1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 * 27 * tcp.c, Code implementing the TCP protocol. 28 */ 29 30 #pragma ident "%Z%%M% %I% %E% SMI" 31 32 #include <sys/types.h> 33 #include <socket_impl.h> 34 #include <socket_inet.h> 35 #include <sys/sysmacros.h> 36 #include <sys/promif.h> 37 #include <sys/socket.h> 38 #include <netinet/in_systm.h> 39 #include <netinet/in.h> 40 #include <netinet/ip.h> 41 #include <netinet/tcp.h> 42 #include <net/if_types.h> 43 #include <sys/salib.h> 44 45 #include "ipv4.h" 46 #include "ipv4_impl.h" 47 #include "mac.h" 48 #include "mac_impl.h" 49 #include "v4_sum_impl.h" 50 #include <sys/bootdebug.h> 51 #include "tcp_inet.h" 52 #include "tcp_sack.h" 53 #include <inet/common.h> 54 #include <inet/mib2.h> 55 56 /* 57 * We need to redefine BUMP_MIB/UPDATE_MIB to not have DTrace probes. 58 */ 59 #undef BUMP_MIB 60 #define BUMP_MIB(x) (x)++ 61 62 #undef UPDATE_MIB 63 #define UPDATE_MIB(x, y) x += y 64 65 /* 66 * MIB-2 stuff for SNMP 67 */ 68 mib2_tcp_t tcp_mib; /* SNMP fixed size info */ 69 70 /* The TCP mib does not include the following errors. */ 71 static uint_t tcp_cksum_errors; 72 static uint_t tcp_drops; 73 74 /* Macros for timestamp comparisons */ 75 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 76 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 77 78 /* 79 * Parameters for TCP Initial Send Sequence number (ISS) generation. 80 * The ISS is calculated by adding three components: a time component 81 * which grows by 1 every 4096 nanoseconds (versus every 4 microseconds 82 * suggested by RFC 793, page 27); 83 * a per-connection component which grows by 125000 for every new connection; 84 * and an "extra" component that grows by a random amount centered 85 * approximately on 64000. This causes the the ISS generator to cycle every 86 * 4.89 hours if no TCP connections are made, and faster if connections are 87 * made. 88 */ 89 #define ISS_INCR 250000 90 #define ISS_NSEC_SHT 0 91 92 static uint32_t tcp_iss_incr_extra; /* Incremented for each connection */ 93 94 #define TCP_XMIT_LOWATER 4096 95 #define TCP_XMIT_HIWATER 49152 96 #define TCP_RECV_LOWATER 2048 97 #define TCP_RECV_HIWATER 49152 98 99 /* 100 * PAWS needs a timer for 24 days. This is the number of ms in 24 days 101 */ 102 #define PAWS_TIMEOUT ((uint32_t)(24*24*60*60*1000)) 103 104 /* 105 * TCP options struct returned from tcp_parse_options. 106 */ 107 typedef struct tcp_opt_s { 108 uint32_t tcp_opt_mss; 109 uint32_t tcp_opt_wscale; 110 uint32_t tcp_opt_ts_val; 111 uint32_t tcp_opt_ts_ecr; 112 tcp_t *tcp; 113 } tcp_opt_t; 114 115 /* 116 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 117 */ 118 119 #ifdef _BIG_ENDIAN 120 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 121 (TCPOPT_TSTAMP << 8) | 10) 122 #else 123 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 124 (TCPOPT_NOP << 8) | TCPOPT_NOP) 125 #endif 126 127 /* 128 * Flags returned from tcp_parse_options. 129 */ 130 #define TCP_OPT_MSS_PRESENT 1 131 #define TCP_OPT_WSCALE_PRESENT 2 132 #define TCP_OPT_TSTAMP_PRESENT 4 133 #define TCP_OPT_SACK_OK_PRESENT 8 134 #define TCP_OPT_SACK_PRESENT 16 135 136 /* TCP option length */ 137 #define TCPOPT_NOP_LEN 1 138 #define TCPOPT_MAXSEG_LEN 4 139 #define TCPOPT_WS_LEN 3 140 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 141 #define TCPOPT_TSTAMP_LEN 10 142 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 143 #define TCPOPT_SACK_OK_LEN 2 144 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 145 #define TCPOPT_REAL_SACK_LEN 4 146 #define TCPOPT_MAX_SACK_LEN 36 147 #define TCPOPT_HEADER_LEN 2 148 149 /* TCP cwnd burst factor. */ 150 #define TCP_CWND_INFINITE 65535 151 #define TCP_CWND_SS 3 152 #define TCP_CWND_NORMAL 5 153 154 /* Named Dispatch Parameter Management Structure */ 155 typedef struct tcpparam_s { 156 uint32_t tcp_param_min; 157 uint32_t tcp_param_max; 158 uint32_t tcp_param_val; 159 char *tcp_param_name; 160 } tcpparam_t; 161 162 /* Max size IP datagram is 64k - 1 */ 163 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (struct ip) + \ 164 sizeof (tcph_t))) 165 166 /* Max of the above */ 167 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 168 169 /* Largest TCP port number */ 170 #define TCP_MAX_PORT (64 * 1024 - 1) 171 172 /* Round up the value to the nearest mss. */ 173 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 174 175 #define MS 1L 176 #define SECONDS (1000 * MS) 177 #define MINUTES (60 * SECONDS) 178 #define HOURS (60 * MINUTES) 179 #define DAYS (24 * HOURS) 180 181 /* All NDD params in the core TCP became static variables. */ 182 static int tcp_time_wait_interval = 1 * MINUTES; 183 static int tcp_conn_req_max_q = 128; 184 static int tcp_conn_req_max_q0 = 1024; 185 static int tcp_conn_req_min = 1; 186 static int tcp_conn_grace_period = 0 * SECONDS; 187 static int tcp_cwnd_max_ = 1024 * 1024; 188 static int tcp_smallest_nonpriv_port = 1024; 189 static int tcp_ip_abort_cinterval = 3 * MINUTES; 190 static int tcp_ip_abort_linterval = 3 * MINUTES; 191 static int tcp_ip_abort_interval = 8 * MINUTES; 192 static int tcp_ip_notify_cinterval = 10 * SECONDS; 193 static int tcp_ip_notify_interval = 10 * SECONDS; 194 static int tcp_ipv4_ttl = 64; 195 static int tcp_mss_def_ipv4 = 536; 196 static int tcp_mss_max_ipv4 = TCP_MSS_MAX_IPV4; 197 static int tcp_mss_min = 108; 198 static int tcp_naglim_def = (4*1024)-1; 199 static int tcp_rexmit_interval_initial = 3 * SECONDS; 200 static int tcp_rexmit_interval_max = 60 * SECONDS; 201 static int tcp_rexmit_interval_min = 400 * MS; 202 static int tcp_dupack_fast_retransmit = 3; 203 static int tcp_smallest_anon_port = 32 * 1024; 204 static int tcp_largest_anon_port = TCP_MAX_PORT; 205 static int tcp_xmit_lowat = TCP_XMIT_LOWATER; 206 static int tcp_recv_hiwat_minmss = 4; 207 static int tcp_fin_wait_2_flush_interval = 1 * MINUTES; 208 static int tcp_max_buf = 1024 * 1024; 209 static int tcp_wscale_always = 1; 210 static int tcp_tstamp_always = 1; 211 static int tcp_tstamp_if_wscale = 1; 212 static int tcp_rexmit_interval_extra = 0; 213 static int tcp_slow_start_after_idle = 2; 214 static int tcp_slow_start_initial = 2; 215 static int tcp_sack_permitted = 2; 216 static int tcp_ecn_permitted = 2; 217 218 /* Extra room to fit in headers. */ 219 static uint_t tcp_wroff_xtra; 220 221 /* Hint for next port to try. */ 222 static in_port_t tcp_next_port_to_try = 32*1024; 223 224 /* 225 * Figure out the value of window scale opton. Note that the rwnd is 226 * ASSUMED to be rounded up to the nearest MSS before the calculation. 227 * We cannot find the scale value and then do a round up of tcp_rwnd 228 * because the scale value may not be correct after that. 229 */ 230 #define SET_WS_VALUE(tcp) \ 231 { \ 232 int i; \ 233 uint32_t rwnd = (tcp)->tcp_rwnd; \ 234 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; \ 235 i++, rwnd >>= 1) \ 236 ; \ 237 (tcp)->tcp_rcv_ws = i; \ 238 } 239 240 /* 241 * Set ECN capable transport (ECT) code point in IP header. 242 * 243 * Note that there are 2 ECT code points '01' and '10', which are called 244 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 245 * point ECT(0) for TCP as described in RFC 2481. 246 */ 247 #define SET_ECT(tcp, iph) \ 248 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 249 /* We need to clear the code point first. */ \ 250 ((struct ip *)(iph))->ip_tos &= 0xFC; \ 251 ((struct ip *)(iph))->ip_tos |= IPH_ECN_ECT0; \ 252 } 253 254 /* 255 * The format argument to pass to tcp_display(). 256 * DISP_PORT_ONLY means that the returned string has only port info. 257 * DISP_ADDR_AND_PORT means that the returned string also contains the 258 * remote and local IP address. 259 */ 260 #define DISP_PORT_ONLY 1 261 #define DISP_ADDR_AND_PORT 2 262 263 /* 264 * TCP reassembly macros. We hide starting and ending sequence numbers in 265 * b_next and b_prev of messages on the reassembly queue. The messages are 266 * chained using b_cont. These macros are used in tcp_reass() so we don't 267 * have to see the ugly casts and assignments. 268 * Note. use uintptr_t to suppress the gcc warning. 269 */ 270 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) 271 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ 272 (mblk_t *)((uintptr_t)(u))) 273 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) 274 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ 275 (mblk_t *)((uintptr_t)(u))) 276 277 #define TCP_TIMER_RESTART(tcp, intvl) \ 278 (tcp)->tcp_rto_timeout = prom_gettime() + intvl; \ 279 (tcp)->tcp_timer_running = B_TRUE; 280 281 static int tcp_accept_comm(tcp_t *, tcp_t *, mblk_t *, uint_t); 282 static mblk_t *tcp_ack_mp(tcp_t *); 283 static in_port_t tcp_bindi(in_port_t, in_addr_t *, boolean_t, boolean_t); 284 static uint16_t tcp_cksum(uint16_t *, uint32_t); 285 static void tcp_clean_death(int, tcp_t *, int err); 286 static tcp_t *tcp_conn_request(tcp_t *, mblk_t *mp, uint_t, uint_t); 287 static char *tcp_display(tcp_t *, char *, char); 288 static int tcp_drain_input(tcp_t *, int, int); 289 static void tcp_drain_needed(int, tcp_t *); 290 static boolean_t tcp_drop_q0(tcp_t *); 291 static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *); 292 static int tcp_header_len(struct inetgram *); 293 static in_port_t tcp_report_ports(uint16_t *, enum Ports); 294 static int tcp_input(int); 295 static void tcp_iss_init(tcp_t *); 296 static tcp_t *tcp_lookup_ipv4(struct ip *, tcpha_t *, int, int *); 297 static tcp_t *tcp_lookup_listener_ipv4(in_addr_t, in_port_t, int *); 298 static int tcp_conn_check(tcp_t *); 299 static int tcp_close(int); 300 static void tcp_close_detached(tcp_t *); 301 static void tcp_eager_cleanup(tcp_t *, boolean_t, int); 302 static void tcp_eager_unlink(tcp_t *); 303 static void tcp_free(tcp_t *); 304 static int tcp_header_init_ipv4(tcp_t *); 305 static void tcp_mss_set(tcp_t *, uint32_t); 306 static int tcp_parse_options(tcph_t *, tcp_opt_t *); 307 static boolean_t tcp_paws_check(tcp_t *, tcph_t *, tcp_opt_t *); 308 static void tcp_process_options(tcp_t *, tcph_t *); 309 static int tcp_random(void); 310 static void tcp_random_init(void); 311 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); 312 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); 313 static void tcp_rcv_drain(int sock_id, tcp_t *); 314 static void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t); 315 static void tcp_rput_data(tcp_t *, mblk_t *, int); 316 static int tcp_rwnd_set(tcp_t *, uint32_t); 317 static int32_t tcp_sack_rxmit(tcp_t *, int); 318 static void tcp_set_cksum(mblk_t *); 319 static void tcp_set_rto(tcp_t *, int32_t); 320 static void tcp_ss_rexmit(tcp_t *, int); 321 static int tcp_state_wait(int, tcp_t *, int); 322 static void tcp_timer(tcp_t *, int); 323 static void tcp_time_wait_append(tcp_t *); 324 static void tcp_time_wait_collector(void); 325 static void tcp_time_wait_processing(tcp_t *, mblk_t *, uint32_t, 326 uint32_t, int, tcph_t *, int sock_id); 327 static void tcp_time_wait_remove(tcp_t *); 328 static in_port_t tcp_update_next_port(in_port_t); 329 static int tcp_verify_cksum(mblk_t *); 330 static void tcp_wput_data(tcp_t *, mblk_t *, int); 331 static void tcp_xmit_ctl(char *, tcp_t *, mblk_t *, uint32_t, uint32_t, 332 int, uint_t, int); 333 static void tcp_xmit_early_reset(char *, int, mblk_t *, uint32_t, uint32_t, 334 int, uint_t); 335 static int tcp_xmit_end(tcp_t *, int); 336 static void tcp_xmit_listeners_reset(int, mblk_t *, uint_t); 337 static mblk_t *tcp_xmit_mp(tcp_t *, mblk_t *, int32_t, int32_t *, 338 mblk_t **, uint32_t, boolean_t, uint32_t *, boolean_t); 339 static int tcp_init_values(tcp_t *, struct inetboot_socket *); 340 341 #if DEBUG > 1 342 #define TCP_DUMP_PACKET(str, mp) \ 343 { \ 344 int len = (mp)->b_wptr - (mp)->b_rptr; \ 345 \ 346 printf("%s: dump TCP(%d): \n", (str), len); \ 347 hexdump((char *)(mp)->b_rptr, len); \ 348 } 349 #else 350 #define TCP_DUMP_PACKET(str, mp) 351 #endif 352 353 #ifdef DEBUG 354 #define DEBUG_1(str, arg) printf(str, (arg)) 355 #define DEBUG_2(str, arg1, arg2) printf(str, (arg1), (arg2)) 356 #define DEBUG_3(str, arg1, arg2, arg3) printf(str, (arg1), (arg2), (arg3)) 357 #else 358 #define DEBUG_1(str, arg) 359 #define DEBUG_2(str, arg1, arg2) 360 #define DEBUG_3(str, arg1, arg2, arg3) 361 #endif 362 363 /* Whether it is the first time TCP is used. */ 364 static boolean_t tcp_initialized = B_FALSE; 365 366 /* TCP time wait list. */ 367 static tcp_t *tcp_time_wait_head; 368 static tcp_t *tcp_time_wait_tail; 369 static uint32_t tcp_cum_timewait; 370 /* When the tcp_time_wait_collector is run. */ 371 static uint32_t tcp_time_wait_runtime; 372 373 #define TCP_RUN_TIME_WAIT_COLLECTOR() \ 374 if (prom_gettime() > tcp_time_wait_runtime) \ 375 tcp_time_wait_collector(); 376 377 /* 378 * Accept will return with an error if there is no connection coming in 379 * after this (in ms). 380 */ 381 static int tcp_accept_timeout = 60000; 382 383 /* 384 * Initialize the TCP-specific parts of a socket. 385 */ 386 void 387 tcp_socket_init(struct inetboot_socket *isp) 388 { 389 /* Do some initializations. */ 390 if (!tcp_initialized) { 391 tcp_random_init(); 392 /* Extra head room for the MAC layer address. */ 393 if ((tcp_wroff_xtra = mac_get_hdr_len()) & 0x3) { 394 tcp_wroff_xtra = (tcp_wroff_xtra & ~0x3) + 0x4; 395 } 396 /* Schedule the first time wait cleanup time */ 397 tcp_time_wait_runtime = prom_gettime() + tcp_time_wait_interval; 398 tcp_initialized = B_TRUE; 399 } 400 TCP_RUN_TIME_WAIT_COLLECTOR(); 401 402 isp->proto = IPPROTO_TCP; 403 isp->input[TRANSPORT_LVL] = tcp_input; 404 /* Socket layer should call tcp_send() directly. */ 405 isp->output[TRANSPORT_LVL] = NULL; 406 isp->close[TRANSPORT_LVL] = tcp_close; 407 isp->headerlen[TRANSPORT_LVL] = tcp_header_len; 408 isp->ports = tcp_report_ports; 409 if ((isp->pcb = bkmem_alloc(sizeof (tcp_t))) == NULL) { 410 errno = ENOBUFS; 411 return; 412 } 413 if ((errno = tcp_init_values((tcp_t *)isp->pcb, isp)) != 0) { 414 bkmem_free(isp->pcb, sizeof (tcp_t)); 415 return; 416 } 417 /* 418 * This is set last because this field is used to determine if 419 * a socket is in use or not. 420 */ 421 isp->type = INETBOOT_STREAM; 422 } 423 424 /* 425 * Return the size of a TCP header including TCP option. 426 */ 427 static int 428 tcp_header_len(struct inetgram *igm) 429 { 430 mblk_t *pkt; 431 int ipvers; 432 433 /* Just returns the standard TCP header without option */ 434 if (igm == NULL) 435 return (sizeof (tcph_t)); 436 437 if ((pkt = igm->igm_mp) == NULL) 438 return (0); 439 440 ipvers = ((struct ip *)pkt->b_rptr)->ip_v; 441 if (ipvers == IPV4_VERSION) { 442 return (TCP_HDR_LENGTH((tcph_t *)(pkt + IPH_HDR_LENGTH(pkt)))); 443 } else { 444 dprintf("tcp_header_len: non-IPv4 packet.\n"); 445 return (0); 446 } 447 } 448 449 /* 450 * Return the requested port number in network order. 451 */ 452 static in_port_t 453 tcp_report_ports(uint16_t *tcphp, enum Ports request) 454 { 455 if (request == SOURCE) 456 return (*(uint16_t *)(((tcph_t *)tcphp)->th_lport)); 457 return (*(uint16_t *)(((tcph_t *)tcphp)->th_fport)); 458 } 459 460 /* 461 * Because inetboot is not interrupt driven, TCP can only poll. This 462 * means that there can be packets stuck in the NIC buffer waiting to 463 * be processed. Thus we need to drain them before, for example, sending 464 * anything because an ACK may actually be stuck there. 465 * 466 * The timeout arguments determine how long we should wait for draining. 467 */ 468 static int 469 tcp_drain_input(tcp_t *tcp, int sock_id, int timeout) 470 { 471 struct inetgram *in_gram; 472 struct inetgram *old_in_gram; 473 int old_timeout; 474 mblk_t *mp; 475 int i; 476 477 dprintf("tcp_drain_input(%d): %s\n", sock_id, 478 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 479 480 /* 481 * Since the driver uses the in_timeout value in the socket 482 * structure to determine the timeout value, we need to save 483 * the original one so that we can restore that after draining. 484 */ 485 old_timeout = sockets[sock_id].in_timeout; 486 sockets[sock_id].in_timeout = timeout; 487 488 /* 489 * We do this because the input queue may have some user 490 * data already. 491 */ 492 old_in_gram = sockets[sock_id].inq; 493 sockets[sock_id].inq = NULL; 494 495 /* Go out and check the wire */ 496 for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) { 497 if (sockets[sock_id].input[i] != NULL) { 498 if (sockets[sock_id].input[i](sock_id) < 0) { 499 sockets[sock_id].in_timeout = old_timeout; 500 if (sockets[sock_id].inq != NULL) 501 nuke_grams(&sockets[sock_id].inq); 502 sockets[sock_id].inq = old_in_gram; 503 return (-1); 504 } 505 } 506 } 507 #if DEBUG 508 printf("tcp_drain_input: done with checking packets\n"); 509 #endif 510 while ((in_gram = sockets[sock_id].inq) != NULL) { 511 /* Remove unknown inetgrams from the head of inq. */ 512 if (in_gram->igm_level != TRANSPORT_LVL) { 513 #if DEBUG 514 printf("tcp_drain_input: unexpected packet " 515 "level %d frame found\n", in_gram->igm_level); 516 #endif 517 del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 518 continue; 519 } 520 mp = in_gram->igm_mp; 521 del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 522 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 523 tcp_rput_data(tcp, mp, sock_id); 524 sockets[sock_id].in_timeout = old_timeout; 525 526 /* 527 * The other side may have closed this connection or 528 * RST us. But we need to continue to process other 529 * packets in the socket's queue because they may be 530 * belong to another TCP connections. 531 */ 532 if (sockets[sock_id].pcb == NULL) 533 tcp = NULL; 534 } 535 536 if (tcp == NULL || sockets[sock_id].pcb == NULL) { 537 if (sockets[sock_id].so_error != 0) 538 return (-1); 539 else 540 return (0); 541 } 542 #if DEBUG 543 printf("tcp_drain_input: done with processing packets\n"); 544 #endif 545 sockets[sock_id].in_timeout = old_timeout; 546 sockets[sock_id].inq = old_in_gram; 547 548 /* 549 * Data may have been received so indicate it is available 550 */ 551 tcp_drain_needed(sock_id, tcp); 552 return (0); 553 } 554 555 /* 556 * The receive entry point for upper layer to call to get data. Note 557 * that this follows the current architecture that lower layer receive 558 * routines have been called already. Thus if the inq of socket is 559 * not NULL, the packets must be for us. 560 */ 561 static int 562 tcp_input(int sock_id) 563 { 564 struct inetgram *in_gram; 565 mblk_t *mp; 566 tcp_t *tcp; 567 568 TCP_RUN_TIME_WAIT_COLLECTOR(); 569 570 if ((tcp = sockets[sock_id].pcb) == NULL) 571 return (-1); 572 573 while ((in_gram = sockets[sock_id].inq) != NULL) { 574 /* Remove unknown inetgrams from the head of inq. */ 575 if (in_gram->igm_level != TRANSPORT_LVL) { 576 #ifdef DEBUG 577 printf("tcp_input: unexpected packet " 578 "level %d frame found\n", in_gram->igm_level); 579 #endif 580 del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 581 continue; 582 } 583 mp = in_gram->igm_mp; 584 del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 585 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 586 tcp_rput_data(tcp, mp, sock_id); 587 /* The TCP may be gone because it gets a RST. */ 588 if (sockets[sock_id].pcb == NULL) 589 return (-1); 590 } 591 592 /* Flush the receive list. */ 593 if (tcp->tcp_rcv_list != NULL) { 594 tcp_rcv_drain(sock_id, tcp); 595 } else { 596 /* The other side has closed the connection, report this up. */ 597 if (tcp->tcp_state == TCPS_CLOSE_WAIT) { 598 sockets[sock_id].so_state |= SS_CANTRCVMORE; 599 return (0); 600 } 601 } 602 return (0); 603 } 604 605 /* 606 * The send entry point for upper layer to call to send data. In order 607 * to minimize changes to the core TCP code, we need to put the 608 * data into mblks. 609 */ 610 int 611 tcp_send(int sock_id, tcp_t *tcp, const void *msg, int len) 612 { 613 mblk_t *mp; 614 mblk_t *head = NULL; 615 mblk_t *tail; 616 int mss = tcp->tcp_mss; 617 int cnt = 0; 618 int win_size; 619 char *buf = (char *)msg; 620 621 TCP_RUN_TIME_WAIT_COLLECTOR(); 622 623 /* We don't want to append 0 size mblk. */ 624 if (len == 0) 625 return (0); 626 while (len > 0) { 627 if (len < mss) { 628 mss = len; 629 } 630 /* 631 * If we cannot allocate more buffer, stop here and 632 * the number of bytes buffered will be returned. 633 * 634 * Note that we follow the core TCP optimization that 635 * each mblk contains only MSS bytes data. 636 */ 637 if ((mp = allocb(mss + tcp->tcp_ip_hdr_len + 638 TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 0)) == NULL) { 639 break; 640 } 641 mp->b_rptr += tcp->tcp_hdr_len + tcp_wroff_xtra; 642 bcopy(buf, mp->b_rptr, mss); 643 mp->b_wptr = mp->b_rptr + mss; 644 buf += mss; 645 cnt += mss; 646 len -= mss; 647 648 if (head == NULL) { 649 head = mp; 650 tail = mp; 651 } else { 652 tail->b_cont = mp; 653 tail = mp; 654 } 655 } 656 657 /* 658 * Since inetboot is not interrupt driven, there may be 659 * some ACKs in the MAC's buffer. Drain them first, 660 * otherwise, we may not be able to send. 661 * 662 * We expect an ACK in two cases: 663 * 664 * 1) We have un-ACK'ed data. 665 * 666 * 2) All ACK's have been received and the sender's window has been 667 * closed. We need an ACK back to open the window so that we can 668 * send. In this case, call tcp_drain_input() if the window size is 669 * less than 2 * MSS. 670 */ 671 672 /* window size = MIN(swnd, cwnd) - unacked bytes */ 673 win_size = (tcp->tcp_swnd > tcp->tcp_cwnd) ? tcp->tcp_cwnd : 674 tcp->tcp_swnd; 675 win_size -= tcp->tcp_snxt; 676 win_size += tcp->tcp_suna; 677 if (win_size < (2 * tcp->tcp_mss)) 678 if (tcp_drain_input(tcp, sock_id, 5) < 0) 679 return (-1); 680 681 tcp_wput_data(tcp, head, sock_id); 682 /* 683 * errno should be reset here as it may be 684 * set to ETIMEDOUT. This may be set by 685 * the MAC driver in case it has timed out 686 * waiting for ARP reply. Any segment which 687 * was not transmitted because of ARP timeout 688 * will be retransmitted by TCP. 689 */ 690 if (errno == ETIMEDOUT) 691 errno = 0; 692 return (cnt); 693 } 694 695 /* Free up all TCP related stuff */ 696 static void 697 tcp_free(tcp_t *tcp) 698 { 699 if (tcp->tcp_iphc != NULL) { 700 bkmem_free((caddr_t)tcp->tcp_iphc, tcp->tcp_iphc_len); 701 tcp->tcp_iphc = NULL; 702 } 703 if (tcp->tcp_xmit_head != NULL) { 704 freemsg(tcp->tcp_xmit_head); 705 tcp->tcp_xmit_head = NULL; 706 } 707 if (tcp->tcp_rcv_list != NULL) { 708 freemsg(tcp->tcp_rcv_list); 709 tcp->tcp_rcv_list = NULL; 710 } 711 if (tcp->tcp_reass_head != NULL) { 712 freemsg(tcp->tcp_reass_head); 713 tcp->tcp_reass_head = NULL; 714 } 715 if (tcp->tcp_sack_info != NULL) { 716 bkmem_free((caddr_t)tcp->tcp_sack_info, 717 sizeof (tcp_sack_info_t)); 718 tcp->tcp_sack_info = NULL; 719 } 720 } 721 722 static void 723 tcp_close_detached(tcp_t *tcp) 724 { 725 if (tcp->tcp_listener != NULL) 726 tcp_eager_unlink(tcp); 727 tcp_free(tcp); 728 bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 729 } 730 731 /* 732 * If we are an eager connection hanging off a listener that hasn't 733 * formally accepted the connection yet, get off its list and blow off 734 * any data that we have accumulated. 735 */ 736 static void 737 tcp_eager_unlink(tcp_t *tcp) 738 { 739 tcp_t *listener = tcp->tcp_listener; 740 741 assert(listener != NULL); 742 if (tcp->tcp_eager_next_q0 != NULL) { 743 assert(tcp->tcp_eager_prev_q0 != NULL); 744 745 /* Remove the eager tcp from q0 */ 746 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 747 tcp->tcp_eager_prev_q0; 748 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 749 tcp->tcp_eager_next_q0; 750 listener->tcp_conn_req_cnt_q0--; 751 } else { 752 tcp_t **tcpp = &listener->tcp_eager_next_q; 753 tcp_t *prev = NULL; 754 755 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 756 if (tcpp[0] == tcp) { 757 if (listener->tcp_eager_last_q == tcp) { 758 /* 759 * If we are unlinking the last 760 * element on the list, adjust 761 * tail pointer. Set tail pointer 762 * to nil when list is empty. 763 */ 764 assert(tcp->tcp_eager_next_q == NULL); 765 if (listener->tcp_eager_last_q == 766 listener->tcp_eager_next_q) { 767 listener->tcp_eager_last_q = 768 NULL; 769 } else { 770 /* 771 * We won't get here if there 772 * is only one eager in the 773 * list. 774 */ 775 assert(prev != NULL); 776 listener->tcp_eager_last_q = 777 prev; 778 } 779 } 780 tcpp[0] = tcp->tcp_eager_next_q; 781 tcp->tcp_eager_next_q = NULL; 782 tcp->tcp_eager_last_q = NULL; 783 listener->tcp_conn_req_cnt_q--; 784 break; 785 } 786 prev = tcpp[0]; 787 } 788 } 789 tcp->tcp_listener = NULL; 790 } 791 792 /* 793 * Reset any eager connection hanging off this listener 794 * and then reclaim it's resources. 795 */ 796 static void 797 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only, int sock_id) 798 { 799 tcp_t *eager; 800 801 if (!q0_only) { 802 /* First cleanup q */ 803 while ((eager = listener->tcp_eager_next_q) != NULL) { 804 assert(listener->tcp_eager_last_q != NULL); 805 tcp_xmit_ctl("tcp_eager_cleanup, can't wait", 806 eager, NULL, eager->tcp_snxt, 0, TH_RST, 0, 807 sock_id); 808 tcp_close_detached(eager); 809 } 810 assert(listener->tcp_eager_last_q == NULL); 811 } 812 /* Then cleanup q0 */ 813 while ((eager = listener->tcp_eager_next_q0) != listener) { 814 tcp_xmit_ctl("tcp_eager_cleanup, can't wait", 815 eager, NULL, eager->tcp_snxt, 0, TH_RST, 0, sock_id); 816 tcp_close_detached(eager); 817 } 818 } 819 820 /* 821 * To handle the shutdown request. Called from shutdown() 822 */ 823 int 824 tcp_shutdown(int sock_id) 825 { 826 tcp_t *tcp; 827 828 DEBUG_1("tcp_shutdown: sock_id %x\n", sock_id); 829 830 if ((tcp = sockets[sock_id].pcb) == NULL) { 831 return (-1); 832 } 833 834 /* 835 * Since inetboot is not interrupt driven, there may be 836 * some ACKs in the MAC's buffer. Drain them first, 837 * otherwise, we may not be able to send. 838 */ 839 if (tcp_drain_input(tcp, sock_id, 5) < 0) { 840 /* 841 * If we return now without freeing TCP, there will be 842 * a memory leak. 843 */ 844 if (sockets[sock_id].pcb != NULL) 845 tcp_clean_death(sock_id, tcp, 0); 846 return (-1); 847 } 848 849 DEBUG_1("tcp_shutdown: tcp_state %x\n", tcp->tcp_state); 850 switch (tcp->tcp_state) { 851 852 case TCPS_SYN_RCVD: 853 /* 854 * Shutdown during the connect 3-way handshake 855 */ 856 case TCPS_ESTABLISHED: 857 /* 858 * Transmit the FIN 859 * wait for the FIN to be ACKed, 860 * then remain in FIN_WAIT_2 861 */ 862 dprintf("tcp_shutdown: sending fin\n"); 863 if (tcp_xmit_end(tcp, sock_id) == 0 && 864 tcp_state_wait(sock_id, tcp, TCPS_FIN_WAIT_2) < 0) { 865 /* During the wait, TCP may be gone... */ 866 if (sockets[sock_id].pcb == NULL) 867 return (-1); 868 } 869 dprintf("tcp_shutdown: done\n"); 870 break; 871 872 default: 873 break; 874 875 } 876 return (0); 877 } 878 879 /* To handle closing of the socket */ 880 static int 881 tcp_close(int sock_id) 882 { 883 char *msg; 884 tcp_t *tcp; 885 int error = 0; 886 887 if ((tcp = sockets[sock_id].pcb) == NULL) { 888 return (-1); 889 } 890 891 TCP_RUN_TIME_WAIT_COLLECTOR(); 892 893 /* 894 * Since inetboot is not interrupt driven, there may be 895 * some ACKs in the MAC's buffer. Drain them first, 896 * otherwise, we may not be able to send. 897 */ 898 if (tcp_drain_input(tcp, sock_id, 5) < 0) { 899 /* 900 * If we return now without freeing TCP, there will be 901 * a memory leak. 902 */ 903 if (sockets[sock_id].pcb != NULL) 904 tcp_clean_death(sock_id, tcp, 0); 905 return (-1); 906 } 907 908 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 909 /* Cleanup for listener */ 910 tcp_eager_cleanup(tcp, 0, sock_id); 911 } 912 913 msg = NULL; 914 switch (tcp->tcp_state) { 915 case TCPS_CLOSED: 916 case TCPS_IDLE: 917 case TCPS_BOUND: 918 case TCPS_LISTEN: 919 break; 920 case TCPS_SYN_SENT: 921 msg = "tcp_close, during connect"; 922 break; 923 case TCPS_SYN_RCVD: 924 /* 925 * Close during the connect 3-way handshake 926 * but here there may or may not be pending data 927 * already on queue. Process almost same as in 928 * the ESTABLISHED state. 929 */ 930 /* FALLTHRU */ 931 default: 932 /* 933 * If SO_LINGER has set a zero linger time, abort the 934 * connection with a reset. 935 */ 936 if (tcp->tcp_linger && tcp->tcp_lingertime == 0) { 937 msg = "tcp_close, zero lingertime"; 938 break; 939 } 940 941 /* 942 * Abort connection if there is unread data queued. 943 */ 944 if (tcp->tcp_rcv_list != NULL || 945 tcp->tcp_reass_head != NULL) { 946 msg = "tcp_close, unread data"; 947 break; 948 } 949 if (tcp->tcp_state <= TCPS_LISTEN) 950 break; 951 952 /* 953 * Transmit the FIN before detaching the tcp_t. 954 * After tcp_detach returns this queue/perimeter 955 * no longer owns the tcp_t thus others can modify it. 956 * The TCP could be closed in tcp_state_wait called by 957 * tcp_wput_data called by tcp_xmit_end. 958 */ 959 (void) tcp_xmit_end(tcp, sock_id); 960 if (sockets[sock_id].pcb == NULL) 961 return (0); 962 963 /* 964 * If lingering on close then wait until the fin is acked, 965 * the SO_LINGER time passes, or a reset is sent/received. 966 */ 967 if (tcp->tcp_linger && tcp->tcp_lingertime > 0 && 968 !(tcp->tcp_fin_acked) && 969 tcp->tcp_state >= TCPS_ESTABLISHED) { 970 uint32_t stoptime; /* in ms */ 971 972 tcp->tcp_client_errno = 0; 973 stoptime = prom_gettime() + 974 (tcp->tcp_lingertime * 1000); 975 while (!(tcp->tcp_fin_acked) && 976 tcp->tcp_state >= TCPS_ESTABLISHED && 977 tcp->tcp_client_errno == 0 && 978 ((int32_t)(stoptime - prom_gettime()) > 0)) { 979 if (tcp_drain_input(tcp, sock_id, 5) < 0) { 980 if (sockets[sock_id].pcb != NULL) { 981 tcp_clean_death(sock_id, 982 tcp, 0); 983 } 984 return (-1); 985 } 986 } 987 tcp->tcp_client_errno = 0; 988 } 989 if (tcp_state_wait(sock_id, tcp, TCPS_TIME_WAIT) < 0) { 990 /* During the wait, TCP may be gone... */ 991 if (sockets[sock_id].pcb == NULL) 992 return (0); 993 msg = "tcp_close, couldn't detach"; 994 } else { 995 return (0); 996 } 997 break; 998 } 999 1000 /* Something went wrong... Send a RST and report the error */ 1001 if (msg != NULL) { 1002 if (tcp->tcp_state == TCPS_ESTABLISHED || 1003 tcp->tcp_state == TCPS_CLOSE_WAIT) 1004 BUMP_MIB(tcp_mib.tcpEstabResets); 1005 if (tcp->tcp_state == TCPS_SYN_SENT || 1006 tcp->tcp_state == TCPS_SYN_RCVD) 1007 BUMP_MIB(tcp_mib.tcpAttemptFails); 1008 tcp_xmit_ctl(msg, tcp, NULL, tcp->tcp_snxt, 0, TH_RST, 0, 1009 sock_id); 1010 } 1011 1012 tcp_free(tcp); 1013 bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 1014 sockets[sock_id].pcb = NULL; 1015 return (error); 1016 } 1017 1018 /* To make an endpoint a listener. */ 1019 int 1020 tcp_listen(int sock_id, int backlog) 1021 { 1022 tcp_t *tcp; 1023 1024 if ((tcp = (tcp_t *)(sockets[sock_id].pcb)) == NULL) { 1025 errno = EINVAL; 1026 return (-1); 1027 } 1028 /* We allow calling listen() multiple times to change the backlog. */ 1029 if (tcp->tcp_state > TCPS_LISTEN || tcp->tcp_state < TCPS_BOUND) { 1030 errno = EOPNOTSUPP; 1031 return (-1); 1032 } 1033 /* The following initialization should only be done once. */ 1034 if (tcp->tcp_state != TCPS_LISTEN) { 1035 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 1036 tcp->tcp_eager_next_q = NULL; 1037 tcp->tcp_state = TCPS_LISTEN; 1038 tcp->tcp_second_ctimer_threshold = tcp_ip_abort_linterval; 1039 } 1040 if ((tcp->tcp_conn_req_max = backlog) > tcp_conn_req_max_q) { 1041 tcp->tcp_conn_req_max = tcp_conn_req_max_q; 1042 } 1043 if (tcp->tcp_conn_req_max < tcp_conn_req_min) { 1044 tcp->tcp_conn_req_max = tcp_conn_req_min; 1045 } 1046 return (0); 1047 } 1048 1049 /* To accept connections. */ 1050 int 1051 tcp_accept(int sock_id, struct sockaddr *addr, socklen_t *addr_len) 1052 { 1053 tcp_t *listener; 1054 tcp_t *eager; 1055 int sd, new_sock_id; 1056 struct sockaddr_in *new_addr = (struct sockaddr_in *)addr; 1057 int timeout; 1058 1059 /* Sanity check. */ 1060 if ((listener = (tcp_t *)(sockets[sock_id].pcb)) == NULL || 1061 new_addr == NULL || addr_len == NULL || 1062 *addr_len < sizeof (struct sockaddr_in) || 1063 listener->tcp_state != TCPS_LISTEN) { 1064 errno = EINVAL; 1065 return (-1); 1066 } 1067 1068 if (sockets[sock_id].in_timeout > tcp_accept_timeout) 1069 timeout = prom_gettime() + sockets[sock_id].in_timeout; 1070 else 1071 timeout = prom_gettime() + tcp_accept_timeout; 1072 while (listener->tcp_eager_next_q == NULL && 1073 timeout > prom_gettime()) { 1074 #if DEBUG 1075 printf("tcp_accept: Waiting in tcp_accept()\n"); 1076 #endif 1077 if (tcp_drain_input(listener, sock_id, 5) < 0) { 1078 return (-1); 1079 } 1080 } 1081 /* If there is an eager, don't timeout... */ 1082 if (timeout <= prom_gettime() && listener->tcp_eager_next_q == NULL) { 1083 #if DEBUG 1084 printf("tcp_accept: timeout\n"); 1085 #endif 1086 errno = ETIMEDOUT; 1087 return (-1); 1088 } 1089 #if DEBUG 1090 printf("tcp_accept: got a connection\n"); 1091 #endif 1092 1093 /* Now create the socket for this new TCP. */ 1094 if ((sd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 1095 return (-1); 1096 } 1097 if ((new_sock_id = so_check_fd(sd, &errno)) == -1) 1098 /* This should not happen! */ 1099 prom_panic("so_check_fd() fails in tcp_accept()"); 1100 /* Free the TCP PCB in the original socket. */ 1101 bkmem_free((caddr_t)(sockets[new_sock_id].pcb), sizeof (tcp_t)); 1102 /* Dequeue the eager and attach it to the socket. */ 1103 eager = listener->tcp_eager_next_q; 1104 listener->tcp_eager_next_q = eager->tcp_eager_next_q; 1105 if (listener->tcp_eager_last_q == eager) 1106 listener->tcp_eager_last_q = NULL; 1107 eager->tcp_eager_next_q = NULL; 1108 sockets[new_sock_id].pcb = eager; 1109 listener->tcp_conn_req_cnt_q--; 1110 1111 /* Copy in the address info. */ 1112 bcopy(&eager->tcp_remote, &new_addr->sin_addr.s_addr, 1113 sizeof (in_addr_t)); 1114 bcopy(&eager->tcp_fport, &new_addr->sin_port, sizeof (in_port_t)); 1115 new_addr->sin_family = AF_INET; 1116 1117 #ifdef DEBUG 1118 printf("tcp_accept(), new sock_id: %d\n", sd); 1119 #endif 1120 return (sd); 1121 } 1122 1123 /* Update the next anonymous port to use. */ 1124 static in_port_t 1125 tcp_update_next_port(in_port_t port) 1126 { 1127 /* Don't allow the port to fall out of the anonymous port range. */ 1128 if (port < tcp_smallest_anon_port || port > tcp_largest_anon_port) 1129 port = (in_port_t)tcp_smallest_anon_port; 1130 1131 if (port < tcp_smallest_nonpriv_port) 1132 port = (in_port_t)tcp_smallest_nonpriv_port; 1133 return (port); 1134 } 1135 1136 /* To check whether a bind to a port is allowed. */ 1137 static in_port_t 1138 tcp_bindi(in_port_t port, in_addr_t *addr, boolean_t reuseaddr, 1139 boolean_t bind_to_req_port_only) 1140 { 1141 int i, count; 1142 tcp_t *tcp; 1143 1144 count = tcp_largest_anon_port - tcp_smallest_anon_port; 1145 try_again: 1146 for (i = 0; i < MAXSOCKET; i++) { 1147 if (sockets[i].type != INETBOOT_STREAM || 1148 ((tcp = (tcp_t *)sockets[i].pcb) == NULL) || 1149 ntohs(tcp->tcp_lport) != port) { 1150 continue; 1151 } 1152 /* 1153 * Both TCPs have the same port. If SO_REUSEDADDR is 1154 * set and the bound TCP has a state greater than 1155 * TCPS_LISTEN, it is fine. 1156 */ 1157 if (reuseaddr && tcp->tcp_state > TCPS_LISTEN) { 1158 continue; 1159 } 1160 if (tcp->tcp_bound_source != INADDR_ANY && 1161 *addr != INADDR_ANY && 1162 tcp->tcp_bound_source != *addr) { 1163 continue; 1164 } 1165 if (bind_to_req_port_only) { 1166 return (0); 1167 } 1168 if (--count > 0) { 1169 port = tcp_update_next_port(++port); 1170 goto try_again; 1171 } else { 1172 return (0); 1173 } 1174 } 1175 return (port); 1176 } 1177 1178 /* To handle the bind request. */ 1179 int 1180 tcp_bind(int sock_id) 1181 { 1182 tcp_t *tcp; 1183 in_port_t requested_port, allocated_port; 1184 boolean_t bind_to_req_port_only; 1185 boolean_t reuseaddr; 1186 1187 if ((tcp = (tcp_t *)sockets[sock_id].pcb) == NULL) { 1188 errno = EINVAL; 1189 return (-1); 1190 } 1191 1192 if (tcp->tcp_state >= TCPS_BOUND) { 1193 /* We don't allow multiple bind(). */ 1194 errno = EPROTO; 1195 return (-1); 1196 } 1197 1198 requested_port = ntohs(sockets[sock_id].bind.sin_port); 1199 1200 /* The bound source can be INADDR_ANY. */ 1201 tcp->tcp_bound_source = sockets[sock_id].bind.sin_addr.s_addr; 1202 1203 tcp->tcp_ipha->ip_src.s_addr = tcp->tcp_bound_source; 1204 1205 /* Verify the port is available. */ 1206 if (requested_port == 0) 1207 bind_to_req_port_only = B_FALSE; 1208 else /* T_BIND_REQ and requested_port != 0 */ 1209 bind_to_req_port_only = B_TRUE; 1210 1211 if (requested_port == 0) { 1212 requested_port = tcp_update_next_port(++tcp_next_port_to_try); 1213 } 1214 reuseaddr = sockets[sock_id].so_opt & SO_REUSEADDR; 1215 allocated_port = tcp_bindi(requested_port, &(tcp->tcp_bound_source), 1216 reuseaddr, bind_to_req_port_only); 1217 1218 if (allocated_port == 0) { 1219 errno = EADDRINUSE; 1220 return (-1); 1221 } 1222 tcp->tcp_lport = htons(allocated_port); 1223 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 1224 sockets[sock_id].bind.sin_port = tcp->tcp_lport; 1225 tcp->tcp_state = TCPS_BOUND; 1226 return (0); 1227 } 1228 1229 /* 1230 * Check for duplicate TCP connections. 1231 */ 1232 static int 1233 tcp_conn_check(tcp_t *tcp) 1234 { 1235 int i; 1236 tcp_t *tmp_tcp; 1237 1238 for (i = 0; i < MAXSOCKET; i++) { 1239 if (sockets[i].type != INETBOOT_STREAM) 1240 continue; 1241 /* Socket may not be closed but the TCP can be gone. */ 1242 if ((tmp_tcp = (tcp_t *)sockets[i].pcb) == NULL) 1243 continue; 1244 /* We only care about TCP in states later than SYN_SENT. */ 1245 if (tmp_tcp->tcp_state < TCPS_SYN_SENT) 1246 continue; 1247 if (tmp_tcp->tcp_lport != tcp->tcp_lport || 1248 tmp_tcp->tcp_fport != tcp->tcp_fport || 1249 tmp_tcp->tcp_bound_source != tcp->tcp_bound_source || 1250 tmp_tcp->tcp_remote != tcp->tcp_remote) { 1251 continue; 1252 } else { 1253 return (-1); 1254 } 1255 } 1256 return (0); 1257 } 1258 1259 /* To handle a connect request. */ 1260 int 1261 tcp_connect(int sock_id) 1262 { 1263 tcp_t *tcp; 1264 in_addr_t dstaddr; 1265 in_port_t dstport; 1266 tcph_t *tcph; 1267 int mss; 1268 mblk_t *syn_mp; 1269 1270 if ((tcp = (tcp_t *)(sockets[sock_id].pcb)) == NULL) { 1271 errno = EINVAL; 1272 return (-1); 1273 } 1274 1275 TCP_RUN_TIME_WAIT_COLLECTOR(); 1276 1277 dstaddr = sockets[sock_id].remote.sin_addr.s_addr; 1278 dstport = sockets[sock_id].remote.sin_port; 1279 1280 /* 1281 * Check for attempt to connect to INADDR_ANY or non-unicast addrress. 1282 * We don't have enough info to check for broadcast addr, except 1283 * for the all 1 broadcast. 1284 */ 1285 if (dstaddr == INADDR_ANY || IN_CLASSD(ntohl(dstaddr)) || 1286 dstaddr == INADDR_BROADCAST) { 1287 /* 1288 * SunOS 4.x and 4.3 BSD allow an application 1289 * to connect a TCP socket to INADDR_ANY. 1290 * When they do this, the kernel picks the 1291 * address of one interface and uses it 1292 * instead. The kernel usually ends up 1293 * picking the address of the loopback 1294 * interface. This is an undocumented feature. 1295 * However, we provide the same thing here 1296 * in order to have source and binary 1297 * compatibility with SunOS 4.x. 1298 * Update the T_CONN_REQ (sin/sin6) since it is used to 1299 * generate the T_CONN_CON. 1300 * 1301 * Fail this for inetboot TCP. 1302 */ 1303 errno = EINVAL; 1304 return (-1); 1305 } 1306 1307 /* It is not bound to any address yet... */ 1308 if (tcp->tcp_bound_source == INADDR_ANY) { 1309 ipv4_getipaddr(&(sockets[sock_id].bind.sin_addr)); 1310 /* We don't have an address! */ 1311 if (ntohl(sockets[sock_id].bind.sin_addr.s_addr) == 1312 INADDR_ANY) { 1313 errno = EPROTO; 1314 return (-1); 1315 } 1316 tcp->tcp_bound_source = sockets[sock_id].bind.sin_addr.s_addr; 1317 tcp->tcp_ipha->ip_src.s_addr = tcp->tcp_bound_source; 1318 } 1319 1320 /* 1321 * Don't let an endpoint connect to itself. 1322 */ 1323 if (dstaddr == tcp->tcp_ipha->ip_src.s_addr && 1324 dstport == tcp->tcp_lport) { 1325 errno = EINVAL; 1326 return (-1); 1327 } 1328 1329 tcp->tcp_ipha->ip_dst.s_addr = dstaddr; 1330 tcp->tcp_remote = dstaddr; 1331 tcph = tcp->tcp_tcph; 1332 *(uint16_t *)tcph->th_fport = dstport; 1333 tcp->tcp_fport = dstport; 1334 1335 /* 1336 * Don't allow this connection to completely duplicate 1337 * an existing connection. 1338 */ 1339 if (tcp_conn_check(tcp) < 0) { 1340 errno = EADDRINUSE; 1341 return (-1); 1342 } 1343 1344 /* 1345 * Just make sure our rwnd is at 1346 * least tcp_recv_hiwat_mss * MSS 1347 * large, and round up to the nearest 1348 * MSS. 1349 * 1350 * We do the round up here because 1351 * we need to get the interface 1352 * MTU first before we can do the 1353 * round up. 1354 */ 1355 mss = tcp->tcp_mss - tcp->tcp_hdr_len; 1356 tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 1357 tcp_recv_hiwat_minmss * mss); 1358 tcp->tcp_rwnd_max = tcp->tcp_rwnd; 1359 SET_WS_VALUE(tcp); 1360 U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), 1361 tcp->tcp_tcph->th_win); 1362 if (tcp->tcp_rcv_ws > 0 || tcp_wscale_always) 1363 tcp->tcp_snd_ws_ok = B_TRUE; 1364 1365 /* 1366 * Set tcp_snd_ts_ok to true 1367 * so that tcp_xmit_mp will 1368 * include the timestamp 1369 * option in the SYN segment. 1370 */ 1371 if (tcp_tstamp_always || 1372 (tcp->tcp_rcv_ws && tcp_tstamp_if_wscale)) { 1373 tcp->tcp_snd_ts_ok = B_TRUE; 1374 } 1375 1376 if (tcp_sack_permitted == 2 || 1377 tcp->tcp_snd_sack_ok) { 1378 assert(tcp->tcp_sack_info == NULL); 1379 if ((tcp->tcp_sack_info = (tcp_sack_info_t *)bkmem_zalloc( 1380 sizeof (tcp_sack_info_t))) == NULL) { 1381 tcp->tcp_snd_sack_ok = B_FALSE; 1382 } else { 1383 tcp->tcp_snd_sack_ok = B_TRUE; 1384 } 1385 } 1386 /* 1387 * Should we use ECN? Note that the current 1388 * default value (SunOS 5.9) of tcp_ecn_permitted 1389 * is 2. The reason for doing this is that there 1390 * are equipments out there that will drop ECN 1391 * enabled IP packets. Setting it to 1 avoids 1392 * compatibility problems. 1393 */ 1394 if (tcp_ecn_permitted == 2) 1395 tcp->tcp_ecn_ok = B_TRUE; 1396 1397 tcp_iss_init(tcp); 1398 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 1399 tcp->tcp_active_open = B_TRUE; 1400 1401 tcp->tcp_state = TCPS_SYN_SENT; 1402 syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, tcp->tcp_iss, B_FALSE, 1403 NULL, B_FALSE); 1404 if (syn_mp != NULL) { 1405 int ret; 1406 1407 /* Dump the packet when debugging. */ 1408 TCP_DUMP_PACKET("tcp_connect", syn_mp); 1409 /* Send out the SYN packet. */ 1410 ret = ipv4_tcp_output(sock_id, syn_mp); 1411 freeb(syn_mp); 1412 /* 1413 * errno ETIMEDOUT is set by the mac driver 1414 * in case it is not able to receive ARP reply. 1415 * TCP will retransmit this segment so we can 1416 * ignore the ARP timeout. 1417 */ 1418 if ((ret < 0) && (errno != ETIMEDOUT)) { 1419 return (-1); 1420 } 1421 /* tcp_state_wait() will finish the 3 way handshake. */ 1422 return (tcp_state_wait(sock_id, tcp, TCPS_ESTABLISHED)); 1423 } else { 1424 errno = ENOBUFS; 1425 return (-1); 1426 } 1427 } 1428 1429 /* 1430 * Common accept code. Called by tcp_conn_request. 1431 * cr_pkt is the SYN packet. 1432 */ 1433 static int 1434 tcp_accept_comm(tcp_t *listener, tcp_t *acceptor, mblk_t *cr_pkt, 1435 uint_t ip_hdr_len) 1436 { 1437 tcph_t *tcph; 1438 1439 #ifdef DEBUG 1440 printf("tcp_accept_comm #######################\n"); 1441 #endif 1442 1443 /* 1444 * When we get here, we know that the acceptor header template 1445 * has already been initialized. 1446 * However, it may not match the listener if the listener 1447 * includes options... 1448 * It may also not match the listener if the listener is v6 and 1449 * and the acceptor is v4 1450 */ 1451 acceptor->tcp_lport = listener->tcp_lport; 1452 1453 if (listener->tcp_ipversion == acceptor->tcp_ipversion) { 1454 if (acceptor->tcp_iphc_len != listener->tcp_iphc_len) { 1455 /* 1456 * Listener had options of some sort; acceptor inherits. 1457 * Free up the acceptor template and allocate one 1458 * of the right size. 1459 */ 1460 bkmem_free(acceptor->tcp_iphc, acceptor->tcp_iphc_len); 1461 acceptor->tcp_iphc = bkmem_zalloc( 1462 listener->tcp_iphc_len); 1463 if (acceptor->tcp_iphc == NULL) { 1464 acceptor->tcp_iphc_len = 0; 1465 return (ENOMEM); 1466 } 1467 acceptor->tcp_iphc_len = listener->tcp_iphc_len; 1468 } 1469 acceptor->tcp_hdr_len = listener->tcp_hdr_len; 1470 acceptor->tcp_ip_hdr_len = listener->tcp_ip_hdr_len; 1471 acceptor->tcp_tcp_hdr_len = listener->tcp_tcp_hdr_len; 1472 1473 /* 1474 * Copy the IP+TCP header template from listener to acceptor 1475 */ 1476 bcopy(listener->tcp_iphc, acceptor->tcp_iphc, 1477 listener->tcp_hdr_len); 1478 acceptor->tcp_ipha = (struct ip *)acceptor->tcp_iphc; 1479 acceptor->tcp_tcph = (tcph_t *)(acceptor->tcp_iphc + 1480 acceptor->tcp_ip_hdr_len); 1481 } else { 1482 prom_panic("tcp_accept_comm: version not equal"); 1483 } 1484 1485 /* Copy our new dest and fport from the connection request packet */ 1486 if (acceptor->tcp_ipversion == IPV4_VERSION) { 1487 struct ip *ipha; 1488 1489 ipha = (struct ip *)cr_pkt->b_rptr; 1490 acceptor->tcp_ipha->ip_dst = ipha->ip_src; 1491 acceptor->tcp_remote = ipha->ip_src.s_addr; 1492 acceptor->tcp_ipha->ip_src = ipha->ip_dst; 1493 acceptor->tcp_bound_source = ipha->ip_dst.s_addr; 1494 tcph = (tcph_t *)&cr_pkt->b_rptr[ip_hdr_len]; 1495 } else { 1496 prom_panic("tcp_accept_comm: not IPv4"); 1497 } 1498 bcopy(tcph->th_lport, acceptor->tcp_tcph->th_fport, sizeof (in_port_t)); 1499 bcopy(acceptor->tcp_tcph->th_fport, &acceptor->tcp_fport, 1500 sizeof (in_port_t)); 1501 /* 1502 * For an all-port proxy listener, the local port is determined by 1503 * the port number field in the SYN packet. 1504 */ 1505 if (listener->tcp_lport == 0) { 1506 acceptor->tcp_lport = *(in_port_t *)tcph->th_fport; 1507 bcopy(tcph->th_fport, acceptor->tcp_tcph->th_lport, 1508 sizeof (in_port_t)); 1509 } 1510 /* Inherit various TCP parameters from the listener */ 1511 acceptor->tcp_naglim = listener->tcp_naglim; 1512 acceptor->tcp_first_timer_threshold = 1513 listener->tcp_first_timer_threshold; 1514 acceptor->tcp_second_timer_threshold = 1515 listener->tcp_second_timer_threshold; 1516 1517 acceptor->tcp_first_ctimer_threshold = 1518 listener->tcp_first_ctimer_threshold; 1519 acceptor->tcp_second_ctimer_threshold = 1520 listener->tcp_second_ctimer_threshold; 1521 1522 acceptor->tcp_xmit_hiwater = listener->tcp_xmit_hiwater; 1523 1524 acceptor->tcp_state = TCPS_LISTEN; 1525 tcp_iss_init(acceptor); 1526 1527 /* Process all TCP options. */ 1528 tcp_process_options(acceptor, tcph); 1529 1530 /* Is the other end ECN capable? */ 1531 if (tcp_ecn_permitted >= 1 && 1532 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 1533 acceptor->tcp_ecn_ok = B_TRUE; 1534 } 1535 1536 /* 1537 * listener->tcp_rq->q_hiwat should be the default window size or a 1538 * window size changed via SO_RCVBUF option. First round up the 1539 * acceptor's tcp_rwnd to the nearest MSS. Then find out the window 1540 * scale option value if needed. Call tcp_rwnd_set() to finish the 1541 * setting. 1542 * 1543 * Note if there is a rpipe metric associated with the remote host, 1544 * we should not inherit receive window size from listener. 1545 */ 1546 acceptor->tcp_rwnd = MSS_ROUNDUP( 1547 (acceptor->tcp_rwnd == 0 ? listener->tcp_rwnd_max : 1548 acceptor->tcp_rwnd), acceptor->tcp_mss); 1549 if (acceptor->tcp_snd_ws_ok) 1550 SET_WS_VALUE(acceptor); 1551 /* 1552 * Note that this is the only place tcp_rwnd_set() is called for 1553 * accepting a connection. We need to call it here instead of 1554 * after the 3-way handshake because we need to tell the other 1555 * side our rwnd in the SYN-ACK segment. 1556 */ 1557 (void) tcp_rwnd_set(acceptor, acceptor->tcp_rwnd); 1558 1559 return (0); 1560 } 1561 1562 /* 1563 * Defense for the SYN attack - 1564 * 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest 1565 * one that doesn't have the dontdrop bit set. 1566 * 2. Don't drop a SYN request before its first timeout. This gives every 1567 * request at least til the first timeout to complete its 3-way handshake. 1568 * 3. The current threshold is - # of timeout > q0len/4 => SYN alert on 1569 * # of timeout drops back to <= q0len/32 => SYN alert off 1570 */ 1571 static boolean_t 1572 tcp_drop_q0(tcp_t *tcp) 1573 { 1574 tcp_t *eager; 1575 1576 assert(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 1577 /* 1578 * New one is added after next_q0 so prev_q0 points to the oldest 1579 * Also do not drop any established connections that are deferred on 1580 * q0 due to q being full 1581 */ 1582 1583 eager = tcp->tcp_eager_prev_q0; 1584 while (eager->tcp_dontdrop || eager->tcp_conn_def_q0) { 1585 /* XXX should move the eager to the head */ 1586 eager = eager->tcp_eager_prev_q0; 1587 if (eager == tcp) { 1588 eager = tcp->tcp_eager_prev_q0; 1589 break; 1590 } 1591 } 1592 dprintf("tcp_drop_q0: listen half-open queue (max=%d) overflow" 1593 " (%d pending) on %s, drop one", tcp_conn_req_max_q0, 1594 tcp->tcp_conn_req_cnt_q0, 1595 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1596 1597 BUMP_MIB(tcp_mib.tcpHalfOpenDrop); 1598 bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1599 return (B_TRUE); 1600 } 1601 1602 /* ARGSUSED */ 1603 static tcp_t * 1604 tcp_conn_request(tcp_t *tcp, mblk_t *mp, uint_t sock_id, uint_t ip_hdr_len) 1605 { 1606 tcp_t *eager; 1607 struct ip *ipha; 1608 int err; 1609 1610 #ifdef DEBUG 1611 printf("tcp_conn_request ###################\n"); 1612 #endif 1613 1614 if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { 1615 BUMP_MIB(tcp_mib.tcpListenDrop); 1616 dprintf("tcp_conn_request: listen backlog (max=%d) " 1617 "overflow (%d pending) on %s", 1618 tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, 1619 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1620 return (NULL); 1621 } 1622 1623 assert(OK_32PTR(mp->b_rptr)); 1624 1625 if (tcp->tcp_conn_req_cnt_q0 >= 1626 tcp->tcp_conn_req_max + tcp_conn_req_max_q0) { 1627 /* 1628 * Q0 is full. Drop a pending half-open req from the queue 1629 * to make room for the new SYN req. Also mark the time we 1630 * drop a SYN. 1631 */ 1632 tcp->tcp_last_rcv_lbolt = prom_gettime(); 1633 if (!tcp_drop_q0(tcp)) { 1634 freemsg(mp); 1635 BUMP_MIB(tcp_mib.tcpListenDropQ0); 1636 dprintf("tcp_conn_request: listen half-open queue " 1637 "(max=%d) full (%d pending) on %s", 1638 tcp_conn_req_max_q0, 1639 tcp->tcp_conn_req_cnt_q0, 1640 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1641 return (NULL); 1642 } 1643 } 1644 1645 ipha = (struct ip *)mp->b_rptr; 1646 if (IN_CLASSD(ntohl(ipha->ip_src.s_addr)) || 1647 ipha->ip_src.s_addr == INADDR_BROADCAST || 1648 ipha->ip_src.s_addr == INADDR_ANY || 1649 ipha->ip_dst.s_addr == INADDR_BROADCAST) { 1650 freemsg(mp); 1651 return (NULL); 1652 } 1653 /* 1654 * We allow the connection to proceed 1655 * by generating a detached tcp state vector and put it in 1656 * the eager queue. When an accept happens, it will be 1657 * dequeued sequentially. 1658 */ 1659 if ((eager = (tcp_t *)bkmem_alloc(sizeof (tcp_t))) == NULL) { 1660 freemsg(mp); 1661 errno = ENOBUFS; 1662 return (NULL); 1663 } 1664 if ((errno = tcp_init_values(eager, NULL)) != 0) { 1665 freemsg(mp); 1666 bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1667 return (NULL); 1668 } 1669 1670 /* 1671 * Eager connection inherits address form from its listener, 1672 * but its packet form comes from the version of the received 1673 * SYN segment. 1674 */ 1675 eager->tcp_family = tcp->tcp_family; 1676 1677 err = tcp_accept_comm(tcp, eager, mp, ip_hdr_len); 1678 if (err) { 1679 bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1680 return (NULL); 1681 } 1682 1683 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 1684 eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1685 tcp->tcp_eager_next_q0 = eager; 1686 eager->tcp_eager_prev_q0 = tcp; 1687 1688 /* Set tcp_listener before adding it to tcp_conn_fanout */ 1689 eager->tcp_listener = tcp; 1690 tcp->tcp_conn_req_cnt_q0++; 1691 1692 return (eager); 1693 } 1694 1695 /* 1696 * To get around the non-interrupt problem of inetboot. 1697 * Keep on processing packets until a certain state is reached or the 1698 * TCP is destroyed because of getting a RST packet. 1699 */ 1700 static int 1701 tcp_state_wait(int sock_id, tcp_t *tcp, int state) 1702 { 1703 int i; 1704 struct inetgram *in_gram; 1705 mblk_t *mp; 1706 int timeout; 1707 boolean_t changed = B_FALSE; 1708 1709 /* 1710 * We need to make sure that the MAC does not wait longer 1711 * than RTO for any packet so that TCP can do retransmission. 1712 * But if the MAC timeout is less than tcp_rto, we are fine 1713 * and do not need to change it. 1714 */ 1715 timeout = sockets[sock_id].in_timeout; 1716 if (timeout > tcp->tcp_rto) { 1717 sockets[sock_id].in_timeout = tcp->tcp_rto; 1718 changed = B_TRUE; 1719 } 1720 retry: 1721 if (sockets[sock_id].inq == NULL) { 1722 /* Go out and check the wire */ 1723 for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) { 1724 if (sockets[sock_id].input[i] != NULL) { 1725 if (sockets[sock_id].input[i](sock_id) < 0) { 1726 if (changed) { 1727 sockets[sock_id].in_timeout = 1728 timeout; 1729 } 1730 return (-1); 1731 } 1732 } 1733 } 1734 } 1735 1736 while ((in_gram = sockets[sock_id].inq) != NULL) { 1737 if (tcp != NULL && tcp->tcp_state == state) 1738 break; 1739 1740 /* Remove unknown inetgrams from the head of inq. */ 1741 if (in_gram->igm_level != TRANSPORT_LVL) { 1742 #ifdef DEBUG 1743 printf("tcp_state_wait for state %d: unexpected " 1744 "packet level %d frame found\n", state, 1745 in_gram->igm_level); 1746 #endif 1747 del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 1748 continue; 1749 } 1750 mp = in_gram->igm_mp; 1751 del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 1752 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 1753 tcp_rput_data(tcp, mp, sock_id); 1754 1755 /* 1756 * The other side may have closed this connection or 1757 * RST us. But we need to continue to process other 1758 * packets in the socket's queue because they may be 1759 * belong to another TCP connections. 1760 */ 1761 if (sockets[sock_id].pcb == NULL) { 1762 tcp = NULL; 1763 } 1764 } 1765 1766 /* If the other side has closed the connection, just return. */ 1767 if (tcp == NULL || sockets[sock_id].pcb == NULL) { 1768 #ifdef DEBUG 1769 printf("tcp_state_wait other side dead: state %d " 1770 "error %d\n", state, sockets[sock_id].so_error); 1771 #endif 1772 if (sockets[sock_id].so_error != 0) 1773 return (-1); 1774 else 1775 return (0); 1776 } 1777 /* 1778 * TCPS_ALL_ACKED is not a valid TCP state, it is just used as an 1779 * indicator to tcp_state_wait to mean that it is being called 1780 * to wait till we have received acks for all the new segments sent. 1781 */ 1782 if ((state == TCPS_ALL_ACKED) && (tcp->tcp_suna == tcp->tcp_snxt)) { 1783 goto done; 1784 } 1785 if (tcp->tcp_state != state) { 1786 if (prom_gettime() > tcp->tcp_rto_timeout) 1787 tcp_timer(tcp, sock_id); 1788 goto retry; 1789 } 1790 done: 1791 if (changed) 1792 sockets[sock_id].in_timeout = timeout; 1793 1794 tcp_drain_needed(sock_id, tcp); 1795 return (0); 1796 } 1797 1798 /* Verify the checksum of a segment. */ 1799 static int 1800 tcp_verify_cksum(mblk_t *mp) 1801 { 1802 struct ip *iph; 1803 tcpha_t *tcph; 1804 int len; 1805 uint16_t old_sum; 1806 1807 iph = (struct ip *)mp->b_rptr; 1808 tcph = (tcpha_t *)(iph + 1); 1809 len = ntohs(iph->ip_len); 1810 1811 /* 1812 * Calculate the TCP checksum. Need to include the psuedo header, 1813 * which is similar to the real IP header starting at the TTL field. 1814 */ 1815 iph->ip_sum = htons(len - IP_SIMPLE_HDR_LENGTH); 1816 old_sum = tcph->tha_sum; 1817 tcph->tha_sum = 0; 1818 iph->ip_ttl = 0; 1819 if (old_sum == tcp_cksum((uint16_t *)&(iph->ip_ttl), 1820 len - IP_SIMPLE_HDR_LENGTH + 12)) { 1821 return (0); 1822 } else { 1823 tcp_cksum_errors++; 1824 return (-1); 1825 } 1826 } 1827 1828 /* To find a TCP connection matching the incoming segment. */ 1829 static tcp_t * 1830 tcp_lookup_ipv4(struct ip *iph, tcpha_t *tcph, int min_state, int *sock_id) 1831 { 1832 int i; 1833 tcp_t *tcp; 1834 1835 for (i = 0; i < MAXSOCKET; i++) { 1836 if (sockets[i].type == INETBOOT_STREAM && 1837 (tcp = (tcp_t *)sockets[i].pcb) != NULL) { 1838 if (tcph->tha_lport == tcp->tcp_fport && 1839 tcph->tha_fport == tcp->tcp_lport && 1840 iph->ip_src.s_addr == tcp->tcp_remote && 1841 iph->ip_dst.s_addr == tcp->tcp_bound_source && 1842 tcp->tcp_state >= min_state) { 1843 *sock_id = i; 1844 return (tcp); 1845 } 1846 } 1847 } 1848 /* Find it in the time wait list. */ 1849 for (tcp = tcp_time_wait_head; tcp != NULL; 1850 tcp = tcp->tcp_time_wait_next) { 1851 if (tcph->tha_lport == tcp->tcp_fport && 1852 tcph->tha_fport == tcp->tcp_lport && 1853 iph->ip_src.s_addr == tcp->tcp_remote && 1854 iph->ip_dst.s_addr == tcp->tcp_bound_source && 1855 tcp->tcp_state >= min_state) { 1856 *sock_id = -1; 1857 return (tcp); 1858 } 1859 } 1860 return (NULL); 1861 } 1862 1863 /* To find a TCP listening connection matching the incoming segment. */ 1864 static tcp_t * 1865 tcp_lookup_listener_ipv4(in_addr_t addr, in_port_t port, int *sock_id) 1866 { 1867 int i; 1868 tcp_t *tcp; 1869 1870 for (i = 0; i < MAXSOCKET; i++) { 1871 if (sockets[i].type == INETBOOT_STREAM && 1872 (tcp = (tcp_t *)sockets[i].pcb) != NULL) { 1873 if (tcp->tcp_lport == port && 1874 (tcp->tcp_bound_source == addr || 1875 tcp->tcp_bound_source == INADDR_ANY)) { 1876 *sock_id = i; 1877 return (tcp); 1878 } 1879 } 1880 } 1881 1882 return (NULL); 1883 } 1884 1885 /* To find a TCP eager matching the incoming segment. */ 1886 static tcp_t * 1887 tcp_lookup_eager_ipv4(tcp_t *listener, struct ip *iph, tcpha_t *tcph) 1888 { 1889 tcp_t *tcp; 1890 1891 #ifdef DEBUG 1892 printf("tcp_lookup_eager_ipv4 ###############\n"); 1893 #endif 1894 for (tcp = listener->tcp_eager_next_q; tcp != NULL; 1895 tcp = tcp->tcp_eager_next_q) { 1896 if (tcph->tha_lport == tcp->tcp_fport && 1897 tcph->tha_fport == tcp->tcp_lport && 1898 iph->ip_src.s_addr == tcp->tcp_remote && 1899 iph->ip_dst.s_addr == tcp->tcp_bound_source) { 1900 return (tcp); 1901 } 1902 } 1903 1904 for (tcp = listener->tcp_eager_next_q0; tcp != listener; 1905 tcp = tcp->tcp_eager_next_q0) { 1906 if (tcph->tha_lport == tcp->tcp_fport && 1907 tcph->tha_fport == tcp->tcp_lport && 1908 iph->ip_src.s_addr == tcp->tcp_remote && 1909 iph->ip_dst.s_addr == tcp->tcp_bound_source) { 1910 return (tcp); 1911 } 1912 } 1913 #ifdef DEBUG 1914 printf("No eager found\n"); 1915 #endif 1916 return (NULL); 1917 } 1918 1919 /* To destroy a TCP control block. */ 1920 static void 1921 tcp_clean_death(int sock_id, tcp_t *tcp, int err) 1922 { 1923 tcp_free(tcp); 1924 if (tcp->tcp_state == TCPS_TIME_WAIT) 1925 tcp_time_wait_remove(tcp); 1926 1927 if (sock_id >= 0) { 1928 sockets[sock_id].pcb = NULL; 1929 if (err != 0) 1930 sockets[sock_id].so_error = err; 1931 } 1932 bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 1933 } 1934 1935 /* 1936 * tcp_rwnd_set() is called to adjust the receive window to a desired value. 1937 * We do not allow the receive window to shrink. After setting rwnd, 1938 * set the flow control hiwat of the stream. 1939 * 1940 * This function is called in 2 cases: 1941 * 1942 * 1) Before data transfer begins, in tcp_accept_comm() for accepting a 1943 * connection (passive open) and in tcp_rput_data() for active connect. 1944 * This is called after tcp_mss_set() when the desired MSS value is known. 1945 * This makes sure that our window size is a mutiple of the other side's 1946 * MSS. 1947 * 2) Handling SO_RCVBUF option. 1948 * 1949 * It is ASSUMED that the requested size is a multiple of the current MSS. 1950 * 1951 * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 1952 * user requests so. 1953 */ 1954 static int 1955 tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 1956 { 1957 uint32_t mss = tcp->tcp_mss; 1958 uint32_t old_max_rwnd; 1959 uint32_t max_transmittable_rwnd; 1960 1961 if (tcp->tcp_rwnd_max != 0) 1962 old_max_rwnd = tcp->tcp_rwnd_max; 1963 else 1964 old_max_rwnd = tcp->tcp_rwnd; 1965 1966 /* 1967 * Insist on a receive window that is at least 1968 * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 1969 * funny TCP interactions of Nagle algorithm, SWS avoidance 1970 * and delayed acknowledgement. 1971 */ 1972 rwnd = MAX(rwnd, tcp_recv_hiwat_minmss * mss); 1973 1974 /* 1975 * If window size info has already been exchanged, TCP should not 1976 * shrink the window. Shrinking window is doable if done carefully. 1977 * We may add that support later. But so far there is not a real 1978 * need to do that. 1979 */ 1980 if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 1981 /* MSS may have changed, do a round up again. */ 1982 rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 1983 } 1984 1985 /* 1986 * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 1987 * can be applied even before the window scale option is decided. 1988 */ 1989 max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 1990 if (rwnd > max_transmittable_rwnd) { 1991 rwnd = max_transmittable_rwnd - 1992 (max_transmittable_rwnd % mss); 1993 if (rwnd < mss) 1994 rwnd = max_transmittable_rwnd; 1995 /* 1996 * If we're over the limit we may have to back down tcp_rwnd. 1997 * The increment below won't work for us. So we set all three 1998 * here and the increment below will have no effect. 1999 */ 2000 tcp->tcp_rwnd = old_max_rwnd = rwnd; 2001 } 2002 2003 /* 2004 * Increment the current rwnd by the amount the maximum grew (we 2005 * can not overwrite it since we might be in the middle of a 2006 * connection.) 2007 */ 2008 tcp->tcp_rwnd += rwnd - old_max_rwnd; 2009 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 2010 if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 2011 tcp->tcp_cwnd_max = rwnd; 2012 tcp->tcp_rwnd_max = rwnd; 2013 2014 return (rwnd); 2015 } 2016 2017 /* 2018 * Extract option values from a tcp header. We put any found values into the 2019 * tcpopt struct and return a bitmask saying which options were found. 2020 */ 2021 static int 2022 tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) 2023 { 2024 uchar_t *endp; 2025 int len; 2026 uint32_t mss; 2027 uchar_t *up = (uchar_t *)tcph; 2028 int found = 0; 2029 int32_t sack_len; 2030 tcp_seq sack_begin, sack_end; 2031 tcp_t *tcp; 2032 2033 endp = up + TCP_HDR_LENGTH(tcph); 2034 up += TCP_MIN_HEADER_LENGTH; 2035 while (up < endp) { 2036 len = endp - up; 2037 switch (*up) { 2038 case TCPOPT_EOL: 2039 break; 2040 2041 case TCPOPT_NOP: 2042 up++; 2043 continue; 2044 2045 case TCPOPT_MAXSEG: 2046 if (len < TCPOPT_MAXSEG_LEN || 2047 up[1] != TCPOPT_MAXSEG_LEN) 2048 break; 2049 2050 mss = BE16_TO_U16(up+2); 2051 /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 2052 tcpopt->tcp_opt_mss = mss; 2053 found |= TCP_OPT_MSS_PRESENT; 2054 2055 up += TCPOPT_MAXSEG_LEN; 2056 continue; 2057 2058 case TCPOPT_WSCALE: 2059 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 2060 break; 2061 2062 if (up[2] > TCP_MAX_WINSHIFT) 2063 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 2064 else 2065 tcpopt->tcp_opt_wscale = up[2]; 2066 found |= TCP_OPT_WSCALE_PRESENT; 2067 2068 up += TCPOPT_WS_LEN; 2069 continue; 2070 2071 case TCPOPT_SACK_PERMITTED: 2072 if (len < TCPOPT_SACK_OK_LEN || 2073 up[1] != TCPOPT_SACK_OK_LEN) 2074 break; 2075 found |= TCP_OPT_SACK_OK_PRESENT; 2076 up += TCPOPT_SACK_OK_LEN; 2077 continue; 2078 2079 case TCPOPT_SACK: 2080 if (len <= 2 || up[1] <= 2 || len < up[1]) 2081 break; 2082 2083 /* If TCP is not interested in SACK blks... */ 2084 if ((tcp = tcpopt->tcp) == NULL) { 2085 up += up[1]; 2086 continue; 2087 } 2088 sack_len = up[1] - TCPOPT_HEADER_LEN; 2089 up += TCPOPT_HEADER_LEN; 2090 2091 /* 2092 * If the list is empty, allocate one and assume 2093 * nothing is sack'ed. 2094 */ 2095 assert(tcp->tcp_sack_info != NULL); 2096 if (tcp->tcp_notsack_list == NULL) { 2097 tcp_notsack_update(&(tcp->tcp_notsack_list), 2098 tcp->tcp_suna, tcp->tcp_snxt, 2099 &(tcp->tcp_num_notsack_blk), 2100 &(tcp->tcp_cnt_notsack_list)); 2101 2102 /* 2103 * Make sure tcp_notsack_list is not NULL. 2104 * This happens when kmem_alloc(KM_NOSLEEP) 2105 * returns NULL. 2106 */ 2107 if (tcp->tcp_notsack_list == NULL) { 2108 up += sack_len; 2109 continue; 2110 } 2111 tcp->tcp_fack = tcp->tcp_suna; 2112 } 2113 2114 while (sack_len > 0) { 2115 if (up + 8 > endp) { 2116 up = endp; 2117 break; 2118 } 2119 sack_begin = BE32_TO_U32(up); 2120 up += 4; 2121 sack_end = BE32_TO_U32(up); 2122 up += 4; 2123 sack_len -= 8; 2124 /* 2125 * Bounds checking. Make sure the SACK 2126 * info is within tcp_suna and tcp_snxt. 2127 * If this SACK blk is out of bound, ignore 2128 * it but continue to parse the following 2129 * blks. 2130 */ 2131 if (SEQ_LEQ(sack_end, sack_begin) || 2132 SEQ_LT(sack_begin, tcp->tcp_suna) || 2133 SEQ_GT(sack_end, tcp->tcp_snxt)) { 2134 continue; 2135 } 2136 tcp_notsack_insert(&(tcp->tcp_notsack_list), 2137 sack_begin, sack_end, 2138 &(tcp->tcp_num_notsack_blk), 2139 &(tcp->tcp_cnt_notsack_list)); 2140 if (SEQ_GT(sack_end, tcp->tcp_fack)) { 2141 tcp->tcp_fack = sack_end; 2142 } 2143 } 2144 found |= TCP_OPT_SACK_PRESENT; 2145 continue; 2146 2147 case TCPOPT_TSTAMP: 2148 if (len < TCPOPT_TSTAMP_LEN || 2149 up[1] != TCPOPT_TSTAMP_LEN) 2150 break; 2151 2152 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 2153 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 2154 2155 found |= TCP_OPT_TSTAMP_PRESENT; 2156 2157 up += TCPOPT_TSTAMP_LEN; 2158 continue; 2159 2160 default: 2161 if (len <= 1 || len < (int)up[1] || up[1] == 0) 2162 break; 2163 up += up[1]; 2164 continue; 2165 } 2166 break; 2167 } 2168 return (found); 2169 } 2170 2171 /* 2172 * Set the mss associated with a particular tcp based on its current value, 2173 * and a new one passed in. Observe minimums and maximums, and reset 2174 * other state variables that we want to view as multiples of mss. 2175 * 2176 * This function is called in various places mainly because 2177 * 1) Various stuffs, tcp_mss, tcp_cwnd, ... need to be adjusted when the 2178 * other side's SYN/SYN-ACK packet arrives. 2179 * 2) PMTUd may get us a new MSS. 2180 * 3) If the other side stops sending us timestamp option, we need to 2181 * increase the MSS size to use the extra bytes available. 2182 */ 2183 static void 2184 tcp_mss_set(tcp_t *tcp, uint32_t mss) 2185 { 2186 uint32_t mss_max; 2187 2188 mss_max = tcp_mss_max_ipv4; 2189 2190 if (mss < tcp_mss_min) 2191 mss = tcp_mss_min; 2192 if (mss > mss_max) 2193 mss = mss_max; 2194 /* 2195 * Unless naglim has been set by our client to 2196 * a non-mss value, force naglim to track mss. 2197 * This can help to aggregate small writes. 2198 */ 2199 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 2200 tcp->tcp_naglim = mss; 2201 /* 2202 * TCP should be able to buffer at least 4 MSS data for obvious 2203 * performance reason. 2204 */ 2205 if ((mss << 2) > tcp->tcp_xmit_hiwater) 2206 tcp->tcp_xmit_hiwater = mss << 2; 2207 tcp->tcp_mss = mss; 2208 /* 2209 * Initialize cwnd according to draft-floyd-incr-init-win-01.txt. 2210 * Previously, we use tcp_slow_start_initial to control the size 2211 * of the initial cwnd. Now, when tcp_slow_start_initial * mss 2212 * is smaller than the cwnd calculated from the formula suggested in 2213 * the draft, we use tcp_slow_start_initial * mss as the cwnd. 2214 * Otherwise, use the cwnd from the draft's formula. The default 2215 * of tcp_slow_start_initial is 2. 2216 */ 2217 tcp->tcp_cwnd = MIN(tcp_slow_start_initial * mss, 2218 MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 2219 tcp->tcp_cwnd_cnt = 0; 2220 } 2221 2222 /* 2223 * Process all TCP option in SYN segment. 2224 * 2225 * This function sets up the correct tcp_mss value according to the 2226 * MSS option value and our header size. It also sets up the window scale 2227 * and timestamp values, and initialize SACK info blocks. But it does not 2228 * change receive window size after setting the tcp_mss value. The caller 2229 * should do the appropriate change. 2230 */ 2231 void 2232 tcp_process_options(tcp_t *tcp, tcph_t *tcph) 2233 { 2234 int options; 2235 tcp_opt_t tcpopt; 2236 uint32_t mss_max; 2237 char *tmp_tcph; 2238 2239 tcpopt.tcp = NULL; 2240 options = tcp_parse_options(tcph, &tcpopt); 2241 2242 /* 2243 * Process MSS option. Note that MSS option value does not account 2244 * for IP or TCP options. This means that it is equal to MTU - minimum 2245 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 2246 * IPv6. 2247 */ 2248 if (!(options & TCP_OPT_MSS_PRESENT)) { 2249 tcpopt.tcp_opt_mss = tcp_mss_def_ipv4; 2250 } else { 2251 if (tcp->tcp_ipversion == IPV4_VERSION) 2252 mss_max = tcp_mss_max_ipv4; 2253 if (tcpopt.tcp_opt_mss < tcp_mss_min) 2254 tcpopt.tcp_opt_mss = tcp_mss_min; 2255 else if (tcpopt.tcp_opt_mss > mss_max) 2256 tcpopt.tcp_opt_mss = mss_max; 2257 } 2258 2259 /* Process Window Scale option. */ 2260 if (options & TCP_OPT_WSCALE_PRESENT) { 2261 tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 2262 tcp->tcp_snd_ws_ok = B_TRUE; 2263 } else { 2264 tcp->tcp_snd_ws = B_FALSE; 2265 tcp->tcp_snd_ws_ok = B_FALSE; 2266 tcp->tcp_rcv_ws = B_FALSE; 2267 } 2268 2269 /* Process Timestamp option. */ 2270 if ((options & TCP_OPT_TSTAMP_PRESENT) && 2271 (tcp->tcp_snd_ts_ok || !tcp->tcp_active_open)) { 2272 tmp_tcph = (char *)tcp->tcp_tcph; 2273 2274 tcp->tcp_snd_ts_ok = B_TRUE; 2275 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 2276 tcp->tcp_last_rcv_lbolt = prom_gettime(); 2277 assert(OK_32PTR(tmp_tcph)); 2278 assert(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 2279 2280 /* Fill in our template header with basic timestamp option. */ 2281 tmp_tcph += tcp->tcp_tcp_hdr_len; 2282 tmp_tcph[0] = TCPOPT_NOP; 2283 tmp_tcph[1] = TCPOPT_NOP; 2284 tmp_tcph[2] = TCPOPT_TSTAMP; 2285 tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 2286 tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN; 2287 tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN; 2288 tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4); 2289 } else { 2290 tcp->tcp_snd_ts_ok = B_FALSE; 2291 } 2292 2293 /* 2294 * Process SACK options. If SACK is enabled for this connection, 2295 * then allocate the SACK info structure. 2296 */ 2297 if ((options & TCP_OPT_SACK_OK_PRESENT) && 2298 (tcp->tcp_snd_sack_ok || 2299 (tcp_sack_permitted != 0 && !tcp->tcp_active_open))) { 2300 /* This should be true only in the passive case. */ 2301 if (tcp->tcp_sack_info == NULL) { 2302 tcp->tcp_sack_info = (tcp_sack_info_t *)bkmem_zalloc( 2303 sizeof (tcp_sack_info_t)); 2304 } 2305 if (tcp->tcp_sack_info == NULL) { 2306 tcp->tcp_snd_sack_ok = B_FALSE; 2307 } else { 2308 tcp->tcp_snd_sack_ok = B_TRUE; 2309 if (tcp->tcp_snd_ts_ok) { 2310 tcp->tcp_max_sack_blk = 3; 2311 } else { 2312 tcp->tcp_max_sack_blk = 4; 2313 } 2314 } 2315 } else { 2316 /* 2317 * Resetting tcp_snd_sack_ok to B_FALSE so that 2318 * no SACK info will be used for this 2319 * connection. This assumes that SACK usage 2320 * permission is negotiated. This may need 2321 * to be changed once this is clarified. 2322 */ 2323 if (tcp->tcp_sack_info != NULL) { 2324 bkmem_free((caddr_t)tcp->tcp_sack_info, 2325 sizeof (tcp_sack_info_t)); 2326 tcp->tcp_sack_info = NULL; 2327 } 2328 tcp->tcp_snd_sack_ok = B_FALSE; 2329 } 2330 2331 /* 2332 * Now we know the exact TCP/IP header length, subtract 2333 * that from tcp_mss to get our side's MSS. 2334 */ 2335 tcp->tcp_mss -= tcp->tcp_hdr_len; 2336 /* 2337 * Here we assume that the other side's header size will be equal to 2338 * our header size. We calculate the real MSS accordingly. Need to 2339 * take into additional stuffs IPsec puts in. 2340 * 2341 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 2342 */ 2343 tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len - 2344 (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH); 2345 2346 /* 2347 * Set MSS to the smaller one of both ends of the connection. 2348 * We should not have called tcp_mss_set() before, but our 2349 * side of the MSS should have been set to a proper value 2350 * by tcp_adapt_ire(). tcp_mss_set() will also set up the 2351 * STREAM head parameters properly. 2352 * 2353 * If we have a larger-than-16-bit window but the other side 2354 * didn't want to do window scale, tcp_rwnd_set() will take 2355 * care of that. 2356 */ 2357 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); 2358 } 2359 2360 /* 2361 * This function does PAWS protection check. Returns B_TRUE if the 2362 * segment passes the PAWS test, else returns B_FALSE. 2363 */ 2364 boolean_t 2365 tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) 2366 { 2367 uint8_t flags; 2368 int options; 2369 uint8_t *up; 2370 2371 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 2372 /* 2373 * If timestamp option is aligned nicely, get values inline, 2374 * otherwise call general routine to parse. Only do that 2375 * if timestamp is the only option. 2376 */ 2377 if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH + 2378 TCPOPT_REAL_TS_LEN && 2379 OK_32PTR((up = ((uint8_t *)tcph) + 2380 TCP_MIN_HEADER_LENGTH)) && 2381 *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 2382 tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 2383 tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 2384 2385 options = TCP_OPT_TSTAMP_PRESENT; 2386 } else { 2387 if (tcp->tcp_snd_sack_ok) { 2388 tcpoptp->tcp = tcp; 2389 } else { 2390 tcpoptp->tcp = NULL; 2391 } 2392 options = tcp_parse_options(tcph, tcpoptp); 2393 } 2394 2395 if (options & TCP_OPT_TSTAMP_PRESENT) { 2396 /* 2397 * Do PAWS per RFC 1323 section 4.2. Accept RST 2398 * regardless of the timestamp, page 18 RFC 1323.bis. 2399 */ 2400 if ((flags & TH_RST) == 0 && 2401 TSTMP_LT(tcpoptp->tcp_opt_ts_val, 2402 tcp->tcp_ts_recent)) { 2403 if (TSTMP_LT(prom_gettime(), 2404 tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) { 2405 /* This segment is not acceptable. */ 2406 return (B_FALSE); 2407 } else { 2408 /* 2409 * Connection has been idle for 2410 * too long. Reset the timestamp 2411 * and assume the segment is valid. 2412 */ 2413 tcp->tcp_ts_recent = 2414 tcpoptp->tcp_opt_ts_val; 2415 } 2416 } 2417 } else { 2418 /* 2419 * If we don't get a timestamp on every packet, we 2420 * figure we can't really trust 'em, so we stop sending 2421 * and parsing them. 2422 */ 2423 tcp->tcp_snd_ts_ok = B_FALSE; 2424 2425 tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 2426 tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 2427 tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4); 2428 tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); 2429 if (tcp->tcp_snd_sack_ok) { 2430 assert(tcp->tcp_sack_info != NULL); 2431 tcp->tcp_max_sack_blk = 4; 2432 } 2433 } 2434 return (B_TRUE); 2435 } 2436 2437 /* 2438 * tcp_get_seg_mp() is called to get the pointer to a segment in the 2439 * send queue which starts at the given seq. no. 2440 * 2441 * Parameters: 2442 * tcp_t *tcp: the tcp instance pointer. 2443 * uint32_t seq: the starting seq. no of the requested segment. 2444 * int32_t *off: after the execution, *off will be the offset to 2445 * the returned mblk which points to the requested seq no. 2446 * 2447 * Return: 2448 * A mblk_t pointer pointing to the requested segment in send queue. 2449 */ 2450 static mblk_t * 2451 tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 2452 { 2453 int32_t cnt; 2454 mblk_t *mp; 2455 2456 /* Defensive coding. Make sure we don't send incorrect data. */ 2457 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt) || 2458 off == NULL) { 2459 return (NULL); 2460 } 2461 cnt = seq - tcp->tcp_suna; 2462 mp = tcp->tcp_xmit_head; 2463 while (cnt > 0 && mp) { 2464 cnt -= mp->b_wptr - mp->b_rptr; 2465 if (cnt < 0) { 2466 cnt += mp->b_wptr - mp->b_rptr; 2467 break; 2468 } 2469 mp = mp->b_cont; 2470 } 2471 assert(mp != NULL); 2472 *off = cnt; 2473 return (mp); 2474 } 2475 2476 /* 2477 * This function handles all retransmissions if SACK is enabled for this 2478 * connection. First it calculates how many segments can be retransmitted 2479 * based on tcp_pipe. Then it goes thru the notsack list to find eligible 2480 * segments. A segment is eligible if sack_cnt for that segment is greater 2481 * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 2482 * all eligible segments, it checks to see if TCP can send some new segments 2483 * (fast recovery). If it can, it returns 1. Otherwise it returns 0. 2484 * 2485 * Parameters: 2486 * tcp_t *tcp: the tcp structure of the connection. 2487 * 2488 * Return: 2489 * 1 if the pipe is not full (new data can be sent), 0 otherwise 2490 */ 2491 static int32_t 2492 tcp_sack_rxmit(tcp_t *tcp, int sock_id) 2493 { 2494 notsack_blk_t *notsack_blk; 2495 int32_t usable_swnd; 2496 int32_t mss; 2497 uint32_t seg_len; 2498 mblk_t *xmit_mp; 2499 2500 assert(tcp->tcp_sack_info != NULL); 2501 assert(tcp->tcp_notsack_list != NULL); 2502 assert(tcp->tcp_rexmit == B_FALSE); 2503 2504 /* Defensive coding in case there is a bug... */ 2505 if (tcp->tcp_notsack_list == NULL) { 2506 return (0); 2507 } 2508 notsack_blk = tcp->tcp_notsack_list; 2509 mss = tcp->tcp_mss; 2510 2511 /* 2512 * Limit the num of outstanding data in the network to be 2513 * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 2514 */ 2515 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 2516 2517 /* At least retransmit 1 MSS of data. */ 2518 if (usable_swnd <= 0) { 2519 usable_swnd = mss; 2520 } 2521 2522 /* Make sure no new RTT samples will be taken. */ 2523 tcp->tcp_csuna = tcp->tcp_snxt; 2524 2525 notsack_blk = tcp->tcp_notsack_list; 2526 while (usable_swnd > 0) { 2527 mblk_t *snxt_mp, *tmp_mp; 2528 tcp_seq begin = tcp->tcp_sack_snxt; 2529 tcp_seq end; 2530 int32_t off; 2531 2532 for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 2533 if (SEQ_GT(notsack_blk->end, begin) && 2534 (notsack_blk->sack_cnt >= 2535 tcp_dupack_fast_retransmit)) { 2536 end = notsack_blk->end; 2537 if (SEQ_LT(begin, notsack_blk->begin)) { 2538 begin = notsack_blk->begin; 2539 } 2540 break; 2541 } 2542 } 2543 /* 2544 * All holes are filled. Manipulate tcp_cwnd to send more 2545 * if we can. Note that after the SACK recovery, tcp_cwnd is 2546 * set to tcp_cwnd_ssthresh. 2547 */ 2548 if (notsack_blk == NULL) { 2549 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 2550 if (usable_swnd <= 0) { 2551 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 2552 assert(tcp->tcp_cwnd > 0); 2553 return (0); 2554 } else { 2555 usable_swnd = usable_swnd / mss; 2556 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 2557 MAX(usable_swnd * mss, mss); 2558 return (1); 2559 } 2560 } 2561 2562 /* 2563 * Note that we may send more than usable_swnd allows here 2564 * because of round off, but no more than 1 MSS of data. 2565 */ 2566 seg_len = end - begin; 2567 if (seg_len > mss) 2568 seg_len = mss; 2569 snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 2570 assert(snxt_mp != NULL); 2571 /* This should not happen. Defensive coding again... */ 2572 if (snxt_mp == NULL) { 2573 return (0); 2574 } 2575 2576 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 2577 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 2578 2579 if (xmit_mp == NULL) 2580 return (0); 2581 2582 usable_swnd -= seg_len; 2583 tcp->tcp_pipe += seg_len; 2584 tcp->tcp_sack_snxt = begin + seg_len; 2585 TCP_DUMP_PACKET("tcp_sack_rxmit", xmit_mp); 2586 (void) ipv4_tcp_output(sock_id, xmit_mp); 2587 freeb(xmit_mp); 2588 2589 /* 2590 * Update the send timestamp to avoid false retransmission. 2591 * Note. use uintptr_t to suppress the gcc warning. 2592 */ 2593 snxt_mp->b_prev = (mblk_t *)(uintptr_t)prom_gettime(); 2594 2595 BUMP_MIB(tcp_mib.tcpRetransSegs); 2596 UPDATE_MIB(tcp_mib.tcpRetransBytes, seg_len); 2597 BUMP_MIB(tcp_mib.tcpOutSackRetransSegs); 2598 /* 2599 * Update tcp_rexmit_max to extend this SACK recovery phase. 2600 * This happens when new data sent during fast recovery is 2601 * also lost. If TCP retransmits those new data, it needs 2602 * to extend SACK recover phase to avoid starting another 2603 * fast retransmit/recovery unnecessarily. 2604 */ 2605 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 2606 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 2607 } 2608 } 2609 return (0); 2610 } 2611 2612 static void 2613 tcp_rput_data(tcp_t *tcp, mblk_t *mp, int sock_id) 2614 { 2615 uchar_t *rptr; 2616 struct ip *iph; 2617 tcp_t *tcp1; 2618 tcpha_t *tcph; 2619 uint32_t seg_ack; 2620 int seg_len; 2621 uint_t ip_hdr_len; 2622 uint32_t seg_seq; 2623 mblk_t *mp1; 2624 uint_t flags; 2625 uint32_t new_swnd = 0; 2626 int mss; 2627 boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 2628 int32_t gap; 2629 int32_t rgap; 2630 tcp_opt_t tcpopt; 2631 int32_t bytes_acked; 2632 int npkt; 2633 uint32_t cwnd; 2634 uint32_t add; 2635 2636 #ifdef DEBUG 2637 printf("tcp_rput_data sock %d mp %x mp_datap %x #################\n", 2638 sock_id, mp, mp->b_datap); 2639 #endif 2640 2641 /* Dump the packet when debugging. */ 2642 TCP_DUMP_PACKET("tcp_rput_data", mp); 2643 2644 assert(OK_32PTR(mp->b_rptr)); 2645 2646 rptr = mp->b_rptr; 2647 iph = (struct ip *)rptr; 2648 ip_hdr_len = IPH_HDR_LENGTH(rptr); 2649 if (ip_hdr_len != IP_SIMPLE_HDR_LENGTH) { 2650 #ifdef DEBUG 2651 printf("Not simple IP header\n"); 2652 #endif 2653 /* We cannot handle IP option yet... */ 2654 tcp_drops++; 2655 freeb(mp); 2656 return; 2657 } 2658 /* The TCP header must be aligned. */ 2659 tcph = (tcpha_t *)&rptr[ip_hdr_len]; 2660 seg_seq = ntohl(tcph->tha_seq); 2661 seg_ack = ntohl(tcph->tha_ack); 2662 assert((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 2663 seg_len = (int)(mp->b_wptr - rptr) - 2664 (ip_hdr_len + TCP_HDR_LENGTH(((tcph_t *)tcph))); 2665 /* In inetboot, b_cont should always be NULL. */ 2666 assert(mp->b_cont == NULL); 2667 2668 /* Verify the checksum. */ 2669 if (tcp_verify_cksum(mp) < 0) { 2670 #ifdef DEBUG 2671 printf("tcp_rput_data: wrong cksum\n"); 2672 #endif 2673 freemsg(mp); 2674 return; 2675 } 2676 2677 /* 2678 * This segment is not for us, try to find its 2679 * intended receiver. 2680 */ 2681 if (tcp == NULL || 2682 tcph->tha_lport != tcp->tcp_fport || 2683 tcph->tha_fport != tcp->tcp_lport || 2684 iph->ip_src.s_addr != tcp->tcp_remote || 2685 iph->ip_dst.s_addr != tcp->tcp_bound_source) { 2686 #ifdef DEBUG 2687 printf("tcp_rput_data: not for us, state %d\n", 2688 tcp->tcp_state); 2689 #endif 2690 /* 2691 * First try to find a established connection. If none 2692 * is found, look for a listener. 2693 * 2694 * If a listener is found, we need to check to see if the 2695 * incoming segment is for one of its eagers. If it is, 2696 * give it to the eager. If not, listener should take care 2697 * of it. 2698 */ 2699 if ((tcp1 = tcp_lookup_ipv4(iph, tcph, TCPS_SYN_SENT, 2700 &sock_id)) != NULL || 2701 (tcp1 = tcp_lookup_listener_ipv4(iph->ip_dst.s_addr, 2702 tcph->tha_fport, &sock_id)) != NULL) { 2703 if (tcp1->tcp_state == TCPS_LISTEN) { 2704 if ((tcp = tcp_lookup_eager_ipv4(tcp1, 2705 iph, tcph)) == NULL) { 2706 /* No eager... sent to listener */ 2707 #ifdef DEBUG 2708 printf("found the listener: %s\n", 2709 tcp_display(tcp1, NULL, 2710 DISP_ADDR_AND_PORT)); 2711 #endif 2712 tcp = tcp1; 2713 } 2714 #ifdef DEBUG 2715 else { 2716 printf("found the eager: %s\n", 2717 tcp_display(tcp, NULL, 2718 DISP_ADDR_AND_PORT)); 2719 } 2720 #endif 2721 } else { 2722 /* Non listener found... */ 2723 #ifdef DEBUG 2724 printf("found the connection: %s\n", 2725 tcp_display(tcp1, NULL, 2726 DISP_ADDR_AND_PORT)); 2727 #endif 2728 tcp = tcp1; 2729 } 2730 } else { 2731 /* 2732 * No connection for this segment... 2733 * Send a RST to the other side. 2734 */ 2735 tcp_xmit_listeners_reset(sock_id, mp, ip_hdr_len); 2736 return; 2737 } 2738 } 2739 2740 flags = tcph->tha_flags & 0xFF; 2741 BUMP_MIB(tcp_mib.tcpInSegs); 2742 if (tcp->tcp_state == TCPS_TIME_WAIT) { 2743 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 2744 seg_len, (tcph_t *)tcph, sock_id); 2745 return; 2746 } 2747 /* 2748 * From this point we can assume that the tcp is not compressed, 2749 * since we would have branched off to tcp_time_wait_processing() 2750 * in such a case. 2751 */ 2752 assert(tcp != NULL && tcp->tcp_state != TCPS_TIME_WAIT); 2753 2754 /* 2755 * After this point, we know we have the correct TCP, so update 2756 * the receive time. 2757 */ 2758 tcp->tcp_last_recv_time = prom_gettime(); 2759 2760 /* In inetboot, we do not handle urgent pointer... */ 2761 if (flags & TH_URG) { 2762 freemsg(mp); 2763 DEBUG_1("tcp_rput_data(%d): received segment with urgent " 2764 "pointer\n", sock_id); 2765 tcp_drops++; 2766 return; 2767 } 2768 2769 switch (tcp->tcp_state) { 2770 case TCPS_LISTEN: 2771 if ((flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN) { 2772 if (flags & TH_RST) { 2773 freemsg(mp); 2774 return; 2775 } 2776 if (flags & TH_ACK) { 2777 tcp_xmit_early_reset("TCPS_LISTEN-TH_ACK", 2778 sock_id, mp, seg_ack, 0, TH_RST, 2779 ip_hdr_len); 2780 return; 2781 } 2782 if (!(flags & TH_SYN)) { 2783 freemsg(mp); 2784 return; 2785 } 2786 printf("tcp_rput_data: %d\n", __LINE__); 2787 prom_panic("inetboot"); 2788 } 2789 if (tcp->tcp_conn_req_max > 0) { 2790 tcp = tcp_conn_request(tcp, mp, sock_id, ip_hdr_len); 2791 if (tcp == NULL) { 2792 freemsg(mp); 2793 return; 2794 } 2795 #ifdef DEBUG 2796 printf("tcp_rput_data: new tcp created\n"); 2797 #endif 2798 } 2799 tcp->tcp_irs = seg_seq; 2800 tcp->tcp_rack = seg_seq; 2801 tcp->tcp_rnxt = seg_seq + 1; 2802 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 2803 BUMP_MIB(tcp_mib.tcpPassiveOpens); 2804 goto syn_rcvd; 2805 case TCPS_SYN_SENT: 2806 if (flags & TH_ACK) { 2807 /* 2808 * Note that our stack cannot send data before a 2809 * connection is established, therefore the 2810 * following check is valid. Otherwise, it has 2811 * to be changed. 2812 */ 2813 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 2814 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 2815 if (flags & TH_RST) { 2816 freemsg(mp); 2817 return; 2818 } 2819 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 2820 tcp, mp, seg_ack, 0, TH_RST, 2821 ip_hdr_len, sock_id); 2822 return; 2823 } 2824 assert(tcp->tcp_suna + 1 == seg_ack); 2825 } 2826 if (flags & TH_RST) { 2827 freemsg(mp); 2828 if (flags & TH_ACK) { 2829 tcp_clean_death(sock_id, tcp, ECONNREFUSED); 2830 } 2831 return; 2832 } 2833 if (!(flags & TH_SYN)) { 2834 freemsg(mp); 2835 return; 2836 } 2837 2838 /* Process all TCP options. */ 2839 tcp_process_options(tcp, (tcph_t *)tcph); 2840 /* 2841 * The following changes our rwnd to be a multiple of the 2842 * MIN(peer MSS, our MSS) for performance reason. 2843 */ 2844 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rwnd, 2845 tcp->tcp_mss)); 2846 2847 /* Is the other end ECN capable? */ 2848 if (tcp->tcp_ecn_ok) { 2849 if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 2850 tcp->tcp_ecn_ok = B_FALSE; 2851 } 2852 } 2853 /* 2854 * Clear ECN flags because it may interfere with later 2855 * processing. 2856 */ 2857 flags &= ~(TH_ECE|TH_CWR); 2858 2859 tcp->tcp_irs = seg_seq; 2860 tcp->tcp_rack = seg_seq; 2861 tcp->tcp_rnxt = seg_seq + 1; 2862 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 2863 2864 if (flags & TH_ACK) { 2865 /* One for the SYN */ 2866 tcp->tcp_suna = tcp->tcp_iss + 1; 2867 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 2868 tcp->tcp_state = TCPS_ESTABLISHED; 2869 2870 /* 2871 * If SYN was retransmitted, need to reset all 2872 * retransmission info. This is because this 2873 * segment will be treated as a dup ACK. 2874 */ 2875 if (tcp->tcp_rexmit) { 2876 tcp->tcp_rexmit = B_FALSE; 2877 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 2878 tcp->tcp_rexmit_max = tcp->tcp_snxt; 2879 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 2880 2881 /* 2882 * Set tcp_cwnd back to 1 MSS, per 2883 * recommendation from 2884 * draft-floyd-incr-init-win-01.txt, 2885 * Increasing TCP's Initial Window. 2886 */ 2887 tcp->tcp_cwnd = tcp->tcp_mss; 2888 } 2889 2890 tcp->tcp_swl1 = seg_seq; 2891 tcp->tcp_swl2 = seg_ack; 2892 2893 new_swnd = BE16_TO_U16(((tcph_t *)tcph)->th_win); 2894 tcp->tcp_swnd = new_swnd; 2895 if (new_swnd > tcp->tcp_max_swnd) 2896 tcp->tcp_max_swnd = new_swnd; 2897 2898 /* 2899 * Always send the three-way handshake ack immediately 2900 * in order to make the connection complete as soon as 2901 * possible on the accepting host. 2902 */ 2903 flags |= TH_ACK_NEEDED; 2904 /* 2905 * Check to see if there is data to be sent. If 2906 * yes, set the transmit flag. Then check to see 2907 * if received data processing needs to be done. 2908 * If not, go straight to xmit_check. This short 2909 * cut is OK as we don't support T/TCP. 2910 */ 2911 if (tcp->tcp_unsent) 2912 flags |= TH_XMIT_NEEDED; 2913 2914 if (seg_len == 0) { 2915 freemsg(mp); 2916 goto xmit_check; 2917 } 2918 2919 flags &= ~TH_SYN; 2920 seg_seq++; 2921 break; 2922 } 2923 syn_rcvd: 2924 tcp->tcp_state = TCPS_SYN_RCVD; 2925 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 2926 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 2927 if (mp1 != NULL) { 2928 TCP_DUMP_PACKET("tcp_rput_data replying SYN", mp1); 2929 (void) ipv4_tcp_output(sock_id, mp1); 2930 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 2931 freeb(mp1); 2932 /* 2933 * Let's wait till our SYN has been ACKED since we 2934 * don't have a timer. 2935 */ 2936 if (tcp_state_wait(sock_id, tcp, TCPS_ALL_ACKED) < 0) { 2937 freemsg(mp); 2938 return; 2939 } 2940 } 2941 freemsg(mp); 2942 return; 2943 default: 2944 break; 2945 } 2946 mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH((tcph_t *)tcph); 2947 new_swnd = ntohs(tcph->tha_win) << 2948 ((flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 2949 mss = tcp->tcp_mss; 2950 2951 if (tcp->tcp_snd_ts_ok) { 2952 if (!tcp_paws_check(tcp, (tcph_t *)tcph, &tcpopt)) { 2953 /* 2954 * This segment is not acceptable. 2955 * Drop it and send back an ACK. 2956 */ 2957 freemsg(mp); 2958 flags |= TH_ACK_NEEDED; 2959 goto ack_check; 2960 } 2961 } else if (tcp->tcp_snd_sack_ok) { 2962 assert(tcp->tcp_sack_info != NULL); 2963 tcpopt.tcp = tcp; 2964 /* 2965 * SACK info in already updated in tcp_parse_options. Ignore 2966 * all other TCP options... 2967 */ 2968 (void) tcp_parse_options((tcph_t *)tcph, &tcpopt); 2969 } 2970 try_again:; 2971 gap = seg_seq - tcp->tcp_rnxt; 2972 rgap = tcp->tcp_rwnd - (gap + seg_len); 2973 /* 2974 * gap is the amount of sequence space between what we expect to see 2975 * and what we got for seg_seq. A positive value for gap means 2976 * something got lost. A negative value means we got some old stuff. 2977 */ 2978 if (gap < 0) { 2979 /* Old stuff present. Is the SYN in there? */ 2980 if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 2981 (seg_len != 0)) { 2982 flags &= ~TH_SYN; 2983 seg_seq++; 2984 /* Recompute the gaps after noting the SYN. */ 2985 goto try_again; 2986 } 2987 BUMP_MIB(tcp_mib.tcpInDataDupSegs); 2988 UPDATE_MIB(tcp_mib.tcpInDataDupBytes, 2989 (seg_len > -gap ? -gap : seg_len)); 2990 /* Remove the old stuff from seg_len. */ 2991 seg_len += gap; 2992 /* 2993 * Anything left? 2994 * Make sure to check for unack'd FIN when rest of data 2995 * has been previously ack'd. 2996 */ 2997 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 2998 /* 2999 * Resets are only valid if they lie within our offered 3000 * window. If the RST bit is set, we just ignore this 3001 * segment. 3002 */ 3003 if (flags & TH_RST) { 3004 freemsg(mp); 3005 return; 3006 } 3007 3008 /* 3009 * This segment is "unacceptable". None of its 3010 * sequence space lies within our advertized window. 3011 * 3012 * Adjust seg_len to the original value for tracing. 3013 */ 3014 seg_len -= gap; 3015 #ifdef DEBUG 3016 printf("tcp_rput: unacceptable, gap %d, rgap " 3017 "%d, flags 0x%x, seg_seq %u, seg_ack %u, " 3018 "seg_len %d, rnxt %u, snxt %u, %s", 3019 gap, rgap, flags, seg_seq, seg_ack, 3020 seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 3021 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 3022 #endif 3023 3024 /* 3025 * Arrange to send an ACK in response to the 3026 * unacceptable segment per RFC 793 page 69. There 3027 * is only one small difference between ours and the 3028 * acceptability test in the RFC - we accept ACK-only 3029 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 3030 * will be generated. 3031 * 3032 * Note that we have to ACK an ACK-only packet at least 3033 * for stacks that send 0-length keep-alives with 3034 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 3035 * section 4.2.3.6. As long as we don't ever generate 3036 * an unacceptable packet in response to an incoming 3037 * packet that is unacceptable, it should not cause 3038 * "ACK wars". 3039 */ 3040 flags |= TH_ACK_NEEDED; 3041 3042 /* 3043 * Continue processing this segment in order to use the 3044 * ACK information it contains, but skip all other 3045 * sequence-number processing. Processing the ACK 3046 * information is necessary in order to 3047 * re-synchronize connections that may have lost 3048 * synchronization. 3049 * 3050 * We clear seg_len and flag fields related to 3051 * sequence number processing as they are not 3052 * to be trusted for an unacceptable segment. 3053 */ 3054 seg_len = 0; 3055 flags &= ~(TH_SYN | TH_FIN | TH_URG); 3056 goto process_ack; 3057 } 3058 3059 /* Fix seg_seq, and chew the gap off the front. */ 3060 seg_seq = tcp->tcp_rnxt; 3061 do { 3062 mblk_t *mp2; 3063 assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 3064 (uintptr_t)UINT_MAX); 3065 gap += (uint_t)(mp->b_wptr - mp->b_rptr); 3066 if (gap > 0) { 3067 mp->b_rptr = mp->b_wptr - gap; 3068 break; 3069 } 3070 mp2 = mp; 3071 mp = mp->b_cont; 3072 freeb(mp2); 3073 } while (gap < 0); 3074 } 3075 /* 3076 * rgap is the amount of stuff received out of window. A negative 3077 * value is the amount out of window. 3078 */ 3079 if (rgap < 0) { 3080 mblk_t *mp2; 3081 3082 if (tcp->tcp_rwnd == 0) 3083 BUMP_MIB(tcp_mib.tcpInWinProbe); 3084 else { 3085 BUMP_MIB(tcp_mib.tcpInDataPastWinSegs); 3086 UPDATE_MIB(tcp_mib.tcpInDataPastWinBytes, -rgap); 3087 } 3088 3089 /* 3090 * seg_len does not include the FIN, so if more than 3091 * just the FIN is out of window, we act like we don't 3092 * see it. (If just the FIN is out of window, rgap 3093 * will be zero and we will go ahead and acknowledge 3094 * the FIN.) 3095 */ 3096 flags &= ~TH_FIN; 3097 3098 /* Fix seg_len and make sure there is something left. */ 3099 seg_len += rgap; 3100 if (seg_len <= 0) { 3101 /* 3102 * Resets are only valid if they lie within our offered 3103 * window. If the RST bit is set, we just ignore this 3104 * segment. 3105 */ 3106 if (flags & TH_RST) { 3107 freemsg(mp); 3108 return; 3109 } 3110 3111 /* Per RFC 793, we need to send back an ACK. */ 3112 flags |= TH_ACK_NEEDED; 3113 3114 /* 3115 * If this is a zero window probe, continue to 3116 * process the ACK part. But we need to set seg_len 3117 * to 0 to avoid data processing. Otherwise just 3118 * drop the segment and send back an ACK. 3119 */ 3120 if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 3121 flags &= ~(TH_SYN | TH_URG); 3122 seg_len = 0; 3123 /* Let's see if we can update our rwnd */ 3124 tcp_rcv_drain(sock_id, tcp); 3125 goto process_ack; 3126 } else { 3127 freemsg(mp); 3128 goto ack_check; 3129 } 3130 } 3131 /* Pitch out of window stuff off the end. */ 3132 rgap = seg_len; 3133 mp2 = mp; 3134 do { 3135 assert((uintptr_t)(mp2->b_wptr - 3136 mp2->b_rptr) <= (uintptr_t)INT_MAX); 3137 rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 3138 if (rgap < 0) { 3139 mp2->b_wptr += rgap; 3140 if ((mp1 = mp2->b_cont) != NULL) { 3141 mp2->b_cont = NULL; 3142 freemsg(mp1); 3143 } 3144 break; 3145 } 3146 } while ((mp2 = mp2->b_cont) != NULL); 3147 } 3148 ok:; 3149 /* 3150 * TCP should check ECN info for segments inside the window only. 3151 * Therefore the check should be done here. 3152 */ 3153 if (tcp->tcp_ecn_ok) { 3154 uchar_t tos = ((struct ip *)rptr)->ip_tos; 3155 3156 if (flags & TH_CWR) { 3157 tcp->tcp_ecn_echo_on = B_FALSE; 3158 } 3159 /* 3160 * Note that both ECN_CE and CWR can be set in the 3161 * same segment. In this case, we once again turn 3162 * on ECN_ECHO. 3163 */ 3164 if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 3165 tcp->tcp_ecn_echo_on = B_TRUE; 3166 } 3167 } 3168 3169 /* 3170 * Check whether we can update tcp_ts_recent. This test is 3171 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 3172 * Extensions for High Performance: An Update", Internet Draft. 3173 */ 3174 if (tcp->tcp_snd_ts_ok && 3175 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 3176 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 3177 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 3178 tcp->tcp_last_rcv_lbolt = prom_gettime(); 3179 } 3180 3181 if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 3182 /* 3183 * FIN in an out of order segment. We record this in 3184 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 3185 * Clear the FIN so that any check on FIN flag will fail. 3186 * Remember that FIN also counts in the sequence number 3187 * space. So we need to ack out of order FIN only segments. 3188 */ 3189 if (flags & TH_FIN) { 3190 tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 3191 tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 3192 flags &= ~TH_FIN; 3193 flags |= TH_ACK_NEEDED; 3194 } 3195 if (seg_len > 0) { 3196 /* Fill in the SACK blk list. */ 3197 if (tcp->tcp_snd_sack_ok) { 3198 assert(tcp->tcp_sack_info != NULL); 3199 tcp_sack_insert(tcp->tcp_sack_list, 3200 seg_seq, seg_seq + seg_len, 3201 &(tcp->tcp_num_sack_blk)); 3202 } 3203 3204 /* 3205 * Attempt reassembly and see if we have something 3206 * ready to go. 3207 */ 3208 mp = tcp_reass(tcp, mp, seg_seq); 3209 /* Always ack out of order packets */ 3210 flags |= TH_ACK_NEEDED | TH_PUSH; 3211 if (mp != NULL) { 3212 assert((uintptr_t)(mp->b_wptr - 3213 mp->b_rptr) <= (uintptr_t)INT_MAX); 3214 seg_len = mp->b_cont ? msgdsize(mp) : 3215 (int)(mp->b_wptr - mp->b_rptr); 3216 seg_seq = tcp->tcp_rnxt; 3217 /* 3218 * A gap is filled and the seq num and len 3219 * of the gap match that of a previously 3220 * received FIN, put the FIN flag back in. 3221 */ 3222 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3223 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3224 flags |= TH_FIN; 3225 tcp->tcp_valid_bits &= 3226 ~TCP_OFO_FIN_VALID; 3227 } 3228 } else { 3229 /* 3230 * Keep going even with NULL mp. 3231 * There may be a useful ACK or something else 3232 * we don't want to miss. 3233 * 3234 * But TCP should not perform fast retransmit 3235 * because of the ack number. TCP uses 3236 * seg_len == 0 to determine if it is a pure 3237 * ACK. And this is not a pure ACK. 3238 */ 3239 seg_len = 0; 3240 ofo_seg = B_TRUE; 3241 } 3242 } 3243 } else if (seg_len > 0) { 3244 BUMP_MIB(tcp_mib.tcpInDataInorderSegs); 3245 UPDATE_MIB(tcp_mib.tcpInDataInorderBytes, seg_len); 3246 /* 3247 * If an out of order FIN was received before, and the seq 3248 * num and len of the new segment match that of the FIN, 3249 * put the FIN flag back in. 3250 */ 3251 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3252 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3253 flags |= TH_FIN; 3254 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 3255 } 3256 } 3257 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 3258 if (flags & TH_RST) { 3259 freemsg(mp); 3260 switch (tcp->tcp_state) { 3261 case TCPS_SYN_RCVD: 3262 (void) tcp_clean_death(sock_id, tcp, ECONNREFUSED); 3263 break; 3264 case TCPS_ESTABLISHED: 3265 case TCPS_FIN_WAIT_1: 3266 case TCPS_FIN_WAIT_2: 3267 case TCPS_CLOSE_WAIT: 3268 (void) tcp_clean_death(sock_id, tcp, ECONNRESET); 3269 break; 3270 case TCPS_CLOSING: 3271 case TCPS_LAST_ACK: 3272 (void) tcp_clean_death(sock_id, tcp, 0); 3273 break; 3274 default: 3275 assert(tcp->tcp_state != TCPS_TIME_WAIT); 3276 (void) tcp_clean_death(sock_id, tcp, ENXIO); 3277 break; 3278 } 3279 return; 3280 } 3281 if (flags & TH_SYN) { 3282 /* 3283 * See RFC 793, Page 71 3284 * 3285 * The seq number must be in the window as it should 3286 * be "fixed" above. If it is outside window, it should 3287 * be already rejected. Note that we allow seg_seq to be 3288 * rnxt + rwnd because we want to accept 0 window probe. 3289 */ 3290 assert(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 3291 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 3292 freemsg(mp); 3293 /* 3294 * If the ACK flag is not set, just use our snxt as the 3295 * seq number of the RST segment. 3296 */ 3297 if (!(flags & TH_ACK)) { 3298 seg_ack = tcp->tcp_snxt; 3299 } 3300 tcp_xmit_ctl("TH_SYN", tcp, NULL, seg_ack, 3301 seg_seq + 1, TH_RST|TH_ACK, 0, sock_id); 3302 assert(tcp->tcp_state != TCPS_TIME_WAIT); 3303 (void) tcp_clean_death(sock_id, tcp, ECONNRESET); 3304 return; 3305 } 3306 3307 process_ack: 3308 if (!(flags & TH_ACK)) { 3309 #ifdef DEBUG 3310 printf("No ack in segment, dropped it, seq:%x\n", seg_seq); 3311 #endif 3312 freemsg(mp); 3313 goto xmit_check; 3314 } 3315 } 3316 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 3317 3318 if (tcp->tcp_state == TCPS_SYN_RCVD) { 3319 tcp_t *listener = tcp->tcp_listener; 3320 #ifdef DEBUG 3321 printf("Done with eager 3-way handshake\n"); 3322 #endif 3323 /* 3324 * NOTE: RFC 793 pg. 72 says this should be 'bytes_acked < 0' 3325 * but that would mean we have an ack that ignored our SYN. 3326 */ 3327 if (bytes_acked < 1 || SEQ_GT(seg_ack, tcp->tcp_snxt)) { 3328 freemsg(mp); 3329 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 3330 tcp, NULL, seg_ack, 0, TH_RST, 0, sock_id); 3331 return; 3332 } 3333 3334 /* 3335 * if the conn_req_q is full defer processing 3336 * until space is availabe after accept() 3337 * processing 3338 */ 3339 if (listener->tcp_conn_req_cnt_q < 3340 listener->tcp_conn_req_max) { 3341 tcp_t *tail; 3342 3343 listener->tcp_conn_req_cnt_q0--; 3344 listener->tcp_conn_req_cnt_q++; 3345 3346 /* Move from SYN_RCVD to ESTABLISHED list */ 3347 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 3348 tcp->tcp_eager_prev_q0; 3349 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 3350 tcp->tcp_eager_next_q0; 3351 tcp->tcp_eager_prev_q0 = NULL; 3352 tcp->tcp_eager_next_q0 = NULL; 3353 3354 /* 3355 * Insert at end of the queue because sockfs 3356 * sends down T_CONN_RES in chronological 3357 * order. Leaving the older conn indications 3358 * at front of the queue helps reducing search 3359 * time. 3360 */ 3361 tail = listener->tcp_eager_last_q; 3362 if (tail != NULL) { 3363 tail->tcp_eager_next_q = tcp; 3364 } else { 3365 listener->tcp_eager_next_q = tcp; 3366 } 3367 listener->tcp_eager_last_q = tcp; 3368 tcp->tcp_eager_next_q = NULL; 3369 } else { 3370 /* 3371 * Defer connection on q0 and set deferred 3372 * connection bit true 3373 */ 3374 tcp->tcp_conn_def_q0 = B_TRUE; 3375 3376 /* take tcp out of q0 ... */ 3377 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 3378 tcp->tcp_eager_next_q0; 3379 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 3380 tcp->tcp_eager_prev_q0; 3381 3382 /* ... and place it at the end of q0 */ 3383 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 3384 tcp->tcp_eager_next_q0 = listener; 3385 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 3386 listener->tcp_eager_prev_q0 = tcp; 3387 } 3388 3389 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 3390 bytes_acked--; 3391 3392 /* 3393 * If SYN was retransmitted, need to reset all 3394 * retransmission info as this segment will be 3395 * treated as a dup ACK. 3396 */ 3397 if (tcp->tcp_rexmit) { 3398 tcp->tcp_rexmit = B_FALSE; 3399 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3400 tcp->tcp_rexmit_max = tcp->tcp_snxt; 3401 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3402 tcp->tcp_ms_we_have_waited = 0; 3403 tcp->tcp_cwnd = mss; 3404 } 3405 3406 /* 3407 * We set the send window to zero here. 3408 * This is needed if there is data to be 3409 * processed already on the queue. 3410 * Later (at swnd_update label), the 3411 * "new_swnd > tcp_swnd" condition is satisfied 3412 * the XMIT_NEEDED flag is set in the current 3413 * (SYN_RCVD) state. This ensures tcp_wput_data() is 3414 * called if there is already data on queue in 3415 * this state. 3416 */ 3417 tcp->tcp_swnd = 0; 3418 3419 if (new_swnd > tcp->tcp_max_swnd) 3420 tcp->tcp_max_swnd = new_swnd; 3421 tcp->tcp_swl1 = seg_seq; 3422 tcp->tcp_swl2 = seg_ack; 3423 tcp->tcp_state = TCPS_ESTABLISHED; 3424 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 3425 } 3426 /* This code follows 4.4BSD-Lite2 mostly. */ 3427 if (bytes_acked < 0) 3428 goto est; 3429 3430 /* 3431 * If TCP is ECN capable and the congestion experience bit is 3432 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 3433 * done once per window (or more loosely, per RTT). 3434 */ 3435 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 3436 tcp->tcp_cwr = B_FALSE; 3437 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { 3438 if (!tcp->tcp_cwr) { 3439 npkt = (MIN(tcp->tcp_cwnd, tcp->tcp_swnd) >> 1) / mss; 3440 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; 3441 tcp->tcp_cwnd = npkt * mss; 3442 /* 3443 * If the cwnd is 0, use the timer to clock out 3444 * new segments. This is required by the ECN spec. 3445 */ 3446 if (npkt == 0) { 3447 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3448 /* 3449 * This makes sure that when the ACK comes 3450 * back, we will increase tcp_cwnd by 1 MSS. 3451 */ 3452 tcp->tcp_cwnd_cnt = 0; 3453 } 3454 tcp->tcp_cwr = B_TRUE; 3455 /* 3456 * This marks the end of the current window of in 3457 * flight data. That is why we don't use 3458 * tcp_suna + tcp_swnd. Only data in flight can 3459 * provide ECN info. 3460 */ 3461 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3462 tcp->tcp_ecn_cwr_sent = B_FALSE; 3463 } 3464 } 3465 3466 mp1 = tcp->tcp_xmit_head; 3467 if (bytes_acked == 0) { 3468 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 3469 int dupack_cnt; 3470 3471 BUMP_MIB(tcp_mib.tcpInDupAck); 3472 /* 3473 * Fast retransmit. When we have seen exactly three 3474 * identical ACKs while we have unacked data 3475 * outstanding we take it as a hint that our peer 3476 * dropped something. 3477 * 3478 * If TCP is retransmitting, don't do fast retransmit. 3479 */ 3480 if (mp1 != NULL && tcp->tcp_suna != tcp->tcp_snxt && 3481 ! tcp->tcp_rexmit) { 3482 /* Do Limited Transmit */ 3483 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 3484 tcp_dupack_fast_retransmit) { 3485 /* 3486 * RFC 3042 3487 * 3488 * What we need to do is temporarily 3489 * increase tcp_cwnd so that new 3490 * data can be sent if it is allowed 3491 * by the receive window (tcp_rwnd). 3492 * tcp_wput_data() will take care of 3493 * the rest. 3494 * 3495 * If the connection is SACK capable, 3496 * only do limited xmit when there 3497 * is SACK info. 3498 * 3499 * Note how tcp_cwnd is incremented. 3500 * The first dup ACK will increase 3501 * it by 1 MSS. The second dup ACK 3502 * will increase it by 2 MSS. This 3503 * means that only 1 new segment will 3504 * be sent for each dup ACK. 3505 */ 3506 if (tcp->tcp_unsent > 0 && 3507 (!tcp->tcp_snd_sack_ok || 3508 (tcp->tcp_snd_sack_ok && 3509 tcp->tcp_notsack_list != NULL))) { 3510 tcp->tcp_cwnd += mss << 3511 (tcp->tcp_dupack_cnt - 1); 3512 flags |= TH_LIMIT_XMIT; 3513 } 3514 } else if (dupack_cnt == 3515 tcp_dupack_fast_retransmit) { 3516 3517 BUMP_MIB(tcp_mib.tcpOutFastRetrans); 3518 /* 3519 * If we have reduced tcp_ssthresh 3520 * because of ECN, do not reduce it again 3521 * unless it is already one window of data 3522 * away. After one window of data, tcp_cwr 3523 * should then be cleared. Note that 3524 * for non ECN capable connection, tcp_cwr 3525 * should always be false. 3526 * 3527 * Adjust cwnd since the duplicate 3528 * ack indicates that a packet was 3529 * dropped (due to congestion.) 3530 */ 3531 if (!tcp->tcp_cwr) { 3532 npkt = (MIN(tcp->tcp_cwnd, 3533 tcp->tcp_swnd) >> 1) / mss; 3534 if (npkt < 2) 3535 npkt = 2; 3536 tcp->tcp_cwnd_ssthresh = npkt * mss; 3537 tcp->tcp_cwnd = (npkt + 3538 tcp->tcp_dupack_cnt) * mss; 3539 } 3540 if (tcp->tcp_ecn_ok) { 3541 tcp->tcp_cwr = B_TRUE; 3542 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3543 tcp->tcp_ecn_cwr_sent = B_FALSE; 3544 } 3545 3546 /* 3547 * We do Hoe's algorithm. Refer to her 3548 * paper "Improving the Start-up Behavior 3549 * of a Congestion Control Scheme for TCP," 3550 * appeared in SIGCOMM'96. 3551 * 3552 * Save highest seq no we have sent so far. 3553 * Be careful about the invisible FIN byte. 3554 */ 3555 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 3556 (tcp->tcp_unsent == 0)) { 3557 tcp->tcp_rexmit_max = tcp->tcp_fss; 3558 } else { 3559 tcp->tcp_rexmit_max = tcp->tcp_snxt; 3560 } 3561 3562 /* 3563 * Do not allow bursty traffic during. 3564 * fast recovery. Refer to Fall and Floyd's 3565 * paper "Simulation-based Comparisons of 3566 * Tahoe, Reno and SACK TCP" (in CCR ??) 3567 * This is a best current practise. 3568 */ 3569 tcp->tcp_snd_burst = TCP_CWND_SS; 3570 3571 /* 3572 * For SACK: 3573 * Calculate tcp_pipe, which is the 3574 * estimated number of bytes in 3575 * network. 3576 * 3577 * tcp_fack is the highest sack'ed seq num 3578 * TCP has received. 3579 * 3580 * tcp_pipe is explained in the above quoted 3581 * Fall and Floyd's paper. tcp_fack is 3582 * explained in Mathis and Mahdavi's 3583 * "Forward Acknowledgment: Refining TCP 3584 * Congestion Control" in SIGCOMM '96. 3585 */ 3586 if (tcp->tcp_snd_sack_ok) { 3587 assert(tcp->tcp_sack_info != NULL); 3588 if (tcp->tcp_notsack_list != NULL) { 3589 tcp->tcp_pipe = tcp->tcp_snxt - 3590 tcp->tcp_fack; 3591 tcp->tcp_sack_snxt = seg_ack; 3592 flags |= TH_NEED_SACK_REXMIT; 3593 } else { 3594 /* 3595 * Always initialize tcp_pipe 3596 * even though we don't have 3597 * any SACK info. If later 3598 * we get SACK info and 3599 * tcp_pipe is not initialized, 3600 * funny things will happen. 3601 */ 3602 tcp->tcp_pipe = 3603 tcp->tcp_cwnd_ssthresh; 3604 } 3605 } else { 3606 flags |= TH_REXMIT_NEEDED; 3607 } /* tcp_snd_sack_ok */ 3608 3609 } else { 3610 /* 3611 * Here we perform congestion 3612 * avoidance, but NOT slow start. 3613 * This is known as the Fast 3614 * Recovery Algorithm. 3615 */ 3616 if (tcp->tcp_snd_sack_ok && 3617 tcp->tcp_notsack_list != NULL) { 3618 flags |= TH_NEED_SACK_REXMIT; 3619 tcp->tcp_pipe -= mss; 3620 if (tcp->tcp_pipe < 0) 3621 tcp->tcp_pipe = 0; 3622 } else { 3623 /* 3624 * We know that one more packet has 3625 * left the pipe thus we can update 3626 * cwnd. 3627 */ 3628 cwnd = tcp->tcp_cwnd + mss; 3629 if (cwnd > tcp->tcp_cwnd_max) 3630 cwnd = tcp->tcp_cwnd_max; 3631 tcp->tcp_cwnd = cwnd; 3632 flags |= TH_XMIT_NEEDED; 3633 } 3634 } 3635 } 3636 } else if (tcp->tcp_zero_win_probe) { 3637 /* 3638 * If the window has opened, need to arrange 3639 * to send additional data. 3640 */ 3641 if (new_swnd != 0) { 3642 /* tcp_suna != tcp_snxt */ 3643 /* Packet contains a window update */ 3644 BUMP_MIB(tcp_mib.tcpInWinUpdate); 3645 tcp->tcp_zero_win_probe = 0; 3646 tcp->tcp_timer_backoff = 0; 3647 tcp->tcp_ms_we_have_waited = 0; 3648 3649 /* 3650 * Transmit starting with tcp_suna since 3651 * the one byte probe is not ack'ed. 3652 * If TCP has sent more than one identical 3653 * probe, tcp_rexmit will be set. That means 3654 * tcp_ss_rexmit() will send out the one 3655 * byte along with new data. Otherwise, 3656 * fake the retransmission. 3657 */ 3658 flags |= TH_XMIT_NEEDED; 3659 if (!tcp->tcp_rexmit) { 3660 tcp->tcp_rexmit = B_TRUE; 3661 tcp->tcp_dupack_cnt = 0; 3662 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 3663 tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 3664 } 3665 } 3666 } 3667 goto swnd_update; 3668 } 3669 3670 /* 3671 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 3672 * If the ACK value acks something that we have not yet sent, it might 3673 * be an old duplicate segment. Send an ACK to re-synchronize the 3674 * other side. 3675 * Note: reset in response to unacceptable ACK in SYN_RECEIVE 3676 * state is handled above, so we can always just drop the segment and 3677 * send an ACK here. 3678 * 3679 * Should we send ACKs in response to ACK only segments? 3680 */ 3681 if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 3682 BUMP_MIB(tcp_mib.tcpInAckUnsent); 3683 /* drop the received segment */ 3684 freemsg(mp); 3685 3686 /* Send back an ACK. */ 3687 mp = tcp_ack_mp(tcp); 3688 3689 if (mp == NULL) { 3690 return; 3691 } 3692 BUMP_MIB(tcp_mib.tcpOutAck); 3693 (void) ipv4_tcp_output(sock_id, mp); 3694 freeb(mp); 3695 return; 3696 } 3697 3698 /* 3699 * TCP gets a new ACK, update the notsack'ed list to delete those 3700 * blocks that are covered by this ACK. 3701 */ 3702 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 3703 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 3704 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 3705 } 3706 3707 /* 3708 * If we got an ACK after fast retransmit, check to see 3709 * if it is a partial ACK. If it is not and the congestion 3710 * window was inflated to account for the other side's 3711 * cached packets, retract it. If it is, do Hoe's algorithm. 3712 */ 3713 if (tcp->tcp_dupack_cnt >= tcp_dupack_fast_retransmit) { 3714 assert(tcp->tcp_rexmit == B_FALSE); 3715 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 3716 tcp->tcp_dupack_cnt = 0; 3717 /* 3718 * Restore the orig tcp_cwnd_ssthresh after 3719 * fast retransmit phase. 3720 */ 3721 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 3722 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; 3723 } 3724 tcp->tcp_rexmit_max = seg_ack; 3725 tcp->tcp_cwnd_cnt = 0; 3726 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3727 3728 /* 3729 * Remove all notsack info to avoid confusion with 3730 * the next fast retrasnmit/recovery phase. 3731 */ 3732 if (tcp->tcp_snd_sack_ok && 3733 tcp->tcp_notsack_list != NULL) { 3734 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 3735 } 3736 } else { 3737 if (tcp->tcp_snd_sack_ok && 3738 tcp->tcp_notsack_list != NULL) { 3739 flags |= TH_NEED_SACK_REXMIT; 3740 tcp->tcp_pipe -= mss; 3741 if (tcp->tcp_pipe < 0) 3742 tcp->tcp_pipe = 0; 3743 } else { 3744 /* 3745 * Hoe's algorithm: 3746 * 3747 * Retransmit the unack'ed segment and 3748 * restart fast recovery. Note that we 3749 * need to scale back tcp_cwnd to the 3750 * original value when we started fast 3751 * recovery. This is to prevent overly 3752 * aggressive behaviour in sending new 3753 * segments. 3754 */ 3755 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + 3756 tcp_dupack_fast_retransmit * mss; 3757 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 3758 BUMP_MIB(tcp_mib.tcpOutFastRetrans); 3759 flags |= TH_REXMIT_NEEDED; 3760 } 3761 } 3762 } else { 3763 tcp->tcp_dupack_cnt = 0; 3764 if (tcp->tcp_rexmit) { 3765 /* 3766 * TCP is retranmitting. If the ACK ack's all 3767 * outstanding data, update tcp_rexmit_max and 3768 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 3769 * to the correct value. 3770 * 3771 * Note that SEQ_LEQ() is used. This is to avoid 3772 * unnecessary fast retransmit caused by dup ACKs 3773 * received when TCP does slow start retransmission 3774 * after a time out. During this phase, TCP may 3775 * send out segments which are already received. 3776 * This causes dup ACKs to be sent back. 3777 */ 3778 if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 3779 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 3780 tcp->tcp_rexmit_nxt = seg_ack; 3781 } 3782 if (seg_ack != tcp->tcp_rexmit_max) { 3783 flags |= TH_XMIT_NEEDED; 3784 } 3785 } else { 3786 tcp->tcp_rexmit = B_FALSE; 3787 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3788 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3789 } 3790 tcp->tcp_ms_we_have_waited = 0; 3791 } 3792 } 3793 3794 BUMP_MIB(tcp_mib.tcpInAckSegs); 3795 UPDATE_MIB(tcp_mib.tcpInAckBytes, bytes_acked); 3796 tcp->tcp_suna = seg_ack; 3797 if (tcp->tcp_zero_win_probe != 0) { 3798 tcp->tcp_zero_win_probe = 0; 3799 tcp->tcp_timer_backoff = 0; 3800 } 3801 3802 /* 3803 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 3804 * Note that it cannot be the SYN being ack'ed. The code flow 3805 * will not reach here. 3806 */ 3807 if (mp1 == NULL) { 3808 goto fin_acked; 3809 } 3810 3811 /* 3812 * Update the congestion window. 3813 * 3814 * If TCP is not ECN capable or TCP is ECN capable but the 3815 * congestion experience bit is not set, increase the tcp_cwnd as 3816 * usual. 3817 */ 3818 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 3819 cwnd = tcp->tcp_cwnd; 3820 add = mss; 3821 3822 if (cwnd >= tcp->tcp_cwnd_ssthresh) { 3823 /* 3824 * This is to prevent an increase of less than 1 MSS of 3825 * tcp_cwnd. With partial increase, tcp_wput_data() 3826 * may send out tinygrams in order to preserve mblk 3827 * boundaries. 3828 * 3829 * By initializing tcp_cwnd_cnt to new tcp_cwnd and 3830 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is 3831 * increased by 1 MSS for every RTTs. 3832 */ 3833 if (tcp->tcp_cwnd_cnt <= 0) { 3834 tcp->tcp_cwnd_cnt = cwnd + add; 3835 } else { 3836 tcp->tcp_cwnd_cnt -= add; 3837 add = 0; 3838 } 3839 } 3840 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); 3841 } 3842 3843 /* Can we update the RTT estimates? */ 3844 if (tcp->tcp_snd_ts_ok) { 3845 /* Ignore zero timestamp echo-reply. */ 3846 if (tcpopt.tcp_opt_ts_ecr != 0) { 3847 tcp_set_rto(tcp, (int32_t)(prom_gettime() - 3848 tcpopt.tcp_opt_ts_ecr)); 3849 } 3850 3851 /* If needed, restart the timer. */ 3852 if (tcp->tcp_set_timer == 1) { 3853 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3854 tcp->tcp_set_timer = 0; 3855 } 3856 /* 3857 * Update tcp_csuna in case the other side stops sending 3858 * us timestamps. 3859 */ 3860 tcp->tcp_csuna = tcp->tcp_snxt; 3861 } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 3862 /* 3863 * An ACK sequence we haven't seen before, so get the RTT 3864 * and update the RTO. 3865 * Note. use uintptr_t to suppress the gcc warning. 3866 */ 3867 tcp_set_rto(tcp, (int32_t)(prom_gettime() - 3868 (uint32_t)(uintptr_t)mp1->b_prev)); 3869 3870 /* Remeber the last sequence to be ACKed */ 3871 tcp->tcp_csuna = seg_ack; 3872 if (tcp->tcp_set_timer == 1) { 3873 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3874 tcp->tcp_set_timer = 0; 3875 } 3876 } else { 3877 BUMP_MIB(tcp_mib.tcpRttNoUpdate); 3878 } 3879 3880 /* Eat acknowledged bytes off the xmit queue. */ 3881 for (;;) { 3882 mblk_t *mp2; 3883 uchar_t *wptr; 3884 3885 wptr = mp1->b_wptr; 3886 assert((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 3887 bytes_acked -= (int)(wptr - mp1->b_rptr); 3888 if (bytes_acked < 0) { 3889 mp1->b_rptr = wptr + bytes_acked; 3890 break; 3891 } 3892 mp1->b_prev = NULL; 3893 mp2 = mp1; 3894 mp1 = mp1->b_cont; 3895 freeb(mp2); 3896 if (bytes_acked == 0) { 3897 if (mp1 == NULL) { 3898 /* Everything is ack'ed, clear the tail. */ 3899 tcp->tcp_xmit_tail = NULL; 3900 goto pre_swnd_update; 3901 } 3902 if (mp2 != tcp->tcp_xmit_tail) 3903 break; 3904 tcp->tcp_xmit_tail = mp1; 3905 assert((uintptr_t)(mp1->b_wptr - 3906 mp1->b_rptr) <= (uintptr_t)INT_MAX); 3907 tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 3908 mp1->b_rptr); 3909 break; 3910 } 3911 if (mp1 == NULL) { 3912 /* 3913 * More was acked but there is nothing more 3914 * outstanding. This means that the FIN was 3915 * just acked or that we're talking to a clown. 3916 */ 3917 fin_acked: 3918 assert(tcp->tcp_fin_sent); 3919 tcp->tcp_xmit_tail = NULL; 3920 if (tcp->tcp_fin_sent) { 3921 tcp->tcp_fin_acked = B_TRUE; 3922 } else { 3923 /* 3924 * We should never got here because 3925 * we have already checked that the 3926 * number of bytes ack'ed should be 3927 * smaller than or equal to what we 3928 * have sent so far (it is the 3929 * acceptability check of the ACK). 3930 * We can only get here if the send 3931 * queue is corrupted. 3932 * 3933 * Terminate the connection and 3934 * panic the system. It is better 3935 * for us to panic instead of 3936 * continuing to avoid other disaster. 3937 */ 3938 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 3939 tcp->tcp_rnxt, TH_RST|TH_ACK, 0, sock_id); 3940 printf("Memory corruption " 3941 "detected for connection %s.\n", 3942 tcp_display(tcp, NULL, 3943 DISP_ADDR_AND_PORT)); 3944 /* We should never get here... */ 3945 prom_panic("tcp_rput_data"); 3946 return; 3947 } 3948 goto pre_swnd_update; 3949 } 3950 assert(mp2 != tcp->tcp_xmit_tail); 3951 } 3952 if (tcp->tcp_unsent) { 3953 flags |= TH_XMIT_NEEDED; 3954 } 3955 pre_swnd_update: 3956 tcp->tcp_xmit_head = mp1; 3957 swnd_update: 3958 /* 3959 * The following check is different from most other implementations. 3960 * For bi-directional transfer, when segments are dropped, the 3961 * "normal" check will not accept a window update in those 3962 * retransmitted segemnts. Failing to do that, TCP may send out 3963 * segments which are outside receiver's window. As TCP accepts 3964 * the ack in those retransmitted segments, if the window update in 3965 * the same segment is not accepted, TCP will incorrectly calculates 3966 * that it can send more segments. This can create a deadlock 3967 * with the receiver if its window becomes zero. 3968 */ 3969 if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 3970 SEQ_LT(tcp->tcp_swl1, seg_seq) || 3971 (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 3972 /* 3973 * The criteria for update is: 3974 * 3975 * 1. the segment acknowledges some data. Or 3976 * 2. the segment is new, i.e. it has a higher seq num. Or 3977 * 3. the segment is not old and the advertised window is 3978 * larger than the previous advertised window. 3979 */ 3980 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 3981 flags |= TH_XMIT_NEEDED; 3982 tcp->tcp_swnd = new_swnd; 3983 if (new_swnd > tcp->tcp_max_swnd) 3984 tcp->tcp_max_swnd = new_swnd; 3985 tcp->tcp_swl1 = seg_seq; 3986 tcp->tcp_swl2 = seg_ack; 3987 } 3988 est: 3989 if (tcp->tcp_state > TCPS_ESTABLISHED) { 3990 switch (tcp->tcp_state) { 3991 case TCPS_FIN_WAIT_1: 3992 if (tcp->tcp_fin_acked) { 3993 tcp->tcp_state = TCPS_FIN_WAIT_2; 3994 /* 3995 * We implement the non-standard BSD/SunOS 3996 * FIN_WAIT_2 flushing algorithm. 3997 * If there is no user attached to this 3998 * TCP endpoint, then this TCP struct 3999 * could hang around forever in FIN_WAIT_2 4000 * state if the peer forgets to send us 4001 * a FIN. To prevent this, we wait only 4002 * 2*MSL (a convenient time value) for 4003 * the FIN to arrive. If it doesn't show up, 4004 * we flush the TCP endpoint. This algorithm, 4005 * though a violation of RFC-793, has worked 4006 * for over 10 years in BSD systems. 4007 * Note: SunOS 4.x waits 675 seconds before 4008 * flushing the FIN_WAIT_2 connection. 4009 */ 4010 TCP_TIMER_RESTART(tcp, 4011 tcp_fin_wait_2_flush_interval); 4012 } 4013 break; 4014 case TCPS_FIN_WAIT_2: 4015 break; /* Shutdown hook? */ 4016 case TCPS_LAST_ACK: 4017 freemsg(mp); 4018 if (tcp->tcp_fin_acked) { 4019 (void) tcp_clean_death(sock_id, tcp, 0); 4020 return; 4021 } 4022 goto xmit_check; 4023 case TCPS_CLOSING: 4024 if (tcp->tcp_fin_acked) { 4025 tcp->tcp_state = TCPS_TIME_WAIT; 4026 tcp_time_wait_append(tcp); 4027 TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 4028 } 4029 /*FALLTHRU*/ 4030 case TCPS_CLOSE_WAIT: 4031 freemsg(mp); 4032 goto xmit_check; 4033 default: 4034 assert(tcp->tcp_state != TCPS_TIME_WAIT); 4035 break; 4036 } 4037 } 4038 if (flags & TH_FIN) { 4039 /* Make sure we ack the fin */ 4040 flags |= TH_ACK_NEEDED; 4041 if (!tcp->tcp_fin_rcvd) { 4042 tcp->tcp_fin_rcvd = B_TRUE; 4043 tcp->tcp_rnxt++; 4044 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 4045 4046 switch (tcp->tcp_state) { 4047 case TCPS_SYN_RCVD: 4048 case TCPS_ESTABLISHED: 4049 tcp->tcp_state = TCPS_CLOSE_WAIT; 4050 /* Keepalive? */ 4051 break; 4052 case TCPS_FIN_WAIT_1: 4053 if (!tcp->tcp_fin_acked) { 4054 tcp->tcp_state = TCPS_CLOSING; 4055 break; 4056 } 4057 /* FALLTHRU */ 4058 case TCPS_FIN_WAIT_2: 4059 tcp->tcp_state = TCPS_TIME_WAIT; 4060 tcp_time_wait_append(tcp); 4061 TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 4062 if (seg_len) { 4063 /* 4064 * implies data piggybacked on FIN. 4065 * break to handle data. 4066 */ 4067 break; 4068 } 4069 freemsg(mp); 4070 goto ack_check; 4071 } 4072 } 4073 } 4074 if (mp == NULL) 4075 goto xmit_check; 4076 if (seg_len == 0) { 4077 freemsg(mp); 4078 goto xmit_check; 4079 } 4080 if (mp->b_rptr == mp->b_wptr) { 4081 /* 4082 * The header has been consumed, so we remove the 4083 * zero-length mblk here. 4084 */ 4085 mp1 = mp; 4086 mp = mp->b_cont; 4087 freeb(mp1); 4088 } 4089 /* 4090 * ACK every other segments, unless the input queue is empty 4091 * as we don't have a timer available. 4092 */ 4093 if (++tcp->tcp_rack_cnt == 2 || sockets[sock_id].inq == NULL) { 4094 flags |= TH_ACK_NEEDED; 4095 tcp->tcp_rack_cnt = 0; 4096 } 4097 tcp->tcp_rnxt += seg_len; 4098 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 4099 4100 /* Update SACK list */ 4101 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4102 tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 4103 &(tcp->tcp_num_sack_blk)); 4104 } 4105 4106 if (tcp->tcp_listener) { 4107 /* 4108 * Side queue inbound data until the accept happens. 4109 * tcp_accept/tcp_rput drains this when the accept happens. 4110 */ 4111 tcp_rcv_enqueue(tcp, mp, seg_len); 4112 } else { 4113 /* Just queue the data until the app calls read. */ 4114 tcp_rcv_enqueue(tcp, mp, seg_len); 4115 /* 4116 * Make sure the timer is running if we have data waiting 4117 * for a push bit. This provides resiliency against 4118 * implementations that do not correctly generate push bits. 4119 */ 4120 if (tcp->tcp_rcv_list != NULL) 4121 flags |= TH_TIMER_NEEDED; 4122 } 4123 4124 xmit_check: 4125 /* Is there anything left to do? */ 4126 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 4127 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_TIMER_NEEDED)) == 0) 4128 return; 4129 4130 /* Any transmit work to do and a non-zero window? */ 4131 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 4132 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 4133 if (flags & TH_REXMIT_NEEDED) { 4134 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 4135 4136 if (snd_size > mss) 4137 snd_size = mss; 4138 if (snd_size > tcp->tcp_swnd) 4139 snd_size = tcp->tcp_swnd; 4140 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 4141 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 4142 B_TRUE); 4143 4144 if (mp1 != NULL) { 4145 /* use uintptr_t to suppress the gcc warning */ 4146 tcp->tcp_xmit_head->b_prev = 4147 (mblk_t *)(uintptr_t)prom_gettime(); 4148 tcp->tcp_csuna = tcp->tcp_snxt; 4149 BUMP_MIB(tcp_mib.tcpRetransSegs); 4150 UPDATE_MIB(tcp_mib.tcpRetransBytes, snd_size); 4151 (void) ipv4_tcp_output(sock_id, mp1); 4152 freeb(mp1); 4153 } 4154 } 4155 if (flags & TH_NEED_SACK_REXMIT) { 4156 if (tcp_sack_rxmit(tcp, sock_id) != 0) { 4157 flags |= TH_XMIT_NEEDED; 4158 } 4159 } 4160 /* 4161 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 4162 * out new segment. Note that tcp_rexmit should not be 4163 * set, otherwise TH_LIMIT_XMIT should not be set. 4164 */ 4165 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 4166 if (!tcp->tcp_rexmit) { 4167 tcp_wput_data(tcp, NULL, sock_id); 4168 } else { 4169 tcp_ss_rexmit(tcp, sock_id); 4170 } 4171 /* 4172 * The TCP could be closed in tcp_state_wait via 4173 * tcp_wput_data (tcp_ss_rexmit could call 4174 * tcp_wput_data as well). 4175 */ 4176 if (sockets[sock_id].pcb == NULL) 4177 return; 4178 } 4179 /* 4180 * Adjust tcp_cwnd back to normal value after sending 4181 * new data segments. 4182 */ 4183 if (flags & TH_LIMIT_XMIT) { 4184 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 4185 } 4186 4187 /* Anything more to do? */ 4188 if ((flags & (TH_ACK_NEEDED|TH_TIMER_NEEDED)) == 0) 4189 return; 4190 } 4191 ack_check: 4192 if (flags & TH_ACK_NEEDED) { 4193 /* 4194 * Time to send an ack for some reason. 4195 */ 4196 if ((mp1 = tcp_ack_mp(tcp)) != NULL) { 4197 TCP_DUMP_PACKET("tcp_rput_data: ack mp", mp1); 4198 (void) ipv4_tcp_output(sock_id, mp1); 4199 BUMP_MIB(tcp_mib.tcpOutAck); 4200 freeb(mp1); 4201 } 4202 } 4203 } 4204 4205 /* 4206 * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start 4207 * retransmission after a timeout. 4208 * 4209 * To limit the number of duplicate segments, we limit the number of segment 4210 * to be sent in one time to tcp_snd_burst, the burst variable. 4211 */ 4212 static void 4213 tcp_ss_rexmit(tcp_t *tcp, int sock_id) 4214 { 4215 uint32_t snxt; 4216 uint32_t smax; 4217 int32_t win; 4218 int32_t mss; 4219 int32_t off; 4220 int32_t burst = tcp->tcp_snd_burst; 4221 mblk_t *snxt_mp; 4222 4223 /* 4224 * Note that tcp_rexmit can be set even though TCP has retransmitted 4225 * all unack'ed segments. 4226 */ 4227 if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 4228 smax = tcp->tcp_rexmit_max; 4229 snxt = tcp->tcp_rexmit_nxt; 4230 if (SEQ_LT(snxt, tcp->tcp_suna)) { 4231 snxt = tcp->tcp_suna; 4232 } 4233 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 4234 win -= snxt - tcp->tcp_suna; 4235 mss = tcp->tcp_mss; 4236 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 4237 4238 while (SEQ_LT(snxt, smax) && (win > 0) && 4239 (burst > 0) && (snxt_mp != NULL)) { 4240 mblk_t *xmit_mp; 4241 mblk_t *old_snxt_mp = snxt_mp; 4242 uint32_t cnt = mss; 4243 4244 if (win < cnt) { 4245 cnt = win; 4246 } 4247 if (SEQ_GT(snxt + cnt, smax)) { 4248 cnt = smax - snxt; 4249 } 4250 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 4251 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 4252 4253 if (xmit_mp == NULL) 4254 return; 4255 4256 (void) ipv4_tcp_output(sock_id, xmit_mp); 4257 freeb(xmit_mp); 4258 4259 snxt += cnt; 4260 win -= cnt; 4261 /* 4262 * Update the send timestamp to avoid false 4263 * retransmission. 4264 * Note. use uintptr_t to suppress the gcc warning. 4265 */ 4266 old_snxt_mp->b_prev = 4267 (mblk_t *)(uintptr_t)prom_gettime(); 4268 BUMP_MIB(tcp_mib.tcpRetransSegs); 4269 UPDATE_MIB(tcp_mib.tcpRetransBytes, cnt); 4270 4271 tcp->tcp_rexmit_nxt = snxt; 4272 burst--; 4273 } 4274 /* 4275 * If we have transmitted all we have at the time 4276 * we started the retranmission, we can leave 4277 * the rest of the job to tcp_wput_data(). But we 4278 * need to check the send window first. If the 4279 * win is not 0, go on with tcp_wput_data(). 4280 */ 4281 if (SEQ_LT(snxt, smax) || win == 0) { 4282 return; 4283 } 4284 } 4285 /* Only call tcp_wput_data() if there is data to be sent. */ 4286 if (tcp->tcp_unsent) { 4287 tcp_wput_data(tcp, NULL, sock_id); 4288 } 4289 } 4290 4291 /* 4292 * tcp_timer is the timer service routine. It handles all timer events for 4293 * a tcp instance except keepalives. It figures out from the state of the 4294 * tcp instance what kind of action needs to be done at the time it is called. 4295 */ 4296 static void 4297 tcp_timer(tcp_t *tcp, int sock_id) 4298 { 4299 mblk_t *mp; 4300 uint32_t first_threshold; 4301 uint32_t second_threshold; 4302 uint32_t ms; 4303 uint32_t mss; 4304 4305 first_threshold = tcp->tcp_first_timer_threshold; 4306 second_threshold = tcp->tcp_second_timer_threshold; 4307 switch (tcp->tcp_state) { 4308 case TCPS_IDLE: 4309 case TCPS_BOUND: 4310 case TCPS_LISTEN: 4311 return; 4312 case TCPS_SYN_RCVD: 4313 case TCPS_SYN_SENT: 4314 first_threshold = tcp->tcp_first_ctimer_threshold; 4315 second_threshold = tcp->tcp_second_ctimer_threshold; 4316 break; 4317 case TCPS_ESTABLISHED: 4318 case TCPS_FIN_WAIT_1: 4319 case TCPS_CLOSING: 4320 case TCPS_CLOSE_WAIT: 4321 case TCPS_LAST_ACK: 4322 /* If we have data to rexmit */ 4323 if (tcp->tcp_suna != tcp->tcp_snxt) { 4324 int32_t time_to_wait; 4325 4326 BUMP_MIB(tcp_mib.tcpTimRetrans); 4327 if (tcp->tcp_xmit_head == NULL) 4328 break; 4329 /* use uintptr_t to suppress the gcc warning */ 4330 time_to_wait = (int32_t)(prom_gettime() - 4331 (uint32_t)(uintptr_t)tcp->tcp_xmit_head->b_prev); 4332 time_to_wait = tcp->tcp_rto - time_to_wait; 4333 if (time_to_wait > 0) { 4334 /* 4335 * Timer fired too early, so restart it. 4336 */ 4337 TCP_TIMER_RESTART(tcp, time_to_wait); 4338 return; 4339 } 4340 /* 4341 * When we probe zero windows, we force the swnd open. 4342 * If our peer acks with a closed window swnd will be 4343 * set to zero by tcp_rput(). As long as we are 4344 * receiving acks tcp_rput will 4345 * reset 'tcp_ms_we_have_waited' so as not to trip the 4346 * first and second interval actions. NOTE: the timer 4347 * interval is allowed to continue its exponential 4348 * backoff. 4349 */ 4350 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 4351 DEBUG_1("tcp_timer (%d): zero win", sock_id); 4352 break; 4353 } else { 4354 /* 4355 * After retransmission, we need to do 4356 * slow start. Set the ssthresh to one 4357 * half of current effective window and 4358 * cwnd to one MSS. Also reset 4359 * tcp_cwnd_cnt. 4360 * 4361 * Note that if tcp_ssthresh is reduced because 4362 * of ECN, do not reduce it again unless it is 4363 * already one window of data away (tcp_cwr 4364 * should then be cleared) or this is a 4365 * timeout for a retransmitted segment. 4366 */ 4367 uint32_t npkt; 4368 4369 if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 4370 npkt = (MIN((tcp->tcp_timer_backoff ? 4371 tcp->tcp_cwnd_ssthresh : 4372 tcp->tcp_cwnd), 4373 tcp->tcp_swnd) >> 1) / 4374 tcp->tcp_mss; 4375 if (npkt < 2) 4376 npkt = 2; 4377 tcp->tcp_cwnd_ssthresh = npkt * 4378 tcp->tcp_mss; 4379 } 4380 tcp->tcp_cwnd = tcp->tcp_mss; 4381 tcp->tcp_cwnd_cnt = 0; 4382 if (tcp->tcp_ecn_ok) { 4383 tcp->tcp_cwr = B_TRUE; 4384 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 4385 tcp->tcp_ecn_cwr_sent = B_FALSE; 4386 } 4387 } 4388 break; 4389 } 4390 /* 4391 * We have something to send yet we cannot send. The 4392 * reason can be: 4393 * 4394 * 1. Zero send window: we need to do zero window probe. 4395 * 2. Zero cwnd: because of ECN, we need to "clock out 4396 * segments. 4397 * 3. SWS avoidance: receiver may have shrunk window, 4398 * reset our knowledge. 4399 * 4400 * Note that condition 2 can happen with either 1 or 4401 * 3. But 1 and 3 are exclusive. 4402 */ 4403 if (tcp->tcp_unsent != 0) { 4404 if (tcp->tcp_cwnd == 0) { 4405 /* 4406 * Set tcp_cwnd to 1 MSS so that a 4407 * new segment can be sent out. We 4408 * are "clocking out" new data when 4409 * the network is really congested. 4410 */ 4411 assert(tcp->tcp_ecn_ok); 4412 tcp->tcp_cwnd = tcp->tcp_mss; 4413 } 4414 if (tcp->tcp_swnd == 0) { 4415 /* Extend window for zero window probe */ 4416 tcp->tcp_swnd++; 4417 tcp->tcp_zero_win_probe = B_TRUE; 4418 BUMP_MIB(tcp_mib.tcpOutWinProbe); 4419 } else { 4420 /* 4421 * Handle timeout from sender SWS avoidance. 4422 * Reset our knowledge of the max send window 4423 * since the receiver might have reduced its 4424 * receive buffer. Avoid setting tcp_max_swnd 4425 * to one since that will essentially disable 4426 * the SWS checks. 4427 * 4428 * Note that since we don't have a SWS 4429 * state variable, if the timeout is set 4430 * for ECN but not for SWS, this 4431 * code will also be executed. This is 4432 * fine as tcp_max_swnd is updated 4433 * constantly and it will not affect 4434 * anything. 4435 */ 4436 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 4437 } 4438 tcp_wput_data(tcp, NULL, sock_id); 4439 return; 4440 } 4441 /* Is there a FIN that needs to be to re retransmitted? */ 4442 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4443 !tcp->tcp_fin_acked) 4444 break; 4445 /* Nothing to do, return without restarting timer. */ 4446 return; 4447 case TCPS_FIN_WAIT_2: 4448 /* 4449 * User closed the TCP endpoint and peer ACK'ed our FIN. 4450 * We waited some time for for peer's FIN, but it hasn't 4451 * arrived. We flush the connection now to avoid 4452 * case where the peer has rebooted. 4453 */ 4454 /* FALLTHRU */ 4455 case TCPS_TIME_WAIT: 4456 (void) tcp_clean_death(sock_id, tcp, 0); 4457 return; 4458 default: 4459 DEBUG_3("tcp_timer (%d): strange state (%d) %s", sock_id, 4460 tcp->tcp_state, tcp_display(tcp, NULL, 4461 DISP_PORT_ONLY)); 4462 return; 4463 } 4464 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 4465 /* 4466 * For zero window probe, we need to send indefinitely, 4467 * unless we have not heard from the other side for some 4468 * time... 4469 */ 4470 if ((tcp->tcp_zero_win_probe == 0) || 4471 ((prom_gettime() - tcp->tcp_last_recv_time) > 4472 second_threshold)) { 4473 BUMP_MIB(tcp_mib.tcpTimRetransDrop); 4474 /* 4475 * If TCP is in SYN_RCVD state, send back a 4476 * RST|ACK as BSD does. Note that tcp_zero_win_probe 4477 * should be zero in TCPS_SYN_RCVD state. 4478 */ 4479 if (tcp->tcp_state == TCPS_SYN_RCVD) { 4480 tcp_xmit_ctl("tcp_timer: RST sent on timeout " 4481 "in SYN_RCVD", 4482 tcp, NULL, tcp->tcp_snxt, 4483 tcp->tcp_rnxt, TH_RST | TH_ACK, 0, sock_id); 4484 } 4485 (void) tcp_clean_death(sock_id, tcp, 4486 tcp->tcp_client_errno ? 4487 tcp->tcp_client_errno : ETIMEDOUT); 4488 return; 4489 } else { 4490 /* 4491 * Set tcp_ms_we_have_waited to second_threshold 4492 * so that in next timeout, we will do the above 4493 * check (lbolt - tcp_last_recv_time). This is 4494 * also to avoid overflow. 4495 * 4496 * We don't need to decrement tcp_timer_backoff 4497 * to avoid overflow because it will be decremented 4498 * later if new timeout value is greater than 4499 * tcp_rexmit_interval_max. In the case when 4500 * tcp_rexmit_interval_max is greater than 4501 * second_threshold, it means that we will wait 4502 * longer than second_threshold to send the next 4503 * window probe. 4504 */ 4505 tcp->tcp_ms_we_have_waited = second_threshold; 4506 } 4507 } else if (ms > first_threshold && tcp->tcp_rtt_sa != 0) { 4508 /* 4509 * We have been retransmitting for too long... The RTT 4510 * we calculated is probably incorrect. Reinitialize it. 4511 * Need to compensate for 0 tcp_rtt_sa. Reset 4512 * tcp_rtt_update so that we won't accidentally cache a 4513 * bad value. But only do this if this is not a zero 4514 * window probe. 4515 */ 4516 if (tcp->tcp_zero_win_probe == 0) { 4517 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 4518 (tcp->tcp_rtt_sa >> 5); 4519 tcp->tcp_rtt_sa = 0; 4520 tcp->tcp_rtt_update = 0; 4521 } 4522 } 4523 tcp->tcp_timer_backoff++; 4524 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 4525 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 4526 tcp_rexmit_interval_min) { 4527 /* 4528 * This means the original RTO is tcp_rexmit_interval_min. 4529 * So we will use tcp_rexmit_interval_min as the RTO value 4530 * and do the backoff. 4531 */ 4532 ms = tcp_rexmit_interval_min << tcp->tcp_timer_backoff; 4533 } else { 4534 ms <<= tcp->tcp_timer_backoff; 4535 } 4536 if (ms > tcp_rexmit_interval_max) { 4537 ms = tcp_rexmit_interval_max; 4538 /* 4539 * ms is at max, decrement tcp_timer_backoff to avoid 4540 * overflow. 4541 */ 4542 tcp->tcp_timer_backoff--; 4543 } 4544 tcp->tcp_ms_we_have_waited += ms; 4545 if (tcp->tcp_zero_win_probe == 0) { 4546 tcp->tcp_rto = ms; 4547 } 4548 TCP_TIMER_RESTART(tcp, ms); 4549 /* 4550 * This is after a timeout and tcp_rto is backed off. Set 4551 * tcp_set_timer to 1 so that next time RTO is updated, we will 4552 * restart the timer with a correct value. 4553 */ 4554 tcp->tcp_set_timer = 1; 4555 mss = tcp->tcp_snxt - tcp->tcp_suna; 4556 if (mss > tcp->tcp_mss) 4557 mss = tcp->tcp_mss; 4558 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 4559 mss = tcp->tcp_swnd; 4560 4561 if ((mp = tcp->tcp_xmit_head) != NULL) { 4562 /* use uintptr_t to suppress the gcc warning */ 4563 mp->b_prev = (mblk_t *)(uintptr_t)prom_gettime(); 4564 } 4565 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 4566 B_TRUE); 4567 if (mp == NULL) 4568 return; 4569 tcp->tcp_csuna = tcp->tcp_snxt; 4570 BUMP_MIB(tcp_mib.tcpRetransSegs); 4571 UPDATE_MIB(tcp_mib.tcpRetransBytes, mss); 4572 /* Dump the packet when debugging. */ 4573 TCP_DUMP_PACKET("tcp_timer", mp); 4574 4575 (void) ipv4_tcp_output(sock_id, mp); 4576 freeb(mp); 4577 4578 /* 4579 * When slow start after retransmission begins, start with 4580 * this seq no. tcp_rexmit_max marks the end of special slow 4581 * start phase. tcp_snd_burst controls how many segments 4582 * can be sent because of an ack. 4583 */ 4584 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 4585 tcp->tcp_snd_burst = TCP_CWND_SS; 4586 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4587 (tcp->tcp_unsent == 0)) { 4588 tcp->tcp_rexmit_max = tcp->tcp_fss; 4589 } else { 4590 tcp->tcp_rexmit_max = tcp->tcp_snxt; 4591 } 4592 tcp->tcp_rexmit = B_TRUE; 4593 tcp->tcp_dupack_cnt = 0; 4594 4595 /* 4596 * Remove all rexmit SACK blk to start from fresh. 4597 */ 4598 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 4599 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 4600 tcp->tcp_num_notsack_blk = 0; 4601 tcp->tcp_cnt_notsack_list = 0; 4602 } 4603 } 4604 4605 /* 4606 * The TCP normal data output path. 4607 * NOTE: the logic of the fast path is duplicated from this function. 4608 */ 4609 static void 4610 tcp_wput_data(tcp_t *tcp, mblk_t *mp, int sock_id) 4611 { 4612 int len; 4613 mblk_t *local_time; 4614 mblk_t *mp1; 4615 uchar_t *rptr; 4616 uint32_t snxt; 4617 int tail_unsent; 4618 int tcpstate; 4619 int usable = 0; 4620 mblk_t *xmit_tail; 4621 int32_t num_burst_seg; 4622 int32_t mss; 4623 int32_t num_sack_blk = 0; 4624 int32_t tcp_hdr_len; 4625 ipaddr_t *dst; 4626 ipaddr_t *src; 4627 4628 #ifdef DEBUG 4629 printf("tcp_wput_data(%d) ##############################\n", sock_id); 4630 #endif 4631 tcpstate = tcp->tcp_state; 4632 if (mp == NULL) { 4633 /* Really tacky... but we need this for detached closes. */ 4634 len = tcp->tcp_unsent; 4635 goto data_null; 4636 } 4637 4638 /* 4639 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 4640 * or before a connection attempt has begun. 4641 * 4642 * The following should not happen in inetboot.... 4643 */ 4644 if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 4645 (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 4646 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 4647 printf("tcp_wput_data: data after ordrel, %s\n", 4648 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 4649 } 4650 freemsg(mp); 4651 return; 4652 } 4653 4654 /* Strip empties */ 4655 for (;;) { 4656 assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 4657 (uintptr_t)INT_MAX); 4658 len = (int)(mp->b_wptr - mp->b_rptr); 4659 if (len > 0) 4660 break; 4661 mp1 = mp; 4662 mp = mp->b_cont; 4663 freeb(mp1); 4664 if (mp == NULL) { 4665 return; 4666 } 4667 } 4668 4669 /* If we are the first on the list ... */ 4670 if (tcp->tcp_xmit_head == NULL) { 4671 tcp->tcp_xmit_head = mp; 4672 tcp->tcp_xmit_tail = mp; 4673 tcp->tcp_xmit_tail_unsent = len; 4674 } else { 4675 tcp->tcp_xmit_last->b_cont = mp; 4676 len += tcp->tcp_unsent; 4677 } 4678 4679 /* Tack on however many more positive length mblks we have */ 4680 if ((mp1 = mp->b_cont) != NULL) { 4681 do { 4682 int tlen; 4683 assert((uintptr_t)(mp1->b_wptr - 4684 mp1->b_rptr) <= (uintptr_t)INT_MAX); 4685 tlen = (int)(mp1->b_wptr - mp1->b_rptr); 4686 if (tlen <= 0) { 4687 mp->b_cont = mp1->b_cont; 4688 freeb(mp1); 4689 } else { 4690 len += tlen; 4691 mp = mp1; 4692 } 4693 } while ((mp1 = mp->b_cont) != NULL); 4694 } 4695 tcp->tcp_xmit_last = mp; 4696 tcp->tcp_unsent = len; 4697 4698 data_null: 4699 snxt = tcp->tcp_snxt; 4700 xmit_tail = tcp->tcp_xmit_tail; 4701 tail_unsent = tcp->tcp_xmit_tail_unsent; 4702 4703 /* 4704 * Note that tcp_mss has been adjusted to take into account the 4705 * timestamp option if applicable. Because SACK options do not 4706 * appear in every TCP segments and they are of variable lengths, 4707 * they cannot be included in tcp_mss. Thus we need to calculate 4708 * the actual segment length when we need to send a segment which 4709 * includes SACK options. 4710 */ 4711 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4712 int32_t opt_len; 4713 4714 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 4715 tcp->tcp_num_sack_blk); 4716 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 4717 2 + TCPOPT_HEADER_LEN; 4718 mss = tcp->tcp_mss - opt_len; 4719 tcp_hdr_len = tcp->tcp_hdr_len + opt_len; 4720 } else { 4721 mss = tcp->tcp_mss; 4722 tcp_hdr_len = tcp->tcp_hdr_len; 4723 } 4724 4725 if ((tcp->tcp_suna == snxt) && 4726 (prom_gettime() - tcp->tcp_last_recv_time) >= tcp->tcp_rto) { 4727 tcp->tcp_cwnd = MIN(tcp_slow_start_after_idle * mss, 4728 MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 4729 } 4730 if (tcpstate == TCPS_SYN_RCVD) { 4731 /* 4732 * The three-way connection establishment handshake is not 4733 * complete yet. We want to queue the data for transmission 4734 * after entering ESTABLISHED state (RFC793). Setting usable to 4735 * zero cause a jump to "done" label effectively leaving data 4736 * on the queue. 4737 */ 4738 4739 usable = 0; 4740 } else { 4741 int usable_r = tcp->tcp_swnd; 4742 4743 /* 4744 * In the special case when cwnd is zero, which can only 4745 * happen if the connection is ECN capable, return now. 4746 * New segments is sent using tcp_timer(). The timer 4747 * is set in tcp_rput_data(). 4748 */ 4749 if (tcp->tcp_cwnd == 0) { 4750 /* 4751 * Note that tcp_cwnd is 0 before 3-way handshake is 4752 * finished. 4753 */ 4754 assert(tcp->tcp_ecn_ok || 4755 tcp->tcp_state < TCPS_ESTABLISHED); 4756 return; 4757 } 4758 4759 /* usable = MIN(swnd, cwnd) - unacked_bytes */ 4760 if (usable_r > tcp->tcp_cwnd) 4761 usable_r = tcp->tcp_cwnd; 4762 4763 /* NOTE: trouble if xmitting while SYN not acked? */ 4764 usable_r -= snxt; 4765 usable_r += tcp->tcp_suna; 4766 4767 /* usable = MIN(usable, unsent) */ 4768 if (usable_r > len) 4769 usable_r = len; 4770 4771 /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 4772 if (usable_r != 0) 4773 usable = usable_r; 4774 } 4775 4776 /* use uintptr_t to suppress the gcc warning */ 4777 local_time = (mblk_t *)(uintptr_t)prom_gettime(); 4778 4779 /* 4780 * "Our" Nagle Algorithm. This is not the same as in the old 4781 * BSD. This is more in line with the true intent of Nagle. 4782 * 4783 * The conditions are: 4784 * 1. The amount of unsent data (or amount of data which can be 4785 * sent, whichever is smaller) is less than Nagle limit. 4786 * 2. The last sent size is also less than Nagle limit. 4787 * 3. There is unack'ed data. 4788 * 4. Urgent pointer is not set. Send urgent data ignoring the 4789 * Nagle algorithm. This reduces the probability that urgent 4790 * bytes get "merged" together. 4791 * 5. The app has not closed the connection. This eliminates the 4792 * wait time of the receiving side waiting for the last piece of 4793 * (small) data. 4794 * 4795 * If all are satisified, exit without sending anything. Note 4796 * that Nagle limit can be smaller than 1 MSS. Nagle limit is 4797 * the smaller of 1 MSS and global tcp_naglim_def (default to be 4798 * 4095). 4799 */ 4800 if (usable < (int)tcp->tcp_naglim && 4801 tcp->tcp_naglim > tcp->tcp_last_sent_len && 4802 snxt != tcp->tcp_suna && 4803 !(tcp->tcp_valid_bits & TCP_URG_VALID)) 4804 goto done; 4805 4806 num_burst_seg = tcp->tcp_snd_burst; 4807 for (;;) { 4808 tcph_t *tcph; 4809 mblk_t *new_mp; 4810 4811 if (num_burst_seg-- == 0) 4812 goto done; 4813 4814 len = mss; 4815 if (len > usable) { 4816 len = usable; 4817 if (len <= 0) { 4818 /* Terminate the loop */ 4819 goto done; 4820 } 4821 /* 4822 * Sender silly-window avoidance. 4823 * Ignore this if we are going to send a 4824 * zero window probe out. 4825 * 4826 * TODO: force data into microscopic window ?? 4827 * ==> (!pushed || (unsent > usable)) 4828 */ 4829 if (len < (tcp->tcp_max_swnd >> 1) && 4830 (tcp->tcp_unsent - (snxt - tcp->tcp_snxt)) > len && 4831 !((tcp->tcp_valid_bits & TCP_URG_VALID) && 4832 len == 1) && (! tcp->tcp_zero_win_probe)) { 4833 /* 4834 * If the retransmit timer is not running 4835 * we start it so that we will retransmit 4836 * in the case when the the receiver has 4837 * decremented the window. 4838 */ 4839 if (snxt == tcp->tcp_snxt && 4840 snxt == tcp->tcp_suna) { 4841 /* 4842 * We are not supposed to send 4843 * anything. So let's wait a little 4844 * bit longer before breaking SWS 4845 * avoidance. 4846 * 4847 * What should the value be? 4848 * Suggestion: MAX(init rexmit time, 4849 * tcp->tcp_rto) 4850 */ 4851 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 4852 } 4853 goto done; 4854 } 4855 } 4856 4857 tcph = tcp->tcp_tcph; 4858 4859 usable -= len; /* Approximate - can be adjusted later */ 4860 if (usable > 0) 4861 tcph->th_flags[0] = TH_ACK; 4862 else 4863 tcph->th_flags[0] = (TH_ACK | TH_PUSH); 4864 4865 U32_TO_ABE32(snxt, tcph->th_seq); 4866 4867 if (tcp->tcp_valid_bits) { 4868 uchar_t *prev_rptr = xmit_tail->b_rptr; 4869 uint32_t prev_snxt = tcp->tcp_snxt; 4870 4871 if (tail_unsent == 0) { 4872 assert(xmit_tail->b_cont != NULL); 4873 xmit_tail = xmit_tail->b_cont; 4874 prev_rptr = xmit_tail->b_rptr; 4875 tail_unsent = (int)(xmit_tail->b_wptr - 4876 xmit_tail->b_rptr); 4877 } else { 4878 xmit_tail->b_rptr = xmit_tail->b_wptr - 4879 tail_unsent; 4880 } 4881 mp = tcp_xmit_mp(tcp, xmit_tail, len, NULL, NULL, 4882 snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 4883 /* Restore tcp_snxt so we get amount sent right. */ 4884 tcp->tcp_snxt = prev_snxt; 4885 if (prev_rptr == xmit_tail->b_rptr) 4886 xmit_tail->b_prev = local_time; 4887 else 4888 xmit_tail->b_rptr = prev_rptr; 4889 4890 if (mp == NULL) 4891 break; 4892 4893 mp1 = mp->b_cont; 4894 4895 snxt += len; 4896 tcp->tcp_last_sent_len = (ushort_t)len; 4897 while (mp1->b_cont) { 4898 xmit_tail = xmit_tail->b_cont; 4899 xmit_tail->b_prev = local_time; 4900 mp1 = mp1->b_cont; 4901 } 4902 tail_unsent = xmit_tail->b_wptr - mp1->b_wptr; 4903 BUMP_MIB(tcp_mib.tcpOutDataSegs); 4904 UPDATE_MIB(tcp_mib.tcpOutDataBytes, len); 4905 /* Dump the packet when debugging. */ 4906 TCP_DUMP_PACKET("tcp_wput_data (valid bits)", mp); 4907 (void) ipv4_tcp_output(sock_id, mp); 4908 freeb(mp); 4909 continue; 4910 } 4911 4912 snxt += len; /* Adjust later if we don't send all of len */ 4913 BUMP_MIB(tcp_mib.tcpOutDataSegs); 4914 UPDATE_MIB(tcp_mib.tcpOutDataBytes, len); 4915 4916 if (tail_unsent) { 4917 /* Are the bytes above us in flight? */ 4918 rptr = xmit_tail->b_wptr - tail_unsent; 4919 if (rptr != xmit_tail->b_rptr) { 4920 tail_unsent -= len; 4921 len += tcp_hdr_len; 4922 tcp->tcp_ipha->ip_len = htons(len); 4923 mp = dupb(xmit_tail); 4924 if (!mp) 4925 break; 4926 mp->b_rptr = rptr; 4927 goto must_alloc; 4928 } 4929 } else { 4930 xmit_tail = xmit_tail->b_cont; 4931 assert((uintptr_t)(xmit_tail->b_wptr - 4932 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 4933 tail_unsent = (int)(xmit_tail->b_wptr - 4934 xmit_tail->b_rptr); 4935 } 4936 4937 tail_unsent -= len; 4938 tcp->tcp_last_sent_len = (ushort_t)len; 4939 4940 len += tcp_hdr_len; 4941 if (tcp->tcp_ipversion == IPV4_VERSION) 4942 tcp->tcp_ipha->ip_len = htons(len); 4943 4944 xmit_tail->b_prev = local_time; 4945 4946 mp = dupb(xmit_tail); 4947 if (mp == NULL) 4948 goto out_of_mem; 4949 4950 len = tcp_hdr_len; 4951 /* 4952 * There are four reasons to allocate a new hdr mblk: 4953 * 1) The bytes above us are in use by another packet 4954 * 2) We don't have good alignment 4955 * 3) The mblk is being shared 4956 * 4) We don't have enough room for a header 4957 */ 4958 rptr = mp->b_rptr - len; 4959 if (!OK_32PTR(rptr) || 4960 rptr < mp->b_datap) { 4961 /* NOTE: we assume allocb returns an OK_32PTR */ 4962 4963 must_alloc:; 4964 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 4965 tcp_wroff_xtra, 0); 4966 if (mp1 == NULL) { 4967 freemsg(mp); 4968 goto out_of_mem; 4969 } 4970 mp1->b_cont = mp; 4971 mp = mp1; 4972 /* Leave room for Link Level header */ 4973 len = tcp_hdr_len; 4974 rptr = &mp->b_rptr[tcp_wroff_xtra]; 4975 mp->b_wptr = &rptr[len]; 4976 } 4977 4978 if (tcp->tcp_snd_ts_ok) { 4979 /* use uintptr_t to suppress the gcc warning */ 4980 U32_TO_BE32((uint32_t)(uintptr_t)local_time, 4981 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 4982 U32_TO_BE32(tcp->tcp_ts_recent, 4983 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 4984 } else { 4985 assert(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 4986 } 4987 4988 mp->b_rptr = rptr; 4989 4990 /* Copy the template header. */ 4991 dst = (ipaddr_t *)rptr; 4992 src = (ipaddr_t *)tcp->tcp_iphc; 4993 dst[0] = src[0]; 4994 dst[1] = src[1]; 4995 dst[2] = src[2]; 4996 dst[3] = src[3]; 4997 dst[4] = src[4]; 4998 dst[5] = src[5]; 4999 dst[6] = src[6]; 5000 dst[7] = src[7]; 5001 dst[8] = src[8]; 5002 dst[9] = src[9]; 5003 len = tcp->tcp_hdr_len; 5004 if (len -= 40) { 5005 len >>= 2; 5006 dst += 10; 5007 src += 10; 5008 do { 5009 *dst++ = *src++; 5010 } while (--len); 5011 } 5012 5013 /* 5014 * Set tcph to point to the header of the outgoing packet, 5015 * not to the template header. 5016 */ 5017 tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 5018 5019 /* 5020 * Set the ECN info in the TCP header if it is not a zero 5021 * window probe. Zero window probe is only sent in 5022 * tcp_wput_data() and tcp_timer(). 5023 */ 5024 if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 5025 SET_ECT(tcp, rptr); 5026 5027 if (tcp->tcp_ecn_echo_on) 5028 tcph->th_flags[0] |= TH_ECE; 5029 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 5030 tcph->th_flags[0] |= TH_CWR; 5031 tcp->tcp_ecn_cwr_sent = B_TRUE; 5032 } 5033 } 5034 5035 /* Fill in SACK options */ 5036 if (num_sack_blk > 0) { 5037 uchar_t *wptr = rptr + tcp->tcp_hdr_len; 5038 sack_blk_t *tmp; 5039 int32_t i; 5040 5041 wptr[0] = TCPOPT_NOP; 5042 wptr[1] = TCPOPT_NOP; 5043 wptr[2] = TCPOPT_SACK; 5044 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5045 sizeof (sack_blk_t); 5046 wptr += TCPOPT_REAL_SACK_LEN; 5047 5048 tmp = tcp->tcp_sack_list; 5049 for (i = 0; i < num_sack_blk; i++) { 5050 U32_TO_BE32(tmp[i].begin, wptr); 5051 wptr += sizeof (tcp_seq); 5052 U32_TO_BE32(tmp[i].end, wptr); 5053 wptr += sizeof (tcp_seq); 5054 } 5055 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 5056 << 4); 5057 } 5058 5059 if (tail_unsent) { 5060 mp1 = mp->b_cont; 5061 if (mp1 == NULL) 5062 mp1 = mp; 5063 /* 5064 * If we're a little short, tack on more mblks 5065 * as long as we don't need to split an mblk. 5066 */ 5067 while (tail_unsent < 0 && 5068 tail_unsent + (int)(xmit_tail->b_cont->b_wptr - 5069 xmit_tail->b_cont->b_rptr) <= 0) { 5070 xmit_tail = xmit_tail->b_cont; 5071 /* Stash for rtt use later */ 5072 xmit_tail->b_prev = local_time; 5073 mp1->b_cont = dupb(xmit_tail); 5074 mp1 = mp1->b_cont; 5075 assert((uintptr_t)(xmit_tail->b_wptr - 5076 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 5077 tail_unsent += (int)(xmit_tail->b_wptr - 5078 xmit_tail->b_rptr); 5079 if (mp1 == NULL) { 5080 freemsg(mp); 5081 goto out_of_mem; 5082 } 5083 } 5084 /* Trim back any surplus on the last mblk */ 5085 if (tail_unsent > 0) 5086 mp1->b_wptr -= tail_unsent; 5087 if (tail_unsent < 0) { 5088 uint32_t ip_len; 5089 5090 /* 5091 * We did not send everything we could in 5092 * order to preserve mblk boundaries. 5093 */ 5094 usable -= tail_unsent; 5095 snxt += tail_unsent; 5096 tcp->tcp_last_sent_len += tail_unsent; 5097 UPDATE_MIB(tcp_mib.tcpOutDataBytes, 5098 tail_unsent); 5099 /* Adjust the IP length field. */ 5100 ip_len = ntohs(((struct ip *)rptr)->ip_len) + 5101 tail_unsent; 5102 ((struct ip *)rptr)->ip_len = htons(ip_len); 5103 tail_unsent = 0; 5104 } 5105 } 5106 5107 if (mp == NULL) 5108 goto out_of_mem; 5109 5110 /* 5111 * Performance hit! We need to pullup the whole message 5112 * in order to do checksum and for the MAC output routine. 5113 */ 5114 if (mp->b_cont != NULL) { 5115 int mp_size; 5116 #ifdef DEBUG 5117 printf("Multiple mblk %d\n", msgdsize(mp)); 5118 #endif 5119 new_mp = allocb(msgdsize(mp) + tcp_wroff_xtra, 0); 5120 new_mp->b_rptr += tcp_wroff_xtra; 5121 new_mp->b_wptr = new_mp->b_rptr; 5122 while (mp != NULL) { 5123 mp_size = mp->b_wptr - mp->b_rptr; 5124 bcopy(mp->b_rptr, new_mp->b_wptr, mp_size); 5125 new_mp->b_wptr += mp_size; 5126 mp = mp->b_cont; 5127 } 5128 freemsg(mp); 5129 mp = new_mp; 5130 } 5131 tcp_set_cksum(mp); 5132 ((struct ip *)mp->b_rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5133 TCP_DUMP_PACKET("tcp_wput_data", mp); 5134 (void) ipv4_tcp_output(sock_id, mp); 5135 freemsg(mp); 5136 } 5137 out_of_mem:; 5138 /* Pretend that all we were trying to send really got sent */ 5139 if (tail_unsent < 0) { 5140 do { 5141 xmit_tail = xmit_tail->b_cont; 5142 xmit_tail->b_prev = local_time; 5143 assert((uintptr_t)(xmit_tail->b_wptr - 5144 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 5145 tail_unsent += (int)(xmit_tail->b_wptr - 5146 xmit_tail->b_rptr); 5147 } while (tail_unsent < 0); 5148 } 5149 done:; 5150 tcp->tcp_xmit_tail = xmit_tail; 5151 tcp->tcp_xmit_tail_unsent = tail_unsent; 5152 len = tcp->tcp_snxt - snxt; 5153 if (len) { 5154 /* 5155 * If new data was sent, need to update the notsack 5156 * list, which is, afterall, data blocks that have 5157 * not been sack'ed by the receiver. New data is 5158 * not sack'ed. 5159 */ 5160 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 5161 /* len is a negative value. */ 5162 tcp->tcp_pipe -= len; 5163 tcp_notsack_update(&(tcp->tcp_notsack_list), 5164 tcp->tcp_snxt, snxt, 5165 &(tcp->tcp_num_notsack_blk), 5166 &(tcp->tcp_cnt_notsack_list)); 5167 } 5168 tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 5169 tcp->tcp_rack = tcp->tcp_rnxt; 5170 tcp->tcp_rack_cnt = 0; 5171 if ((snxt + len) == tcp->tcp_suna) { 5172 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5173 } 5174 /* 5175 * Note that len is the amount we just sent but with a negative 5176 * sign. We update tcp_unsent here since we may come back to 5177 * tcp_wput_data from tcp_state_wait. 5178 */ 5179 len += tcp->tcp_unsent; 5180 tcp->tcp_unsent = len; 5181 5182 /* 5183 * Let's wait till all the segments have been acked, since we 5184 * don't have a timer. 5185 */ 5186 (void) tcp_state_wait(sock_id, tcp, TCPS_ALL_ACKED); 5187 return; 5188 } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 5189 /* 5190 * Didn't send anything. Make sure the timer is running 5191 * so that we will probe a zero window. 5192 */ 5193 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5194 } 5195 5196 /* Note that len is the amount we just sent but with a negative sign */ 5197 len += tcp->tcp_unsent; 5198 tcp->tcp_unsent = len; 5199 5200 } 5201 5202 static void 5203 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 5204 uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcph_t *tcph, 5205 int sock_id) 5206 { 5207 int32_t bytes_acked; 5208 int32_t gap; 5209 int32_t rgap; 5210 tcp_opt_t tcpopt; 5211 uint_t flags; 5212 uint32_t new_swnd = 0; 5213 5214 #ifdef DEBUG 5215 printf("Time wait processing called ###############3\n"); 5216 #endif 5217 5218 /* Just make sure we send the right sock_id to tcp_clean_death */ 5219 if ((sockets[sock_id].pcb == NULL) || (sockets[sock_id].pcb != tcp)) 5220 sock_id = -1; 5221 5222 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 5223 new_swnd = BE16_TO_U16(tcph->th_win) << 5224 ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 5225 if (tcp->tcp_snd_ts_ok) { 5226 if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 5227 freemsg(mp); 5228 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5229 tcp->tcp_rnxt, TH_ACK, 0, -1); 5230 return; 5231 } 5232 } 5233 gap = seg_seq - tcp->tcp_rnxt; 5234 rgap = tcp->tcp_rwnd - (gap + seg_len); 5235 if (gap < 0) { 5236 BUMP_MIB(tcp_mib.tcpInDataDupSegs); 5237 UPDATE_MIB(tcp_mib.tcpInDataDupBytes, 5238 (seg_len > -gap ? -gap : seg_len)); 5239 seg_len += gap; 5240 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 5241 if (flags & TH_RST) { 5242 freemsg(mp); 5243 return; 5244 } 5245 if ((flags & TH_FIN) && seg_len == -1) { 5246 /* 5247 * When TCP receives a duplicate FIN in 5248 * TIME_WAIT state, restart the 2 MSL timer. 5249 * See page 73 in RFC 793. Make sure this TCP 5250 * is already on the TIME_WAIT list. If not, 5251 * just restart the timer. 5252 */ 5253 tcp_time_wait_remove(tcp); 5254 tcp_time_wait_append(tcp); 5255 TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 5256 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5257 tcp->tcp_rnxt, TH_ACK, 0, -1); 5258 freemsg(mp); 5259 return; 5260 } 5261 flags |= TH_ACK_NEEDED; 5262 seg_len = 0; 5263 goto process_ack; 5264 } 5265 5266 /* Fix seg_seq, and chew the gap off the front. */ 5267 seg_seq = tcp->tcp_rnxt; 5268 } 5269 5270 if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 5271 /* 5272 * Make sure that when we accept the connection, pick 5273 * an ISS greater than (tcp_snxt + ISS_INCR/2) for the 5274 * old connection. 5275 * 5276 * The next ISS generated is equal to tcp_iss_incr_extra 5277 * + ISS_INCR/2 + other components depending on the 5278 * value of tcp_strong_iss. We pre-calculate the new 5279 * ISS here and compare with tcp_snxt to determine if 5280 * we need to make adjustment to tcp_iss_incr_extra. 5281 * 5282 * Note that since we are now in the global queue 5283 * perimeter and need to do a lateral_put() to the 5284 * listener queue, there can be other connection requests/ 5285 * attempts while the lateral_put() is going on. That 5286 * means what we calculate here may not be correct. This 5287 * is extremely difficult to solve unless TCP and IP 5288 * modules are merged and there is no perimeter, but just 5289 * locks. The above calculation is ugly and is a 5290 * waste of CPU cycles... 5291 */ 5292 uint32_t new_iss = tcp_iss_incr_extra; 5293 int32_t adj; 5294 5295 /* Add time component and min random (i.e. 1). */ 5296 new_iss += (prom_gettime() >> ISS_NSEC_SHT) + 1; 5297 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 5298 /* 5299 * New ISS not guaranteed to be ISS_INCR/2 5300 * ahead of the current tcp_snxt, so add the 5301 * difference to tcp_iss_incr_extra. 5302 */ 5303 tcp_iss_incr_extra += adj; 5304 } 5305 tcp_clean_death(sock_id, tcp, 0); 5306 5307 /* 5308 * This is a passive open. Right now we do not 5309 * do anything... 5310 */ 5311 freemsg(mp); 5312 return; 5313 } 5314 5315 /* 5316 * rgap is the amount of stuff received out of window. A negative 5317 * value is the amount out of window. 5318 */ 5319 if (rgap < 0) { 5320 BUMP_MIB(tcp_mib.tcpInDataPastWinSegs); 5321 UPDATE_MIB(tcp_mib.tcpInDataPastWinBytes, -rgap); 5322 /* Fix seg_len and make sure there is something left. */ 5323 seg_len += rgap; 5324 if (seg_len <= 0) { 5325 if (flags & TH_RST) { 5326 freemsg(mp); 5327 return; 5328 } 5329 flags |= TH_ACK_NEEDED; 5330 seg_len = 0; 5331 goto process_ack; 5332 } 5333 } 5334 /* 5335 * Check whether we can update tcp_ts_recent. This test is 5336 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 5337 * Extensions for High Performance: An Update", Internet Draft. 5338 */ 5339 if (tcp->tcp_snd_ts_ok && 5340 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 5341 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 5342 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 5343 tcp->tcp_last_rcv_lbolt = prom_gettime(); 5344 } 5345 5346 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 5347 /* Always ack out of order packets */ 5348 flags |= TH_ACK_NEEDED; 5349 seg_len = 0; 5350 } else if (seg_len > 0) { 5351 BUMP_MIB(tcp_mib.tcpInDataInorderSegs); 5352 UPDATE_MIB(tcp_mib.tcpInDataInorderBytes, seg_len); 5353 } 5354 if (flags & TH_RST) { 5355 freemsg(mp); 5356 (void) tcp_clean_death(sock_id, tcp, 0); 5357 return; 5358 } 5359 if (flags & TH_SYN) { 5360 freemsg(mp); 5361 tcp_xmit_ctl("TH_SYN", tcp, NULL, seg_ack, seg_seq + 1, 5362 TH_RST|TH_ACK, 0, -1); 5363 /* 5364 * Do not delete the TCP structure if it is in 5365 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 5366 */ 5367 return; 5368 } 5369 process_ack: 5370 if (flags & TH_ACK) { 5371 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 5372 if (bytes_acked <= 0) { 5373 if (bytes_acked == 0 && seg_len == 0 && 5374 new_swnd == tcp->tcp_swnd) 5375 BUMP_MIB(tcp_mib.tcpInDupAck); 5376 } else { 5377 /* Acks something not sent */ 5378 flags |= TH_ACK_NEEDED; 5379 } 5380 } 5381 freemsg(mp); 5382 if (flags & TH_ACK_NEEDED) { 5383 /* 5384 * Time to send an ack for some reason. 5385 */ 5386 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5387 tcp->tcp_rnxt, TH_ACK, 0, -1); 5388 } 5389 } 5390 5391 static int 5392 tcp_init_values(tcp_t *tcp, struct inetboot_socket *isp) 5393 { 5394 int err; 5395 5396 tcp->tcp_family = AF_INET; 5397 tcp->tcp_ipversion = IPV4_VERSION; 5398 5399 /* 5400 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 5401 * will be close to tcp_rexmit_interval_initial. By doing this, we 5402 * allow the algorithm to adjust slowly to large fluctuations of RTT 5403 * during first few transmissions of a connection as seen in slow 5404 * links. 5405 */ 5406 tcp->tcp_rtt_sa = tcp_rexmit_interval_initial << 2; 5407 tcp->tcp_rtt_sd = tcp_rexmit_interval_initial >> 1; 5408 tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 5409 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + 5410 tcp_conn_grace_period; 5411 if (tcp->tcp_rto < tcp_rexmit_interval_min) 5412 tcp->tcp_rto = tcp_rexmit_interval_min; 5413 tcp->tcp_timer_backoff = 0; 5414 tcp->tcp_ms_we_have_waited = 0; 5415 tcp->tcp_last_recv_time = prom_gettime(); 5416 tcp->tcp_cwnd_max = tcp_cwnd_max_; 5417 tcp->tcp_snd_burst = TCP_CWND_INFINITE; 5418 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 5419 /* For Ethernet, the mtu returned is actually 1550... */ 5420 if (mac_get_type() == IFT_ETHER) { 5421 tcp->tcp_if_mtu = mac_get_mtu() - 50; 5422 } else { 5423 tcp->tcp_if_mtu = mac_get_mtu(); 5424 } 5425 tcp->tcp_mss = tcp->tcp_if_mtu; 5426 5427 tcp->tcp_first_timer_threshold = tcp_ip_notify_interval; 5428 tcp->tcp_first_ctimer_threshold = tcp_ip_notify_cinterval; 5429 tcp->tcp_second_timer_threshold = tcp_ip_abort_interval; 5430 /* 5431 * Fix it to tcp_ip_abort_linterval later if it turns out to be a 5432 * passive open. 5433 */ 5434 tcp->tcp_second_ctimer_threshold = tcp_ip_abort_cinterval; 5435 5436 tcp->tcp_naglim = tcp_naglim_def; 5437 5438 /* NOTE: ISS is now set in tcp_adapt_ire(). */ 5439 5440 /* Initialize the header template */ 5441 if (tcp->tcp_ipversion == IPV4_VERSION) { 5442 err = tcp_header_init_ipv4(tcp); 5443 } 5444 if (err) 5445 return (err); 5446 5447 /* 5448 * Init the window scale to the max so tcp_rwnd_set() won't pare 5449 * down tcp_rwnd. tcp_adapt_ire() will set the right value later. 5450 */ 5451 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 5452 tcp->tcp_xmit_lowater = tcp_xmit_lowat; 5453 if (isp != NULL) { 5454 tcp->tcp_xmit_hiwater = isp->so_sndbuf; 5455 tcp->tcp_rwnd = isp->so_rcvbuf; 5456 tcp->tcp_rwnd_max = isp->so_rcvbuf; 5457 } 5458 tcp->tcp_state = TCPS_IDLE; 5459 return (0); 5460 } 5461 5462 /* 5463 * Initialize the IPv4 header. Loses any record of any IP options. 5464 */ 5465 static int 5466 tcp_header_init_ipv4(tcp_t *tcp) 5467 { 5468 tcph_t *tcph; 5469 5470 /* 5471 * This is a simple initialization. If there's 5472 * already a template, it should never be too small, 5473 * so reuse it. Otherwise, allocate space for the new one. 5474 */ 5475 if (tcp->tcp_iphc != NULL) { 5476 assert(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 5477 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 5478 } else { 5479 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 5480 tcp->tcp_iphc = bkmem_zalloc(tcp->tcp_iphc_len); 5481 if (tcp->tcp_iphc == NULL) { 5482 tcp->tcp_iphc_len = 0; 5483 return (ENOMEM); 5484 } 5485 } 5486 tcp->tcp_ipha = (struct ip *)tcp->tcp_iphc; 5487 tcp->tcp_ipversion = IPV4_VERSION; 5488 5489 /* 5490 * Note that it does not include TCP options yet. It will 5491 * after the connection is established. 5492 */ 5493 tcp->tcp_hdr_len = sizeof (struct ip) + sizeof (tcph_t); 5494 tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 5495 tcp->tcp_ip_hdr_len = sizeof (struct ip); 5496 tcp->tcp_ipha->ip_v = IP_VERSION; 5497 /* We don't support IP options... */ 5498 tcp->tcp_ipha->ip_hl = IP_SIMPLE_HDR_LENGTH_IN_WORDS; 5499 tcp->tcp_ipha->ip_p = IPPROTO_TCP; 5500 /* We are not supposed to do PMTU discovery... */ 5501 tcp->tcp_ipha->ip_sum = 0; 5502 5503 tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (struct ip)); 5504 tcp->tcp_tcph = tcph; 5505 tcph->th_offset_and_rsrvd[0] = (5 << 4); 5506 return (0); 5507 } 5508 5509 /* 5510 * Send out a control packet on the tcp connection specified. This routine 5511 * is typically called where we need a simple ACK or RST generated. 5512 * 5513 * This function is called with or without a mp. 5514 */ 5515 static void 5516 tcp_xmit_ctl(char *str, tcp_t *tcp, mblk_t *mp, uint32_t seq, 5517 uint32_t ack, int ctl, uint_t ip_hdr_len, int sock_id) 5518 { 5519 uchar_t *rptr; 5520 tcph_t *tcph; 5521 struct ip *iph = NULL; 5522 int tcp_hdr_len; 5523 int tcp_ip_hdr_len; 5524 5525 tcp_hdr_len = tcp->tcp_hdr_len; 5526 tcp_ip_hdr_len = tcp->tcp_ip_hdr_len; 5527 5528 if (mp) { 5529 assert(ip_hdr_len != 0); 5530 rptr = mp->b_rptr; 5531 tcph = (tcph_t *)(rptr + ip_hdr_len); 5532 /* Don't reply to a RST segment. */ 5533 if (tcph->th_flags[0] & TH_RST) { 5534 freeb(mp); 5535 return; 5536 } 5537 freemsg(mp); 5538 rptr = NULL; 5539 } else { 5540 assert(ip_hdr_len == 0); 5541 } 5542 /* If a text string is passed in with the request, print it out. */ 5543 if (str != NULL) { 5544 dprintf("tcp_xmit_ctl(%d): '%s', seq 0x%x, ack 0x%x, " 5545 "ctl 0x%x\n", sock_id, str, seq, ack, ctl); 5546 } 5547 mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 0); 5548 if (mp == NULL) { 5549 dprintf("tcp_xmit_ctl(%d): Cannot allocate memory\n", sock_id); 5550 return; 5551 } 5552 rptr = &mp->b_rptr[tcp_wroff_xtra]; 5553 mp->b_rptr = rptr; 5554 mp->b_wptr = &rptr[tcp_hdr_len]; 5555 bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); 5556 5557 iph = (struct ip *)rptr; 5558 iph->ip_len = htons(tcp_hdr_len); 5559 5560 tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; 5561 tcph->th_flags[0] = (uint8_t)ctl; 5562 if (ctl & TH_RST) { 5563 BUMP_MIB(tcp_mib.tcpOutRsts); 5564 BUMP_MIB(tcp_mib.tcpOutControl); 5565 /* 5566 * Don't send TSopt w/ TH_RST packets per RFC 1323. 5567 */ 5568 if (tcp->tcp_snd_ts_ok && tcp->tcp_state > TCPS_SYN_SENT) { 5569 mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN]; 5570 *(mp->b_wptr) = TCPOPT_EOL; 5571 iph->ip_len = htons(tcp_hdr_len - 5572 TCPOPT_REAL_TS_LEN); 5573 tcph->th_offset_and_rsrvd[0] -= (3 << 4); 5574 } 5575 } 5576 if (ctl & TH_ACK) { 5577 uint32_t now = prom_gettime(); 5578 5579 if (tcp->tcp_snd_ts_ok) { 5580 U32_TO_BE32(now, 5581 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5582 U32_TO_BE32(tcp->tcp_ts_recent, 5583 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5584 } 5585 tcp->tcp_rack = ack; 5586 tcp->tcp_rack_cnt = 0; 5587 BUMP_MIB(tcp_mib.tcpOutAck); 5588 } 5589 BUMP_MIB(tcp_mib.tcpOutSegs); 5590 U32_TO_BE32(seq, tcph->th_seq); 5591 U32_TO_BE32(ack, tcph->th_ack); 5592 5593 tcp_set_cksum(mp); 5594 iph->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5595 TCP_DUMP_PACKET("tcp_xmit_ctl", mp); 5596 (void) ipv4_tcp_output(sock_id, mp); 5597 freeb(mp); 5598 } 5599 5600 /* Generate an ACK-only (no data) segment for a TCP endpoint */ 5601 static mblk_t * 5602 tcp_ack_mp(tcp_t *tcp) 5603 { 5604 if (tcp->tcp_valid_bits) { 5605 /* 5606 * For the complex case where we have to send some 5607 * controls (FIN or SYN), let tcp_xmit_mp do it. 5608 * When sending an ACK-only segment (no data) 5609 * into a zero window, always set the seq number to 5610 * suna, since snxt will be extended past the window. 5611 * If we used snxt, the receiver might consider the ACK 5612 * unacceptable. 5613 */ 5614 return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 5615 (tcp->tcp_zero_win_probe) ? 5616 tcp->tcp_suna : 5617 tcp->tcp_snxt, B_FALSE, NULL, B_FALSE)); 5618 } else { 5619 /* Generate a simple ACK */ 5620 uchar_t *rptr; 5621 tcph_t *tcph; 5622 mblk_t *mp1; 5623 int32_t tcp_hdr_len; 5624 int32_t num_sack_blk = 0; 5625 int32_t sack_opt_len; 5626 5627 /* 5628 * Allocate space for TCP + IP headers 5629 * and link-level header 5630 */ 5631 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 5632 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 5633 tcp->tcp_num_sack_blk); 5634 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 5635 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 5636 tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len; 5637 } else { 5638 tcp_hdr_len = tcp->tcp_hdr_len; 5639 } 5640 mp1 = allocb(tcp_hdr_len + tcp_wroff_xtra, 0); 5641 if (mp1 == NULL) 5642 return (NULL); 5643 5644 /* copy in prototype TCP + IP header */ 5645 rptr = mp1->b_rptr + tcp_wroff_xtra; 5646 mp1->b_rptr = rptr; 5647 mp1->b_wptr = rptr + tcp_hdr_len; 5648 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 5649 5650 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 5651 5652 /* 5653 * Set the TCP sequence number. 5654 * When sending an ACK-only segment (no data) 5655 * into a zero window, always set the seq number to 5656 * suna, since snxt will be extended past the window. 5657 * If we used snxt, the receiver might consider the ACK 5658 * unacceptable. 5659 */ 5660 U32_TO_ABE32((tcp->tcp_zero_win_probe) ? 5661 tcp->tcp_suna : tcp->tcp_snxt, tcph->th_seq); 5662 5663 /* Set up the TCP flag field. */ 5664 tcph->th_flags[0] = (uchar_t)TH_ACK; 5665 if (tcp->tcp_ecn_echo_on) 5666 tcph->th_flags[0] |= TH_ECE; 5667 5668 tcp->tcp_rack = tcp->tcp_rnxt; 5669 tcp->tcp_rack_cnt = 0; 5670 5671 /* fill in timestamp option if in use */ 5672 if (tcp->tcp_snd_ts_ok) { 5673 uint32_t llbolt = (uint32_t)prom_gettime(); 5674 5675 U32_TO_BE32(llbolt, 5676 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5677 U32_TO_BE32(tcp->tcp_ts_recent, 5678 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5679 } 5680 5681 /* Fill in SACK options */ 5682 if (num_sack_blk > 0) { 5683 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 5684 sack_blk_t *tmp; 5685 int32_t i; 5686 5687 wptr[0] = TCPOPT_NOP; 5688 wptr[1] = TCPOPT_NOP; 5689 wptr[2] = TCPOPT_SACK; 5690 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5691 sizeof (sack_blk_t); 5692 wptr += TCPOPT_REAL_SACK_LEN; 5693 5694 tmp = tcp->tcp_sack_list; 5695 for (i = 0; i < num_sack_blk; i++) { 5696 U32_TO_BE32(tmp[i].begin, wptr); 5697 wptr += sizeof (tcp_seq); 5698 U32_TO_BE32(tmp[i].end, wptr); 5699 wptr += sizeof (tcp_seq); 5700 } 5701 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 5702 << 4); 5703 } 5704 5705 ((struct ip *)rptr)->ip_len = htons(tcp_hdr_len); 5706 tcp_set_cksum(mp1); 5707 ((struct ip *)rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5708 return (mp1); 5709 } 5710 } 5711 5712 /* 5713 * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 5714 * ip and tcp header ready to pass down to IP. If the mp passed in is 5715 * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 5716 * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 5717 * otherwise it will dup partial mblks.) 5718 * Otherwise, an appropriate ACK packet will be generated. This 5719 * routine is not usually called to send new data for the first time. It 5720 * is mostly called out of the timer for retransmits, and to generate ACKs. 5721 * 5722 * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 5723 * be adjusted by *offset. And after dupb(), the offset and the ending mblk 5724 * of the original mblk chain will be returned in *offset and *end_mp. 5725 */ 5726 static mblk_t * 5727 tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 5728 mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 5729 boolean_t rexmit) 5730 { 5731 int data_length; 5732 int32_t off = 0; 5733 uint_t flags; 5734 mblk_t *mp1; 5735 mblk_t *mp2; 5736 mblk_t *new_mp; 5737 uchar_t *rptr; 5738 tcph_t *tcph; 5739 int32_t num_sack_blk = 0; 5740 int32_t sack_opt_len = 0; 5741 5742 /* Allocate for our maximum TCP header + link-level */ 5743 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 5744 tcp_wroff_xtra, 0); 5745 if (mp1 == NULL) 5746 return (NULL); 5747 data_length = 0; 5748 5749 /* 5750 * Note that tcp_mss has been adjusted to take into account the 5751 * timestamp option if applicable. Because SACK options do not 5752 * appear in every TCP segments and they are of variable lengths, 5753 * they cannot be included in tcp_mss. Thus we need to calculate 5754 * the actual segment length when we need to send a segment which 5755 * includes SACK options. 5756 */ 5757 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 5758 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 5759 tcp->tcp_num_sack_blk); 5760 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 5761 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 5762 if (max_to_send + sack_opt_len > tcp->tcp_mss) 5763 max_to_send -= sack_opt_len; 5764 } 5765 5766 if (offset != NULL) { 5767 off = *offset; 5768 /* We use offset as an indicator that end_mp is not NULL. */ 5769 *end_mp = NULL; 5770 } 5771 for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 5772 /* This could be faster with cooperation from downstream */ 5773 if (mp2 != mp1 && !sendall && 5774 data_length + (int)(mp->b_wptr - mp->b_rptr) > 5775 max_to_send) 5776 /* 5777 * Don't send the next mblk since the whole mblk 5778 * does not fit. 5779 */ 5780 break; 5781 mp2->b_cont = dupb(mp); 5782 mp2 = mp2->b_cont; 5783 if (mp2 == NULL) { 5784 freemsg(mp1); 5785 return (NULL); 5786 } 5787 mp2->b_rptr += off; 5788 assert((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 5789 (uintptr_t)INT_MAX); 5790 5791 data_length += (int)(mp2->b_wptr - mp2->b_rptr); 5792 if (data_length > max_to_send) { 5793 mp2->b_wptr -= data_length - max_to_send; 5794 data_length = max_to_send; 5795 off = mp2->b_wptr - mp->b_rptr; 5796 break; 5797 } else { 5798 off = 0; 5799 } 5800 } 5801 if (offset != NULL) { 5802 *offset = off; 5803 *end_mp = mp; 5804 } 5805 if (seg_len != NULL) { 5806 *seg_len = data_length; 5807 } 5808 5809 rptr = mp1->b_rptr + tcp_wroff_xtra; 5810 mp1->b_rptr = rptr; 5811 mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; 5812 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 5813 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 5814 U32_TO_ABE32(seq, tcph->th_seq); 5815 5816 /* 5817 * Use tcp_unsent to determine if the PUSH bit should be used assumes 5818 * that this function was called from tcp_wput_data. Thus, when called 5819 * to retransmit data the setting of the PUSH bit may appear some 5820 * what random in that it might get set when it should not. This 5821 * should not pose any performance issues. 5822 */ 5823 if (data_length != 0 && (tcp->tcp_unsent == 0 || 5824 tcp->tcp_unsent == data_length)) { 5825 flags = TH_ACK | TH_PUSH; 5826 } else { 5827 flags = TH_ACK; 5828 } 5829 5830 if (tcp->tcp_ecn_ok) { 5831 if (tcp->tcp_ecn_echo_on) 5832 flags |= TH_ECE; 5833 5834 /* 5835 * Only set ECT bit and ECN_CWR if a segment contains new data. 5836 * There is no TCP flow control for non-data segments, and 5837 * only data segment is transmitted reliably. 5838 */ 5839 if (data_length > 0 && !rexmit) { 5840 SET_ECT(tcp, rptr); 5841 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 5842 flags |= TH_CWR; 5843 tcp->tcp_ecn_cwr_sent = B_TRUE; 5844 } 5845 } 5846 } 5847 5848 if (tcp->tcp_valid_bits) { 5849 uint32_t u1; 5850 5851 if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 5852 seq == tcp->tcp_iss) { 5853 uchar_t *wptr; 5854 5855 /* 5856 * Tack on the MSS option. It is always needed 5857 * for both active and passive open. 5858 */ 5859 wptr = mp1->b_wptr; 5860 wptr[0] = TCPOPT_MAXSEG; 5861 wptr[1] = TCPOPT_MAXSEG_LEN; 5862 wptr += 2; 5863 /* 5864 * MSS option value should be interface MTU - MIN 5865 * TCP/IP header. 5866 */ 5867 u1 = tcp->tcp_if_mtu - IP_SIMPLE_HDR_LENGTH - 5868 TCP_MIN_HEADER_LENGTH; 5869 U16_TO_BE16(u1, wptr); 5870 mp1->b_wptr = wptr + 2; 5871 /* Update the offset to cover the additional word */ 5872 tcph->th_offset_and_rsrvd[0] += (1 << 4); 5873 5874 /* 5875 * Note that the following way of filling in 5876 * TCP options are not optimal. Some NOPs can 5877 * be saved. But there is no need at this time 5878 * to optimize it. When it is needed, we will 5879 * do it. 5880 */ 5881 switch (tcp->tcp_state) { 5882 case TCPS_SYN_SENT: 5883 flags = TH_SYN; 5884 5885 if (tcp->tcp_snd_ws_ok) { 5886 wptr = mp1->b_wptr; 5887 wptr[0] = TCPOPT_NOP; 5888 wptr[1] = TCPOPT_WSCALE; 5889 wptr[2] = TCPOPT_WS_LEN; 5890 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 5891 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 5892 tcph->th_offset_and_rsrvd[0] += 5893 (1 << 4); 5894 } 5895 5896 if (tcp->tcp_snd_ts_ok) { 5897 uint32_t llbolt; 5898 5899 llbolt = prom_gettime(); 5900 wptr = mp1->b_wptr; 5901 wptr[0] = TCPOPT_NOP; 5902 wptr[1] = TCPOPT_NOP; 5903 wptr[2] = TCPOPT_TSTAMP; 5904 wptr[3] = TCPOPT_TSTAMP_LEN; 5905 wptr += 4; 5906 U32_TO_BE32(llbolt, wptr); 5907 wptr += 4; 5908 assert(tcp->tcp_ts_recent == 0); 5909 U32_TO_BE32(0L, wptr); 5910 mp1->b_wptr += TCPOPT_REAL_TS_LEN; 5911 tcph->th_offset_and_rsrvd[0] += 5912 (3 << 4); 5913 } 5914 5915 if (tcp->tcp_snd_sack_ok) { 5916 wptr = mp1->b_wptr; 5917 wptr[0] = TCPOPT_NOP; 5918 wptr[1] = TCPOPT_NOP; 5919 wptr[2] = TCPOPT_SACK_PERMITTED; 5920 wptr[3] = TCPOPT_SACK_OK_LEN; 5921 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 5922 tcph->th_offset_and_rsrvd[0] += 5923 (1 << 4); 5924 } 5925 5926 /* 5927 * Set up all the bits to tell other side 5928 * we are ECN capable. 5929 */ 5930 if (tcp->tcp_ecn_ok) { 5931 flags |= (TH_ECE | TH_CWR); 5932 } 5933 break; 5934 case TCPS_SYN_RCVD: 5935 flags |= TH_SYN; 5936 5937 if (tcp->tcp_snd_ws_ok) { 5938 wptr = mp1->b_wptr; 5939 wptr[0] = TCPOPT_NOP; 5940 wptr[1] = TCPOPT_WSCALE; 5941 wptr[2] = TCPOPT_WS_LEN; 5942 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 5943 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 5944 tcph->th_offset_and_rsrvd[0] += (1 << 4); 5945 } 5946 5947 if (tcp->tcp_snd_sack_ok) { 5948 wptr = mp1->b_wptr; 5949 wptr[0] = TCPOPT_NOP; 5950 wptr[1] = TCPOPT_NOP; 5951 wptr[2] = TCPOPT_SACK_PERMITTED; 5952 wptr[3] = TCPOPT_SACK_OK_LEN; 5953 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 5954 tcph->th_offset_and_rsrvd[0] += 5955 (1 << 4); 5956 } 5957 5958 /* 5959 * If the other side is ECN capable, reply 5960 * that we are also ECN capable. 5961 */ 5962 if (tcp->tcp_ecn_ok) { 5963 flags |= TH_ECE; 5964 } 5965 break; 5966 default: 5967 break; 5968 } 5969 /* allocb() of adequate mblk assures space */ 5970 assert((uintptr_t)(mp1->b_wptr - 5971 mp1->b_rptr) <= (uintptr_t)INT_MAX); 5972 if (flags & TH_SYN) 5973 BUMP_MIB(tcp_mib.tcpOutControl); 5974 } 5975 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 5976 (seq + data_length) == tcp->tcp_fss) { 5977 if (!tcp->tcp_fin_acked) { 5978 flags |= TH_FIN; 5979 BUMP_MIB(tcp_mib.tcpOutControl); 5980 } 5981 if (!tcp->tcp_fin_sent) { 5982 tcp->tcp_fin_sent = B_TRUE; 5983 switch (tcp->tcp_state) { 5984 case TCPS_SYN_RCVD: 5985 case TCPS_ESTABLISHED: 5986 tcp->tcp_state = TCPS_FIN_WAIT_1; 5987 break; 5988 case TCPS_CLOSE_WAIT: 5989 tcp->tcp_state = TCPS_LAST_ACK; 5990 break; 5991 } 5992 if (tcp->tcp_suna == tcp->tcp_snxt) 5993 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5994 tcp->tcp_snxt = tcp->tcp_fss + 1; 5995 } 5996 } 5997 } 5998 tcph->th_flags[0] = (uchar_t)flags; 5999 tcp->tcp_rack = tcp->tcp_rnxt; 6000 tcp->tcp_rack_cnt = 0; 6001 6002 if (tcp->tcp_snd_ts_ok) { 6003 if (tcp->tcp_state != TCPS_SYN_SENT) { 6004 uint32_t llbolt = prom_gettime(); 6005 6006 U32_TO_BE32(llbolt, 6007 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 6008 U32_TO_BE32(tcp->tcp_ts_recent, 6009 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 6010 } 6011 } 6012 6013 if (num_sack_blk > 0) { 6014 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 6015 sack_blk_t *tmp; 6016 int32_t i; 6017 6018 wptr[0] = TCPOPT_NOP; 6019 wptr[1] = TCPOPT_NOP; 6020 wptr[2] = TCPOPT_SACK; 6021 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 6022 sizeof (sack_blk_t); 6023 wptr += TCPOPT_REAL_SACK_LEN; 6024 6025 tmp = tcp->tcp_sack_list; 6026 for (i = 0; i < num_sack_blk; i++) { 6027 U32_TO_BE32(tmp[i].begin, wptr); 6028 wptr += sizeof (tcp_seq); 6029 U32_TO_BE32(tmp[i].end, wptr); 6030 wptr += sizeof (tcp_seq); 6031 } 6032 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4); 6033 } 6034 assert((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 6035 data_length += (int)(mp1->b_wptr - rptr); 6036 if (tcp->tcp_ipversion == IPV4_VERSION) 6037 ((struct ip *)rptr)->ip_len = htons(data_length); 6038 6039 /* 6040 * Performance hit! We need to pullup the whole message 6041 * in order to do checksum and for the MAC output routine. 6042 */ 6043 if (mp1->b_cont != NULL) { 6044 int mp_size; 6045 #ifdef DEBUG 6046 printf("Multiple mblk %d\n", msgdsize(mp1)); 6047 #endif 6048 mp2 = mp1; 6049 new_mp = allocb(msgdsize(mp1) + tcp_wroff_xtra, 0); 6050 new_mp->b_rptr += tcp_wroff_xtra; 6051 new_mp->b_wptr = new_mp->b_rptr; 6052 while (mp1 != NULL) { 6053 mp_size = mp1->b_wptr - mp1->b_rptr; 6054 bcopy(mp1->b_rptr, new_mp->b_wptr, mp_size); 6055 new_mp->b_wptr += mp_size; 6056 mp1 = mp1->b_cont; 6057 } 6058 freemsg(mp2); 6059 mp1 = new_mp; 6060 } 6061 tcp_set_cksum(mp1); 6062 /* Fill in the TTL field as it is 0 in the header template. */ 6063 ((struct ip *)mp1->b_rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 6064 6065 return (mp1); 6066 } 6067 6068 /* 6069 * Generate a "no listener here" reset in response to the 6070 * connection request contained within 'mp' 6071 */ 6072 static void 6073 tcp_xmit_listeners_reset(int sock_id, mblk_t *mp, uint_t ip_hdr_len) 6074 { 6075 uchar_t *rptr; 6076 uint32_t seg_len; 6077 tcph_t *tcph; 6078 uint32_t seg_seq; 6079 uint32_t seg_ack; 6080 uint_t flags; 6081 6082 rptr = mp->b_rptr; 6083 6084 tcph = (tcph_t *)&rptr[ip_hdr_len]; 6085 seg_seq = BE32_TO_U32(tcph->th_seq); 6086 seg_ack = BE32_TO_U32(tcph->th_ack); 6087 flags = tcph->th_flags[0]; 6088 6089 seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len); 6090 if (flags & TH_RST) { 6091 freeb(mp); 6092 } else if (flags & TH_ACK) { 6093 tcp_xmit_early_reset("no tcp, reset", 6094 sock_id, mp, seg_ack, 0, TH_RST, ip_hdr_len); 6095 } else { 6096 if (flags & TH_SYN) 6097 seg_len++; 6098 tcp_xmit_early_reset("no tcp, reset/ack", sock_id, 6099 mp, 0, seg_seq + seg_len, 6100 TH_RST | TH_ACK, ip_hdr_len); 6101 } 6102 } 6103 6104 /* Non overlapping byte exchanger */ 6105 static void 6106 tcp_xchg(uchar_t *a, uchar_t *b, int len) 6107 { 6108 uchar_t uch; 6109 6110 while (len-- > 0) { 6111 uch = a[len]; 6112 a[len] = b[len]; 6113 b[len] = uch; 6114 } 6115 } 6116 6117 /* 6118 * Generate a reset based on an inbound packet for which there is no active 6119 * tcp state that we can find. 6120 */ 6121 static void 6122 tcp_xmit_early_reset(char *str, int sock_id, mblk_t *mp, uint32_t seq, 6123 uint32_t ack, int ctl, uint_t ip_hdr_len) 6124 { 6125 struct ip *iph = NULL; 6126 ushort_t len; 6127 tcph_t *tcph; 6128 int i; 6129 ipaddr_t addr; 6130 mblk_t *new_mp; 6131 6132 if (str != NULL) { 6133 dprintf("tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 6134 "flags 0x%x\n", str, seq, ack, ctl); 6135 } 6136 6137 /* 6138 * We skip reversing source route here. 6139 * (for now we replace all IP options with EOL) 6140 */ 6141 iph = (struct ip *)mp->b_rptr; 6142 for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 6143 mp->b_rptr[i] = IPOPT_EOL; 6144 /* 6145 * Make sure that src address is not a limited broadcast 6146 * address. Not all broadcast address checking for the 6147 * src address is possible, since we don't know the 6148 * netmask of the src addr. 6149 * No check for destination address is done, since 6150 * IP will not pass up a packet with a broadcast dest address 6151 * to TCP. 6152 */ 6153 if (iph->ip_src.s_addr == INADDR_ANY || 6154 iph->ip_src.s_addr == INADDR_BROADCAST) { 6155 freemsg(mp); 6156 return; 6157 } 6158 6159 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6160 if (tcph->th_flags[0] & TH_RST) { 6161 freemsg(mp); 6162 return; 6163 } 6164 /* 6165 * Now copy the original header to a new buffer. The reason 6166 * for doing this is that we need to put extra room before 6167 * the header for the MAC layer address. The original mblk 6168 * does not have this extra head room. 6169 */ 6170 len = ip_hdr_len + sizeof (tcph_t); 6171 if ((new_mp = allocb(len + tcp_wroff_xtra, 0)) == NULL) { 6172 freemsg(mp); 6173 return; 6174 } 6175 new_mp->b_rptr += tcp_wroff_xtra; 6176 bcopy(mp->b_rptr, new_mp->b_rptr, len); 6177 new_mp->b_wptr = new_mp->b_rptr + len; 6178 freemsg(mp); 6179 mp = new_mp; 6180 iph = (struct ip *)mp->b_rptr; 6181 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6182 6183 tcph->th_offset_and_rsrvd[0] = (5 << 4); 6184 tcp_xchg(tcph->th_fport, tcph->th_lport, 2); 6185 U32_TO_BE32(ack, tcph->th_ack); 6186 U32_TO_BE32(seq, tcph->th_seq); 6187 U16_TO_BE16(0, tcph->th_win); 6188 bzero(tcph->th_sum, sizeof (int16_t)); 6189 tcph->th_flags[0] = (uint8_t)ctl; 6190 if (ctl & TH_RST) { 6191 BUMP_MIB(tcp_mib.tcpOutRsts); 6192 BUMP_MIB(tcp_mib.tcpOutControl); 6193 } 6194 6195 iph->ip_len = htons(len); 6196 /* Swap addresses */ 6197 addr = iph->ip_src.s_addr; 6198 iph->ip_src = iph->ip_dst; 6199 iph->ip_dst.s_addr = addr; 6200 iph->ip_id = 0; 6201 iph->ip_ttl = 0; 6202 tcp_set_cksum(mp); 6203 iph->ip_ttl = (uint8_t)tcp_ipv4_ttl; 6204 6205 /* Dump the packet when debugging. */ 6206 TCP_DUMP_PACKET("tcp_xmit_early_reset", mp); 6207 (void) ipv4_tcp_output(sock_id, mp); 6208 freemsg(mp); 6209 } 6210 6211 static void 6212 tcp_set_cksum(mblk_t *mp) 6213 { 6214 struct ip *iph; 6215 tcpha_t *tcph; 6216 int len; 6217 6218 iph = (struct ip *)mp->b_rptr; 6219 tcph = (tcpha_t *)(iph + 1); 6220 len = ntohs(iph->ip_len); 6221 /* 6222 * Calculate the TCP checksum. Need to include the psuedo header, 6223 * which is similar to the real IP header starting at the TTL field. 6224 */ 6225 iph->ip_sum = htons(len - IP_SIMPLE_HDR_LENGTH); 6226 tcph->tha_sum = 0; 6227 tcph->tha_sum = tcp_cksum((uint16_t *)&(iph->ip_ttl), 6228 len - IP_SIMPLE_HDR_LENGTH + 12); 6229 iph->ip_sum = 0; 6230 } 6231 6232 static uint16_t 6233 tcp_cksum(uint16_t *buf, uint32_t len) 6234 { 6235 /* 6236 * Compute Internet Checksum for "count" bytes 6237 * beginning at location "addr". 6238 */ 6239 int32_t sum = 0; 6240 6241 while (len > 1) { 6242 /* This is the inner loop */ 6243 sum += *buf++; 6244 len -= 2; 6245 } 6246 6247 /* Add left-over byte, if any */ 6248 if (len > 0) 6249 sum += *(unsigned char *)buf * 256; 6250 6251 /* Fold 32-bit sum to 16 bits */ 6252 while (sum >> 16) 6253 sum = (sum & 0xffff) + (sum >> 16); 6254 6255 return ((uint16_t)~sum); 6256 } 6257 6258 /* 6259 * Type three generator adapted from the random() function in 4.4 BSD: 6260 */ 6261 6262 /* 6263 * Copyright (c) 1983, 1993 6264 * The Regents of the University of California. All rights reserved. 6265 * 6266 * Redistribution and use in source and binary forms, with or without 6267 * modification, are permitted provided that the following conditions 6268 * are met: 6269 * 1. Redistributions of source code must retain the above copyright 6270 * notice, this list of conditions and the following disclaimer. 6271 * 2. Redistributions in binary form must reproduce the above copyright 6272 * notice, this list of conditions and the following disclaimer in the 6273 * documentation and/or other materials provided with the distribution. 6274 * 3. All advertising materials mentioning features or use of this software 6275 * must display the following acknowledgement: 6276 * This product includes software developed by the University of 6277 * California, Berkeley and its contributors. 6278 * 4. Neither the name of the University nor the names of its contributors 6279 * may be used to endorse or promote products derived from this software 6280 * without specific prior written permission. 6281 * 6282 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 6283 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 6284 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 6285 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 6286 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 6287 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 6288 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 6289 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 6290 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 6291 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 6292 * SUCH DAMAGE. 6293 */ 6294 6295 /* Type 3 -- x**31 + x**3 + 1 */ 6296 #define DEG_3 31 6297 #define SEP_3 3 6298 6299 6300 /* Protected by tcp_random_lock */ 6301 static int tcp_randtbl[DEG_3 + 1]; 6302 6303 static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 6304 static int *tcp_random_rptr = &tcp_randtbl[1]; 6305 6306 static int *tcp_random_state = &tcp_randtbl[1]; 6307 static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 6308 6309 static void 6310 tcp_random_init(void) 6311 { 6312 int i; 6313 uint32_t hrt; 6314 uint32_t wallclock; 6315 uint32_t result; 6316 6317 /* 6318 * 6319 * XXX We don't have high resolution time in standalone... The 6320 * following is just some approximation on the comment below. 6321 * 6322 * Use high-res timer and current time for seed. Gethrtime() returns 6323 * a longlong, which may contain resolution down to nanoseconds. 6324 * The current time will either be a 32-bit or a 64-bit quantity. 6325 * XOR the two together in a 64-bit result variable. 6326 * Convert the result to a 32-bit value by multiplying the high-order 6327 * 32-bits by the low-order 32-bits. 6328 * 6329 * XXX We don't have gethrtime() in prom and the wallclock.... 6330 */ 6331 6332 hrt = prom_gettime(); 6333 wallclock = (uint32_t)time(NULL); 6334 result = wallclock ^ hrt; 6335 tcp_random_state[0] = result; 6336 6337 for (i = 1; i < DEG_3; i++) 6338 tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 6339 + 12345; 6340 tcp_random_fptr = &tcp_random_state[SEP_3]; 6341 tcp_random_rptr = &tcp_random_state[0]; 6342 for (i = 0; i < 10 * DEG_3; i++) 6343 (void) tcp_random(); 6344 } 6345 6346 /* 6347 * tcp_random: Return a random number in the range [1 - (128K + 1)]. 6348 * This range is selected to be approximately centered on TCP_ISS / 2, 6349 * and easy to compute. We get this value by generating a 32-bit random 6350 * number, selecting out the high-order 17 bits, and then adding one so 6351 * that we never return zero. 6352 */ 6353 static int 6354 tcp_random(void) 6355 { 6356 int i; 6357 6358 *tcp_random_fptr += *tcp_random_rptr; 6359 6360 /* 6361 * The high-order bits are more random than the low-order bits, 6362 * so we select out the high-order 17 bits and add one so that 6363 * we never return zero. 6364 */ 6365 i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 6366 if (++tcp_random_fptr >= tcp_random_end_ptr) { 6367 tcp_random_fptr = tcp_random_state; 6368 ++tcp_random_rptr; 6369 } else if (++tcp_random_rptr >= tcp_random_end_ptr) 6370 tcp_random_rptr = tcp_random_state; 6371 6372 return (i); 6373 } 6374 6375 /* 6376 * Generate ISS, taking into account NDD changes may happen halfway through. 6377 * (If the iss is not zero, set it.) 6378 */ 6379 static void 6380 tcp_iss_init(tcp_t *tcp) 6381 { 6382 tcp_iss_incr_extra += (ISS_INCR >> 1); 6383 tcp->tcp_iss = tcp_iss_incr_extra; 6384 tcp->tcp_iss += (prom_gettime() >> ISS_NSEC_SHT) + tcp_random(); 6385 tcp->tcp_valid_bits = TCP_ISS_VALID; 6386 tcp->tcp_fss = tcp->tcp_iss - 1; 6387 tcp->tcp_suna = tcp->tcp_iss; 6388 tcp->tcp_snxt = tcp->tcp_iss + 1; 6389 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 6390 tcp->tcp_csuna = tcp->tcp_snxt; 6391 } 6392 6393 /* 6394 * Diagnostic routine used to return a string associated with the tcp state. 6395 * Note that if the caller does not supply a buffer, it will use an internal 6396 * static string. This means that if multiple threads call this function at 6397 * the same time, output can be corrupted... Note also that this function 6398 * does not check the size of the supplied buffer. The caller has to make 6399 * sure that it is big enough. 6400 */ 6401 static char * 6402 tcp_display(tcp_t *tcp, char *sup_buf, char format) 6403 { 6404 char buf1[30]; 6405 static char priv_buf[INET_ADDRSTRLEN * 2 + 80]; 6406 char *buf; 6407 char *cp; 6408 char local_addrbuf[INET_ADDRSTRLEN]; 6409 char remote_addrbuf[INET_ADDRSTRLEN]; 6410 struct in_addr addr; 6411 6412 if (sup_buf != NULL) 6413 buf = sup_buf; 6414 else 6415 buf = priv_buf; 6416 6417 if (tcp == NULL) 6418 return ("NULL_TCP"); 6419 switch (tcp->tcp_state) { 6420 case TCPS_CLOSED: 6421 cp = "TCP_CLOSED"; 6422 break; 6423 case TCPS_IDLE: 6424 cp = "TCP_IDLE"; 6425 break; 6426 case TCPS_BOUND: 6427 cp = "TCP_BOUND"; 6428 break; 6429 case TCPS_LISTEN: 6430 cp = "TCP_LISTEN"; 6431 break; 6432 case TCPS_SYN_SENT: 6433 cp = "TCP_SYN_SENT"; 6434 break; 6435 case TCPS_SYN_RCVD: 6436 cp = "TCP_SYN_RCVD"; 6437 break; 6438 case TCPS_ESTABLISHED: 6439 cp = "TCP_ESTABLISHED"; 6440 break; 6441 case TCPS_CLOSE_WAIT: 6442 cp = "TCP_CLOSE_WAIT"; 6443 break; 6444 case TCPS_FIN_WAIT_1: 6445 cp = "TCP_FIN_WAIT_1"; 6446 break; 6447 case TCPS_CLOSING: 6448 cp = "TCP_CLOSING"; 6449 break; 6450 case TCPS_LAST_ACK: 6451 cp = "TCP_LAST_ACK"; 6452 break; 6453 case TCPS_FIN_WAIT_2: 6454 cp = "TCP_FIN_WAIT_2"; 6455 break; 6456 case TCPS_TIME_WAIT: 6457 cp = "TCP_TIME_WAIT"; 6458 break; 6459 default: 6460 (void) sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 6461 cp = buf1; 6462 break; 6463 } 6464 switch (format) { 6465 case DISP_ADDR_AND_PORT: 6466 /* 6467 * Note that we use the remote address in the tcp_b 6468 * structure. This means that it will print out 6469 * the real destination address, not the next hop's 6470 * address if source routing is used. 6471 */ 6472 addr.s_addr = tcp->tcp_bound_source; 6473 bcopy(inet_ntoa(addr), local_addrbuf, sizeof (local_addrbuf)); 6474 addr.s_addr = tcp->tcp_remote; 6475 bcopy(inet_ntoa(addr), remote_addrbuf, sizeof (remote_addrbuf)); 6476 (void) snprintf(buf, sizeof (priv_buf), "[%s.%u, %s.%u] %s", 6477 local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf, 6478 ntohs(tcp->tcp_fport), cp); 6479 break; 6480 case DISP_PORT_ONLY: 6481 default: 6482 (void) snprintf(buf, sizeof (priv_buf), "[%u, %u] %s", 6483 ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp); 6484 break; 6485 } 6486 6487 return (buf); 6488 } 6489 6490 /* 6491 * Add a new piece to the tcp reassembly queue. If the gap at the beginning 6492 * is filled, return as much as we can. The message passed in may be 6493 * multi-part, chained using b_cont. "start" is the starting sequence 6494 * number for this piece. 6495 */ 6496 static mblk_t * 6497 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 6498 { 6499 uint32_t end; 6500 mblk_t *mp1; 6501 mblk_t *mp2; 6502 mblk_t *next_mp; 6503 uint32_t u1; 6504 6505 /* Walk through all the new pieces. */ 6506 do { 6507 assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 6508 (uintptr_t)INT_MAX); 6509 end = start + (int)(mp->b_wptr - mp->b_rptr); 6510 next_mp = mp->b_cont; 6511 if (start == end) { 6512 /* Empty. Blast it. */ 6513 freeb(mp); 6514 continue; 6515 } 6516 mp->b_cont = NULL; 6517 TCP_REASS_SET_SEQ(mp, start); 6518 TCP_REASS_SET_END(mp, end); 6519 mp1 = tcp->tcp_reass_tail; 6520 if (!mp1) { 6521 tcp->tcp_reass_tail = mp; 6522 tcp->tcp_reass_head = mp; 6523 BUMP_MIB(tcp_mib.tcpInDataUnorderSegs); 6524 UPDATE_MIB(tcp_mib.tcpInDataUnorderBytes, end - start); 6525 continue; 6526 } 6527 /* New stuff completely beyond tail? */ 6528 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { 6529 /* Link it on end. */ 6530 mp1->b_cont = mp; 6531 tcp->tcp_reass_tail = mp; 6532 BUMP_MIB(tcp_mib.tcpInDataUnorderSegs); 6533 UPDATE_MIB(tcp_mib.tcpInDataUnorderBytes, end - start); 6534 continue; 6535 } 6536 mp1 = tcp->tcp_reass_head; 6537 u1 = TCP_REASS_SEQ(mp1); 6538 /* New stuff at the front? */ 6539 if (SEQ_LT(start, u1)) { 6540 /* Yes... Check for overlap. */ 6541 mp->b_cont = mp1; 6542 tcp->tcp_reass_head = mp; 6543 tcp_reass_elim_overlap(tcp, mp); 6544 continue; 6545 } 6546 /* 6547 * The new piece fits somewhere between the head and tail. 6548 * We find our slot, where mp1 precedes us and mp2 trails. 6549 */ 6550 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 6551 u1 = TCP_REASS_SEQ(mp2); 6552 if (SEQ_LEQ(start, u1)) 6553 break; 6554 } 6555 /* Link ourselves in */ 6556 mp->b_cont = mp2; 6557 mp1->b_cont = mp; 6558 6559 /* Trim overlap with following mblk(s) first */ 6560 tcp_reass_elim_overlap(tcp, mp); 6561 6562 /* Trim overlap with preceding mblk */ 6563 tcp_reass_elim_overlap(tcp, mp1); 6564 6565 } while (start = end, mp = next_mp); 6566 mp1 = tcp->tcp_reass_head; 6567 /* Anything ready to go? */ 6568 if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 6569 return (NULL); 6570 /* Eat what we can off the queue */ 6571 for (;;) { 6572 mp = mp1->b_cont; 6573 end = TCP_REASS_END(mp1); 6574 TCP_REASS_SET_SEQ(mp1, 0); 6575 TCP_REASS_SET_END(mp1, 0); 6576 if (!mp) { 6577 tcp->tcp_reass_tail = NULL; 6578 break; 6579 } 6580 if (end != TCP_REASS_SEQ(mp)) { 6581 mp1->b_cont = NULL; 6582 break; 6583 } 6584 mp1 = mp; 6585 } 6586 mp1 = tcp->tcp_reass_head; 6587 tcp->tcp_reass_head = mp; 6588 return (mp1); 6589 } 6590 6591 /* Eliminate any overlap that mp may have over later mblks */ 6592 static void 6593 tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 6594 { 6595 uint32_t end; 6596 mblk_t *mp1; 6597 uint32_t u1; 6598 6599 end = TCP_REASS_END(mp); 6600 while ((mp1 = mp->b_cont) != NULL) { 6601 u1 = TCP_REASS_SEQ(mp1); 6602 if (!SEQ_GT(end, u1)) 6603 break; 6604 if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 6605 mp->b_wptr -= end - u1; 6606 TCP_REASS_SET_END(mp, u1); 6607 BUMP_MIB(tcp_mib.tcpInDataPartDupSegs); 6608 UPDATE_MIB(tcp_mib.tcpInDataPartDupBytes, end - u1); 6609 break; 6610 } 6611 mp->b_cont = mp1->b_cont; 6612 freeb(mp1); 6613 BUMP_MIB(tcp_mib.tcpInDataDupSegs); 6614 UPDATE_MIB(tcp_mib.tcpInDataDupBytes, end - u1); 6615 } 6616 if (!mp1) 6617 tcp->tcp_reass_tail = mp; 6618 } 6619 6620 /* 6621 * Remove a connection from the list of detached TIME_WAIT connections. 6622 */ 6623 static void 6624 tcp_time_wait_remove(tcp_t *tcp) 6625 { 6626 if (tcp->tcp_time_wait_expire == 0) { 6627 assert(tcp->tcp_time_wait_next == NULL); 6628 assert(tcp->tcp_time_wait_prev == NULL); 6629 return; 6630 } 6631 assert(tcp->tcp_state == TCPS_TIME_WAIT); 6632 if (tcp == tcp_time_wait_head) { 6633 assert(tcp->tcp_time_wait_prev == NULL); 6634 tcp_time_wait_head = tcp->tcp_time_wait_next; 6635 if (tcp_time_wait_head != NULL) { 6636 tcp_time_wait_head->tcp_time_wait_prev = NULL; 6637 } else { 6638 tcp_time_wait_tail = NULL; 6639 } 6640 } else if (tcp == tcp_time_wait_tail) { 6641 assert(tcp != tcp_time_wait_head); 6642 assert(tcp->tcp_time_wait_next == NULL); 6643 tcp_time_wait_tail = tcp->tcp_time_wait_prev; 6644 assert(tcp_time_wait_tail != NULL); 6645 tcp_time_wait_tail->tcp_time_wait_next = NULL; 6646 } else { 6647 assert(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 6648 assert(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 6649 tcp->tcp_time_wait_prev->tcp_time_wait_next = 6650 tcp->tcp_time_wait_next; 6651 tcp->tcp_time_wait_next->tcp_time_wait_prev = 6652 tcp->tcp_time_wait_prev; 6653 } 6654 tcp->tcp_time_wait_next = NULL; 6655 tcp->tcp_time_wait_prev = NULL; 6656 tcp->tcp_time_wait_expire = 0; 6657 } 6658 6659 /* 6660 * Add a connection to the list of detached TIME_WAIT connections 6661 * and set its time to expire ... 6662 */ 6663 static void 6664 tcp_time_wait_append(tcp_t *tcp) 6665 { 6666 tcp->tcp_time_wait_expire = prom_gettime() + tcp_time_wait_interval; 6667 if (tcp->tcp_time_wait_expire == 0) 6668 tcp->tcp_time_wait_expire = 1; 6669 6670 if (tcp_time_wait_head == NULL) { 6671 assert(tcp_time_wait_tail == NULL); 6672 tcp_time_wait_head = tcp; 6673 } else { 6674 assert(tcp_time_wait_tail != NULL); 6675 assert(tcp_time_wait_tail->tcp_state == TCPS_TIME_WAIT); 6676 tcp_time_wait_tail->tcp_time_wait_next = tcp; 6677 tcp->tcp_time_wait_prev = tcp_time_wait_tail; 6678 } 6679 tcp_time_wait_tail = tcp; 6680 6681 /* for ndd stats about compression */ 6682 tcp_cum_timewait++; 6683 } 6684 6685 /* 6686 * Periodic qtimeout routine run on the default queue. 6687 * Performs 2 functions. 6688 * 1. Does TIME_WAIT compression on all recently added tcps. List 6689 * traversal is done backwards from the tail. 6690 * 2. Blows away all tcps whose TIME_WAIT has expired. List traversal 6691 * is done forwards from the head. 6692 */ 6693 void 6694 tcp_time_wait_collector(void) 6695 { 6696 tcp_t *tcp; 6697 uint32_t now; 6698 6699 /* 6700 * In order to reap time waits reliably, we should use a 6701 * source of time that is not adjustable by the user 6702 */ 6703 now = prom_gettime(); 6704 while ((tcp = tcp_time_wait_head) != NULL) { 6705 /* 6706 * Compare times using modular arithmetic, since 6707 * lbolt can wrapover. 6708 */ 6709 if ((int32_t)(now - tcp->tcp_time_wait_expire) < 0) { 6710 break; 6711 } 6712 /* 6713 * Note that the err must be 0 as there is no socket 6714 * associated with this TCP... 6715 */ 6716 (void) tcp_clean_death(-1, tcp, 0); 6717 } 6718 /* Schedule next run time. */ 6719 tcp_time_wait_runtime = prom_gettime() + 10000; 6720 } 6721 6722 void 6723 tcp_time_wait_report(void) 6724 { 6725 tcp_t *tcp; 6726 6727 printf("Current time %u\n", prom_gettime()); 6728 for (tcp = tcp_time_wait_head; tcp != NULL; 6729 tcp = tcp->tcp_time_wait_next) { 6730 printf("%s expires at %u\n", tcp_display(tcp, NULL, 6731 DISP_ADDR_AND_PORT), tcp->tcp_time_wait_expire); 6732 } 6733 } 6734 6735 /* 6736 * Send up all messages queued on tcp_rcv_list. 6737 * Have to set tcp_co_norm since we use putnext. 6738 */ 6739 static void 6740 tcp_rcv_drain(int sock_id, tcp_t *tcp) 6741 { 6742 mblk_t *mp; 6743 struct inetgram *in_gram; 6744 mblk_t *in_mp; 6745 int len; 6746 6747 /* Don't drain if the app has not finished reading all the data. */ 6748 if (sockets[sock_id].so_rcvbuf <= 0) 6749 return; 6750 6751 /* We might have come here just to updated the rwnd */ 6752 if (tcp->tcp_rcv_list == NULL) 6753 goto win_update; 6754 6755 if ((in_gram = (struct inetgram *)bkmem_zalloc( 6756 sizeof (struct inetgram))) == NULL) { 6757 return; 6758 } 6759 if ((in_mp = allocb(tcp->tcp_rcv_cnt, 0)) == NULL) { 6760 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 6761 return; 6762 } 6763 in_gram->igm_level = APP_LVL; 6764 in_gram->igm_mp = in_mp; 6765 in_gram->igm_id = 0; 6766 6767 while ((mp = tcp->tcp_rcv_list) != NULL) { 6768 tcp->tcp_rcv_list = mp->b_cont; 6769 len = mp->b_wptr - mp->b_rptr; 6770 bcopy(mp->b_rptr, in_mp->b_wptr, len); 6771 in_mp->b_wptr += len; 6772 freeb(mp); 6773 } 6774 6775 tcp->tcp_rcv_last_tail = NULL; 6776 tcp->tcp_rcv_cnt = 0; 6777 add_grams(&sockets[sock_id].inq, in_gram); 6778 6779 /* This means that so_rcvbuf can be less than 0. */ 6780 sockets[sock_id].so_rcvbuf -= in_mp->b_wptr - in_mp->b_rptr; 6781 win_update: 6782 /* 6783 * Increase the receive window to max. But we need to do receiver 6784 * SWS avoidance. This means that we need to check the increase of 6785 * of receive window is at least 1 MSS. 6786 */ 6787 if (sockets[sock_id].so_rcvbuf > 0 && 6788 (tcp->tcp_rwnd_max - tcp->tcp_rwnd >= tcp->tcp_mss)) { 6789 tcp->tcp_rwnd = tcp->tcp_rwnd_max; 6790 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 6791 tcp->tcp_tcph->th_win); 6792 } 6793 } 6794 6795 /* 6796 * Wrapper for recvfrom to call 6797 */ 6798 void 6799 tcp_rcv_drain_sock(int sock_id) 6800 { 6801 tcp_t *tcp; 6802 if ((tcp = sockets[sock_id].pcb) == NULL) 6803 return; 6804 tcp_rcv_drain(sock_id, tcp); 6805 } 6806 6807 /* 6808 * If the inq == NULL and the tcp_rcv_list != NULL, we have data that 6809 * recvfrom could read. Place a magic message in the inq to let recvfrom 6810 * know that it needs to call tcp_rcv_drain_sock to pullup the data. 6811 */ 6812 static void 6813 tcp_drain_needed(int sock_id, tcp_t *tcp) 6814 { 6815 struct inetgram *in_gram; 6816 #ifdef DEBUG 6817 printf("tcp_drain_needed: inq %x, tcp_rcv_list %x\n", 6818 sockets[sock_id].inq, tcp->tcp_rcv_list); 6819 #endif 6820 if ((sockets[sock_id].inq != NULL) || 6821 (tcp->tcp_rcv_list == NULL)) 6822 return; 6823 6824 if ((in_gram = (struct inetgram *)bkmem_zalloc( 6825 sizeof (struct inetgram))) == NULL) 6826 return; 6827 6828 in_gram->igm_level = APP_LVL; 6829 in_gram->igm_mp = NULL; 6830 in_gram->igm_id = TCP_CALLB_MAGIC_ID; 6831 6832 add_grams(&sockets[sock_id].inq, in_gram); 6833 } 6834 6835 /* 6836 * Queue data on tcp_rcv_list which is a b_next chain. 6837 * Each element of the chain is a b_cont chain. 6838 * 6839 * M_DATA messages are added to the current element. 6840 * Other messages are added as new (b_next) elements. 6841 */ 6842 static void 6843 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) 6844 { 6845 assert(seg_len == msgdsize(mp)); 6846 if (tcp->tcp_rcv_list == NULL) { 6847 tcp->tcp_rcv_list = mp; 6848 } else { 6849 tcp->tcp_rcv_last_tail->b_cont = mp; 6850 } 6851 while (mp->b_cont) 6852 mp = mp->b_cont; 6853 tcp->tcp_rcv_last_tail = mp; 6854 tcp->tcp_rcv_cnt += seg_len; 6855 tcp->tcp_rwnd -= seg_len; 6856 #ifdef DEBUG 6857 printf("tcp_rcv_enqueue rwnd %d\n", tcp->tcp_rwnd); 6858 #endif 6859 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 6860 } 6861 6862 /* The minimum of smoothed mean deviation in RTO calculation. */ 6863 #define TCP_SD_MIN 400 6864 6865 /* 6866 * Set RTO for this connection. The formula is from Jacobson and Karels' 6867 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 6868 * are the same as those in Appendix A.2 of that paper. 6869 * 6870 * m = new measurement 6871 * sa = smoothed RTT average (8 * average estimates). 6872 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 6873 */ 6874 static void 6875 tcp_set_rto(tcp_t *tcp, int32_t rtt) 6876 { 6877 int32_t m = rtt; 6878 uint32_t sa = tcp->tcp_rtt_sa; 6879 uint32_t sv = tcp->tcp_rtt_sd; 6880 uint32_t rto; 6881 6882 BUMP_MIB(tcp_mib.tcpRttUpdate); 6883 tcp->tcp_rtt_update++; 6884 6885 /* tcp_rtt_sa is not 0 means this is a new sample. */ 6886 if (sa != 0) { 6887 /* 6888 * Update average estimator: 6889 * new rtt = 7/8 old rtt + 1/8 Error 6890 */ 6891 6892 /* m is now Error in estimate. */ 6893 m -= sa >> 3; 6894 if ((int32_t)(sa += m) <= 0) { 6895 /* 6896 * Don't allow the smoothed average to be negative. 6897 * We use 0 to denote reinitialization of the 6898 * variables. 6899 */ 6900 sa = 1; 6901 } 6902 6903 /* 6904 * Update deviation estimator: 6905 * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) 6906 */ 6907 if (m < 0) 6908 m = -m; 6909 m -= sv >> 2; 6910 sv += m; 6911 } else { 6912 /* 6913 * This follows BSD's implementation. So the reinitialized 6914 * RTO is 3 * m. We cannot go less than 2 because if the 6915 * link is bandwidth dominated, doubling the window size 6916 * during slow start means doubling the RTT. We want to be 6917 * more conservative when we reinitialize our estimates. 3 6918 * is just a convenient number. 6919 */ 6920 sa = m << 3; 6921 sv = m << 1; 6922 } 6923 if (sv < TCP_SD_MIN) { 6924 /* 6925 * We do not know that if sa captures the delay ACK 6926 * effect as in a long train of segments, a receiver 6927 * does not delay its ACKs. So set the minimum of sv 6928 * to be TCP_SD_MIN, which is default to 400 ms, twice 6929 * of BSD DATO. That means the minimum of mean 6930 * deviation is 100 ms. 6931 * 6932 */ 6933 sv = TCP_SD_MIN; 6934 } 6935 tcp->tcp_rtt_sa = sa; 6936 tcp->tcp_rtt_sd = sv; 6937 /* 6938 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) 6939 * 6940 * Add tcp_rexmit_interval extra in case of extreme environment 6941 * where the algorithm fails to work. The default value of 6942 * tcp_rexmit_interval_extra should be 0. 6943 * 6944 * As we use a finer grained clock than BSD and update 6945 * RTO for every ACKs, add in another .25 of RTT to the 6946 * deviation of RTO to accomodate burstiness of 1/4 of 6947 * window size. 6948 */ 6949 rto = (sa >> 3) + sv + tcp_rexmit_interval_extra + (sa >> 5); 6950 6951 if (rto > tcp_rexmit_interval_max) { 6952 tcp->tcp_rto = tcp_rexmit_interval_max; 6953 } else if (rto < tcp_rexmit_interval_min) { 6954 tcp->tcp_rto = tcp_rexmit_interval_min; 6955 } else { 6956 tcp->tcp_rto = rto; 6957 } 6958 6959 /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 6960 tcp->tcp_timer_backoff = 0; 6961 } 6962 6963 /* 6964 * Initiate closedown sequence on an active connection. 6965 * Return value zero for OK return, non-zero for error return. 6966 */ 6967 static int 6968 tcp_xmit_end(tcp_t *tcp, int sock_id) 6969 { 6970 mblk_t *mp; 6971 6972 if (tcp->tcp_state < TCPS_SYN_RCVD || 6973 tcp->tcp_state > TCPS_CLOSE_WAIT) { 6974 /* 6975 * Invalid state, only states TCPS_SYN_RCVD, 6976 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 6977 */ 6978 return (-1); 6979 } 6980 6981 tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 6982 tcp->tcp_valid_bits |= TCP_FSS_VALID; 6983 /* 6984 * If there is nothing more unsent, send the FIN now. 6985 * Otherwise, it will go out with the last segment. 6986 */ 6987 if (tcp->tcp_unsent == 0) { 6988 mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 6989 tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 6990 6991 if (mp != NULL) { 6992 /* Dump the packet when debugging. */ 6993 TCP_DUMP_PACKET("tcp_xmit_end", mp); 6994 (void) ipv4_tcp_output(sock_id, mp); 6995 freeb(mp); 6996 } else { 6997 /* 6998 * Couldn't allocate msg. Pretend we got it out. 6999 * Wait for rexmit timeout. 7000 */ 7001 tcp->tcp_snxt = tcp->tcp_fss + 1; 7002 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 7003 } 7004 7005 /* 7006 * If needed, update tcp_rexmit_snxt as tcp_snxt is 7007 * changed. 7008 */ 7009 if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 7010 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 7011 } 7012 } else { 7013 tcp_wput_data(tcp, NULL, B_FALSE); 7014 } 7015 7016 return (0); 7017 } 7018 7019 int 7020 tcp_opt_set(tcp_t *tcp, int level, int option, const void *optval, 7021 socklen_t optlen) 7022 { 7023 switch (level) { 7024 case SOL_SOCKET: { 7025 switch (option) { 7026 case SO_RCVBUF: 7027 if (optlen == sizeof (int)) { 7028 int val = *(int *)optval; 7029 7030 if (val > tcp_max_buf) { 7031 errno = ENOBUFS; 7032 break; 7033 } 7034 /* Silently ignore zero */ 7035 if (val != 0) { 7036 val = MSS_ROUNDUP(val, tcp->tcp_mss); 7037 (void) tcp_rwnd_set(tcp, val); 7038 } 7039 } else { 7040 errno = EINVAL; 7041 } 7042 break; 7043 case SO_SNDBUF: 7044 if (optlen == sizeof (int)) { 7045 tcp->tcp_xmit_hiwater = *(int *)optval; 7046 if (tcp->tcp_xmit_hiwater > tcp_max_buf) 7047 tcp->tcp_xmit_hiwater = tcp_max_buf; 7048 } else { 7049 errno = EINVAL; 7050 } 7051 break; 7052 case SO_LINGER: 7053 if (optlen == sizeof (struct linger)) { 7054 struct linger *lgr = (struct linger *)optval; 7055 7056 if (lgr->l_onoff) { 7057 tcp->tcp_linger = 1; 7058 tcp->tcp_lingertime = lgr->l_linger; 7059 } else { 7060 tcp->tcp_linger = 0; 7061 tcp->tcp_lingertime = 0; 7062 } 7063 } else { 7064 errno = EINVAL; 7065 } 7066 break; 7067 default: 7068 errno = ENOPROTOOPT; 7069 break; 7070 } 7071 break; 7072 } /* case SOL_SOCKET */ 7073 case IPPROTO_TCP: { 7074 switch (option) { 7075 default: 7076 errno = ENOPROTOOPT; 7077 break; 7078 } 7079 break; 7080 } /* case IPPROTO_TCP */ 7081 case IPPROTO_IP: { 7082 switch (option) { 7083 default: 7084 errno = ENOPROTOOPT; 7085 break; 7086 } 7087 break; 7088 } /* case IPPROTO_IP */ 7089 default: 7090 errno = ENOPROTOOPT; 7091 break; 7092 } /* switch (level) */ 7093 7094 if (errno != 0) 7095 return (-1); 7096 else 7097 return (0); 7098 } 7099