1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * tcp.c, Code implementing the TCP protocol. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include <sys/types.h> 32 #include <socket_impl.h> 33 #include <socket_inet.h> 34 #include <sys/sysmacros.h> 35 #include <sys/promif.h> 36 #include <sys/socket.h> 37 #include <netinet/in_systm.h> 38 #include <netinet/in.h> 39 #include <netinet/ip.h> 40 #include <netinet/tcp.h> 41 #include <net/if_types.h> 42 #include <sys/salib.h> 43 44 #include "ipv4.h" 45 #include "ipv4_impl.h" 46 #include "mac.h" 47 #include "mac_impl.h" 48 #include "v4_sum_impl.h" 49 #include <sys/bootdebug.h> 50 #include "tcp_inet.h" 51 #include "tcp_sack.h" 52 #include <inet/common.h> 53 #include <inet/mib2.h> 54 55 /* 56 * We need to redefine BUMP_MIB/UPDATE_MIB to not have DTrace probes. 57 */ 58 #undef BUMP_MIB 59 #define BUMP_MIB(x) (x)++ 60 61 #undef UPDATE_MIB 62 #define UPDATE_MIB(x, y) x += y 63 64 /* 65 * MIB-2 stuff for SNMP 66 */ 67 mib2_tcp_t tcp_mib; /* SNMP fixed size info */ 68 69 /* The TCP mib does not include the following errors. */ 70 static uint_t tcp_cksum_errors; 71 static uint_t tcp_drops; 72 73 /* Macros for timestamp comparisons */ 74 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 75 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 76 77 /* 78 * Parameters for TCP Initial Send Sequence number (ISS) generation. 79 * The ISS is calculated by adding three components: a time component 80 * which grows by 1 every 4096 nanoseconds (versus every 4 microseconds 81 * suggested by RFC 793, page 27); 82 * a per-connection component which grows by 125000 for every new connection; 83 * and an "extra" component that grows by a random amount centered 84 * approximately on 64000. This causes the the ISS generator to cycle every 85 * 4.89 hours if no TCP connections are made, and faster if connections are 86 * made. 87 */ 88 #define ISS_INCR 250000 89 #define ISS_NSEC_SHT 0 90 91 static uint32_t tcp_iss_incr_extra; /* Incremented for each connection */ 92 93 #define TCP_XMIT_LOWATER 4096 94 #define TCP_XMIT_HIWATER 49152 95 #define TCP_RECV_LOWATER 2048 96 #define TCP_RECV_HIWATER 49152 97 98 /* 99 * PAWS needs a timer for 24 days. This is the number of ms in 24 days 100 */ 101 #define PAWS_TIMEOUT ((uint32_t)(24*24*60*60*1000)) 102 103 /* 104 * TCP options struct returned from tcp_parse_options. 105 */ 106 typedef struct tcp_opt_s { 107 uint32_t tcp_opt_mss; 108 uint32_t tcp_opt_wscale; 109 uint32_t tcp_opt_ts_val; 110 uint32_t tcp_opt_ts_ecr; 111 tcp_t *tcp; 112 } tcp_opt_t; 113 114 /* 115 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 116 */ 117 118 #ifdef _BIG_ENDIAN 119 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 120 (TCPOPT_TSTAMP << 8) | 10) 121 #else 122 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 123 (TCPOPT_NOP << 8) | TCPOPT_NOP) 124 #endif 125 126 /* 127 * Flags returned from tcp_parse_options. 128 */ 129 #define TCP_OPT_MSS_PRESENT 1 130 #define TCP_OPT_WSCALE_PRESENT 2 131 #define TCP_OPT_TSTAMP_PRESENT 4 132 #define TCP_OPT_SACK_OK_PRESENT 8 133 #define TCP_OPT_SACK_PRESENT 16 134 135 /* TCP option length */ 136 #define TCPOPT_NOP_LEN 1 137 #define TCPOPT_MAXSEG_LEN 4 138 #define TCPOPT_WS_LEN 3 139 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 140 #define TCPOPT_TSTAMP_LEN 10 141 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 142 #define TCPOPT_SACK_OK_LEN 2 143 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 144 #define TCPOPT_REAL_SACK_LEN 4 145 #define TCPOPT_MAX_SACK_LEN 36 146 #define TCPOPT_HEADER_LEN 2 147 148 /* TCP cwnd burst factor. */ 149 #define TCP_CWND_INFINITE 65535 150 #define TCP_CWND_SS 3 151 #define TCP_CWND_NORMAL 5 152 153 /* Named Dispatch Parameter Management Structure */ 154 typedef struct tcpparam_s { 155 uint32_t tcp_param_min; 156 uint32_t tcp_param_max; 157 uint32_t tcp_param_val; 158 char *tcp_param_name; 159 } tcpparam_t; 160 161 /* Max size IP datagram is 64k - 1 */ 162 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (struct ip) + \ 163 sizeof (tcph_t))) 164 165 /* Max of the above */ 166 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 167 168 /* Largest TCP port number */ 169 #define TCP_MAX_PORT (64 * 1024 - 1) 170 171 /* Round up the value to the nearest mss. */ 172 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 173 174 #define MS 1L 175 #define SECONDS (1000 * MS) 176 #define MINUTES (60 * SECONDS) 177 #define HOURS (60 * MINUTES) 178 #define DAYS (24 * HOURS) 179 180 /* All NDD params in the core TCP became static variables. */ 181 static int tcp_time_wait_interval = 1 * MINUTES; 182 static int tcp_conn_req_max_q = 128; 183 static int tcp_conn_req_max_q0 = 1024; 184 static int tcp_conn_req_min = 1; 185 static int tcp_conn_grace_period = 0 * SECONDS; 186 static int tcp_cwnd_max_ = 1024 * 1024; 187 static int tcp_smallest_nonpriv_port = 1024; 188 static int tcp_ip_abort_cinterval = 3 * MINUTES; 189 static int tcp_ip_abort_linterval = 3 * MINUTES; 190 static int tcp_ip_abort_interval = 8 * MINUTES; 191 static int tcp_ip_notify_cinterval = 10 * SECONDS; 192 static int tcp_ip_notify_interval = 10 * SECONDS; 193 static int tcp_ipv4_ttl = 64; 194 static int tcp_mss_def_ipv4 = 536; 195 static int tcp_mss_max_ipv4 = TCP_MSS_MAX_IPV4; 196 static int tcp_mss_min = 108; 197 static int tcp_naglim_def = (4*1024)-1; 198 static int tcp_rexmit_interval_initial = 3 * SECONDS; 199 static int tcp_rexmit_interval_max = 60 * SECONDS; 200 static int tcp_rexmit_interval_min = 400 * MS; 201 static int tcp_dupack_fast_retransmit = 3; 202 static int tcp_smallest_anon_port = 32 * 1024; 203 static int tcp_largest_anon_port = TCP_MAX_PORT; 204 static int tcp_xmit_lowat = TCP_XMIT_LOWATER; 205 static int tcp_recv_hiwat_minmss = 4; 206 static int tcp_fin_wait_2_flush_interval = 1 * MINUTES; 207 static int tcp_max_buf = 1024 * 1024; 208 static int tcp_wscale_always = 1; 209 static int tcp_tstamp_always = 1; 210 static int tcp_tstamp_if_wscale = 1; 211 static int tcp_rexmit_interval_extra = 0; 212 static int tcp_slow_start_after_idle = 2; 213 static int tcp_slow_start_initial = 2; 214 static int tcp_sack_permitted = 2; 215 static int tcp_ecn_permitted = 2; 216 217 /* Extra room to fit in headers. */ 218 static uint_t tcp_wroff_xtra; 219 220 /* Hint for next port to try. */ 221 static in_port_t tcp_next_port_to_try = 32*1024; 222 223 /* 224 * Figure out the value of window scale opton. Note that the rwnd is 225 * ASSUMED to be rounded up to the nearest MSS before the calculation. 226 * We cannot find the scale value and then do a round up of tcp_rwnd 227 * because the scale value may not be correct after that. 228 */ 229 #define SET_WS_VALUE(tcp) \ 230 { \ 231 int i; \ 232 uint32_t rwnd = (tcp)->tcp_rwnd; \ 233 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; \ 234 i++, rwnd >>= 1) \ 235 ; \ 236 (tcp)->tcp_rcv_ws = i; \ 237 } 238 239 /* 240 * Set ECN capable transport (ECT) code point in IP header. 241 * 242 * Note that there are 2 ECT code points '01' and '10', which are called 243 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 244 * point ECT(0) for TCP as described in RFC 2481. 245 */ 246 #define SET_ECT(tcp, iph) \ 247 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 248 /* We need to clear the code point first. */ \ 249 ((struct ip *)(iph))->ip_tos &= 0xFC; \ 250 ((struct ip *)(iph))->ip_tos |= IPH_ECN_ECT0; \ 251 } 252 253 /* 254 * The format argument to pass to tcp_display(). 255 * DISP_PORT_ONLY means that the returned string has only port info. 256 * DISP_ADDR_AND_PORT means that the returned string also contains the 257 * remote and local IP address. 258 */ 259 #define DISP_PORT_ONLY 1 260 #define DISP_ADDR_AND_PORT 2 261 262 /* 263 * TCP reassembly macros. We hide starting and ending sequence numbers in 264 * b_next and b_prev of messages on the reassembly queue. The messages are 265 * chained using b_cont. These macros are used in tcp_reass() so we don't 266 * have to see the ugly casts and assignments. 267 * Note. use uintptr_t to suppress the gcc warning. 268 */ 269 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) 270 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ 271 (mblk_t *)((uintptr_t)(u))) 272 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) 273 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ 274 (mblk_t *)((uintptr_t)(u))) 275 276 #define TCP_TIMER_RESTART(tcp, intvl) \ 277 (tcp)->tcp_rto_timeout = prom_gettime() + intvl; \ 278 (tcp)->tcp_timer_running = B_TRUE; 279 280 static int tcp_accept_comm(tcp_t *, tcp_t *, mblk_t *, uint_t); 281 static mblk_t *tcp_ack_mp(tcp_t *); 282 static in_port_t tcp_bindi(in_port_t, in_addr_t *, boolean_t, boolean_t); 283 static uint16_t tcp_cksum(uint16_t *, uint32_t); 284 static void tcp_clean_death(int, tcp_t *, int err); 285 static tcp_t *tcp_conn_request(tcp_t *, mblk_t *mp, uint_t, uint_t); 286 static char *tcp_display(tcp_t *, char *, char); 287 static int tcp_drain_input(tcp_t *, int, int); 288 static void tcp_drain_needed(int, tcp_t *); 289 static boolean_t tcp_drop_q0(tcp_t *); 290 static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *); 291 static int tcp_header_len(struct inetgram *); 292 static in_port_t tcp_report_ports(uint16_t *, enum Ports); 293 static int tcp_input(int); 294 static void tcp_iss_init(tcp_t *); 295 static tcp_t *tcp_lookup_ipv4(struct ip *, tcpha_t *, int, int *); 296 static tcp_t *tcp_lookup_listener_ipv4(in_addr_t, in_port_t, int *); 297 static int tcp_conn_check(tcp_t *); 298 static int tcp_close(int); 299 static void tcp_close_detached(tcp_t *); 300 static void tcp_eager_cleanup(tcp_t *, boolean_t, int); 301 static void tcp_eager_unlink(tcp_t *); 302 static void tcp_free(tcp_t *); 303 static int tcp_header_init_ipv4(tcp_t *); 304 static void tcp_mss_set(tcp_t *, uint32_t); 305 static int tcp_parse_options(tcph_t *, tcp_opt_t *); 306 static boolean_t tcp_paws_check(tcp_t *, tcph_t *, tcp_opt_t *); 307 static void tcp_process_options(tcp_t *, tcph_t *); 308 static int tcp_random(void); 309 static void tcp_random_init(void); 310 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); 311 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); 312 static void tcp_rcv_drain(int sock_id, tcp_t *); 313 static void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t); 314 static void tcp_rput_data(tcp_t *, mblk_t *, int); 315 static int tcp_rwnd_set(tcp_t *, uint32_t); 316 static int32_t tcp_sack_rxmit(tcp_t *, int); 317 static void tcp_set_cksum(mblk_t *); 318 static void tcp_set_rto(tcp_t *, int32_t); 319 static void tcp_ss_rexmit(tcp_t *, int); 320 static int tcp_state_wait(int, tcp_t *, int); 321 static void tcp_timer(tcp_t *, int); 322 static void tcp_time_wait_append(tcp_t *); 323 static void tcp_time_wait_collector(void); 324 static void tcp_time_wait_processing(tcp_t *, mblk_t *, uint32_t, 325 uint32_t, int, tcph_t *, int sock_id); 326 static void tcp_time_wait_remove(tcp_t *); 327 static in_port_t tcp_update_next_port(in_port_t); 328 static int tcp_verify_cksum(mblk_t *); 329 static void tcp_wput_data(tcp_t *, mblk_t *, int); 330 static void tcp_xmit_ctl(char *, tcp_t *, mblk_t *, uint32_t, uint32_t, 331 int, uint_t, int); 332 static void tcp_xmit_early_reset(char *, int, mblk_t *, uint32_t, uint32_t, 333 int, uint_t); 334 static int tcp_xmit_end(tcp_t *, int); 335 static void tcp_xmit_listeners_reset(int, mblk_t *, uint_t); 336 static mblk_t *tcp_xmit_mp(tcp_t *, mblk_t *, int32_t, int32_t *, 337 mblk_t **, uint32_t, boolean_t, uint32_t *, boolean_t); 338 static int tcp_init_values(tcp_t *, struct inetboot_socket *); 339 340 #if DEBUG > 1 341 #define TCP_DUMP_PACKET(str, mp) \ 342 { \ 343 int len = (mp)->b_wptr - (mp)->b_rptr; \ 344 \ 345 printf("%s: dump TCP(%d): \n", (str), len); \ 346 hexdump((char *)(mp)->b_rptr, len); \ 347 } 348 #else 349 #define TCP_DUMP_PACKET(str, mp) 350 #endif 351 352 #ifdef DEBUG 353 #define DEBUG_1(str, arg) printf(str, (arg)) 354 #define DEBUG_2(str, arg1, arg2) printf(str, (arg1), (arg2)) 355 #define DEBUG_3(str, arg1, arg2, arg3) printf(str, (arg1), (arg2), (arg3)) 356 #else 357 #define DEBUG_1(str, arg) 358 #define DEBUG_2(str, arg1, arg2) 359 #define DEBUG_3(str, arg1, arg2, arg3) 360 #endif 361 362 /* Whether it is the first time TCP is used. */ 363 static boolean_t tcp_initialized = B_FALSE; 364 365 /* TCP time wait list. */ 366 static tcp_t *tcp_time_wait_head; 367 static tcp_t *tcp_time_wait_tail; 368 static uint32_t tcp_cum_timewait; 369 /* When the tcp_time_wait_collector is run. */ 370 static uint32_t tcp_time_wait_runtime; 371 372 #define TCP_RUN_TIME_WAIT_COLLECTOR() \ 373 if (prom_gettime() > tcp_time_wait_runtime) \ 374 tcp_time_wait_collector(); 375 376 /* 377 * Accept will return with an error if there is no connection coming in 378 * after this (in ms). 379 */ 380 static int tcp_accept_timeout = 60000; 381 382 /* 383 * Initialize the TCP-specific parts of a socket. 384 */ 385 void 386 tcp_socket_init(struct inetboot_socket *isp) 387 { 388 /* Do some initializations. */ 389 if (!tcp_initialized) { 390 tcp_random_init(); 391 /* Extra head room for the MAC layer address. */ 392 if ((tcp_wroff_xtra = mac_get_hdr_len()) & 0x3) { 393 tcp_wroff_xtra = (tcp_wroff_xtra & ~0x3) + 0x4; 394 } 395 /* Schedule the first time wait cleanup time */ 396 tcp_time_wait_runtime = prom_gettime() + tcp_time_wait_interval; 397 tcp_initialized = B_TRUE; 398 } 399 TCP_RUN_TIME_WAIT_COLLECTOR(); 400 401 isp->proto = IPPROTO_TCP; 402 isp->input[TRANSPORT_LVL] = tcp_input; 403 /* Socket layer should call tcp_send() directly. */ 404 isp->output[TRANSPORT_LVL] = NULL; 405 isp->close[TRANSPORT_LVL] = tcp_close; 406 isp->headerlen[TRANSPORT_LVL] = tcp_header_len; 407 isp->ports = tcp_report_ports; 408 if ((isp->pcb = bkmem_alloc(sizeof (tcp_t))) == NULL) { 409 errno = ENOBUFS; 410 return; 411 } 412 if ((errno = tcp_init_values((tcp_t *)isp->pcb, isp)) != 0) { 413 bkmem_free(isp->pcb, sizeof (tcp_t)); 414 return; 415 } 416 /* 417 * This is set last because this field is used to determine if 418 * a socket is in use or not. 419 */ 420 isp->type = INETBOOT_STREAM; 421 } 422 423 /* 424 * Return the size of a TCP header including TCP option. 425 */ 426 static int 427 tcp_header_len(struct inetgram *igm) 428 { 429 mblk_t *pkt; 430 int ipvers; 431 432 /* Just returns the standard TCP header without option */ 433 if (igm == NULL) 434 return (sizeof (tcph_t)); 435 436 if ((pkt = igm->igm_mp) == NULL) 437 return (0); 438 439 ipvers = ((struct ip *)pkt->b_rptr)->ip_v; 440 if (ipvers == IPV4_VERSION) { 441 return (TCP_HDR_LENGTH((tcph_t *)(pkt + IPH_HDR_LENGTH(pkt)))); 442 } else { 443 dprintf("tcp_header_len: non-IPv4 packet.\n"); 444 return (0); 445 } 446 } 447 448 /* 449 * Return the requested port number in network order. 450 */ 451 static in_port_t 452 tcp_report_ports(uint16_t *tcphp, enum Ports request) 453 { 454 if (request == SOURCE) 455 return (*(uint16_t *)(((tcph_t *)tcphp)->th_lport)); 456 return (*(uint16_t *)(((tcph_t *)tcphp)->th_fport)); 457 } 458 459 /* 460 * Because inetboot is not interrupt driven, TCP can only poll. This 461 * means that there can be packets stuck in the NIC buffer waiting to 462 * be processed. Thus we need to drain them before, for example, sending 463 * anything because an ACK may actually be stuck there. 464 * 465 * The timeout arguments determine how long we should wait for draining. 466 */ 467 static int 468 tcp_drain_input(tcp_t *tcp, int sock_id, int timeout) 469 { 470 struct inetgram *in_gram; 471 struct inetgram *old_in_gram; 472 int old_timeout; 473 mblk_t *mp; 474 int i; 475 476 dprintf("tcp_drain_input(%d): %s\n", sock_id, 477 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 478 479 /* 480 * Since the driver uses the in_timeout value in the socket 481 * structure to determine the timeout value, we need to save 482 * the original one so that we can restore that after draining. 483 */ 484 old_timeout = sockets[sock_id].in_timeout; 485 sockets[sock_id].in_timeout = timeout; 486 487 /* 488 * We do this because the input queue may have some user 489 * data already. 490 */ 491 old_in_gram = sockets[sock_id].inq; 492 sockets[sock_id].inq = NULL; 493 494 /* Go out and check the wire */ 495 for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) { 496 if (sockets[sock_id].input[i] != NULL) { 497 if (sockets[sock_id].input[i](sock_id) < 0) { 498 sockets[sock_id].in_timeout = old_timeout; 499 if (sockets[sock_id].inq != NULL) 500 nuke_grams(&sockets[sock_id].inq); 501 sockets[sock_id].inq = old_in_gram; 502 return (-1); 503 } 504 } 505 } 506 #if DEBUG 507 printf("tcp_drain_input: done with checking packets\n"); 508 #endif 509 while ((in_gram = sockets[sock_id].inq) != NULL) { 510 /* Remove unknown inetgrams from the head of inq. */ 511 if (in_gram->igm_level != TRANSPORT_LVL) { 512 #if DEBUG 513 printf("tcp_drain_input: unexpected packet " 514 "level %d frame found\n", in_gram->igm_level); 515 #endif 516 del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 517 continue; 518 } 519 mp = in_gram->igm_mp; 520 del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 521 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 522 tcp_rput_data(tcp, mp, sock_id); 523 sockets[sock_id].in_timeout = old_timeout; 524 525 /* 526 * The other side may have closed this connection or 527 * RST us. But we need to continue to process other 528 * packets in the socket's queue because they may be 529 * belong to another TCP connections. 530 */ 531 if (sockets[sock_id].pcb == NULL) 532 tcp = NULL; 533 } 534 535 if (tcp == NULL || sockets[sock_id].pcb == NULL) { 536 if (sockets[sock_id].so_error != 0) 537 return (-1); 538 else 539 return (0); 540 } 541 #if DEBUG 542 printf("tcp_drain_input: done with processing packets\n"); 543 #endif 544 sockets[sock_id].in_timeout = old_timeout; 545 sockets[sock_id].inq = old_in_gram; 546 547 /* 548 * Data may have been received so indicate it is available 549 */ 550 tcp_drain_needed(sock_id, tcp); 551 return (0); 552 } 553 554 /* 555 * The receive entry point for upper layer to call to get data. Note 556 * that this follows the current architecture that lower layer receive 557 * routines have been called already. Thus if the inq of socket is 558 * not NULL, the packets must be for us. 559 */ 560 static int 561 tcp_input(int sock_id) 562 { 563 struct inetgram *in_gram; 564 mblk_t *mp; 565 tcp_t *tcp; 566 567 TCP_RUN_TIME_WAIT_COLLECTOR(); 568 569 if ((tcp = sockets[sock_id].pcb) == NULL) 570 return (-1); 571 572 while ((in_gram = sockets[sock_id].inq) != NULL) { 573 /* Remove unknown inetgrams from the head of inq. */ 574 if (in_gram->igm_level != TRANSPORT_LVL) { 575 #ifdef DEBUG 576 printf("tcp_input: unexpected packet " 577 "level %d frame found\n", in_gram->igm_level); 578 #endif 579 del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 580 continue; 581 } 582 mp = in_gram->igm_mp; 583 del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 584 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 585 tcp_rput_data(tcp, mp, sock_id); 586 /* The TCP may be gone because it gets a RST. */ 587 if (sockets[sock_id].pcb == NULL) 588 return (-1); 589 } 590 591 /* Flush the receive list. */ 592 if (tcp->tcp_rcv_list != NULL) { 593 tcp_rcv_drain(sock_id, tcp); 594 } else { 595 /* The other side has closed the connection, report this up. */ 596 if (tcp->tcp_state == TCPS_CLOSE_WAIT) { 597 sockets[sock_id].so_state |= SS_CANTRCVMORE; 598 return (0); 599 } 600 } 601 return (0); 602 } 603 604 /* 605 * The send entry point for upper layer to call to send data. In order 606 * to minimize changes to the core TCP code, we need to put the 607 * data into mblks. 608 */ 609 int 610 tcp_send(int sock_id, tcp_t *tcp, const void *msg, int len) 611 { 612 mblk_t *mp; 613 mblk_t *head = NULL; 614 mblk_t *tail; 615 int mss = tcp->tcp_mss; 616 int cnt = 0; 617 int win_size; 618 char *buf = (char *)msg; 619 620 TCP_RUN_TIME_WAIT_COLLECTOR(); 621 622 /* We don't want to append 0 size mblk. */ 623 if (len == 0) 624 return (0); 625 while (len > 0) { 626 if (len < mss) { 627 mss = len; 628 } 629 /* 630 * If we cannot allocate more buffer, stop here and 631 * the number of bytes buffered will be returned. 632 * 633 * Note that we follow the core TCP optimization that 634 * each mblk contains only MSS bytes data. 635 */ 636 if ((mp = allocb(mss + tcp->tcp_ip_hdr_len + 637 TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 0)) == NULL) { 638 break; 639 } 640 mp->b_rptr += tcp->tcp_hdr_len + tcp_wroff_xtra; 641 bcopy(buf, mp->b_rptr, mss); 642 mp->b_wptr = mp->b_rptr + mss; 643 buf += mss; 644 cnt += mss; 645 len -= mss; 646 647 if (head == NULL) { 648 head = mp; 649 tail = mp; 650 } else { 651 tail->b_cont = mp; 652 tail = mp; 653 } 654 } 655 656 /* 657 * Since inetboot is not interrupt driven, there may be 658 * some ACKs in the MAC's buffer. Drain them first, 659 * otherwise, we may not be able to send. 660 * 661 * We expect an ACK in two cases: 662 * 663 * 1) We have un-ACK'ed data. 664 * 665 * 2) All ACK's have been received and the sender's window has been 666 * closed. We need an ACK back to open the window so that we can 667 * send. In this case, call tcp_drain_input() if the window size is 668 * less than 2 * MSS. 669 */ 670 671 /* window size = MIN(swnd, cwnd) - unacked bytes */ 672 win_size = (tcp->tcp_swnd > tcp->tcp_cwnd) ? tcp->tcp_cwnd : 673 tcp->tcp_swnd; 674 win_size -= tcp->tcp_snxt; 675 win_size += tcp->tcp_suna; 676 if (win_size < (2 * tcp->tcp_mss)) 677 if (tcp_drain_input(tcp, sock_id, 5) < 0) 678 return (-1); 679 680 tcp_wput_data(tcp, head, sock_id); 681 return (cnt); 682 } 683 684 /* Free up all TCP related stuff */ 685 static void 686 tcp_free(tcp_t *tcp) 687 { 688 if (tcp->tcp_iphc != NULL) { 689 bkmem_free((caddr_t)tcp->tcp_iphc, tcp->tcp_iphc_len); 690 tcp->tcp_iphc = NULL; 691 } 692 if (tcp->tcp_xmit_head != NULL) { 693 freemsg(tcp->tcp_xmit_head); 694 tcp->tcp_xmit_head = NULL; 695 } 696 if (tcp->tcp_rcv_list != NULL) { 697 freemsg(tcp->tcp_rcv_list); 698 tcp->tcp_rcv_list = NULL; 699 } 700 if (tcp->tcp_reass_head != NULL) { 701 freemsg(tcp->tcp_reass_head); 702 tcp->tcp_reass_head = NULL; 703 } 704 if (tcp->tcp_sack_info != NULL) { 705 bkmem_free((caddr_t)tcp->tcp_sack_info, 706 sizeof (tcp_sack_info_t)); 707 tcp->tcp_sack_info = NULL; 708 } 709 } 710 711 static void 712 tcp_close_detached(tcp_t *tcp) 713 { 714 if (tcp->tcp_listener != NULL) 715 tcp_eager_unlink(tcp); 716 tcp_free(tcp); 717 bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 718 } 719 720 /* 721 * If we are an eager connection hanging off a listener that hasn't 722 * formally accepted the connection yet, get off his list and blow off 723 * any data that we have accumulated. 724 */ 725 static void 726 tcp_eager_unlink(tcp_t *tcp) 727 { 728 tcp_t *listener = tcp->tcp_listener; 729 730 assert(listener != NULL); 731 if (tcp->tcp_eager_next_q0 != NULL) { 732 assert(tcp->tcp_eager_prev_q0 != NULL); 733 734 /* Remove the eager tcp from q0 */ 735 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 736 tcp->tcp_eager_prev_q0; 737 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 738 tcp->tcp_eager_next_q0; 739 listener->tcp_conn_req_cnt_q0--; 740 } else { 741 tcp_t **tcpp = &listener->tcp_eager_next_q; 742 tcp_t *prev = NULL; 743 744 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 745 if (tcpp[0] == tcp) { 746 if (listener->tcp_eager_last_q == tcp) { 747 /* 748 * If we are unlinking the last 749 * element on the list, adjust 750 * tail pointer. Set tail pointer 751 * to nil when list is empty. 752 */ 753 assert(tcp->tcp_eager_next_q == NULL); 754 if (listener->tcp_eager_last_q == 755 listener->tcp_eager_next_q) { 756 listener->tcp_eager_last_q = 757 NULL; 758 } else { 759 /* 760 * We won't get here if there 761 * is only one eager in the 762 * list. 763 */ 764 assert(prev != NULL); 765 listener->tcp_eager_last_q = 766 prev; 767 } 768 } 769 tcpp[0] = tcp->tcp_eager_next_q; 770 tcp->tcp_eager_next_q = NULL; 771 tcp->tcp_eager_last_q = NULL; 772 listener->tcp_conn_req_cnt_q--; 773 break; 774 } 775 prev = tcpp[0]; 776 } 777 } 778 tcp->tcp_listener = NULL; 779 } 780 781 /* 782 * Reset any eager connection hanging off this listener 783 * and then reclaim it's resources. 784 */ 785 static void 786 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only, int sock_id) 787 { 788 tcp_t *eager; 789 790 if (!q0_only) { 791 /* First cleanup q */ 792 while ((eager = listener->tcp_eager_next_q) != NULL) { 793 assert(listener->tcp_eager_last_q != NULL); 794 tcp_xmit_ctl("tcp_eager_cleanup, can't wait", 795 eager, NULL, eager->tcp_snxt, 0, TH_RST, 0, 796 sock_id); 797 tcp_close_detached(eager); 798 } 799 assert(listener->tcp_eager_last_q == NULL); 800 } 801 /* Then cleanup q0 */ 802 while ((eager = listener->tcp_eager_next_q0) != listener) { 803 tcp_xmit_ctl("tcp_eager_cleanup, can't wait", 804 eager, NULL, eager->tcp_snxt, 0, TH_RST, 0, sock_id); 805 tcp_close_detached(eager); 806 } 807 } 808 809 /* 810 * To handle the shutdown request. Called from shutdown() 811 */ 812 int 813 tcp_shutdown(int sock_id) 814 { 815 tcp_t *tcp; 816 817 DEBUG_1("tcp_shutdown: sock_id %x\n", sock_id); 818 819 if ((tcp = sockets[sock_id].pcb) == NULL) { 820 return (-1); 821 } 822 823 /* 824 * Since inetboot is not interrupt driven, there may be 825 * some ACKs in the MAC's buffer. Drain them first, 826 * otherwise, we may not be able to send. 827 */ 828 if (tcp_drain_input(tcp, sock_id, 5) < 0) { 829 /* 830 * If we return now without freeing TCP, there will be 831 * a memory leak. 832 */ 833 if (sockets[sock_id].pcb != NULL) 834 tcp_clean_death(sock_id, tcp, 0); 835 return (-1); 836 } 837 838 DEBUG_1("tcp_shutdown: tcp_state %x\n", tcp->tcp_state); 839 switch (tcp->tcp_state) { 840 841 case TCPS_SYN_RCVD: 842 /* 843 * Shutdown during the connect 3-way handshake 844 */ 845 case TCPS_ESTABLISHED: 846 /* 847 * Transmit the FIN 848 * wait for the FIN to be ACKed, 849 * then remain in FIN_WAIT_2 850 */ 851 dprintf("tcp_shutdown: sending fin\n"); 852 if (tcp_xmit_end(tcp, sock_id) == 0 && 853 tcp_state_wait(sock_id, tcp, TCPS_FIN_WAIT_2) < 0) { 854 /* During the wait, TCP may be gone... */ 855 if (sockets[sock_id].pcb == NULL) 856 return (-1); 857 } 858 dprintf("tcp_shutdown: done\n"); 859 break; 860 861 default: 862 break; 863 864 } 865 return (0); 866 } 867 868 /* To handle closing of the socket */ 869 static int 870 tcp_close(int sock_id) 871 { 872 char *msg; 873 tcp_t *tcp; 874 int error = 0; 875 876 if ((tcp = sockets[sock_id].pcb) == NULL) { 877 return (-1); 878 } 879 880 TCP_RUN_TIME_WAIT_COLLECTOR(); 881 882 /* 883 * Since inetboot is not interrupt driven, there may be 884 * some ACKs in the MAC's buffer. Drain them first, 885 * otherwise, we may not be able to send. 886 */ 887 if (tcp_drain_input(tcp, sock_id, 5) < 0) { 888 /* 889 * If we return now without freeing TCP, there will be 890 * a memory leak. 891 */ 892 if (sockets[sock_id].pcb != NULL) 893 tcp_clean_death(sock_id, tcp, 0); 894 return (-1); 895 } 896 897 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 898 /* Cleanup for listener */ 899 tcp_eager_cleanup(tcp, 0, sock_id); 900 } 901 902 msg = NULL; 903 switch (tcp->tcp_state) { 904 case TCPS_CLOSED: 905 case TCPS_IDLE: 906 case TCPS_BOUND: 907 case TCPS_LISTEN: 908 break; 909 case TCPS_SYN_SENT: 910 msg = "tcp_close, during connect"; 911 break; 912 case TCPS_SYN_RCVD: 913 /* 914 * Close during the connect 3-way handshake 915 * but here there may or may not be pending data 916 * already on queue. Process almost same as in 917 * the ESTABLISHED state. 918 */ 919 /* FALLTHRU */ 920 default: 921 /* 922 * If SO_LINGER has set a zero linger time, abort the 923 * connection with a reset. 924 */ 925 if (tcp->tcp_linger && tcp->tcp_lingertime == 0) { 926 msg = "tcp_close, zero lingertime"; 927 break; 928 } 929 930 /* 931 * Abort connection if there is unread data queued. 932 */ 933 if (tcp->tcp_rcv_list != NULL || 934 tcp->tcp_reass_head != NULL) { 935 msg = "tcp_close, unread data"; 936 break; 937 } 938 if (tcp->tcp_state <= TCPS_LISTEN) 939 break; 940 941 /* 942 * Transmit the FIN before detaching the tcp_t. 943 * After tcp_detach returns this queue/perimeter 944 * no longer owns the tcp_t thus others can modify it. 945 * The TCP could be closed in tcp_state_wait called by 946 * tcp_wput_data called by tcp_xmit_end. 947 */ 948 (void) tcp_xmit_end(tcp, sock_id); 949 if (sockets[sock_id].pcb == NULL) 950 return (0); 951 952 /* 953 * If lingering on close then wait until the fin is acked, 954 * the SO_LINGER time passes, or a reset is sent/received. 955 */ 956 if (tcp->tcp_linger && tcp->tcp_lingertime > 0 && 957 !(tcp->tcp_fin_acked) && 958 tcp->tcp_state >= TCPS_ESTABLISHED) { 959 uint32_t stoptime; /* in ms */ 960 961 tcp->tcp_client_errno = 0; 962 stoptime = prom_gettime() + 963 (tcp->tcp_lingertime * 1000); 964 while (!(tcp->tcp_fin_acked) && 965 tcp->tcp_state >= TCPS_ESTABLISHED && 966 tcp->tcp_client_errno == 0 && 967 ((int32_t)(stoptime - prom_gettime()) > 0)) { 968 if (tcp_drain_input(tcp, sock_id, 5) < 0) { 969 if (sockets[sock_id].pcb != NULL) { 970 tcp_clean_death(sock_id, 971 tcp, 0); 972 } 973 return (-1); 974 } 975 } 976 tcp->tcp_client_errno = 0; 977 } 978 if (tcp_state_wait(sock_id, tcp, TCPS_TIME_WAIT) < 0) { 979 /* During the wait, TCP may be gone... */ 980 if (sockets[sock_id].pcb == NULL) 981 return (0); 982 msg = "tcp_close, couldn't detach"; 983 } else { 984 return (0); 985 } 986 break; 987 } 988 989 /* Something went wrong... Send a RST and report the error */ 990 if (msg != NULL) { 991 if (tcp->tcp_state == TCPS_ESTABLISHED || 992 tcp->tcp_state == TCPS_CLOSE_WAIT) 993 BUMP_MIB(tcp_mib.tcpEstabResets); 994 if (tcp->tcp_state == TCPS_SYN_SENT || 995 tcp->tcp_state == TCPS_SYN_RCVD) 996 BUMP_MIB(tcp_mib.tcpAttemptFails); 997 tcp_xmit_ctl(msg, tcp, NULL, tcp->tcp_snxt, 0, TH_RST, 0, 998 sock_id); 999 } 1000 1001 tcp_free(tcp); 1002 bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 1003 sockets[sock_id].pcb = NULL; 1004 return (error); 1005 } 1006 1007 /* To make an endpoint a listener. */ 1008 int 1009 tcp_listen(int sock_id, int backlog) 1010 { 1011 tcp_t *tcp; 1012 1013 if ((tcp = (tcp_t *)(sockets[sock_id].pcb)) == NULL) { 1014 errno = EINVAL; 1015 return (-1); 1016 } 1017 /* We allow calling listen() multiple times to change the backlog. */ 1018 if (tcp->tcp_state > TCPS_LISTEN || tcp->tcp_state < TCPS_BOUND) { 1019 errno = EOPNOTSUPP; 1020 return (-1); 1021 } 1022 /* The following initialization should only be done once. */ 1023 if (tcp->tcp_state != TCPS_LISTEN) { 1024 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 1025 tcp->tcp_eager_next_q = NULL; 1026 tcp->tcp_state = TCPS_LISTEN; 1027 tcp->tcp_second_ctimer_threshold = tcp_ip_abort_linterval; 1028 } 1029 if ((tcp->tcp_conn_req_max = backlog) > tcp_conn_req_max_q) { 1030 tcp->tcp_conn_req_max = tcp_conn_req_max_q; 1031 } 1032 if (tcp->tcp_conn_req_max < tcp_conn_req_min) { 1033 tcp->tcp_conn_req_max = tcp_conn_req_min; 1034 } 1035 return (0); 1036 } 1037 1038 /* To accept connections. */ 1039 int 1040 tcp_accept(int sock_id, struct sockaddr *addr, socklen_t *addr_len) 1041 { 1042 tcp_t *listener; 1043 tcp_t *eager; 1044 int sd, new_sock_id; 1045 struct sockaddr_in *new_addr = (struct sockaddr_in *)addr; 1046 int timeout; 1047 1048 /* Sanity check. */ 1049 if ((listener = (tcp_t *)(sockets[sock_id].pcb)) == NULL || 1050 new_addr == NULL || addr_len == NULL || 1051 *addr_len < sizeof (struct sockaddr_in) || 1052 listener->tcp_state != TCPS_LISTEN) { 1053 errno = EINVAL; 1054 return (-1); 1055 } 1056 1057 if (sockets[sock_id].in_timeout > tcp_accept_timeout) 1058 timeout = prom_gettime() + sockets[sock_id].in_timeout; 1059 else 1060 timeout = prom_gettime() + tcp_accept_timeout; 1061 while (listener->tcp_eager_next_q == NULL && 1062 timeout > prom_gettime()) { 1063 #if DEBUG 1064 printf("tcp_accept: Waiting in tcp_accept()\n"); 1065 #endif 1066 if (tcp_drain_input(listener, sock_id, 5) < 0) { 1067 return (-1); 1068 } 1069 } 1070 /* If there is an eager, don't timeout... */ 1071 if (timeout <= prom_gettime() && listener->tcp_eager_next_q == NULL) { 1072 #if DEBUG 1073 printf("tcp_accept: timeout\n"); 1074 #endif 1075 errno = ETIMEDOUT; 1076 return (-1); 1077 } 1078 #if DEBUG 1079 printf("tcp_accept: got a connection\n"); 1080 #endif 1081 1082 /* Now create the socket for this new TCP. */ 1083 if ((sd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 1084 return (-1); 1085 } 1086 if ((new_sock_id = so_check_fd(sd, &errno)) == -1) 1087 /* This should not happen! */ 1088 prom_panic("so_check_fd() fails in tcp_accept()"); 1089 /* Free the TCP PCB in the original socket. */ 1090 bkmem_free((caddr_t)(sockets[new_sock_id].pcb), sizeof (tcp_t)); 1091 /* Dequeue the eager and attach it to the socket. */ 1092 eager = listener->tcp_eager_next_q; 1093 listener->tcp_eager_next_q = eager->tcp_eager_next_q; 1094 if (listener->tcp_eager_last_q == eager) 1095 listener->tcp_eager_last_q = NULL; 1096 eager->tcp_eager_next_q = NULL; 1097 sockets[new_sock_id].pcb = eager; 1098 listener->tcp_conn_req_cnt_q--; 1099 1100 /* Copy in the address info. */ 1101 bcopy(&eager->tcp_remote, &new_addr->sin_addr.s_addr, 1102 sizeof (in_addr_t)); 1103 bcopy(&eager->tcp_fport, &new_addr->sin_port, sizeof (in_port_t)); 1104 new_addr->sin_family = AF_INET; 1105 1106 #ifdef DEBUG 1107 printf("tcp_accept(), new sock_id: %d\n", sd); 1108 #endif 1109 return (sd); 1110 } 1111 1112 /* Update the next anonymous port to use. */ 1113 static in_port_t 1114 tcp_update_next_port(in_port_t port) 1115 { 1116 /* Don't allow the port to fall out of the anonymous port range. */ 1117 if (port < tcp_smallest_anon_port || port > tcp_largest_anon_port) 1118 port = (in_port_t)tcp_smallest_anon_port; 1119 1120 if (port < tcp_smallest_nonpriv_port) 1121 port = (in_port_t)tcp_smallest_nonpriv_port; 1122 return (port); 1123 } 1124 1125 /* To check whether a bind to a port is allowed. */ 1126 static in_port_t 1127 tcp_bindi(in_port_t port, in_addr_t *addr, boolean_t reuseaddr, 1128 boolean_t bind_to_req_port_only) 1129 { 1130 int i, count; 1131 tcp_t *tcp; 1132 1133 count = tcp_largest_anon_port - tcp_smallest_anon_port; 1134 try_again: 1135 for (i = 0; i < MAXSOCKET; i++) { 1136 if (sockets[i].type != INETBOOT_STREAM || 1137 ((tcp = (tcp_t *)sockets[i].pcb) == NULL) || 1138 ntohs(tcp->tcp_lport) != port) { 1139 continue; 1140 } 1141 /* 1142 * Both TCPs have the same port. If SO_REUSEDADDR is 1143 * set and the bound TCP has a state greater than 1144 * TCPS_LISTEN, it is fine. 1145 */ 1146 if (reuseaddr && tcp->tcp_state > TCPS_LISTEN) { 1147 continue; 1148 } 1149 if (tcp->tcp_bound_source != INADDR_ANY && 1150 *addr != INADDR_ANY && 1151 tcp->tcp_bound_source != *addr) { 1152 continue; 1153 } 1154 if (bind_to_req_port_only) { 1155 return (0); 1156 } 1157 if (--count > 0) { 1158 port = tcp_update_next_port(++port); 1159 goto try_again; 1160 } else { 1161 return (0); 1162 } 1163 } 1164 return (port); 1165 } 1166 1167 /* To handle the bind request. */ 1168 int 1169 tcp_bind(int sock_id) 1170 { 1171 tcp_t *tcp; 1172 in_port_t requested_port, allocated_port; 1173 boolean_t bind_to_req_port_only; 1174 boolean_t reuseaddr; 1175 1176 if ((tcp = (tcp_t *)sockets[sock_id].pcb) == NULL) { 1177 errno = EINVAL; 1178 return (-1); 1179 } 1180 1181 if (tcp->tcp_state >= TCPS_BOUND) { 1182 /* We don't allow multiple bind(). */ 1183 errno = EPROTO; 1184 return (-1); 1185 } 1186 1187 requested_port = ntohs(sockets[sock_id].bind.sin_port); 1188 1189 /* The bound source can be INADDR_ANY. */ 1190 tcp->tcp_bound_source = sockets[sock_id].bind.sin_addr.s_addr; 1191 1192 tcp->tcp_ipha->ip_src.s_addr = tcp->tcp_bound_source; 1193 1194 /* Verify the port is available. */ 1195 if (requested_port == 0) 1196 bind_to_req_port_only = B_FALSE; 1197 else /* T_BIND_REQ and requested_port != 0 */ 1198 bind_to_req_port_only = B_TRUE; 1199 1200 if (requested_port == 0) { 1201 requested_port = tcp_update_next_port(++tcp_next_port_to_try); 1202 } 1203 reuseaddr = sockets[sock_id].so_opt & SO_REUSEADDR; 1204 allocated_port = tcp_bindi(requested_port, &(tcp->tcp_bound_source), 1205 reuseaddr, bind_to_req_port_only); 1206 1207 if (allocated_port == 0) { 1208 errno = EADDRINUSE; 1209 return (-1); 1210 } 1211 tcp->tcp_lport = htons(allocated_port); 1212 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 1213 sockets[sock_id].bind.sin_port = tcp->tcp_lport; 1214 tcp->tcp_state = TCPS_BOUND; 1215 return (0); 1216 } 1217 1218 /* 1219 * Check for duplicate TCP connections. 1220 */ 1221 static int 1222 tcp_conn_check(tcp_t *tcp) 1223 { 1224 int i; 1225 tcp_t *tmp_tcp; 1226 1227 for (i = 0; i < MAXSOCKET; i++) { 1228 if (sockets[i].type != INETBOOT_STREAM) 1229 continue; 1230 /* Socket may not be closed but the TCP can be gone. */ 1231 if ((tmp_tcp = (tcp_t *)sockets[i].pcb) == NULL) 1232 continue; 1233 /* We only care about TCP in states later than SYN_SENT. */ 1234 if (tmp_tcp->tcp_state < TCPS_SYN_SENT) 1235 continue; 1236 if (tmp_tcp->tcp_lport != tcp->tcp_lport || 1237 tmp_tcp->tcp_fport != tcp->tcp_fport || 1238 tmp_tcp->tcp_bound_source != tcp->tcp_bound_source || 1239 tmp_tcp->tcp_remote != tcp->tcp_remote) { 1240 continue; 1241 } else { 1242 return (-1); 1243 } 1244 } 1245 return (0); 1246 } 1247 1248 /* To handle a connect request. */ 1249 int 1250 tcp_connect(int sock_id) 1251 { 1252 tcp_t *tcp; 1253 in_addr_t dstaddr; 1254 in_port_t dstport; 1255 tcph_t *tcph; 1256 int mss; 1257 mblk_t *syn_mp; 1258 1259 if ((tcp = (tcp_t *)(sockets[sock_id].pcb)) == NULL) { 1260 errno = EINVAL; 1261 return (-1); 1262 } 1263 1264 TCP_RUN_TIME_WAIT_COLLECTOR(); 1265 1266 dstaddr = sockets[sock_id].remote.sin_addr.s_addr; 1267 dstport = sockets[sock_id].remote.sin_port; 1268 1269 /* 1270 * Check for attempt to connect to INADDR_ANY or non-unicast addrress. 1271 * We don't have enough info to check for broadcast addr, except 1272 * for the all 1 broadcast. 1273 */ 1274 if (dstaddr == INADDR_ANY || IN_CLASSD(ntohl(dstaddr)) || 1275 dstaddr == INADDR_BROADCAST) { 1276 /* 1277 * SunOS 4.x and 4.3 BSD allow an application 1278 * to connect a TCP socket to INADDR_ANY. 1279 * When they do this, the kernel picks the 1280 * address of one interface and uses it 1281 * instead. The kernel usually ends up 1282 * picking the address of the loopback 1283 * interface. This is an undocumented feature. 1284 * However, we provide the same thing here 1285 * in order to have source and binary 1286 * compatibility with SunOS 4.x. 1287 * Update the T_CONN_REQ (sin/sin6) since it is used to 1288 * generate the T_CONN_CON. 1289 * 1290 * Fail this for inetboot TCP. 1291 */ 1292 errno = EINVAL; 1293 return (-1); 1294 } 1295 1296 /* It is not bound to any address yet... */ 1297 if (tcp->tcp_bound_source == INADDR_ANY) { 1298 ipv4_getipaddr(&(sockets[sock_id].bind.sin_addr)); 1299 /* We don't have an address! */ 1300 if (ntohl(sockets[sock_id].bind.sin_addr.s_addr) == 1301 INADDR_ANY) { 1302 errno = EPROTO; 1303 return (-1); 1304 } 1305 tcp->tcp_bound_source = sockets[sock_id].bind.sin_addr.s_addr; 1306 tcp->tcp_ipha->ip_src.s_addr = tcp->tcp_bound_source; 1307 } 1308 1309 /* 1310 * Don't let an endpoint connect to itself. 1311 */ 1312 if (dstaddr == tcp->tcp_ipha->ip_src.s_addr && 1313 dstport == tcp->tcp_lport) { 1314 errno = EINVAL; 1315 return (-1); 1316 } 1317 1318 tcp->tcp_ipha->ip_dst.s_addr = dstaddr; 1319 tcp->tcp_remote = dstaddr; 1320 tcph = tcp->tcp_tcph; 1321 *(uint16_t *)tcph->th_fport = dstport; 1322 tcp->tcp_fport = dstport; 1323 1324 /* 1325 * Don't allow this connection to completely duplicate 1326 * an existing connection. 1327 */ 1328 if (tcp_conn_check(tcp) < 0) { 1329 errno = EADDRINUSE; 1330 return (-1); 1331 } 1332 1333 /* 1334 * Just make sure our rwnd is at 1335 * least tcp_recv_hiwat_mss * MSS 1336 * large, and round up to the nearest 1337 * MSS. 1338 * 1339 * We do the round up here because 1340 * we need to get the interface 1341 * MTU first before we can do the 1342 * round up. 1343 */ 1344 mss = tcp->tcp_mss - tcp->tcp_hdr_len; 1345 tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 1346 tcp_recv_hiwat_minmss * mss); 1347 tcp->tcp_rwnd_max = tcp->tcp_rwnd; 1348 SET_WS_VALUE(tcp); 1349 U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), 1350 tcp->tcp_tcph->th_win); 1351 if (tcp->tcp_rcv_ws > 0 || tcp_wscale_always) 1352 tcp->tcp_snd_ws_ok = B_TRUE; 1353 1354 /* 1355 * Set tcp_snd_ts_ok to true 1356 * so that tcp_xmit_mp will 1357 * include the timestamp 1358 * option in the SYN segment. 1359 */ 1360 if (tcp_tstamp_always || 1361 (tcp->tcp_rcv_ws && tcp_tstamp_if_wscale)) { 1362 tcp->tcp_snd_ts_ok = B_TRUE; 1363 } 1364 1365 if (tcp_sack_permitted == 2 || 1366 tcp->tcp_snd_sack_ok) { 1367 assert(tcp->tcp_sack_info == NULL); 1368 if ((tcp->tcp_sack_info = (tcp_sack_info_t *)bkmem_zalloc( 1369 sizeof (tcp_sack_info_t))) == NULL) { 1370 tcp->tcp_snd_sack_ok = B_FALSE; 1371 } else { 1372 tcp->tcp_snd_sack_ok = B_TRUE; 1373 } 1374 } 1375 /* 1376 * Should we use ECN? Note that the current 1377 * default value (SunOS 5.9) of tcp_ecn_permitted 1378 * is 2. The reason for doing this is that there 1379 * are equipments out there that will drop ECN 1380 * enabled IP packets. Setting it to 1 avoids 1381 * compatibility problems. 1382 */ 1383 if (tcp_ecn_permitted == 2) 1384 tcp->tcp_ecn_ok = B_TRUE; 1385 1386 tcp_iss_init(tcp); 1387 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 1388 tcp->tcp_active_open = B_TRUE; 1389 1390 tcp->tcp_state = TCPS_SYN_SENT; 1391 syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, tcp->tcp_iss, B_FALSE, 1392 NULL, B_FALSE); 1393 if (syn_mp != NULL) { 1394 int ret; 1395 1396 /* Dump the packet when debugging. */ 1397 TCP_DUMP_PACKET("tcp_connect", syn_mp); 1398 /* Send out the SYN packet. */ 1399 ret = ipv4_tcp_output(sock_id, syn_mp); 1400 freeb(syn_mp); 1401 if (ret < 0) { 1402 return (-1); 1403 } 1404 /* tcp_state_wait() will finish the 3 way handshake. */ 1405 return (tcp_state_wait(sock_id, tcp, TCPS_ESTABLISHED)); 1406 } else { 1407 errno = ENOBUFS; 1408 return (-1); 1409 } 1410 } 1411 1412 /* 1413 * Common accept code. Called by tcp_conn_request. 1414 * cr_pkt is the SYN packet. 1415 */ 1416 static int 1417 tcp_accept_comm(tcp_t *listener, tcp_t *acceptor, mblk_t *cr_pkt, 1418 uint_t ip_hdr_len) 1419 { 1420 tcph_t *tcph; 1421 1422 #ifdef DEBUG 1423 printf("tcp_accept_comm #######################\n"); 1424 #endif 1425 1426 /* 1427 * When we get here, we know that the acceptor header template 1428 * has already been initialized. 1429 * However, it may not match the listener if the listener 1430 * includes options... 1431 * It may also not match the listener if the listener is v6 and 1432 * and the acceptor is v4 1433 */ 1434 acceptor->tcp_lport = listener->tcp_lport; 1435 1436 if (listener->tcp_ipversion == acceptor->tcp_ipversion) { 1437 if (acceptor->tcp_iphc_len != listener->tcp_iphc_len) { 1438 /* 1439 * Listener had options of some sort; acceptor inherits. 1440 * Free up the acceptor template and allocate one 1441 * of the right size. 1442 */ 1443 bkmem_free(acceptor->tcp_iphc, acceptor->tcp_iphc_len); 1444 acceptor->tcp_iphc = bkmem_zalloc( 1445 listener->tcp_iphc_len); 1446 if (acceptor->tcp_iphc == NULL) { 1447 acceptor->tcp_iphc_len = 0; 1448 return (ENOMEM); 1449 } 1450 acceptor->tcp_iphc_len = listener->tcp_iphc_len; 1451 } 1452 acceptor->tcp_hdr_len = listener->tcp_hdr_len; 1453 acceptor->tcp_ip_hdr_len = listener->tcp_ip_hdr_len; 1454 acceptor->tcp_tcp_hdr_len = listener->tcp_tcp_hdr_len; 1455 1456 /* 1457 * Copy the IP+TCP header template from listener to acceptor 1458 */ 1459 bcopy(listener->tcp_iphc, acceptor->tcp_iphc, 1460 listener->tcp_hdr_len); 1461 acceptor->tcp_ipha = (struct ip *)acceptor->tcp_iphc; 1462 acceptor->tcp_tcph = (tcph_t *)(acceptor->tcp_iphc + 1463 acceptor->tcp_ip_hdr_len); 1464 } else { 1465 prom_panic("tcp_accept_comm: version not equal"); 1466 } 1467 1468 /* Copy our new dest and fport from the connection request packet */ 1469 if (acceptor->tcp_ipversion == IPV4_VERSION) { 1470 struct ip *ipha; 1471 1472 ipha = (struct ip *)cr_pkt->b_rptr; 1473 acceptor->tcp_ipha->ip_dst = ipha->ip_src; 1474 acceptor->tcp_remote = ipha->ip_src.s_addr; 1475 acceptor->tcp_ipha->ip_src = ipha->ip_dst; 1476 acceptor->tcp_bound_source = ipha->ip_dst.s_addr; 1477 tcph = (tcph_t *)&cr_pkt->b_rptr[ip_hdr_len]; 1478 } else { 1479 prom_panic("tcp_accept_comm: not IPv4"); 1480 } 1481 bcopy(tcph->th_lport, acceptor->tcp_tcph->th_fport, sizeof (in_port_t)); 1482 bcopy(acceptor->tcp_tcph->th_fport, &acceptor->tcp_fport, 1483 sizeof (in_port_t)); 1484 /* 1485 * For an all-port proxy listener, the local port is determined by 1486 * the port number field in the SYN packet. 1487 */ 1488 if (listener->tcp_lport == 0) { 1489 acceptor->tcp_lport = *(in_port_t *)tcph->th_fport; 1490 bcopy(tcph->th_fport, acceptor->tcp_tcph->th_lport, 1491 sizeof (in_port_t)); 1492 } 1493 /* Inherit various TCP parameters from the listener */ 1494 acceptor->tcp_naglim = listener->tcp_naglim; 1495 acceptor->tcp_first_timer_threshold = 1496 listener->tcp_first_timer_threshold; 1497 acceptor->tcp_second_timer_threshold = 1498 listener->tcp_second_timer_threshold; 1499 1500 acceptor->tcp_first_ctimer_threshold = 1501 listener->tcp_first_ctimer_threshold; 1502 acceptor->tcp_second_ctimer_threshold = 1503 listener->tcp_second_ctimer_threshold; 1504 1505 acceptor->tcp_xmit_hiwater = listener->tcp_xmit_hiwater; 1506 1507 acceptor->tcp_state = TCPS_LISTEN; 1508 tcp_iss_init(acceptor); 1509 1510 /* Process all TCP options. */ 1511 tcp_process_options(acceptor, tcph); 1512 1513 /* Is the other end ECN capable? */ 1514 if (tcp_ecn_permitted >= 1 && 1515 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 1516 acceptor->tcp_ecn_ok = B_TRUE; 1517 } 1518 1519 /* 1520 * listener->tcp_rq->q_hiwat should be the default window size or a 1521 * window size changed via SO_RCVBUF option. First round up the 1522 * acceptor's tcp_rwnd to the nearest MSS. Then find out the window 1523 * scale option value if needed. Call tcp_rwnd_set() to finish the 1524 * setting. 1525 * 1526 * Note if there is a rpipe metric associated with the remote host, 1527 * we should not inherit receive window size from listener. 1528 */ 1529 acceptor->tcp_rwnd = MSS_ROUNDUP( 1530 (acceptor->tcp_rwnd == 0 ? listener->tcp_rwnd_max : 1531 acceptor->tcp_rwnd), acceptor->tcp_mss); 1532 if (acceptor->tcp_snd_ws_ok) 1533 SET_WS_VALUE(acceptor); 1534 /* 1535 * Note that this is the only place tcp_rwnd_set() is called for 1536 * accepting a connection. We need to call it here instead of 1537 * after the 3-way handshake because we need to tell the other 1538 * side our rwnd in the SYN-ACK segment. 1539 */ 1540 (void) tcp_rwnd_set(acceptor, acceptor->tcp_rwnd); 1541 1542 return (0); 1543 } 1544 1545 /* 1546 * Defense for the SYN attack - 1547 * 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest 1548 * one that doesn't have the dontdrop bit set. 1549 * 2. Don't drop a SYN request before its first timeout. This gives every 1550 * request at least til the first timeout to complete its 3-way handshake. 1551 * 3. The current threshold is - # of timeout > q0len/4 => SYN alert on 1552 * # of timeout drops back to <= q0len/32 => SYN alert off 1553 */ 1554 static boolean_t 1555 tcp_drop_q0(tcp_t *tcp) 1556 { 1557 tcp_t *eager; 1558 1559 assert(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 1560 /* 1561 * New one is added after next_q0 so prev_q0 points to the oldest 1562 * Also do not drop any established connections that are deferred on 1563 * q0 due to q being full 1564 */ 1565 1566 eager = tcp->tcp_eager_prev_q0; 1567 while (eager->tcp_dontdrop || eager->tcp_conn_def_q0) { 1568 /* XXX should move the eager to the head */ 1569 eager = eager->tcp_eager_prev_q0; 1570 if (eager == tcp) { 1571 eager = tcp->tcp_eager_prev_q0; 1572 break; 1573 } 1574 } 1575 dprintf("tcp_drop_q0: listen half-open queue (max=%d) overflow" 1576 " (%d pending) on %s, drop one", tcp_conn_req_max_q0, 1577 tcp->tcp_conn_req_cnt_q0, 1578 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1579 1580 BUMP_MIB(tcp_mib.tcpHalfOpenDrop); 1581 bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1582 return (B_TRUE); 1583 } 1584 1585 /* ARGSUSED */ 1586 static tcp_t * 1587 tcp_conn_request(tcp_t *tcp, mblk_t *mp, uint_t sock_id, uint_t ip_hdr_len) 1588 { 1589 tcp_t *eager; 1590 struct ip *ipha; 1591 int err; 1592 1593 #ifdef DEBUG 1594 printf("tcp_conn_request ###################\n"); 1595 #endif 1596 1597 if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { 1598 BUMP_MIB(tcp_mib.tcpListenDrop); 1599 dprintf("tcp_conn_request: listen backlog (max=%d) " 1600 "overflow (%d pending) on %s", 1601 tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, 1602 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1603 return (NULL); 1604 } 1605 1606 assert(OK_32PTR(mp->b_rptr)); 1607 1608 if (tcp->tcp_conn_req_cnt_q0 >= 1609 tcp->tcp_conn_req_max + tcp_conn_req_max_q0) { 1610 /* 1611 * Q0 is full. Drop a pending half-open req from the queue 1612 * to make room for the new SYN req. Also mark the time we 1613 * drop a SYN. 1614 */ 1615 tcp->tcp_last_rcv_lbolt = prom_gettime(); 1616 if (!tcp_drop_q0(tcp)) { 1617 freemsg(mp); 1618 BUMP_MIB(tcp_mib.tcpListenDropQ0); 1619 dprintf("tcp_conn_request: listen half-open queue " 1620 "(max=%d) full (%d pending) on %s", 1621 tcp_conn_req_max_q0, 1622 tcp->tcp_conn_req_cnt_q0, 1623 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1624 return (NULL); 1625 } 1626 } 1627 1628 ipha = (struct ip *)mp->b_rptr; 1629 if (IN_CLASSD(ntohl(ipha->ip_src.s_addr)) || 1630 ipha->ip_src.s_addr == INADDR_BROADCAST || 1631 ipha->ip_src.s_addr == INADDR_ANY || 1632 ipha->ip_dst.s_addr == INADDR_BROADCAST) { 1633 freemsg(mp); 1634 return (NULL); 1635 } 1636 /* 1637 * We allow the connection to proceed 1638 * by generating a detached tcp state vector and put it in 1639 * the eager queue. When an accept happens, it will be 1640 * dequeued sequentially. 1641 */ 1642 if ((eager = (tcp_t *)bkmem_alloc(sizeof (tcp_t))) == NULL) { 1643 freemsg(mp); 1644 errno = ENOBUFS; 1645 return (NULL); 1646 } 1647 if ((errno = tcp_init_values(eager, NULL)) != 0) { 1648 freemsg(mp); 1649 bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1650 return (NULL); 1651 } 1652 1653 /* 1654 * Eager connection inherits address form from its listener, 1655 * but its packet form comes from the version of the received 1656 * SYN segment. 1657 */ 1658 eager->tcp_family = tcp->tcp_family; 1659 1660 err = tcp_accept_comm(tcp, eager, mp, ip_hdr_len); 1661 if (err) { 1662 bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1663 return (NULL); 1664 } 1665 1666 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 1667 eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1668 tcp->tcp_eager_next_q0 = eager; 1669 eager->tcp_eager_prev_q0 = tcp; 1670 1671 /* Set tcp_listener before adding it to tcp_conn_fanout */ 1672 eager->tcp_listener = tcp; 1673 tcp->tcp_conn_req_cnt_q0++; 1674 1675 return (eager); 1676 } 1677 1678 /* 1679 * To get around the non-interrupt problem of inetboot. 1680 * Keep on processing packets until a certain state is reached or the 1681 * TCP is destroyed because of getting a RST packet. 1682 */ 1683 static int 1684 tcp_state_wait(int sock_id, tcp_t *tcp, int state) 1685 { 1686 int i; 1687 struct inetgram *in_gram; 1688 mblk_t *mp; 1689 int timeout; 1690 boolean_t changed = B_FALSE; 1691 1692 /* 1693 * We need to make sure that the MAC does not wait longer 1694 * than RTO for any packet so that TCP can do retransmission. 1695 * But if the MAC timeout is less than tcp_rto, we are fine 1696 * and do not need to change it. 1697 */ 1698 timeout = sockets[sock_id].in_timeout; 1699 if (timeout > tcp->tcp_rto) { 1700 sockets[sock_id].in_timeout = tcp->tcp_rto; 1701 changed = B_TRUE; 1702 } 1703 retry: 1704 if (sockets[sock_id].inq == NULL) { 1705 /* Go out and check the wire */ 1706 for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) { 1707 if (sockets[sock_id].input[i] != NULL) { 1708 if (sockets[sock_id].input[i](sock_id) < 0) { 1709 if (changed) { 1710 sockets[sock_id].in_timeout = 1711 timeout; 1712 } 1713 return (-1); 1714 } 1715 } 1716 } 1717 } 1718 1719 while ((in_gram = sockets[sock_id].inq) != NULL) { 1720 if (tcp != NULL && tcp->tcp_state == state) 1721 break; 1722 1723 /* Remove unknown inetgrams from the head of inq. */ 1724 if (in_gram->igm_level != TRANSPORT_LVL) { 1725 #ifdef DEBUG 1726 printf("tcp_state_wait for state %d: unexpected " 1727 "packet level %d frame found\n", state, 1728 in_gram->igm_level); 1729 #endif 1730 del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 1731 continue; 1732 } 1733 mp = in_gram->igm_mp; 1734 del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 1735 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 1736 tcp_rput_data(tcp, mp, sock_id); 1737 1738 /* 1739 * The other side may have closed this connection or 1740 * RST us. But we need to continue to process other 1741 * packets in the socket's queue because they may be 1742 * belong to another TCP connections. 1743 */ 1744 if (sockets[sock_id].pcb == NULL) { 1745 tcp = NULL; 1746 } 1747 } 1748 1749 /* If the other side has closed the connection, just return. */ 1750 if (tcp == NULL || sockets[sock_id].pcb == NULL) { 1751 #ifdef DEBUG 1752 printf("tcp_state_wait other side dead: state %d " 1753 "error %d\n", state, sockets[sock_id].so_error); 1754 #endif 1755 if (sockets[sock_id].so_error != 0) 1756 return (-1); 1757 else 1758 return (0); 1759 } 1760 /* 1761 * TCPS_ALL_ACKED is not a valid TCP state, it is just used as an 1762 * indicator to tcp_state_wait to mean that it is being called 1763 * to wait till we have received acks for all the new segments sent. 1764 */ 1765 if ((state == TCPS_ALL_ACKED) && (tcp->tcp_suna == tcp->tcp_snxt)) { 1766 goto done; 1767 } 1768 if (tcp->tcp_state != state) { 1769 if (prom_gettime() > tcp->tcp_rto_timeout) 1770 tcp_timer(tcp, sock_id); 1771 goto retry; 1772 } 1773 done: 1774 if (changed) 1775 sockets[sock_id].in_timeout = timeout; 1776 1777 tcp_drain_needed(sock_id, tcp); 1778 return (0); 1779 } 1780 1781 /* Verify the checksum of a segment. */ 1782 static int 1783 tcp_verify_cksum(mblk_t *mp) 1784 { 1785 struct ip *iph; 1786 tcpha_t *tcph; 1787 int len; 1788 uint16_t old_sum; 1789 1790 iph = (struct ip *)mp->b_rptr; 1791 tcph = (tcpha_t *)(iph + 1); 1792 len = ntohs(iph->ip_len); 1793 1794 /* 1795 * Calculate the TCP checksum. Need to include the psuedo header, 1796 * which is similar to the real IP header starting at the TTL field. 1797 */ 1798 iph->ip_sum = htons(len - IP_SIMPLE_HDR_LENGTH); 1799 old_sum = tcph->tha_sum; 1800 tcph->tha_sum = 0; 1801 iph->ip_ttl = 0; 1802 if (old_sum == tcp_cksum((uint16_t *)&(iph->ip_ttl), 1803 len - IP_SIMPLE_HDR_LENGTH + 12)) { 1804 return (0); 1805 } else { 1806 tcp_cksum_errors++; 1807 return (-1); 1808 } 1809 } 1810 1811 /* To find a TCP connection matching the incoming segment. */ 1812 static tcp_t * 1813 tcp_lookup_ipv4(struct ip *iph, tcpha_t *tcph, int min_state, int *sock_id) 1814 { 1815 int i; 1816 tcp_t *tcp; 1817 1818 for (i = 0; i < MAXSOCKET; i++) { 1819 if (sockets[i].type == INETBOOT_STREAM && 1820 (tcp = (tcp_t *)sockets[i].pcb) != NULL) { 1821 if (tcph->tha_lport == tcp->tcp_fport && 1822 tcph->tha_fport == tcp->tcp_lport && 1823 iph->ip_src.s_addr == tcp->tcp_remote && 1824 iph->ip_dst.s_addr == tcp->tcp_bound_source && 1825 tcp->tcp_state >= min_state) { 1826 *sock_id = i; 1827 return (tcp); 1828 } 1829 } 1830 } 1831 /* Find it in the time wait list. */ 1832 for (tcp = tcp_time_wait_head; tcp != NULL; 1833 tcp = tcp->tcp_time_wait_next) { 1834 if (tcph->tha_lport == tcp->tcp_fport && 1835 tcph->tha_fport == tcp->tcp_lport && 1836 iph->ip_src.s_addr == tcp->tcp_remote && 1837 iph->ip_dst.s_addr == tcp->tcp_bound_source && 1838 tcp->tcp_state >= min_state) { 1839 *sock_id = -1; 1840 return (tcp); 1841 } 1842 } 1843 return (NULL); 1844 } 1845 1846 /* To find a TCP listening connection matching the incoming segment. */ 1847 static tcp_t * 1848 tcp_lookup_listener_ipv4(in_addr_t addr, in_port_t port, int *sock_id) 1849 { 1850 int i; 1851 tcp_t *tcp; 1852 1853 for (i = 0; i < MAXSOCKET; i++) { 1854 if (sockets[i].type == INETBOOT_STREAM && 1855 (tcp = (tcp_t *)sockets[i].pcb) != NULL) { 1856 if (tcp->tcp_lport == port && 1857 (tcp->tcp_bound_source == addr || 1858 tcp->tcp_bound_source == INADDR_ANY)) { 1859 *sock_id = i; 1860 return (tcp); 1861 } 1862 } 1863 } 1864 1865 return (NULL); 1866 } 1867 1868 /* To find a TCP eager matching the incoming segment. */ 1869 static tcp_t * 1870 tcp_lookup_eager_ipv4(tcp_t *listener, struct ip *iph, tcpha_t *tcph) 1871 { 1872 tcp_t *tcp; 1873 1874 #ifdef DEBUG 1875 printf("tcp_lookup_eager_ipv4 ###############\n"); 1876 #endif 1877 for (tcp = listener->tcp_eager_next_q; tcp != NULL; 1878 tcp = tcp->tcp_eager_next_q) { 1879 if (tcph->tha_lport == tcp->tcp_fport && 1880 tcph->tha_fport == tcp->tcp_lport && 1881 iph->ip_src.s_addr == tcp->tcp_remote && 1882 iph->ip_dst.s_addr == tcp->tcp_bound_source) { 1883 return (tcp); 1884 } 1885 } 1886 1887 for (tcp = listener->tcp_eager_next_q0; tcp != listener; 1888 tcp = tcp->tcp_eager_next_q0) { 1889 if (tcph->tha_lport == tcp->tcp_fport && 1890 tcph->tha_fport == tcp->tcp_lport && 1891 iph->ip_src.s_addr == tcp->tcp_remote && 1892 iph->ip_dst.s_addr == tcp->tcp_bound_source) { 1893 return (tcp); 1894 } 1895 } 1896 #ifdef DEBUG 1897 printf("No eager found\n"); 1898 #endif 1899 return (NULL); 1900 } 1901 1902 /* To destroy a TCP control block. */ 1903 static void 1904 tcp_clean_death(int sock_id, tcp_t *tcp, int err) 1905 { 1906 tcp_free(tcp); 1907 if (tcp->tcp_state == TCPS_TIME_WAIT) 1908 tcp_time_wait_remove(tcp); 1909 1910 if (sock_id >= 0) { 1911 sockets[sock_id].pcb = NULL; 1912 if (err != 0) 1913 sockets[sock_id].so_error = err; 1914 } 1915 bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 1916 } 1917 1918 /* 1919 * tcp_rwnd_set() is called to adjust the receive window to a desired value. 1920 * We do not allow the receive window to shrink. After setting rwnd, 1921 * set the flow control hiwat of the stream. 1922 * 1923 * This function is called in 2 cases: 1924 * 1925 * 1) Before data transfer begins, in tcp_accept_comm() for accepting a 1926 * connection (passive open) and in tcp_rput_data() for active connect. 1927 * This is called after tcp_mss_set() when the desired MSS value is known. 1928 * This makes sure that our window size is a mutiple of the other side's 1929 * MSS. 1930 * 2) Handling SO_RCVBUF option. 1931 * 1932 * It is ASSUMED that the requested size is a multiple of the current MSS. 1933 * 1934 * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 1935 * user requests so. 1936 */ 1937 static int 1938 tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 1939 { 1940 uint32_t mss = tcp->tcp_mss; 1941 uint32_t old_max_rwnd; 1942 uint32_t max_transmittable_rwnd; 1943 1944 if (tcp->tcp_rwnd_max != 0) 1945 old_max_rwnd = tcp->tcp_rwnd_max; 1946 else 1947 old_max_rwnd = tcp->tcp_rwnd; 1948 1949 /* 1950 * Insist on a receive window that is at least 1951 * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 1952 * funny TCP interactions of Nagle algorithm, SWS avoidance 1953 * and delayed acknowledgement. 1954 */ 1955 rwnd = MAX(rwnd, tcp_recv_hiwat_minmss * mss); 1956 1957 /* 1958 * If window size info has already been exchanged, TCP should not 1959 * shrink the window. Shrinking window is doable if done carefully. 1960 * We may add that support later. But so far there is not a real 1961 * need to do that. 1962 */ 1963 if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 1964 /* MSS may have changed, do a round up again. */ 1965 rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 1966 } 1967 1968 /* 1969 * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 1970 * can be applied even before the window scale option is decided. 1971 */ 1972 max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 1973 if (rwnd > max_transmittable_rwnd) { 1974 rwnd = max_transmittable_rwnd - 1975 (max_transmittable_rwnd % mss); 1976 if (rwnd < mss) 1977 rwnd = max_transmittable_rwnd; 1978 /* 1979 * If we're over the limit we may have to back down tcp_rwnd. 1980 * The increment below won't work for us. So we set all three 1981 * here and the increment below will have no effect. 1982 */ 1983 tcp->tcp_rwnd = old_max_rwnd = rwnd; 1984 } 1985 1986 /* 1987 * Increment the current rwnd by the amount the maximum grew (we 1988 * can not overwrite it since we might be in the middle of a 1989 * connection.) 1990 */ 1991 tcp->tcp_rwnd += rwnd - old_max_rwnd; 1992 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 1993 if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 1994 tcp->tcp_cwnd_max = rwnd; 1995 tcp->tcp_rwnd_max = rwnd; 1996 1997 return (rwnd); 1998 } 1999 2000 /* 2001 * Extract option values from a tcp header. We put any found values into the 2002 * tcpopt struct and return a bitmask saying which options were found. 2003 */ 2004 static int 2005 tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) 2006 { 2007 uchar_t *endp; 2008 int len; 2009 uint32_t mss; 2010 uchar_t *up = (uchar_t *)tcph; 2011 int found = 0; 2012 int32_t sack_len; 2013 tcp_seq sack_begin, sack_end; 2014 tcp_t *tcp; 2015 2016 endp = up + TCP_HDR_LENGTH(tcph); 2017 up += TCP_MIN_HEADER_LENGTH; 2018 while (up < endp) { 2019 len = endp - up; 2020 switch (*up) { 2021 case TCPOPT_EOL: 2022 break; 2023 2024 case TCPOPT_NOP: 2025 up++; 2026 continue; 2027 2028 case TCPOPT_MAXSEG: 2029 if (len < TCPOPT_MAXSEG_LEN || 2030 up[1] != TCPOPT_MAXSEG_LEN) 2031 break; 2032 2033 mss = BE16_TO_U16(up+2); 2034 /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 2035 tcpopt->tcp_opt_mss = mss; 2036 found |= TCP_OPT_MSS_PRESENT; 2037 2038 up += TCPOPT_MAXSEG_LEN; 2039 continue; 2040 2041 case TCPOPT_WSCALE: 2042 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 2043 break; 2044 2045 if (up[2] > TCP_MAX_WINSHIFT) 2046 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 2047 else 2048 tcpopt->tcp_opt_wscale = up[2]; 2049 found |= TCP_OPT_WSCALE_PRESENT; 2050 2051 up += TCPOPT_WS_LEN; 2052 continue; 2053 2054 case TCPOPT_SACK_PERMITTED: 2055 if (len < TCPOPT_SACK_OK_LEN || 2056 up[1] != TCPOPT_SACK_OK_LEN) 2057 break; 2058 found |= TCP_OPT_SACK_OK_PRESENT; 2059 up += TCPOPT_SACK_OK_LEN; 2060 continue; 2061 2062 case TCPOPT_SACK: 2063 if (len <= 2 || up[1] <= 2 || len < up[1]) 2064 break; 2065 2066 /* If TCP is not interested in SACK blks... */ 2067 if ((tcp = tcpopt->tcp) == NULL) { 2068 up += up[1]; 2069 continue; 2070 } 2071 sack_len = up[1] - TCPOPT_HEADER_LEN; 2072 up += TCPOPT_HEADER_LEN; 2073 2074 /* 2075 * If the list is empty, allocate one and assume 2076 * nothing is sack'ed. 2077 */ 2078 assert(tcp->tcp_sack_info != NULL); 2079 if (tcp->tcp_notsack_list == NULL) { 2080 tcp_notsack_update(&(tcp->tcp_notsack_list), 2081 tcp->tcp_suna, tcp->tcp_snxt, 2082 &(tcp->tcp_num_notsack_blk), 2083 &(tcp->tcp_cnt_notsack_list)); 2084 2085 /* 2086 * Make sure tcp_notsack_list is not NULL. 2087 * This happens when kmem_alloc(KM_NOSLEEP) 2088 * returns NULL. 2089 */ 2090 if (tcp->tcp_notsack_list == NULL) { 2091 up += sack_len; 2092 continue; 2093 } 2094 tcp->tcp_fack = tcp->tcp_suna; 2095 } 2096 2097 while (sack_len > 0) { 2098 if (up + 8 > endp) { 2099 up = endp; 2100 break; 2101 } 2102 sack_begin = BE32_TO_U32(up); 2103 up += 4; 2104 sack_end = BE32_TO_U32(up); 2105 up += 4; 2106 sack_len -= 8; 2107 /* 2108 * Bounds checking. Make sure the SACK 2109 * info is within tcp_suna and tcp_snxt. 2110 * If this SACK blk is out of bound, ignore 2111 * it but continue to parse the following 2112 * blks. 2113 */ 2114 if (SEQ_LEQ(sack_end, sack_begin) || 2115 SEQ_LT(sack_begin, tcp->tcp_suna) || 2116 SEQ_GT(sack_end, tcp->tcp_snxt)) { 2117 continue; 2118 } 2119 tcp_notsack_insert(&(tcp->tcp_notsack_list), 2120 sack_begin, sack_end, 2121 &(tcp->tcp_num_notsack_blk), 2122 &(tcp->tcp_cnt_notsack_list)); 2123 if (SEQ_GT(sack_end, tcp->tcp_fack)) { 2124 tcp->tcp_fack = sack_end; 2125 } 2126 } 2127 found |= TCP_OPT_SACK_PRESENT; 2128 continue; 2129 2130 case TCPOPT_TSTAMP: 2131 if (len < TCPOPT_TSTAMP_LEN || 2132 up[1] != TCPOPT_TSTAMP_LEN) 2133 break; 2134 2135 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 2136 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 2137 2138 found |= TCP_OPT_TSTAMP_PRESENT; 2139 2140 up += TCPOPT_TSTAMP_LEN; 2141 continue; 2142 2143 default: 2144 if (len <= 1 || len < (int)up[1] || up[1] == 0) 2145 break; 2146 up += up[1]; 2147 continue; 2148 } 2149 break; 2150 } 2151 return (found); 2152 } 2153 2154 /* 2155 * Set the mss associated with a particular tcp based on its current value, 2156 * and a new one passed in. Observe minimums and maximums, and reset 2157 * other state variables that we want to view as multiples of mss. 2158 * 2159 * This function is called in various places mainly because 2160 * 1) Various stuffs, tcp_mss, tcp_cwnd, ... need to be adjusted when the 2161 * other side's SYN/SYN-ACK packet arrives. 2162 * 2) PMTUd may get us a new MSS. 2163 * 3) If the other side stops sending us timestamp option, we need to 2164 * increase the MSS size to use the extra bytes available. 2165 */ 2166 static void 2167 tcp_mss_set(tcp_t *tcp, uint32_t mss) 2168 { 2169 uint32_t mss_max; 2170 2171 mss_max = tcp_mss_max_ipv4; 2172 2173 if (mss < tcp_mss_min) 2174 mss = tcp_mss_min; 2175 if (mss > mss_max) 2176 mss = mss_max; 2177 /* 2178 * Unless naglim has been set by our client to 2179 * a non-mss value, force naglim to track mss. 2180 * This can help to aggregate small writes. 2181 */ 2182 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 2183 tcp->tcp_naglim = mss; 2184 /* 2185 * TCP should be able to buffer at least 4 MSS data for obvious 2186 * performance reason. 2187 */ 2188 if ((mss << 2) > tcp->tcp_xmit_hiwater) 2189 tcp->tcp_xmit_hiwater = mss << 2; 2190 tcp->tcp_mss = mss; 2191 /* 2192 * Initialize cwnd according to draft-floyd-incr-init-win-01.txt. 2193 * Previously, we use tcp_slow_start_initial to control the size 2194 * of the initial cwnd. Now, when tcp_slow_start_initial * mss 2195 * is smaller than the cwnd calculated from the formula suggested in 2196 * the draft, we use tcp_slow_start_initial * mss as the cwnd. 2197 * Otherwise, use the cwnd from the draft's formula. The default 2198 * of tcp_slow_start_initial is 2. 2199 */ 2200 tcp->tcp_cwnd = MIN(tcp_slow_start_initial * mss, 2201 MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 2202 tcp->tcp_cwnd_cnt = 0; 2203 } 2204 2205 /* 2206 * Process all TCP option in SYN segment. 2207 * 2208 * This function sets up the correct tcp_mss value according to the 2209 * MSS option value and our header size. It also sets up the window scale 2210 * and timestamp values, and initialize SACK info blocks. But it does not 2211 * change receive window size after setting the tcp_mss value. The caller 2212 * should do the appropriate change. 2213 */ 2214 void 2215 tcp_process_options(tcp_t *tcp, tcph_t *tcph) 2216 { 2217 int options; 2218 tcp_opt_t tcpopt; 2219 uint32_t mss_max; 2220 char *tmp_tcph; 2221 2222 tcpopt.tcp = NULL; 2223 options = tcp_parse_options(tcph, &tcpopt); 2224 2225 /* 2226 * Process MSS option. Note that MSS option value does not account 2227 * for IP or TCP options. This means that it is equal to MTU - minimum 2228 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 2229 * IPv6. 2230 */ 2231 if (!(options & TCP_OPT_MSS_PRESENT)) { 2232 tcpopt.tcp_opt_mss = tcp_mss_def_ipv4; 2233 } else { 2234 if (tcp->tcp_ipversion == IPV4_VERSION) 2235 mss_max = tcp_mss_max_ipv4; 2236 if (tcpopt.tcp_opt_mss < tcp_mss_min) 2237 tcpopt.tcp_opt_mss = tcp_mss_min; 2238 else if (tcpopt.tcp_opt_mss > mss_max) 2239 tcpopt.tcp_opt_mss = mss_max; 2240 } 2241 2242 /* Process Window Scale option. */ 2243 if (options & TCP_OPT_WSCALE_PRESENT) { 2244 tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 2245 tcp->tcp_snd_ws_ok = B_TRUE; 2246 } else { 2247 tcp->tcp_snd_ws = B_FALSE; 2248 tcp->tcp_snd_ws_ok = B_FALSE; 2249 tcp->tcp_rcv_ws = B_FALSE; 2250 } 2251 2252 /* Process Timestamp option. */ 2253 if ((options & TCP_OPT_TSTAMP_PRESENT) && 2254 (tcp->tcp_snd_ts_ok || !tcp->tcp_active_open)) { 2255 tmp_tcph = (char *)tcp->tcp_tcph; 2256 2257 tcp->tcp_snd_ts_ok = B_TRUE; 2258 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 2259 tcp->tcp_last_rcv_lbolt = prom_gettime(); 2260 assert(OK_32PTR(tmp_tcph)); 2261 assert(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 2262 2263 /* Fill in our template header with basic timestamp option. */ 2264 tmp_tcph += tcp->tcp_tcp_hdr_len; 2265 tmp_tcph[0] = TCPOPT_NOP; 2266 tmp_tcph[1] = TCPOPT_NOP; 2267 tmp_tcph[2] = TCPOPT_TSTAMP; 2268 tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 2269 tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN; 2270 tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN; 2271 tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4); 2272 } else { 2273 tcp->tcp_snd_ts_ok = B_FALSE; 2274 } 2275 2276 /* 2277 * Process SACK options. If SACK is enabled for this connection, 2278 * then allocate the SACK info structure. 2279 */ 2280 if ((options & TCP_OPT_SACK_OK_PRESENT) && 2281 (tcp->tcp_snd_sack_ok || 2282 (tcp_sack_permitted != 0 && !tcp->tcp_active_open))) { 2283 /* This should be true only in the passive case. */ 2284 if (tcp->tcp_sack_info == NULL) { 2285 tcp->tcp_sack_info = (tcp_sack_info_t *)bkmem_zalloc( 2286 sizeof (tcp_sack_info_t)); 2287 } 2288 if (tcp->tcp_sack_info == NULL) { 2289 tcp->tcp_snd_sack_ok = B_FALSE; 2290 } else { 2291 tcp->tcp_snd_sack_ok = B_TRUE; 2292 if (tcp->tcp_snd_ts_ok) { 2293 tcp->tcp_max_sack_blk = 3; 2294 } else { 2295 tcp->tcp_max_sack_blk = 4; 2296 } 2297 } 2298 } else { 2299 /* 2300 * Resetting tcp_snd_sack_ok to B_FALSE so that 2301 * no SACK info will be used for this 2302 * connection. This assumes that SACK usage 2303 * permission is negotiated. This may need 2304 * to be changed once this is clarified. 2305 */ 2306 if (tcp->tcp_sack_info != NULL) { 2307 bkmem_free((caddr_t)tcp->tcp_sack_info, 2308 sizeof (tcp_sack_info_t)); 2309 tcp->tcp_sack_info = NULL; 2310 } 2311 tcp->tcp_snd_sack_ok = B_FALSE; 2312 } 2313 2314 /* 2315 * Now we know the exact TCP/IP header length, subtract 2316 * that from tcp_mss to get our side's MSS. 2317 */ 2318 tcp->tcp_mss -= tcp->tcp_hdr_len; 2319 /* 2320 * Here we assume that the other side's header size will be equal to 2321 * our header size. We calculate the real MSS accordingly. Need to 2322 * take into additional stuffs IPsec puts in. 2323 * 2324 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 2325 */ 2326 tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len - 2327 (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH); 2328 2329 /* 2330 * Set MSS to the smaller one of both ends of the connection. 2331 * We should not have called tcp_mss_set() before, but our 2332 * side of the MSS should have been set to a proper value 2333 * by tcp_adapt_ire(). tcp_mss_set() will also set up the 2334 * STREAM head parameters properly. 2335 * 2336 * If we have a larger-than-16-bit window but the other side 2337 * didn't want to do window scale, tcp_rwnd_set() will take 2338 * care of that. 2339 */ 2340 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); 2341 } 2342 2343 /* 2344 * This function does PAWS protection check. Returns B_TRUE if the 2345 * segment passes the PAWS test, else returns B_FALSE. 2346 */ 2347 boolean_t 2348 tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) 2349 { 2350 uint8_t flags; 2351 int options; 2352 uint8_t *up; 2353 2354 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 2355 /* 2356 * If timestamp option is aligned nicely, get values inline, 2357 * otherwise call general routine to parse. Only do that 2358 * if timestamp is the only option. 2359 */ 2360 if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH + 2361 TCPOPT_REAL_TS_LEN && 2362 OK_32PTR((up = ((uint8_t *)tcph) + 2363 TCP_MIN_HEADER_LENGTH)) && 2364 *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 2365 tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 2366 tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 2367 2368 options = TCP_OPT_TSTAMP_PRESENT; 2369 } else { 2370 if (tcp->tcp_snd_sack_ok) { 2371 tcpoptp->tcp = tcp; 2372 } else { 2373 tcpoptp->tcp = NULL; 2374 } 2375 options = tcp_parse_options(tcph, tcpoptp); 2376 } 2377 2378 if (options & TCP_OPT_TSTAMP_PRESENT) { 2379 /* 2380 * Do PAWS per RFC 1323 section 4.2. Accept RST 2381 * regardless of the timestamp, page 18 RFC 1323.bis. 2382 */ 2383 if ((flags & TH_RST) == 0 && 2384 TSTMP_LT(tcpoptp->tcp_opt_ts_val, 2385 tcp->tcp_ts_recent)) { 2386 if (TSTMP_LT(prom_gettime(), 2387 tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) { 2388 /* This segment is not acceptable. */ 2389 return (B_FALSE); 2390 } else { 2391 /* 2392 * Connection has been idle for 2393 * too long. Reset the timestamp 2394 * and assume the segment is valid. 2395 */ 2396 tcp->tcp_ts_recent = 2397 tcpoptp->tcp_opt_ts_val; 2398 } 2399 } 2400 } else { 2401 /* 2402 * If we don't get a timestamp on every packet, we 2403 * figure we can't really trust 'em, so we stop sending 2404 * and parsing them. 2405 */ 2406 tcp->tcp_snd_ts_ok = B_FALSE; 2407 2408 tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 2409 tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 2410 tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4); 2411 tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); 2412 if (tcp->tcp_snd_sack_ok) { 2413 assert(tcp->tcp_sack_info != NULL); 2414 tcp->tcp_max_sack_blk = 4; 2415 } 2416 } 2417 return (B_TRUE); 2418 } 2419 2420 /* 2421 * tcp_get_seg_mp() is called to get the pointer to a segment in the 2422 * send queue which starts at the given seq. no. 2423 * 2424 * Parameters: 2425 * tcp_t *tcp: the tcp instance pointer. 2426 * uint32_t seq: the starting seq. no of the requested segment. 2427 * int32_t *off: after the execution, *off will be the offset to 2428 * the returned mblk which points to the requested seq no. 2429 * 2430 * Return: 2431 * A mblk_t pointer pointing to the requested segment in send queue. 2432 */ 2433 static mblk_t * 2434 tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 2435 { 2436 int32_t cnt; 2437 mblk_t *mp; 2438 2439 /* Defensive coding. Make sure we don't send incorrect data. */ 2440 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt) || 2441 off == NULL) { 2442 return (NULL); 2443 } 2444 cnt = seq - tcp->tcp_suna; 2445 mp = tcp->tcp_xmit_head; 2446 while (cnt > 0 && mp) { 2447 cnt -= mp->b_wptr - mp->b_rptr; 2448 if (cnt < 0) { 2449 cnt += mp->b_wptr - mp->b_rptr; 2450 break; 2451 } 2452 mp = mp->b_cont; 2453 } 2454 assert(mp != NULL); 2455 *off = cnt; 2456 return (mp); 2457 } 2458 2459 /* 2460 * This function handles all retransmissions if SACK is enabled for this 2461 * connection. First it calculates how many segments can be retransmitted 2462 * based on tcp_pipe. Then it goes thru the notsack list to find eligible 2463 * segments. A segment is eligible if sack_cnt for that segment is greater 2464 * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 2465 * all eligible segments, it checks to see if TCP can send some new segments 2466 * (fast recovery). If it can, it returns 1. Otherwise it returns 0. 2467 * 2468 * Parameters: 2469 * tcp_t *tcp: the tcp structure of the connection. 2470 * 2471 * Return: 2472 * 1 if the pipe is not full (new data can be sent), 0 otherwise 2473 */ 2474 static int32_t 2475 tcp_sack_rxmit(tcp_t *tcp, int sock_id) 2476 { 2477 notsack_blk_t *notsack_blk; 2478 int32_t usable_swnd; 2479 int32_t mss; 2480 uint32_t seg_len; 2481 mblk_t *xmit_mp; 2482 2483 assert(tcp->tcp_sack_info != NULL); 2484 assert(tcp->tcp_notsack_list != NULL); 2485 assert(tcp->tcp_rexmit == B_FALSE); 2486 2487 /* Defensive coding in case there is a bug... */ 2488 if (tcp->tcp_notsack_list == NULL) { 2489 return (0); 2490 } 2491 notsack_blk = tcp->tcp_notsack_list; 2492 mss = tcp->tcp_mss; 2493 2494 /* 2495 * Limit the num of outstanding data in the network to be 2496 * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 2497 */ 2498 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 2499 2500 /* At least retransmit 1 MSS of data. */ 2501 if (usable_swnd <= 0) { 2502 usable_swnd = mss; 2503 } 2504 2505 /* Make sure no new RTT samples will be taken. */ 2506 tcp->tcp_csuna = tcp->tcp_snxt; 2507 2508 notsack_blk = tcp->tcp_notsack_list; 2509 while (usable_swnd > 0) { 2510 mblk_t *snxt_mp, *tmp_mp; 2511 tcp_seq begin = tcp->tcp_sack_snxt; 2512 tcp_seq end; 2513 int32_t off; 2514 2515 for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 2516 if (SEQ_GT(notsack_blk->end, begin) && 2517 (notsack_blk->sack_cnt >= 2518 tcp_dupack_fast_retransmit)) { 2519 end = notsack_blk->end; 2520 if (SEQ_LT(begin, notsack_blk->begin)) { 2521 begin = notsack_blk->begin; 2522 } 2523 break; 2524 } 2525 } 2526 /* 2527 * All holes are filled. Manipulate tcp_cwnd to send more 2528 * if we can. Note that after the SACK recovery, tcp_cwnd is 2529 * set to tcp_cwnd_ssthresh. 2530 */ 2531 if (notsack_blk == NULL) { 2532 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 2533 if (usable_swnd <= 0) { 2534 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 2535 assert(tcp->tcp_cwnd > 0); 2536 return (0); 2537 } else { 2538 usable_swnd = usable_swnd / mss; 2539 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 2540 MAX(usable_swnd * mss, mss); 2541 return (1); 2542 } 2543 } 2544 2545 /* 2546 * Note that we may send more than usable_swnd allows here 2547 * because of round off, but no more than 1 MSS of data. 2548 */ 2549 seg_len = end - begin; 2550 if (seg_len > mss) 2551 seg_len = mss; 2552 snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 2553 assert(snxt_mp != NULL); 2554 /* This should not happen. Defensive coding again... */ 2555 if (snxt_mp == NULL) { 2556 return (0); 2557 } 2558 2559 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 2560 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 2561 2562 if (xmit_mp == NULL) 2563 return (0); 2564 2565 usable_swnd -= seg_len; 2566 tcp->tcp_pipe += seg_len; 2567 tcp->tcp_sack_snxt = begin + seg_len; 2568 TCP_DUMP_PACKET("tcp_sack_rxmit", xmit_mp); 2569 (void) ipv4_tcp_output(sock_id, xmit_mp); 2570 freeb(xmit_mp); 2571 2572 /* 2573 * Update the send timestamp to avoid false retransmission. 2574 * Note. use uintptr_t to suppress the gcc warning. 2575 */ 2576 snxt_mp->b_prev = (mblk_t *)(uintptr_t)prom_gettime(); 2577 2578 BUMP_MIB(tcp_mib.tcpRetransSegs); 2579 UPDATE_MIB(tcp_mib.tcpRetransBytes, seg_len); 2580 BUMP_MIB(tcp_mib.tcpOutSackRetransSegs); 2581 /* 2582 * Update tcp_rexmit_max to extend this SACK recovery phase. 2583 * This happens when new data sent during fast recovery is 2584 * also lost. If TCP retransmits those new data, it needs 2585 * to extend SACK recover phase to avoid starting another 2586 * fast retransmit/recovery unnecessarily. 2587 */ 2588 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 2589 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 2590 } 2591 } 2592 return (0); 2593 } 2594 2595 static void 2596 tcp_rput_data(tcp_t *tcp, mblk_t *mp, int sock_id) 2597 { 2598 uchar_t *rptr; 2599 struct ip *iph; 2600 tcp_t *tcp1; 2601 tcpha_t *tcph; 2602 uint32_t seg_ack; 2603 int seg_len; 2604 uint_t ip_hdr_len; 2605 uint32_t seg_seq; 2606 mblk_t *mp1; 2607 uint_t flags; 2608 uint32_t new_swnd = 0; 2609 int mss; 2610 boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 2611 int32_t gap; 2612 int32_t rgap; 2613 tcp_opt_t tcpopt; 2614 int32_t bytes_acked; 2615 int npkt; 2616 uint32_t cwnd; 2617 uint32_t add; 2618 2619 #ifdef DEBUG 2620 printf("tcp_rput_data sock %d mp %x mp_datap %x #################\n", 2621 sock_id, mp, mp->b_datap); 2622 #endif 2623 2624 /* Dump the packet when debugging. */ 2625 TCP_DUMP_PACKET("tcp_rput_data", mp); 2626 2627 assert(OK_32PTR(mp->b_rptr)); 2628 2629 rptr = mp->b_rptr; 2630 iph = (struct ip *)rptr; 2631 ip_hdr_len = IPH_HDR_LENGTH(rptr); 2632 if (ip_hdr_len != IP_SIMPLE_HDR_LENGTH) { 2633 #ifdef DEBUG 2634 printf("Not simple IP header\n"); 2635 #endif 2636 /* We cannot handle IP option yet... */ 2637 tcp_drops++; 2638 freeb(mp); 2639 return; 2640 } 2641 /* The TCP header must be aligned. */ 2642 tcph = (tcpha_t *)&rptr[ip_hdr_len]; 2643 seg_seq = ntohl(tcph->tha_seq); 2644 seg_ack = ntohl(tcph->tha_ack); 2645 assert((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 2646 seg_len = (int)(mp->b_wptr - rptr) - 2647 (ip_hdr_len + TCP_HDR_LENGTH(((tcph_t *)tcph))); 2648 /* In inetboot, b_cont should always be NULL. */ 2649 assert(mp->b_cont == NULL); 2650 2651 /* Verify the checksum. */ 2652 if (tcp_verify_cksum(mp) < 0) { 2653 #ifdef DEBUG 2654 printf("tcp_rput_data: wrong cksum\n"); 2655 #endif 2656 freemsg(mp); 2657 return; 2658 } 2659 2660 /* 2661 * This segment is not for us, try to find its 2662 * intended receiver. 2663 */ 2664 if (tcp == NULL || 2665 tcph->tha_lport != tcp->tcp_fport || 2666 tcph->tha_fport != tcp->tcp_lport || 2667 iph->ip_src.s_addr != tcp->tcp_remote || 2668 iph->ip_dst.s_addr != tcp->tcp_bound_source) { 2669 #ifdef DEBUG 2670 printf("tcp_rput_data: not for us, state %d\n", 2671 tcp->tcp_state); 2672 #endif 2673 /* 2674 * First try to find a established connection. If none 2675 * is found, look for a listener. 2676 * 2677 * If a listener is found, we need to check to see if the 2678 * incoming segment is for one of its eagers. If it is, 2679 * give it to the eager. If not, listener should take care 2680 * of it. 2681 */ 2682 if ((tcp1 = tcp_lookup_ipv4(iph, tcph, TCPS_SYN_SENT, 2683 &sock_id)) != NULL || 2684 (tcp1 = tcp_lookup_listener_ipv4(iph->ip_dst.s_addr, 2685 tcph->tha_fport, &sock_id)) != NULL) { 2686 if (tcp1->tcp_state == TCPS_LISTEN) { 2687 if ((tcp = tcp_lookup_eager_ipv4(tcp1, 2688 iph, tcph)) == NULL) { 2689 /* No eager... sent to listener */ 2690 #ifdef DEBUG 2691 printf("found the listener: %s\n", 2692 tcp_display(tcp1, NULL, 2693 DISP_ADDR_AND_PORT)); 2694 #endif 2695 tcp = tcp1; 2696 } 2697 #ifdef DEBUG 2698 else { 2699 printf("found the eager: %s\n", 2700 tcp_display(tcp, NULL, 2701 DISP_ADDR_AND_PORT)); 2702 } 2703 #endif 2704 } else { 2705 /* Non listener found... */ 2706 #ifdef DEBUG 2707 printf("found the connection: %s\n", 2708 tcp_display(tcp1, NULL, 2709 DISP_ADDR_AND_PORT)); 2710 #endif 2711 tcp = tcp1; 2712 } 2713 } else { 2714 /* 2715 * No connection for this segment... 2716 * Send a RST to the other side. 2717 */ 2718 tcp_xmit_listeners_reset(sock_id, mp, ip_hdr_len); 2719 return; 2720 } 2721 } 2722 2723 flags = tcph->tha_flags & 0xFF; 2724 BUMP_MIB(tcp_mib.tcpInSegs); 2725 if (tcp->tcp_state == TCPS_TIME_WAIT) { 2726 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 2727 seg_len, (tcph_t *)tcph, sock_id); 2728 return; 2729 } 2730 /* 2731 * From this point we can assume that the tcp is not compressed, 2732 * since we would have branched off to tcp_time_wait_processing() 2733 * in such a case. 2734 */ 2735 assert(tcp != NULL && tcp->tcp_state != TCPS_TIME_WAIT); 2736 2737 /* 2738 * After this point, we know we have the correct TCP, so update 2739 * the receive time. 2740 */ 2741 tcp->tcp_last_recv_time = prom_gettime(); 2742 2743 /* In inetboot, we do not handle urgent pointer... */ 2744 if (flags & TH_URG) { 2745 freemsg(mp); 2746 DEBUG_1("tcp_rput_data(%d): received segment with urgent " 2747 "pointer\n", sock_id); 2748 tcp_drops++; 2749 return; 2750 } 2751 2752 switch (tcp->tcp_state) { 2753 case TCPS_LISTEN: 2754 if ((flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN) { 2755 if (flags & TH_RST) { 2756 freemsg(mp); 2757 return; 2758 } 2759 if (flags & TH_ACK) { 2760 tcp_xmit_early_reset("TCPS_LISTEN-TH_ACK", 2761 sock_id, mp, seg_ack, 0, TH_RST, 2762 ip_hdr_len); 2763 return; 2764 } 2765 if (!(flags & TH_SYN)) { 2766 freemsg(mp); 2767 return; 2768 } 2769 printf("tcp_rput_data: %d\n", __LINE__); 2770 prom_panic("inetboot"); 2771 } 2772 if (tcp->tcp_conn_req_max > 0) { 2773 tcp = tcp_conn_request(tcp, mp, sock_id, ip_hdr_len); 2774 if (tcp == NULL) { 2775 freemsg(mp); 2776 return; 2777 } 2778 #ifdef DEBUG 2779 printf("tcp_rput_data: new tcp created\n"); 2780 #endif 2781 } 2782 tcp->tcp_irs = seg_seq; 2783 tcp->tcp_rack = seg_seq; 2784 tcp->tcp_rnxt = seg_seq + 1; 2785 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 2786 BUMP_MIB(tcp_mib.tcpPassiveOpens); 2787 goto syn_rcvd; 2788 case TCPS_SYN_SENT: 2789 if (flags & TH_ACK) { 2790 /* 2791 * Note that our stack cannot send data before a 2792 * connection is established, therefore the 2793 * following check is valid. Otherwise, it has 2794 * to be changed. 2795 */ 2796 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 2797 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 2798 if (flags & TH_RST) { 2799 freemsg(mp); 2800 return; 2801 } 2802 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 2803 tcp, mp, seg_ack, 0, TH_RST, 2804 ip_hdr_len, sock_id); 2805 return; 2806 } 2807 assert(tcp->tcp_suna + 1 == seg_ack); 2808 } 2809 if (flags & TH_RST) { 2810 freemsg(mp); 2811 if (flags & TH_ACK) { 2812 tcp_clean_death(sock_id, tcp, ECONNREFUSED); 2813 } 2814 return; 2815 } 2816 if (!(flags & TH_SYN)) { 2817 freemsg(mp); 2818 return; 2819 } 2820 2821 /* Process all TCP options. */ 2822 tcp_process_options(tcp, (tcph_t *)tcph); 2823 /* 2824 * The following changes our rwnd to be a multiple of the 2825 * MIN(peer MSS, our MSS) for performance reason. 2826 */ 2827 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rwnd, 2828 tcp->tcp_mss)); 2829 2830 /* Is the other end ECN capable? */ 2831 if (tcp->tcp_ecn_ok) { 2832 if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 2833 tcp->tcp_ecn_ok = B_FALSE; 2834 } 2835 } 2836 /* 2837 * Clear ECN flags because it may interfere with later 2838 * processing. 2839 */ 2840 flags &= ~(TH_ECE|TH_CWR); 2841 2842 tcp->tcp_irs = seg_seq; 2843 tcp->tcp_rack = seg_seq; 2844 tcp->tcp_rnxt = seg_seq + 1; 2845 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 2846 2847 if (flags & TH_ACK) { 2848 /* One for the SYN */ 2849 tcp->tcp_suna = tcp->tcp_iss + 1; 2850 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 2851 tcp->tcp_state = TCPS_ESTABLISHED; 2852 2853 /* 2854 * If SYN was retransmitted, need to reset all 2855 * retransmission info. This is because this 2856 * segment will be treated as a dup ACK. 2857 */ 2858 if (tcp->tcp_rexmit) { 2859 tcp->tcp_rexmit = B_FALSE; 2860 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 2861 tcp->tcp_rexmit_max = tcp->tcp_snxt; 2862 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 2863 2864 /* 2865 * Set tcp_cwnd back to 1 MSS, per 2866 * recommendation from 2867 * draft-floyd-incr-init-win-01.txt, 2868 * Increasing TCP's Initial Window. 2869 */ 2870 tcp->tcp_cwnd = tcp->tcp_mss; 2871 } 2872 2873 tcp->tcp_swl1 = seg_seq; 2874 tcp->tcp_swl2 = seg_ack; 2875 2876 new_swnd = BE16_TO_U16(((tcph_t *)tcph)->th_win); 2877 tcp->tcp_swnd = new_swnd; 2878 if (new_swnd > tcp->tcp_max_swnd) 2879 tcp->tcp_max_swnd = new_swnd; 2880 2881 /* 2882 * Always send the three-way handshake ack immediately 2883 * in order to make the connection complete as soon as 2884 * possible on the accepting host. 2885 */ 2886 flags |= TH_ACK_NEEDED; 2887 /* 2888 * Check to see if there is data to be sent. If 2889 * yes, set the transmit flag. Then check to see 2890 * if received data processing needs to be done. 2891 * If not, go straight to xmit_check. This short 2892 * cut is OK as we don't support T/TCP. 2893 */ 2894 if (tcp->tcp_unsent) 2895 flags |= TH_XMIT_NEEDED; 2896 2897 if (seg_len == 0) { 2898 freemsg(mp); 2899 goto xmit_check; 2900 } 2901 2902 flags &= ~TH_SYN; 2903 seg_seq++; 2904 break; 2905 } 2906 syn_rcvd: 2907 tcp->tcp_state = TCPS_SYN_RCVD; 2908 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 2909 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 2910 if (mp1 != NULL) { 2911 TCP_DUMP_PACKET("tcp_rput_data replying SYN", mp1); 2912 (void) ipv4_tcp_output(sock_id, mp1); 2913 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 2914 freeb(mp1); 2915 /* 2916 * Let's wait till our SYN has been ACKED since we 2917 * don't have a timer. 2918 */ 2919 if (tcp_state_wait(sock_id, tcp, TCPS_ALL_ACKED) < 0) { 2920 freemsg(mp); 2921 return; 2922 } 2923 } 2924 freemsg(mp); 2925 return; 2926 default: 2927 break; 2928 } 2929 mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH((tcph_t *)tcph); 2930 new_swnd = ntohs(tcph->tha_win) << 2931 ((flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 2932 mss = tcp->tcp_mss; 2933 2934 if (tcp->tcp_snd_ts_ok) { 2935 if (!tcp_paws_check(tcp, (tcph_t *)tcph, &tcpopt)) { 2936 /* 2937 * This segment is not acceptable. 2938 * Drop it and send back an ACK. 2939 */ 2940 freemsg(mp); 2941 flags |= TH_ACK_NEEDED; 2942 goto ack_check; 2943 } 2944 } else if (tcp->tcp_snd_sack_ok) { 2945 assert(tcp->tcp_sack_info != NULL); 2946 tcpopt.tcp = tcp; 2947 /* 2948 * SACK info in already updated in tcp_parse_options. Ignore 2949 * all other TCP options... 2950 */ 2951 (void) tcp_parse_options((tcph_t *)tcph, &tcpopt); 2952 } 2953 try_again:; 2954 gap = seg_seq - tcp->tcp_rnxt; 2955 rgap = tcp->tcp_rwnd - (gap + seg_len); 2956 /* 2957 * gap is the amount of sequence space between what we expect to see 2958 * and what we got for seg_seq. A positive value for gap means 2959 * something got lost. A negative value means we got some old stuff. 2960 */ 2961 if (gap < 0) { 2962 /* Old stuff present. Is the SYN in there? */ 2963 if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 2964 (seg_len != 0)) { 2965 flags &= ~TH_SYN; 2966 seg_seq++; 2967 /* Recompute the gaps after noting the SYN. */ 2968 goto try_again; 2969 } 2970 BUMP_MIB(tcp_mib.tcpInDataDupSegs); 2971 UPDATE_MIB(tcp_mib.tcpInDataDupBytes, 2972 (seg_len > -gap ? -gap : seg_len)); 2973 /* Remove the old stuff from seg_len. */ 2974 seg_len += gap; 2975 /* 2976 * Anything left? 2977 * Make sure to check for unack'd FIN when rest of data 2978 * has been previously ack'd. 2979 */ 2980 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 2981 /* 2982 * Resets are only valid if they lie within our offered 2983 * window. If the RST bit is set, we just ignore this 2984 * segment. 2985 */ 2986 if (flags & TH_RST) { 2987 freemsg(mp); 2988 return; 2989 } 2990 2991 /* 2992 * This segment is "unacceptable". None of its 2993 * sequence space lies within our advertized window. 2994 * 2995 * Adjust seg_len to the original value for tracing. 2996 */ 2997 seg_len -= gap; 2998 #ifdef DEBUG 2999 printf("tcp_rput: unacceptable, gap %d, rgap " 3000 "%d, flags 0x%x, seg_seq %u, seg_ack %u, " 3001 "seg_len %d, rnxt %u, snxt %u, %s", 3002 gap, rgap, flags, seg_seq, seg_ack, 3003 seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 3004 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 3005 #endif 3006 3007 /* 3008 * Arrange to send an ACK in response to the 3009 * unacceptable segment per RFC 793 page 69. There 3010 * is only one small difference between ours and the 3011 * acceptability test in the RFC - we accept ACK-only 3012 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 3013 * will be generated. 3014 * 3015 * Note that we have to ACK an ACK-only packet at least 3016 * for stacks that send 0-length keep-alives with 3017 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 3018 * section 4.2.3.6. As long as we don't ever generate 3019 * an unacceptable packet in response to an incoming 3020 * packet that is unacceptable, it should not cause 3021 * "ACK wars". 3022 */ 3023 flags |= TH_ACK_NEEDED; 3024 3025 /* 3026 * Continue processing this segment in order to use the 3027 * ACK information it contains, but skip all other 3028 * sequence-number processing. Processing the ACK 3029 * information is necessary in order to 3030 * re-synchronize connections that may have lost 3031 * synchronization. 3032 * 3033 * We clear seg_len and flag fields related to 3034 * sequence number processing as they are not 3035 * to be trusted for an unacceptable segment. 3036 */ 3037 seg_len = 0; 3038 flags &= ~(TH_SYN | TH_FIN | TH_URG); 3039 goto process_ack; 3040 } 3041 3042 /* Fix seg_seq, and chew the gap off the front. */ 3043 seg_seq = tcp->tcp_rnxt; 3044 do { 3045 mblk_t *mp2; 3046 assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 3047 (uintptr_t)UINT_MAX); 3048 gap += (uint_t)(mp->b_wptr - mp->b_rptr); 3049 if (gap > 0) { 3050 mp->b_rptr = mp->b_wptr - gap; 3051 break; 3052 } 3053 mp2 = mp; 3054 mp = mp->b_cont; 3055 freeb(mp2); 3056 } while (gap < 0); 3057 } 3058 /* 3059 * rgap is the amount of stuff received out of window. A negative 3060 * value is the amount out of window. 3061 */ 3062 if (rgap < 0) { 3063 mblk_t *mp2; 3064 3065 if (tcp->tcp_rwnd == 0) 3066 BUMP_MIB(tcp_mib.tcpInWinProbe); 3067 else { 3068 BUMP_MIB(tcp_mib.tcpInDataPastWinSegs); 3069 UPDATE_MIB(tcp_mib.tcpInDataPastWinBytes, -rgap); 3070 } 3071 3072 /* 3073 * seg_len does not include the FIN, so if more than 3074 * just the FIN is out of window, we act like we don't 3075 * see it. (If just the FIN is out of window, rgap 3076 * will be zero and we will go ahead and acknowledge 3077 * the FIN.) 3078 */ 3079 flags &= ~TH_FIN; 3080 3081 /* Fix seg_len and make sure there is something left. */ 3082 seg_len += rgap; 3083 if (seg_len <= 0) { 3084 /* 3085 * Resets are only valid if they lie within our offered 3086 * window. If the RST bit is set, we just ignore this 3087 * segment. 3088 */ 3089 if (flags & TH_RST) { 3090 freemsg(mp); 3091 return; 3092 } 3093 3094 /* Per RFC 793, we need to send back an ACK. */ 3095 flags |= TH_ACK_NEEDED; 3096 3097 /* 3098 * If this is a zero window probe, continue to 3099 * process the ACK part. But we need to set seg_len 3100 * to 0 to avoid data processing. Otherwise just 3101 * drop the segment and send back an ACK. 3102 */ 3103 if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 3104 flags &= ~(TH_SYN | TH_URG); 3105 seg_len = 0; 3106 /* Let's see if we can update our rwnd */ 3107 tcp_rcv_drain(sock_id, tcp); 3108 goto process_ack; 3109 } else { 3110 freemsg(mp); 3111 goto ack_check; 3112 } 3113 } 3114 /* Pitch out of window stuff off the end. */ 3115 rgap = seg_len; 3116 mp2 = mp; 3117 do { 3118 assert((uintptr_t)(mp2->b_wptr - 3119 mp2->b_rptr) <= (uintptr_t)INT_MAX); 3120 rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 3121 if (rgap < 0) { 3122 mp2->b_wptr += rgap; 3123 if ((mp1 = mp2->b_cont) != NULL) { 3124 mp2->b_cont = NULL; 3125 freemsg(mp1); 3126 } 3127 break; 3128 } 3129 } while ((mp2 = mp2->b_cont) != NULL); 3130 } 3131 ok:; 3132 /* 3133 * TCP should check ECN info for segments inside the window only. 3134 * Therefore the check should be done here. 3135 */ 3136 if (tcp->tcp_ecn_ok) { 3137 uchar_t tos = ((struct ip *)rptr)->ip_tos; 3138 3139 if (flags & TH_CWR) { 3140 tcp->tcp_ecn_echo_on = B_FALSE; 3141 } 3142 /* 3143 * Note that both ECN_CE and CWR can be set in the 3144 * same segment. In this case, we once again turn 3145 * on ECN_ECHO. 3146 */ 3147 if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 3148 tcp->tcp_ecn_echo_on = B_TRUE; 3149 } 3150 } 3151 3152 /* 3153 * Check whether we can update tcp_ts_recent. This test is 3154 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 3155 * Extensions for High Performance: An Update", Internet Draft. 3156 */ 3157 if (tcp->tcp_snd_ts_ok && 3158 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 3159 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 3160 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 3161 tcp->tcp_last_rcv_lbolt = prom_gettime(); 3162 } 3163 3164 if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 3165 /* 3166 * FIN in an out of order segment. We record this in 3167 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 3168 * Clear the FIN so that any check on FIN flag will fail. 3169 * Remember that FIN also counts in the sequence number 3170 * space. So we need to ack out of order FIN only segments. 3171 */ 3172 if (flags & TH_FIN) { 3173 tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 3174 tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 3175 flags &= ~TH_FIN; 3176 flags |= TH_ACK_NEEDED; 3177 } 3178 if (seg_len > 0) { 3179 /* Fill in the SACK blk list. */ 3180 if (tcp->tcp_snd_sack_ok) { 3181 assert(tcp->tcp_sack_info != NULL); 3182 tcp_sack_insert(tcp->tcp_sack_list, 3183 seg_seq, seg_seq + seg_len, 3184 &(tcp->tcp_num_sack_blk)); 3185 } 3186 3187 /* 3188 * Attempt reassembly and see if we have something 3189 * ready to go. 3190 */ 3191 mp = tcp_reass(tcp, mp, seg_seq); 3192 /* Always ack out of order packets */ 3193 flags |= TH_ACK_NEEDED | TH_PUSH; 3194 if (mp != NULL) { 3195 assert((uintptr_t)(mp->b_wptr - 3196 mp->b_rptr) <= (uintptr_t)INT_MAX); 3197 seg_len = mp->b_cont ? msgdsize(mp) : 3198 (int)(mp->b_wptr - mp->b_rptr); 3199 seg_seq = tcp->tcp_rnxt; 3200 /* 3201 * A gap is filled and the seq num and len 3202 * of the gap match that of a previously 3203 * received FIN, put the FIN flag back in. 3204 */ 3205 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3206 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3207 flags |= TH_FIN; 3208 tcp->tcp_valid_bits &= 3209 ~TCP_OFO_FIN_VALID; 3210 } 3211 } else { 3212 /* 3213 * Keep going even with NULL mp. 3214 * There may be a useful ACK or something else 3215 * we don't want to miss. 3216 * 3217 * But TCP should not perform fast retransmit 3218 * because of the ack number. TCP uses 3219 * seg_len == 0 to determine if it is a pure 3220 * ACK. And this is not a pure ACK. 3221 */ 3222 seg_len = 0; 3223 ofo_seg = B_TRUE; 3224 } 3225 } 3226 } else if (seg_len > 0) { 3227 BUMP_MIB(tcp_mib.tcpInDataInorderSegs); 3228 UPDATE_MIB(tcp_mib.tcpInDataInorderBytes, seg_len); 3229 /* 3230 * If an out of order FIN was received before, and the seq 3231 * num and len of the new segment match that of the FIN, 3232 * put the FIN flag back in. 3233 */ 3234 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3235 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3236 flags |= TH_FIN; 3237 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 3238 } 3239 } 3240 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 3241 if (flags & TH_RST) { 3242 freemsg(mp); 3243 switch (tcp->tcp_state) { 3244 case TCPS_SYN_RCVD: 3245 (void) tcp_clean_death(sock_id, tcp, ECONNREFUSED); 3246 break; 3247 case TCPS_ESTABLISHED: 3248 case TCPS_FIN_WAIT_1: 3249 case TCPS_FIN_WAIT_2: 3250 case TCPS_CLOSE_WAIT: 3251 (void) tcp_clean_death(sock_id, tcp, ECONNRESET); 3252 break; 3253 case TCPS_CLOSING: 3254 case TCPS_LAST_ACK: 3255 (void) tcp_clean_death(sock_id, tcp, 0); 3256 break; 3257 default: 3258 assert(tcp->tcp_state != TCPS_TIME_WAIT); 3259 (void) tcp_clean_death(sock_id, tcp, ENXIO); 3260 break; 3261 } 3262 return; 3263 } 3264 if (flags & TH_SYN) { 3265 /* 3266 * See RFC 793, Page 71 3267 * 3268 * The seq number must be in the window as it should 3269 * be "fixed" above. If it is outside window, it should 3270 * be already rejected. Note that we allow seg_seq to be 3271 * rnxt + rwnd because we want to accept 0 window probe. 3272 */ 3273 assert(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 3274 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 3275 freemsg(mp); 3276 /* 3277 * If the ACK flag is not set, just use our snxt as the 3278 * seq number of the RST segment. 3279 */ 3280 if (!(flags & TH_ACK)) { 3281 seg_ack = tcp->tcp_snxt; 3282 } 3283 tcp_xmit_ctl("TH_SYN", tcp, NULL, seg_ack, 3284 seg_seq + 1, TH_RST|TH_ACK, 0, sock_id); 3285 assert(tcp->tcp_state != TCPS_TIME_WAIT); 3286 (void) tcp_clean_death(sock_id, tcp, ECONNRESET); 3287 return; 3288 } 3289 3290 process_ack: 3291 if (!(flags & TH_ACK)) { 3292 #ifdef DEBUG 3293 printf("No ack in segment, dropped it, seq:%x\n", seg_seq); 3294 #endif 3295 freemsg(mp); 3296 goto xmit_check; 3297 } 3298 } 3299 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 3300 3301 if (tcp->tcp_state == TCPS_SYN_RCVD) { 3302 tcp_t *listener = tcp->tcp_listener; 3303 #ifdef DEBUG 3304 printf("Done with eager 3-way handshake\n"); 3305 #endif 3306 /* 3307 * NOTE: RFC 793 pg. 72 says this should be 'bytes_acked < 0' 3308 * but that would mean we have an ack that ignored our SYN. 3309 */ 3310 if (bytes_acked < 1 || SEQ_GT(seg_ack, tcp->tcp_snxt)) { 3311 freemsg(mp); 3312 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 3313 tcp, NULL, seg_ack, 0, TH_RST, 0, sock_id); 3314 return; 3315 } 3316 3317 /* 3318 * if the conn_req_q is full defer processing 3319 * until space is availabe after accept() 3320 * processing 3321 */ 3322 if (listener->tcp_conn_req_cnt_q < 3323 listener->tcp_conn_req_max) { 3324 tcp_t *tail; 3325 3326 listener->tcp_conn_req_cnt_q0--; 3327 listener->tcp_conn_req_cnt_q++; 3328 3329 /* Move from SYN_RCVD to ESTABLISHED list */ 3330 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 3331 tcp->tcp_eager_prev_q0; 3332 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 3333 tcp->tcp_eager_next_q0; 3334 tcp->tcp_eager_prev_q0 = NULL; 3335 tcp->tcp_eager_next_q0 = NULL; 3336 3337 /* 3338 * Insert at end of the queue because sockfs 3339 * sends down T_CONN_RES in chronological 3340 * order. Leaving the older conn indications 3341 * at front of the queue helps reducing search 3342 * time. 3343 */ 3344 tail = listener->tcp_eager_last_q; 3345 if (tail != NULL) { 3346 tail->tcp_eager_next_q = tcp; 3347 } else { 3348 listener->tcp_eager_next_q = tcp; 3349 } 3350 listener->tcp_eager_last_q = tcp; 3351 tcp->tcp_eager_next_q = NULL; 3352 } else { 3353 /* 3354 * Defer connection on q0 and set deferred 3355 * connection bit true 3356 */ 3357 tcp->tcp_conn_def_q0 = B_TRUE; 3358 3359 /* take tcp out of q0 ... */ 3360 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 3361 tcp->tcp_eager_next_q0; 3362 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 3363 tcp->tcp_eager_prev_q0; 3364 3365 /* ... and place it at the end of q0 */ 3366 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 3367 tcp->tcp_eager_next_q0 = listener; 3368 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 3369 listener->tcp_eager_prev_q0 = tcp; 3370 } 3371 3372 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 3373 bytes_acked--; 3374 3375 /* 3376 * If SYN was retransmitted, need to reset all 3377 * retransmission info as this segment will be 3378 * treated as a dup ACK. 3379 */ 3380 if (tcp->tcp_rexmit) { 3381 tcp->tcp_rexmit = B_FALSE; 3382 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3383 tcp->tcp_rexmit_max = tcp->tcp_snxt; 3384 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3385 tcp->tcp_ms_we_have_waited = 0; 3386 tcp->tcp_cwnd = mss; 3387 } 3388 3389 /* 3390 * We set the send window to zero here. 3391 * This is needed if there is data to be 3392 * processed already on the queue. 3393 * Later (at swnd_update label), the 3394 * "new_swnd > tcp_swnd" condition is satisfied 3395 * the XMIT_NEEDED flag is set in the current 3396 * (SYN_RCVD) state. This ensures tcp_wput_data() is 3397 * called if there is already data on queue in 3398 * this state. 3399 */ 3400 tcp->tcp_swnd = 0; 3401 3402 if (new_swnd > tcp->tcp_max_swnd) 3403 tcp->tcp_max_swnd = new_swnd; 3404 tcp->tcp_swl1 = seg_seq; 3405 tcp->tcp_swl2 = seg_ack; 3406 tcp->tcp_state = TCPS_ESTABLISHED; 3407 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 3408 } 3409 /* This code follows 4.4BSD-Lite2 mostly. */ 3410 if (bytes_acked < 0) 3411 goto est; 3412 3413 /* 3414 * If TCP is ECN capable and the congestion experience bit is 3415 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 3416 * done once per window (or more loosely, per RTT). 3417 */ 3418 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 3419 tcp->tcp_cwr = B_FALSE; 3420 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { 3421 if (!tcp->tcp_cwr) { 3422 npkt = (MIN(tcp->tcp_cwnd, tcp->tcp_swnd) >> 1) / mss; 3423 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; 3424 tcp->tcp_cwnd = npkt * mss; 3425 /* 3426 * If the cwnd is 0, use the timer to clock out 3427 * new segments. This is required by the ECN spec. 3428 */ 3429 if (npkt == 0) { 3430 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3431 /* 3432 * This makes sure that when the ACK comes 3433 * back, we will increase tcp_cwnd by 1 MSS. 3434 */ 3435 tcp->tcp_cwnd_cnt = 0; 3436 } 3437 tcp->tcp_cwr = B_TRUE; 3438 /* 3439 * This marks the end of the current window of in 3440 * flight data. That is why we don't use 3441 * tcp_suna + tcp_swnd. Only data in flight can 3442 * provide ECN info. 3443 */ 3444 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3445 tcp->tcp_ecn_cwr_sent = B_FALSE; 3446 } 3447 } 3448 3449 mp1 = tcp->tcp_xmit_head; 3450 if (bytes_acked == 0) { 3451 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 3452 int dupack_cnt; 3453 3454 BUMP_MIB(tcp_mib.tcpInDupAck); 3455 /* 3456 * Fast retransmit. When we have seen exactly three 3457 * identical ACKs while we have unacked data 3458 * outstanding we take it as a hint that our peer 3459 * dropped something. 3460 * 3461 * If TCP is retransmitting, don't do fast retransmit. 3462 */ 3463 if (mp1 != NULL && tcp->tcp_suna != tcp->tcp_snxt && 3464 ! tcp->tcp_rexmit) { 3465 /* Do Limited Transmit */ 3466 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 3467 tcp_dupack_fast_retransmit) { 3468 /* 3469 * RFC 3042 3470 * 3471 * What we need to do is temporarily 3472 * increase tcp_cwnd so that new 3473 * data can be sent if it is allowed 3474 * by the receive window (tcp_rwnd). 3475 * tcp_wput_data() will take care of 3476 * the rest. 3477 * 3478 * If the connection is SACK capable, 3479 * only do limited xmit when there 3480 * is SACK info. 3481 * 3482 * Note how tcp_cwnd is incremented. 3483 * The first dup ACK will increase 3484 * it by 1 MSS. The second dup ACK 3485 * will increase it by 2 MSS. This 3486 * means that only 1 new segment will 3487 * be sent for each dup ACK. 3488 */ 3489 if (tcp->tcp_unsent > 0 && 3490 (!tcp->tcp_snd_sack_ok || 3491 (tcp->tcp_snd_sack_ok && 3492 tcp->tcp_notsack_list != NULL))) { 3493 tcp->tcp_cwnd += mss << 3494 (tcp->tcp_dupack_cnt - 1); 3495 flags |= TH_LIMIT_XMIT; 3496 } 3497 } else if (dupack_cnt == 3498 tcp_dupack_fast_retransmit) { 3499 3500 BUMP_MIB(tcp_mib.tcpOutFastRetrans); 3501 /* 3502 * If we have reduced tcp_ssthresh 3503 * because of ECN, do not reduce it again 3504 * unless it is already one window of data 3505 * away. After one window of data, tcp_cwr 3506 * should then be cleared. Note that 3507 * for non ECN capable connection, tcp_cwr 3508 * should always be false. 3509 * 3510 * Adjust cwnd since the duplicate 3511 * ack indicates that a packet was 3512 * dropped (due to congestion.) 3513 */ 3514 if (!tcp->tcp_cwr) { 3515 npkt = (MIN(tcp->tcp_cwnd, 3516 tcp->tcp_swnd) >> 1) / mss; 3517 if (npkt < 2) 3518 npkt = 2; 3519 tcp->tcp_cwnd_ssthresh = npkt * mss; 3520 tcp->tcp_cwnd = (npkt + 3521 tcp->tcp_dupack_cnt) * mss; 3522 } 3523 if (tcp->tcp_ecn_ok) { 3524 tcp->tcp_cwr = B_TRUE; 3525 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3526 tcp->tcp_ecn_cwr_sent = B_FALSE; 3527 } 3528 3529 /* 3530 * We do Hoe's algorithm. Refer to her 3531 * paper "Improving the Start-up Behavior 3532 * of a Congestion Control Scheme for TCP," 3533 * appeared in SIGCOMM'96. 3534 * 3535 * Save highest seq no we have sent so far. 3536 * Be careful about the invisible FIN byte. 3537 */ 3538 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 3539 (tcp->tcp_unsent == 0)) { 3540 tcp->tcp_rexmit_max = tcp->tcp_fss; 3541 } else { 3542 tcp->tcp_rexmit_max = tcp->tcp_snxt; 3543 } 3544 3545 /* 3546 * Do not allow bursty traffic during. 3547 * fast recovery. Refer to Fall and Floyd's 3548 * paper "Simulation-based Comparisons of 3549 * Tahoe, Reno and SACK TCP" (in CCR ??) 3550 * This is a best current practise. 3551 */ 3552 tcp->tcp_snd_burst = TCP_CWND_SS; 3553 3554 /* 3555 * For SACK: 3556 * Calculate tcp_pipe, which is the 3557 * estimated number of bytes in 3558 * network. 3559 * 3560 * tcp_fack is the highest sack'ed seq num 3561 * TCP has received. 3562 * 3563 * tcp_pipe is explained in the above quoted 3564 * Fall and Floyd's paper. tcp_fack is 3565 * explained in Mathis and Mahdavi's 3566 * "Forward Acknowledgment: Refining TCP 3567 * Congestion Control" in SIGCOMM '96. 3568 */ 3569 if (tcp->tcp_snd_sack_ok) { 3570 assert(tcp->tcp_sack_info != NULL); 3571 if (tcp->tcp_notsack_list != NULL) { 3572 tcp->tcp_pipe = tcp->tcp_snxt - 3573 tcp->tcp_fack; 3574 tcp->tcp_sack_snxt = seg_ack; 3575 flags |= TH_NEED_SACK_REXMIT; 3576 } else { 3577 /* 3578 * Always initialize tcp_pipe 3579 * even though we don't have 3580 * any SACK info. If later 3581 * we get SACK info and 3582 * tcp_pipe is not initialized, 3583 * funny things will happen. 3584 */ 3585 tcp->tcp_pipe = 3586 tcp->tcp_cwnd_ssthresh; 3587 } 3588 } else { 3589 flags |= TH_REXMIT_NEEDED; 3590 } /* tcp_snd_sack_ok */ 3591 3592 } else { 3593 /* 3594 * Here we perform congestion 3595 * avoidance, but NOT slow start. 3596 * This is known as the Fast 3597 * Recovery Algorithm. 3598 */ 3599 if (tcp->tcp_snd_sack_ok && 3600 tcp->tcp_notsack_list != NULL) { 3601 flags |= TH_NEED_SACK_REXMIT; 3602 tcp->tcp_pipe -= mss; 3603 if (tcp->tcp_pipe < 0) 3604 tcp->tcp_pipe = 0; 3605 } else { 3606 /* 3607 * We know that one more packet has 3608 * left the pipe thus we can update 3609 * cwnd. 3610 */ 3611 cwnd = tcp->tcp_cwnd + mss; 3612 if (cwnd > tcp->tcp_cwnd_max) 3613 cwnd = tcp->tcp_cwnd_max; 3614 tcp->tcp_cwnd = cwnd; 3615 flags |= TH_XMIT_NEEDED; 3616 } 3617 } 3618 } 3619 } else if (tcp->tcp_zero_win_probe) { 3620 /* 3621 * If the window has opened, need to arrange 3622 * to send additional data. 3623 */ 3624 if (new_swnd != 0) { 3625 /* tcp_suna != tcp_snxt */ 3626 /* Packet contains a window update */ 3627 BUMP_MIB(tcp_mib.tcpInWinUpdate); 3628 tcp->tcp_zero_win_probe = 0; 3629 tcp->tcp_timer_backoff = 0; 3630 tcp->tcp_ms_we_have_waited = 0; 3631 3632 /* 3633 * Transmit starting with tcp_suna since 3634 * the one byte probe is not ack'ed. 3635 * If TCP has sent more than one identical 3636 * probe, tcp_rexmit will be set. That means 3637 * tcp_ss_rexmit() will send out the one 3638 * byte along with new data. Otherwise, 3639 * fake the retransmission. 3640 */ 3641 flags |= TH_XMIT_NEEDED; 3642 if (!tcp->tcp_rexmit) { 3643 tcp->tcp_rexmit = B_TRUE; 3644 tcp->tcp_dupack_cnt = 0; 3645 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 3646 tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 3647 } 3648 } 3649 } 3650 goto swnd_update; 3651 } 3652 3653 /* 3654 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 3655 * If the ACK value acks something that we have not yet sent, it might 3656 * be an old duplicate segment. Send an ACK to re-synchronize the 3657 * other side. 3658 * Note: reset in response to unacceptable ACK in SYN_RECEIVE 3659 * state is handled above, so we can always just drop the segment and 3660 * send an ACK here. 3661 * 3662 * Should we send ACKs in response to ACK only segments? 3663 */ 3664 if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 3665 BUMP_MIB(tcp_mib.tcpInAckUnsent); 3666 /* drop the received segment */ 3667 freemsg(mp); 3668 3669 /* Send back an ACK. */ 3670 mp = tcp_ack_mp(tcp); 3671 3672 if (mp == NULL) { 3673 return; 3674 } 3675 BUMP_MIB(tcp_mib.tcpOutAck); 3676 (void) ipv4_tcp_output(sock_id, mp); 3677 freeb(mp); 3678 return; 3679 } 3680 3681 /* 3682 * TCP gets a new ACK, update the notsack'ed list to delete those 3683 * blocks that are covered by this ACK. 3684 */ 3685 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 3686 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 3687 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 3688 } 3689 3690 /* 3691 * If we got an ACK after fast retransmit, check to see 3692 * if it is a partial ACK. If it is not and the congestion 3693 * window was inflated to account for the other side's 3694 * cached packets, retract it. If it is, do Hoe's algorithm. 3695 */ 3696 if (tcp->tcp_dupack_cnt >= tcp_dupack_fast_retransmit) { 3697 assert(tcp->tcp_rexmit == B_FALSE); 3698 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 3699 tcp->tcp_dupack_cnt = 0; 3700 /* 3701 * Restore the orig tcp_cwnd_ssthresh after 3702 * fast retransmit phase. 3703 */ 3704 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 3705 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; 3706 } 3707 tcp->tcp_rexmit_max = seg_ack; 3708 tcp->tcp_cwnd_cnt = 0; 3709 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3710 3711 /* 3712 * Remove all notsack info to avoid confusion with 3713 * the next fast retrasnmit/recovery phase. 3714 */ 3715 if (tcp->tcp_snd_sack_ok && 3716 tcp->tcp_notsack_list != NULL) { 3717 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 3718 } 3719 } else { 3720 if (tcp->tcp_snd_sack_ok && 3721 tcp->tcp_notsack_list != NULL) { 3722 flags |= TH_NEED_SACK_REXMIT; 3723 tcp->tcp_pipe -= mss; 3724 if (tcp->tcp_pipe < 0) 3725 tcp->tcp_pipe = 0; 3726 } else { 3727 /* 3728 * Hoe's algorithm: 3729 * 3730 * Retransmit the unack'ed segment and 3731 * restart fast recovery. Note that we 3732 * need to scale back tcp_cwnd to the 3733 * original value when we started fast 3734 * recovery. This is to prevent overly 3735 * aggressive behaviour in sending new 3736 * segments. 3737 */ 3738 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + 3739 tcp_dupack_fast_retransmit * mss; 3740 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 3741 BUMP_MIB(tcp_mib.tcpOutFastRetrans); 3742 flags |= TH_REXMIT_NEEDED; 3743 } 3744 } 3745 } else { 3746 tcp->tcp_dupack_cnt = 0; 3747 if (tcp->tcp_rexmit) { 3748 /* 3749 * TCP is retranmitting. If the ACK ack's all 3750 * outstanding data, update tcp_rexmit_max and 3751 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 3752 * to the correct value. 3753 * 3754 * Note that SEQ_LEQ() is used. This is to avoid 3755 * unnecessary fast retransmit caused by dup ACKs 3756 * received when TCP does slow start retransmission 3757 * after a time out. During this phase, TCP may 3758 * send out segments which are already received. 3759 * This causes dup ACKs to be sent back. 3760 */ 3761 if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 3762 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 3763 tcp->tcp_rexmit_nxt = seg_ack; 3764 } 3765 if (seg_ack != tcp->tcp_rexmit_max) { 3766 flags |= TH_XMIT_NEEDED; 3767 } 3768 } else { 3769 tcp->tcp_rexmit = B_FALSE; 3770 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3771 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3772 } 3773 tcp->tcp_ms_we_have_waited = 0; 3774 } 3775 } 3776 3777 BUMP_MIB(tcp_mib.tcpInAckSegs); 3778 UPDATE_MIB(tcp_mib.tcpInAckBytes, bytes_acked); 3779 tcp->tcp_suna = seg_ack; 3780 if (tcp->tcp_zero_win_probe != 0) { 3781 tcp->tcp_zero_win_probe = 0; 3782 tcp->tcp_timer_backoff = 0; 3783 } 3784 3785 /* 3786 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 3787 * Note that it cannot be the SYN being ack'ed. The code flow 3788 * will not reach here. 3789 */ 3790 if (mp1 == NULL) { 3791 goto fin_acked; 3792 } 3793 3794 /* 3795 * Update the congestion window. 3796 * 3797 * If TCP is not ECN capable or TCP is ECN capable but the 3798 * congestion experience bit is not set, increase the tcp_cwnd as 3799 * usual. 3800 */ 3801 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 3802 cwnd = tcp->tcp_cwnd; 3803 add = mss; 3804 3805 if (cwnd >= tcp->tcp_cwnd_ssthresh) { 3806 /* 3807 * This is to prevent an increase of less than 1 MSS of 3808 * tcp_cwnd. With partial increase, tcp_wput_data() 3809 * may send out tinygrams in order to preserve mblk 3810 * boundaries. 3811 * 3812 * By initializing tcp_cwnd_cnt to new tcp_cwnd and 3813 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is 3814 * increased by 1 MSS for every RTTs. 3815 */ 3816 if (tcp->tcp_cwnd_cnt <= 0) { 3817 tcp->tcp_cwnd_cnt = cwnd + add; 3818 } else { 3819 tcp->tcp_cwnd_cnt -= add; 3820 add = 0; 3821 } 3822 } 3823 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); 3824 } 3825 3826 /* Can we update the RTT estimates? */ 3827 if (tcp->tcp_snd_ts_ok) { 3828 /* Ignore zero timestamp echo-reply. */ 3829 if (tcpopt.tcp_opt_ts_ecr != 0) { 3830 tcp_set_rto(tcp, (int32_t)(prom_gettime() - 3831 tcpopt.tcp_opt_ts_ecr)); 3832 } 3833 3834 /* If needed, restart the timer. */ 3835 if (tcp->tcp_set_timer == 1) { 3836 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3837 tcp->tcp_set_timer = 0; 3838 } 3839 /* 3840 * Update tcp_csuna in case the other side stops sending 3841 * us timestamps. 3842 */ 3843 tcp->tcp_csuna = tcp->tcp_snxt; 3844 } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 3845 /* 3846 * An ACK sequence we haven't seen before, so get the RTT 3847 * and update the RTO. 3848 * Note. use uintptr_t to suppress the gcc warning. 3849 */ 3850 tcp_set_rto(tcp, (int32_t)(prom_gettime() - 3851 (uint32_t)(uintptr_t)mp1->b_prev)); 3852 3853 /* Remeber the last sequence to be ACKed */ 3854 tcp->tcp_csuna = seg_ack; 3855 if (tcp->tcp_set_timer == 1) { 3856 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3857 tcp->tcp_set_timer = 0; 3858 } 3859 } else { 3860 BUMP_MIB(tcp_mib.tcpRttNoUpdate); 3861 } 3862 3863 /* Eat acknowledged bytes off the xmit queue. */ 3864 for (;;) { 3865 mblk_t *mp2; 3866 uchar_t *wptr; 3867 3868 wptr = mp1->b_wptr; 3869 assert((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 3870 bytes_acked -= (int)(wptr - mp1->b_rptr); 3871 if (bytes_acked < 0) { 3872 mp1->b_rptr = wptr + bytes_acked; 3873 break; 3874 } 3875 mp1->b_prev = NULL; 3876 mp2 = mp1; 3877 mp1 = mp1->b_cont; 3878 freeb(mp2); 3879 if (bytes_acked == 0) { 3880 if (mp1 == NULL) { 3881 /* Everything is ack'ed, clear the tail. */ 3882 tcp->tcp_xmit_tail = NULL; 3883 goto pre_swnd_update; 3884 } 3885 if (mp2 != tcp->tcp_xmit_tail) 3886 break; 3887 tcp->tcp_xmit_tail = mp1; 3888 assert((uintptr_t)(mp1->b_wptr - 3889 mp1->b_rptr) <= (uintptr_t)INT_MAX); 3890 tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 3891 mp1->b_rptr); 3892 break; 3893 } 3894 if (mp1 == NULL) { 3895 /* 3896 * More was acked but there is nothing more 3897 * outstanding. This means that the FIN was 3898 * just acked or that we're talking to a clown. 3899 */ 3900 fin_acked: 3901 assert(tcp->tcp_fin_sent); 3902 tcp->tcp_xmit_tail = NULL; 3903 if (tcp->tcp_fin_sent) { 3904 tcp->tcp_fin_acked = B_TRUE; 3905 } else { 3906 /* 3907 * We should never got here because 3908 * we have already checked that the 3909 * number of bytes ack'ed should be 3910 * smaller than or equal to what we 3911 * have sent so far (it is the 3912 * acceptability check of the ACK). 3913 * We can only get here if the send 3914 * queue is corrupted. 3915 * 3916 * Terminate the connection and 3917 * panic the system. It is better 3918 * for us to panic instead of 3919 * continuing to avoid other disaster. 3920 */ 3921 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 3922 tcp->tcp_rnxt, TH_RST|TH_ACK, 0, sock_id); 3923 printf("Memory corruption " 3924 "detected for connection %s.\n", 3925 tcp_display(tcp, NULL, 3926 DISP_ADDR_AND_PORT)); 3927 /* We should never get here... */ 3928 prom_panic("tcp_rput_data"); 3929 return; 3930 } 3931 goto pre_swnd_update; 3932 } 3933 assert(mp2 != tcp->tcp_xmit_tail); 3934 } 3935 if (tcp->tcp_unsent) { 3936 flags |= TH_XMIT_NEEDED; 3937 } 3938 pre_swnd_update: 3939 tcp->tcp_xmit_head = mp1; 3940 swnd_update: 3941 /* 3942 * The following check is different from most other implementations. 3943 * For bi-directional transfer, when segments are dropped, the 3944 * "normal" check will not accept a window update in those 3945 * retransmitted segemnts. Failing to do that, TCP may send out 3946 * segments which are outside receiver's window. As TCP accepts 3947 * the ack in those retransmitted segments, if the window update in 3948 * the same segment is not accepted, TCP will incorrectly calculates 3949 * that it can send more segments. This can create a deadlock 3950 * with the receiver if its window becomes zero. 3951 */ 3952 if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 3953 SEQ_LT(tcp->tcp_swl1, seg_seq) || 3954 (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 3955 /* 3956 * The criteria for update is: 3957 * 3958 * 1. the segment acknowledges some data. Or 3959 * 2. the segment is new, i.e. it has a higher seq num. Or 3960 * 3. the segment is not old and the advertised window is 3961 * larger than the previous advertised window. 3962 */ 3963 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 3964 flags |= TH_XMIT_NEEDED; 3965 tcp->tcp_swnd = new_swnd; 3966 if (new_swnd > tcp->tcp_max_swnd) 3967 tcp->tcp_max_swnd = new_swnd; 3968 tcp->tcp_swl1 = seg_seq; 3969 tcp->tcp_swl2 = seg_ack; 3970 } 3971 est: 3972 if (tcp->tcp_state > TCPS_ESTABLISHED) { 3973 switch (tcp->tcp_state) { 3974 case TCPS_FIN_WAIT_1: 3975 if (tcp->tcp_fin_acked) { 3976 tcp->tcp_state = TCPS_FIN_WAIT_2; 3977 /* 3978 * We implement the non-standard BSD/SunOS 3979 * FIN_WAIT_2 flushing algorithm. 3980 * If there is no user attached to this 3981 * TCP endpoint, then this TCP struct 3982 * could hang around forever in FIN_WAIT_2 3983 * state if the peer forgets to send us 3984 * a FIN. To prevent this, we wait only 3985 * 2*MSL (a convenient time value) for 3986 * the FIN to arrive. If it doesn't show up, 3987 * we flush the TCP endpoint. This algorithm, 3988 * though a violation of RFC-793, has worked 3989 * for over 10 years in BSD systems. 3990 * Note: SunOS 4.x waits 675 seconds before 3991 * flushing the FIN_WAIT_2 connection. 3992 */ 3993 TCP_TIMER_RESTART(tcp, 3994 tcp_fin_wait_2_flush_interval); 3995 } 3996 break; 3997 case TCPS_FIN_WAIT_2: 3998 break; /* Shutdown hook? */ 3999 case TCPS_LAST_ACK: 4000 freemsg(mp); 4001 if (tcp->tcp_fin_acked) { 4002 (void) tcp_clean_death(sock_id, tcp, 0); 4003 return; 4004 } 4005 goto xmit_check; 4006 case TCPS_CLOSING: 4007 if (tcp->tcp_fin_acked) { 4008 tcp->tcp_state = TCPS_TIME_WAIT; 4009 tcp_time_wait_append(tcp); 4010 TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 4011 } 4012 /*FALLTHRU*/ 4013 case TCPS_CLOSE_WAIT: 4014 freemsg(mp); 4015 goto xmit_check; 4016 default: 4017 assert(tcp->tcp_state != TCPS_TIME_WAIT); 4018 break; 4019 } 4020 } 4021 if (flags & TH_FIN) { 4022 /* Make sure we ack the fin */ 4023 flags |= TH_ACK_NEEDED; 4024 if (!tcp->tcp_fin_rcvd) { 4025 tcp->tcp_fin_rcvd = B_TRUE; 4026 tcp->tcp_rnxt++; 4027 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 4028 4029 switch (tcp->tcp_state) { 4030 case TCPS_SYN_RCVD: 4031 case TCPS_ESTABLISHED: 4032 tcp->tcp_state = TCPS_CLOSE_WAIT; 4033 /* Keepalive? */ 4034 break; 4035 case TCPS_FIN_WAIT_1: 4036 if (!tcp->tcp_fin_acked) { 4037 tcp->tcp_state = TCPS_CLOSING; 4038 break; 4039 } 4040 /* FALLTHRU */ 4041 case TCPS_FIN_WAIT_2: 4042 tcp->tcp_state = TCPS_TIME_WAIT; 4043 tcp_time_wait_append(tcp); 4044 TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 4045 if (seg_len) { 4046 /* 4047 * implies data piggybacked on FIN. 4048 * break to handle data. 4049 */ 4050 break; 4051 } 4052 freemsg(mp); 4053 goto ack_check; 4054 } 4055 } 4056 } 4057 if (mp == NULL) 4058 goto xmit_check; 4059 if (seg_len == 0) { 4060 freemsg(mp); 4061 goto xmit_check; 4062 } 4063 if (mp->b_rptr == mp->b_wptr) { 4064 /* 4065 * The header has been consumed, so we remove the 4066 * zero-length mblk here. 4067 */ 4068 mp1 = mp; 4069 mp = mp->b_cont; 4070 freeb(mp1); 4071 } 4072 /* 4073 * ACK every other segments, unless the input queue is empty 4074 * as we don't have a timer available. 4075 */ 4076 if (++tcp->tcp_rack_cnt == 2 || sockets[sock_id].inq == NULL) { 4077 flags |= TH_ACK_NEEDED; 4078 tcp->tcp_rack_cnt = 0; 4079 } 4080 tcp->tcp_rnxt += seg_len; 4081 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 4082 4083 /* Update SACK list */ 4084 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4085 tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 4086 &(tcp->tcp_num_sack_blk)); 4087 } 4088 4089 if (tcp->tcp_listener) { 4090 /* 4091 * Side queue inbound data until the accept happens. 4092 * tcp_accept/tcp_rput drains this when the accept happens. 4093 */ 4094 tcp_rcv_enqueue(tcp, mp, seg_len); 4095 } else { 4096 /* Just queue the data until the app calls read. */ 4097 tcp_rcv_enqueue(tcp, mp, seg_len); 4098 /* 4099 * Make sure the timer is running if we have data waiting 4100 * for a push bit. This provides resiliency against 4101 * implementations that do not correctly generate push bits. 4102 */ 4103 if (tcp->tcp_rcv_list != NULL) 4104 flags |= TH_TIMER_NEEDED; 4105 } 4106 4107 xmit_check: 4108 /* Is there anything left to do? */ 4109 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 4110 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_TIMER_NEEDED)) == 0) 4111 return; 4112 4113 /* Any transmit work to do and a non-zero window? */ 4114 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 4115 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 4116 if (flags & TH_REXMIT_NEEDED) { 4117 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 4118 4119 if (snd_size > mss) 4120 snd_size = mss; 4121 if (snd_size > tcp->tcp_swnd) 4122 snd_size = tcp->tcp_swnd; 4123 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 4124 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 4125 B_TRUE); 4126 4127 if (mp1 != NULL) { 4128 /* use uintptr_t to suppress the gcc warning */ 4129 tcp->tcp_xmit_head->b_prev = 4130 (mblk_t *)(uintptr_t)prom_gettime(); 4131 tcp->tcp_csuna = tcp->tcp_snxt; 4132 BUMP_MIB(tcp_mib.tcpRetransSegs); 4133 UPDATE_MIB(tcp_mib.tcpRetransBytes, snd_size); 4134 (void) ipv4_tcp_output(sock_id, mp1); 4135 freeb(mp1); 4136 } 4137 } 4138 if (flags & TH_NEED_SACK_REXMIT) { 4139 if (tcp_sack_rxmit(tcp, sock_id) != 0) { 4140 flags |= TH_XMIT_NEEDED; 4141 } 4142 } 4143 /* 4144 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 4145 * out new segment. Note that tcp_rexmit should not be 4146 * set, otherwise TH_LIMIT_XMIT should not be set. 4147 */ 4148 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 4149 if (!tcp->tcp_rexmit) { 4150 tcp_wput_data(tcp, NULL, sock_id); 4151 } else { 4152 tcp_ss_rexmit(tcp, sock_id); 4153 } 4154 /* 4155 * The TCP could be closed in tcp_state_wait via 4156 * tcp_wput_data (tcp_ss_rexmit could call 4157 * tcp_wput_data as well). 4158 */ 4159 if (sockets[sock_id].pcb == NULL) 4160 return; 4161 } 4162 /* 4163 * Adjust tcp_cwnd back to normal value after sending 4164 * new data segments. 4165 */ 4166 if (flags & TH_LIMIT_XMIT) { 4167 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 4168 } 4169 4170 /* Anything more to do? */ 4171 if ((flags & (TH_ACK_NEEDED|TH_TIMER_NEEDED)) == 0) 4172 return; 4173 } 4174 ack_check: 4175 if (flags & TH_ACK_NEEDED) { 4176 /* 4177 * Time to send an ack for some reason. 4178 */ 4179 if ((mp1 = tcp_ack_mp(tcp)) != NULL) { 4180 TCP_DUMP_PACKET("tcp_rput_data: ack mp", mp1); 4181 (void) ipv4_tcp_output(sock_id, mp1); 4182 BUMP_MIB(tcp_mib.tcpOutAck); 4183 freeb(mp1); 4184 } 4185 } 4186 } 4187 4188 /* 4189 * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start 4190 * retransmission after a timeout. 4191 * 4192 * To limit the number of duplicate segments, we limit the number of segment 4193 * to be sent in one time to tcp_snd_burst, the burst variable. 4194 */ 4195 static void 4196 tcp_ss_rexmit(tcp_t *tcp, int sock_id) 4197 { 4198 uint32_t snxt; 4199 uint32_t smax; 4200 int32_t win; 4201 int32_t mss; 4202 int32_t off; 4203 int32_t burst = tcp->tcp_snd_burst; 4204 mblk_t *snxt_mp; 4205 4206 /* 4207 * Note that tcp_rexmit can be set even though TCP has retransmitted 4208 * all unack'ed segments. 4209 */ 4210 if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 4211 smax = tcp->tcp_rexmit_max; 4212 snxt = tcp->tcp_rexmit_nxt; 4213 if (SEQ_LT(snxt, tcp->tcp_suna)) { 4214 snxt = tcp->tcp_suna; 4215 } 4216 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 4217 win -= snxt - tcp->tcp_suna; 4218 mss = tcp->tcp_mss; 4219 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 4220 4221 while (SEQ_LT(snxt, smax) && (win > 0) && 4222 (burst > 0) && (snxt_mp != NULL)) { 4223 mblk_t *xmit_mp; 4224 mblk_t *old_snxt_mp = snxt_mp; 4225 uint32_t cnt = mss; 4226 4227 if (win < cnt) { 4228 cnt = win; 4229 } 4230 if (SEQ_GT(snxt + cnt, smax)) { 4231 cnt = smax - snxt; 4232 } 4233 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 4234 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 4235 4236 if (xmit_mp == NULL) 4237 return; 4238 4239 (void) ipv4_tcp_output(sock_id, xmit_mp); 4240 freeb(xmit_mp); 4241 4242 snxt += cnt; 4243 win -= cnt; 4244 /* 4245 * Update the send timestamp to avoid false 4246 * retransmission. 4247 * Note. use uintptr_t to suppress the gcc warning. 4248 */ 4249 old_snxt_mp->b_prev = 4250 (mblk_t *)(uintptr_t)prom_gettime(); 4251 BUMP_MIB(tcp_mib.tcpRetransSegs); 4252 UPDATE_MIB(tcp_mib.tcpRetransBytes, cnt); 4253 4254 tcp->tcp_rexmit_nxt = snxt; 4255 burst--; 4256 } 4257 /* 4258 * If we have transmitted all we have at the time 4259 * we started the retranmission, we can leave 4260 * the rest of the job to tcp_wput_data(). But we 4261 * need to check the send window first. If the 4262 * win is not 0, go on with tcp_wput_data(). 4263 */ 4264 if (SEQ_LT(snxt, smax) || win == 0) { 4265 return; 4266 } 4267 } 4268 /* Only call tcp_wput_data() if there is data to be sent. */ 4269 if (tcp->tcp_unsent) { 4270 tcp_wput_data(tcp, NULL, sock_id); 4271 } 4272 } 4273 4274 /* 4275 * tcp_timer is the timer service routine. It handles all timer events for 4276 * a tcp instance except keepalives. It figures out from the state of the 4277 * tcp instance what kind of action needs to be done at the time it is called. 4278 */ 4279 static void 4280 tcp_timer(tcp_t *tcp, int sock_id) 4281 { 4282 mblk_t *mp; 4283 uint32_t first_threshold; 4284 uint32_t second_threshold; 4285 uint32_t ms; 4286 uint32_t mss; 4287 4288 first_threshold = tcp->tcp_first_timer_threshold; 4289 second_threshold = tcp->tcp_second_timer_threshold; 4290 switch (tcp->tcp_state) { 4291 case TCPS_IDLE: 4292 case TCPS_BOUND: 4293 case TCPS_LISTEN: 4294 return; 4295 case TCPS_SYN_RCVD: 4296 case TCPS_SYN_SENT: 4297 first_threshold = tcp->tcp_first_ctimer_threshold; 4298 second_threshold = tcp->tcp_second_ctimer_threshold; 4299 break; 4300 case TCPS_ESTABLISHED: 4301 case TCPS_FIN_WAIT_1: 4302 case TCPS_CLOSING: 4303 case TCPS_CLOSE_WAIT: 4304 case TCPS_LAST_ACK: 4305 /* If we have data to rexmit */ 4306 if (tcp->tcp_suna != tcp->tcp_snxt) { 4307 int32_t time_to_wait; 4308 4309 BUMP_MIB(tcp_mib.tcpTimRetrans); 4310 if (tcp->tcp_xmit_head == NULL) 4311 break; 4312 /* use uintptr_t to suppress the gcc warning */ 4313 time_to_wait = (int32_t)(prom_gettime() - 4314 (uint32_t)(uintptr_t)tcp->tcp_xmit_head->b_prev); 4315 time_to_wait = tcp->tcp_rto - time_to_wait; 4316 if (time_to_wait > 0) { 4317 /* 4318 * Timer fired too early, so restart it. 4319 */ 4320 TCP_TIMER_RESTART(tcp, time_to_wait); 4321 return; 4322 } 4323 /* 4324 * When we probe zero windows, we force the swnd open. 4325 * If our peer acks with a closed window swnd will be 4326 * set to zero by tcp_rput(). As long as we are 4327 * receiving acks tcp_rput will 4328 * reset 'tcp_ms_we_have_waited' so as not to trip the 4329 * first and second interval actions. NOTE: the timer 4330 * interval is allowed to continue its exponential 4331 * backoff. 4332 */ 4333 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 4334 DEBUG_1("tcp_timer (%d): zero win", sock_id); 4335 break; 4336 } else { 4337 /* 4338 * After retransmission, we need to do 4339 * slow start. Set the ssthresh to one 4340 * half of current effective window and 4341 * cwnd to one MSS. Also reset 4342 * tcp_cwnd_cnt. 4343 * 4344 * Note that if tcp_ssthresh is reduced because 4345 * of ECN, do not reduce it again unless it is 4346 * already one window of data away (tcp_cwr 4347 * should then be cleared) or this is a 4348 * timeout for a retransmitted segment. 4349 */ 4350 uint32_t npkt; 4351 4352 if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 4353 npkt = (MIN((tcp->tcp_timer_backoff ? 4354 tcp->tcp_cwnd_ssthresh : 4355 tcp->tcp_cwnd), 4356 tcp->tcp_swnd) >> 1) / 4357 tcp->tcp_mss; 4358 if (npkt < 2) 4359 npkt = 2; 4360 tcp->tcp_cwnd_ssthresh = npkt * 4361 tcp->tcp_mss; 4362 } 4363 tcp->tcp_cwnd = tcp->tcp_mss; 4364 tcp->tcp_cwnd_cnt = 0; 4365 if (tcp->tcp_ecn_ok) { 4366 tcp->tcp_cwr = B_TRUE; 4367 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 4368 tcp->tcp_ecn_cwr_sent = B_FALSE; 4369 } 4370 } 4371 break; 4372 } 4373 /* 4374 * We have something to send yet we cannot send. The 4375 * reason can be: 4376 * 4377 * 1. Zero send window: we need to do zero window probe. 4378 * 2. Zero cwnd: because of ECN, we need to "clock out 4379 * segments. 4380 * 3. SWS avoidance: receiver may have shrunk window, 4381 * reset our knowledge. 4382 * 4383 * Note that condition 2 can happen with either 1 or 4384 * 3. But 1 and 3 are exclusive. 4385 */ 4386 if (tcp->tcp_unsent != 0) { 4387 if (tcp->tcp_cwnd == 0) { 4388 /* 4389 * Set tcp_cwnd to 1 MSS so that a 4390 * new segment can be sent out. We 4391 * are "clocking out" new data when 4392 * the network is really congested. 4393 */ 4394 assert(tcp->tcp_ecn_ok); 4395 tcp->tcp_cwnd = tcp->tcp_mss; 4396 } 4397 if (tcp->tcp_swnd == 0) { 4398 /* Extend window for zero window probe */ 4399 tcp->tcp_swnd++; 4400 tcp->tcp_zero_win_probe = B_TRUE; 4401 BUMP_MIB(tcp_mib.tcpOutWinProbe); 4402 } else { 4403 /* 4404 * Handle timeout from sender SWS avoidance. 4405 * Reset our knowledge of the max send window 4406 * since the receiver might have reduced its 4407 * receive buffer. Avoid setting tcp_max_swnd 4408 * to one since that will essentially disable 4409 * the SWS checks. 4410 * 4411 * Note that since we don't have a SWS 4412 * state variable, if the timeout is set 4413 * for ECN but not for SWS, this 4414 * code will also be executed. This is 4415 * fine as tcp_max_swnd is updated 4416 * constantly and it will not affect 4417 * anything. 4418 */ 4419 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 4420 } 4421 tcp_wput_data(tcp, NULL, sock_id); 4422 return; 4423 } 4424 /* Is there a FIN that needs to be to re retransmitted? */ 4425 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4426 !tcp->tcp_fin_acked) 4427 break; 4428 /* Nothing to do, return without restarting timer. */ 4429 return; 4430 case TCPS_FIN_WAIT_2: 4431 /* 4432 * User closed the TCP endpoint and peer ACK'ed our FIN. 4433 * We waited some time for for peer's FIN, but it hasn't 4434 * arrived. We flush the connection now to avoid 4435 * case where the peer has rebooted. 4436 */ 4437 /* FALLTHRU */ 4438 case TCPS_TIME_WAIT: 4439 (void) tcp_clean_death(sock_id, tcp, 0); 4440 return; 4441 default: 4442 DEBUG_3("tcp_timer (%d): strange state (%d) %s", sock_id, 4443 tcp->tcp_state, tcp_display(tcp, NULL, 4444 DISP_PORT_ONLY)); 4445 return; 4446 } 4447 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 4448 /* 4449 * For zero window probe, we need to send indefinitely, 4450 * unless we have not heard from the other side for some 4451 * time... 4452 */ 4453 if ((tcp->tcp_zero_win_probe == 0) || 4454 ((prom_gettime() - tcp->tcp_last_recv_time) > 4455 second_threshold)) { 4456 BUMP_MIB(tcp_mib.tcpTimRetransDrop); 4457 /* 4458 * If TCP is in SYN_RCVD state, send back a 4459 * RST|ACK as BSD does. Note that tcp_zero_win_probe 4460 * should be zero in TCPS_SYN_RCVD state. 4461 */ 4462 if (tcp->tcp_state == TCPS_SYN_RCVD) { 4463 tcp_xmit_ctl("tcp_timer: RST sent on timeout " 4464 "in SYN_RCVD", 4465 tcp, NULL, tcp->tcp_snxt, 4466 tcp->tcp_rnxt, TH_RST | TH_ACK, 0, sock_id); 4467 } 4468 (void) tcp_clean_death(sock_id, tcp, 4469 tcp->tcp_client_errno ? 4470 tcp->tcp_client_errno : ETIMEDOUT); 4471 return; 4472 } else { 4473 /* 4474 * Set tcp_ms_we_have_waited to second_threshold 4475 * so that in next timeout, we will do the above 4476 * check (lbolt - tcp_last_recv_time). This is 4477 * also to avoid overflow. 4478 * 4479 * We don't need to decrement tcp_timer_backoff 4480 * to avoid overflow because it will be decremented 4481 * later if new timeout value is greater than 4482 * tcp_rexmit_interval_max. In the case when 4483 * tcp_rexmit_interval_max is greater than 4484 * second_threshold, it means that we will wait 4485 * longer than second_threshold to send the next 4486 * window probe. 4487 */ 4488 tcp->tcp_ms_we_have_waited = second_threshold; 4489 } 4490 } else if (ms > first_threshold && tcp->tcp_rtt_sa != 0) { 4491 /* 4492 * We have been retransmitting for too long... The RTT 4493 * we calculated is probably incorrect. Reinitialize it. 4494 * Need to compensate for 0 tcp_rtt_sa. Reset 4495 * tcp_rtt_update so that we won't accidentally cache a 4496 * bad value. But only do this if this is not a zero 4497 * window probe. 4498 */ 4499 if (tcp->tcp_zero_win_probe == 0) { 4500 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 4501 (tcp->tcp_rtt_sa >> 5); 4502 tcp->tcp_rtt_sa = 0; 4503 tcp->tcp_rtt_update = 0; 4504 } 4505 } 4506 tcp->tcp_timer_backoff++; 4507 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 4508 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 4509 tcp_rexmit_interval_min) { 4510 /* 4511 * This means the original RTO is tcp_rexmit_interval_min. 4512 * So we will use tcp_rexmit_interval_min as the RTO value 4513 * and do the backoff. 4514 */ 4515 ms = tcp_rexmit_interval_min << tcp->tcp_timer_backoff; 4516 } else { 4517 ms <<= tcp->tcp_timer_backoff; 4518 } 4519 if (ms > tcp_rexmit_interval_max) { 4520 ms = tcp_rexmit_interval_max; 4521 /* 4522 * ms is at max, decrement tcp_timer_backoff to avoid 4523 * overflow. 4524 */ 4525 tcp->tcp_timer_backoff--; 4526 } 4527 tcp->tcp_ms_we_have_waited += ms; 4528 if (tcp->tcp_zero_win_probe == 0) { 4529 tcp->tcp_rto = ms; 4530 } 4531 TCP_TIMER_RESTART(tcp, ms); 4532 /* 4533 * This is after a timeout and tcp_rto is backed off. Set 4534 * tcp_set_timer to 1 so that next time RTO is updated, we will 4535 * restart the timer with a correct value. 4536 */ 4537 tcp->tcp_set_timer = 1; 4538 mss = tcp->tcp_snxt - tcp->tcp_suna; 4539 if (mss > tcp->tcp_mss) 4540 mss = tcp->tcp_mss; 4541 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 4542 mss = tcp->tcp_swnd; 4543 4544 if ((mp = tcp->tcp_xmit_head) != NULL) { 4545 /* use uintptr_t to suppress the gcc warning */ 4546 mp->b_prev = (mblk_t *)(uintptr_t)prom_gettime(); 4547 } 4548 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 4549 B_TRUE); 4550 if (mp == NULL) 4551 return; 4552 tcp->tcp_csuna = tcp->tcp_snxt; 4553 BUMP_MIB(tcp_mib.tcpRetransSegs); 4554 UPDATE_MIB(tcp_mib.tcpRetransBytes, mss); 4555 /* Dump the packet when debugging. */ 4556 TCP_DUMP_PACKET("tcp_timer", mp); 4557 4558 (void) ipv4_tcp_output(sock_id, mp); 4559 freeb(mp); 4560 4561 /* 4562 * When slow start after retransmission begins, start with 4563 * this seq no. tcp_rexmit_max marks the end of special slow 4564 * start phase. tcp_snd_burst controls how many segments 4565 * can be sent because of an ack. 4566 */ 4567 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 4568 tcp->tcp_snd_burst = TCP_CWND_SS; 4569 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4570 (tcp->tcp_unsent == 0)) { 4571 tcp->tcp_rexmit_max = tcp->tcp_fss; 4572 } else { 4573 tcp->tcp_rexmit_max = tcp->tcp_snxt; 4574 } 4575 tcp->tcp_rexmit = B_TRUE; 4576 tcp->tcp_dupack_cnt = 0; 4577 4578 /* 4579 * Remove all rexmit SACK blk to start from fresh. 4580 */ 4581 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 4582 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 4583 tcp->tcp_num_notsack_blk = 0; 4584 tcp->tcp_cnt_notsack_list = 0; 4585 } 4586 } 4587 4588 /* 4589 * The TCP normal data output path. 4590 * NOTE: the logic of the fast path is duplicated from this function. 4591 */ 4592 static void 4593 tcp_wput_data(tcp_t *tcp, mblk_t *mp, int sock_id) 4594 { 4595 int len; 4596 mblk_t *local_time; 4597 mblk_t *mp1; 4598 uchar_t *rptr; 4599 uint32_t snxt; 4600 int tail_unsent; 4601 int tcpstate; 4602 int usable = 0; 4603 mblk_t *xmit_tail; 4604 int32_t num_burst_seg; 4605 int32_t mss; 4606 int32_t num_sack_blk = 0; 4607 int32_t tcp_hdr_len; 4608 ipaddr_t *dst; 4609 ipaddr_t *src; 4610 4611 #ifdef DEBUG 4612 printf("tcp_wput_data(%d) ##############################\n", sock_id); 4613 #endif 4614 tcpstate = tcp->tcp_state; 4615 if (mp == NULL) { 4616 /* Really tacky... but we need this for detached closes. */ 4617 len = tcp->tcp_unsent; 4618 goto data_null; 4619 } 4620 4621 /* 4622 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 4623 * or before a connection attempt has begun. 4624 * 4625 * The following should not happen in inetboot.... 4626 */ 4627 if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 4628 (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 4629 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 4630 printf("tcp_wput_data: data after ordrel, %s\n", 4631 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 4632 } 4633 freemsg(mp); 4634 return; 4635 } 4636 4637 /* Strip empties */ 4638 for (;;) { 4639 assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 4640 (uintptr_t)INT_MAX); 4641 len = (int)(mp->b_wptr - mp->b_rptr); 4642 if (len > 0) 4643 break; 4644 mp1 = mp; 4645 mp = mp->b_cont; 4646 freeb(mp1); 4647 if (mp == NULL) { 4648 return; 4649 } 4650 } 4651 4652 /* If we are the first on the list ... */ 4653 if (tcp->tcp_xmit_head == NULL) { 4654 tcp->tcp_xmit_head = mp; 4655 tcp->tcp_xmit_tail = mp; 4656 tcp->tcp_xmit_tail_unsent = len; 4657 } else { 4658 tcp->tcp_xmit_last->b_cont = mp; 4659 len += tcp->tcp_unsent; 4660 } 4661 4662 /* Tack on however many more positive length mblks we have */ 4663 if ((mp1 = mp->b_cont) != NULL) { 4664 do { 4665 int tlen; 4666 assert((uintptr_t)(mp1->b_wptr - 4667 mp1->b_rptr) <= (uintptr_t)INT_MAX); 4668 tlen = (int)(mp1->b_wptr - mp1->b_rptr); 4669 if (tlen <= 0) { 4670 mp->b_cont = mp1->b_cont; 4671 freeb(mp1); 4672 } else { 4673 len += tlen; 4674 mp = mp1; 4675 } 4676 } while ((mp1 = mp->b_cont) != NULL); 4677 } 4678 tcp->tcp_xmit_last = mp; 4679 tcp->tcp_unsent = len; 4680 4681 data_null: 4682 snxt = tcp->tcp_snxt; 4683 xmit_tail = tcp->tcp_xmit_tail; 4684 tail_unsent = tcp->tcp_xmit_tail_unsent; 4685 4686 /* 4687 * Note that tcp_mss has been adjusted to take into account the 4688 * timestamp option if applicable. Because SACK options do not 4689 * appear in every TCP segments and they are of variable lengths, 4690 * they cannot be included in tcp_mss. Thus we need to calculate 4691 * the actual segment length when we need to send a segment which 4692 * includes SACK options. 4693 */ 4694 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4695 int32_t opt_len; 4696 4697 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 4698 tcp->tcp_num_sack_blk); 4699 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 4700 2 + TCPOPT_HEADER_LEN; 4701 mss = tcp->tcp_mss - opt_len; 4702 tcp_hdr_len = tcp->tcp_hdr_len + opt_len; 4703 } else { 4704 mss = tcp->tcp_mss; 4705 tcp_hdr_len = tcp->tcp_hdr_len; 4706 } 4707 4708 if ((tcp->tcp_suna == snxt) && 4709 (prom_gettime() - tcp->tcp_last_recv_time) >= tcp->tcp_rto) { 4710 tcp->tcp_cwnd = MIN(tcp_slow_start_after_idle * mss, 4711 MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 4712 } 4713 if (tcpstate == TCPS_SYN_RCVD) { 4714 /* 4715 * The three-way connection establishment handshake is not 4716 * complete yet. We want to queue the data for transmission 4717 * after entering ESTABLISHED state (RFC793). Setting usable to 4718 * zero cause a jump to "done" label effectively leaving data 4719 * on the queue. 4720 */ 4721 4722 usable = 0; 4723 } else { 4724 int usable_r = tcp->tcp_swnd; 4725 4726 /* 4727 * In the special case when cwnd is zero, which can only 4728 * happen if the connection is ECN capable, return now. 4729 * New segments is sent using tcp_timer(). The timer 4730 * is set in tcp_rput_data(). 4731 */ 4732 if (tcp->tcp_cwnd == 0) { 4733 /* 4734 * Note that tcp_cwnd is 0 before 3-way handshake is 4735 * finished. 4736 */ 4737 assert(tcp->tcp_ecn_ok || 4738 tcp->tcp_state < TCPS_ESTABLISHED); 4739 return; 4740 } 4741 4742 /* usable = MIN(swnd, cwnd) - unacked_bytes */ 4743 if (usable_r > tcp->tcp_cwnd) 4744 usable_r = tcp->tcp_cwnd; 4745 4746 /* NOTE: trouble if xmitting while SYN not acked? */ 4747 usable_r -= snxt; 4748 usable_r += tcp->tcp_suna; 4749 4750 /* usable = MIN(usable, unsent) */ 4751 if (usable_r > len) 4752 usable_r = len; 4753 4754 /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 4755 if (usable_r != 0) 4756 usable = usable_r; 4757 } 4758 4759 /* use uintptr_t to suppress the gcc warning */ 4760 local_time = (mblk_t *)(uintptr_t)prom_gettime(); 4761 4762 /* 4763 * "Our" Nagle Algorithm. This is not the same as in the old 4764 * BSD. This is more in line with the true intent of Nagle. 4765 * 4766 * The conditions are: 4767 * 1. The amount of unsent data (or amount of data which can be 4768 * sent, whichever is smaller) is less than Nagle limit. 4769 * 2. The last sent size is also less than Nagle limit. 4770 * 3. There is unack'ed data. 4771 * 4. Urgent pointer is not set. Send urgent data ignoring the 4772 * Nagle algorithm. This reduces the probability that urgent 4773 * bytes get "merged" together. 4774 * 5. The app has not closed the connection. This eliminates the 4775 * wait time of the receiving side waiting for the last piece of 4776 * (small) data. 4777 * 4778 * If all are satisified, exit without sending anything. Note 4779 * that Nagle limit can be smaller than 1 MSS. Nagle limit is 4780 * the smaller of 1 MSS and global tcp_naglim_def (default to be 4781 * 4095). 4782 */ 4783 if (usable < (int)tcp->tcp_naglim && 4784 tcp->tcp_naglim > tcp->tcp_last_sent_len && 4785 snxt != tcp->tcp_suna && 4786 !(tcp->tcp_valid_bits & TCP_URG_VALID)) 4787 goto done; 4788 4789 num_burst_seg = tcp->tcp_snd_burst; 4790 for (;;) { 4791 tcph_t *tcph; 4792 mblk_t *new_mp; 4793 4794 if (num_burst_seg-- == 0) 4795 goto done; 4796 4797 len = mss; 4798 if (len > usable) { 4799 len = usable; 4800 if (len <= 0) { 4801 /* Terminate the loop */ 4802 goto done; 4803 } 4804 /* 4805 * Sender silly-window avoidance. 4806 * Ignore this if we are going to send a 4807 * zero window probe out. 4808 * 4809 * TODO: force data into microscopic window ?? 4810 * ==> (!pushed || (unsent > usable)) 4811 */ 4812 if (len < (tcp->tcp_max_swnd >> 1) && 4813 (tcp->tcp_unsent - (snxt - tcp->tcp_snxt)) > len && 4814 !((tcp->tcp_valid_bits & TCP_URG_VALID) && 4815 len == 1) && (! tcp->tcp_zero_win_probe)) { 4816 /* 4817 * If the retransmit timer is not running 4818 * we start it so that we will retransmit 4819 * in the case when the the receiver has 4820 * decremented the window. 4821 */ 4822 if (snxt == tcp->tcp_snxt && 4823 snxt == tcp->tcp_suna) { 4824 /* 4825 * We are not supposed to send 4826 * anything. So let's wait a little 4827 * bit longer before breaking SWS 4828 * avoidance. 4829 * 4830 * What should the value be? 4831 * Suggestion: MAX(init rexmit time, 4832 * tcp->tcp_rto) 4833 */ 4834 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 4835 } 4836 goto done; 4837 } 4838 } 4839 4840 tcph = tcp->tcp_tcph; 4841 4842 usable -= len; /* Approximate - can be adjusted later */ 4843 if (usable > 0) 4844 tcph->th_flags[0] = TH_ACK; 4845 else 4846 tcph->th_flags[0] = (TH_ACK | TH_PUSH); 4847 4848 U32_TO_ABE32(snxt, tcph->th_seq); 4849 4850 if (tcp->tcp_valid_bits) { 4851 uchar_t *prev_rptr = xmit_tail->b_rptr; 4852 uint32_t prev_snxt = tcp->tcp_snxt; 4853 4854 if (tail_unsent == 0) { 4855 assert(xmit_tail->b_cont != NULL); 4856 xmit_tail = xmit_tail->b_cont; 4857 prev_rptr = xmit_tail->b_rptr; 4858 tail_unsent = (int)(xmit_tail->b_wptr - 4859 xmit_tail->b_rptr); 4860 } else { 4861 xmit_tail->b_rptr = xmit_tail->b_wptr - 4862 tail_unsent; 4863 } 4864 mp = tcp_xmit_mp(tcp, xmit_tail, len, NULL, NULL, 4865 snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 4866 /* Restore tcp_snxt so we get amount sent right. */ 4867 tcp->tcp_snxt = prev_snxt; 4868 if (prev_rptr == xmit_tail->b_rptr) 4869 xmit_tail->b_prev = local_time; 4870 else 4871 xmit_tail->b_rptr = prev_rptr; 4872 4873 if (mp == NULL) 4874 break; 4875 4876 mp1 = mp->b_cont; 4877 4878 snxt += len; 4879 tcp->tcp_last_sent_len = (ushort_t)len; 4880 while (mp1->b_cont) { 4881 xmit_tail = xmit_tail->b_cont; 4882 xmit_tail->b_prev = local_time; 4883 mp1 = mp1->b_cont; 4884 } 4885 tail_unsent = xmit_tail->b_wptr - mp1->b_wptr; 4886 BUMP_MIB(tcp_mib.tcpOutDataSegs); 4887 UPDATE_MIB(tcp_mib.tcpOutDataBytes, len); 4888 /* Dump the packet when debugging. */ 4889 TCP_DUMP_PACKET("tcp_wput_data (valid bits)", mp); 4890 (void) ipv4_tcp_output(sock_id, mp); 4891 freeb(mp); 4892 continue; 4893 } 4894 4895 snxt += len; /* Adjust later if we don't send all of len */ 4896 BUMP_MIB(tcp_mib.tcpOutDataSegs); 4897 UPDATE_MIB(tcp_mib.tcpOutDataBytes, len); 4898 4899 if (tail_unsent) { 4900 /* Are the bytes above us in flight? */ 4901 rptr = xmit_tail->b_wptr - tail_unsent; 4902 if (rptr != xmit_tail->b_rptr) { 4903 tail_unsent -= len; 4904 len += tcp_hdr_len; 4905 tcp->tcp_ipha->ip_len = htons(len); 4906 mp = dupb(xmit_tail); 4907 if (!mp) 4908 break; 4909 mp->b_rptr = rptr; 4910 goto must_alloc; 4911 } 4912 } else { 4913 xmit_tail = xmit_tail->b_cont; 4914 assert((uintptr_t)(xmit_tail->b_wptr - 4915 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 4916 tail_unsent = (int)(xmit_tail->b_wptr - 4917 xmit_tail->b_rptr); 4918 } 4919 4920 tail_unsent -= len; 4921 tcp->tcp_last_sent_len = (ushort_t)len; 4922 4923 len += tcp_hdr_len; 4924 if (tcp->tcp_ipversion == IPV4_VERSION) 4925 tcp->tcp_ipha->ip_len = htons(len); 4926 4927 xmit_tail->b_prev = local_time; 4928 4929 mp = dupb(xmit_tail); 4930 if (mp == NULL) 4931 goto out_of_mem; 4932 4933 len = tcp_hdr_len; 4934 /* 4935 * There are four reasons to allocate a new hdr mblk: 4936 * 1) The bytes above us are in use by another packet 4937 * 2) We don't have good alignment 4938 * 3) The mblk is being shared 4939 * 4) We don't have enough room for a header 4940 */ 4941 rptr = mp->b_rptr - len; 4942 if (!OK_32PTR(rptr) || 4943 rptr < mp->b_datap) { 4944 /* NOTE: we assume allocb returns an OK_32PTR */ 4945 4946 must_alloc:; 4947 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 4948 tcp_wroff_xtra, 0); 4949 if (mp1 == NULL) { 4950 freemsg(mp); 4951 goto out_of_mem; 4952 } 4953 mp1->b_cont = mp; 4954 mp = mp1; 4955 /* Leave room for Link Level header */ 4956 len = tcp_hdr_len; 4957 rptr = &mp->b_rptr[tcp_wroff_xtra]; 4958 mp->b_wptr = &rptr[len]; 4959 } 4960 4961 if (tcp->tcp_snd_ts_ok) { 4962 /* use uintptr_t to suppress the gcc warning */ 4963 U32_TO_BE32((uint32_t)(uintptr_t)local_time, 4964 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 4965 U32_TO_BE32(tcp->tcp_ts_recent, 4966 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 4967 } else { 4968 assert(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 4969 } 4970 4971 mp->b_rptr = rptr; 4972 4973 /* Copy the template header. */ 4974 dst = (ipaddr_t *)rptr; 4975 src = (ipaddr_t *)tcp->tcp_iphc; 4976 dst[0] = src[0]; 4977 dst[1] = src[1]; 4978 dst[2] = src[2]; 4979 dst[3] = src[3]; 4980 dst[4] = src[4]; 4981 dst[5] = src[5]; 4982 dst[6] = src[6]; 4983 dst[7] = src[7]; 4984 dst[8] = src[8]; 4985 dst[9] = src[9]; 4986 len = tcp->tcp_hdr_len; 4987 if (len -= 40) { 4988 len >>= 2; 4989 dst += 10; 4990 src += 10; 4991 do { 4992 *dst++ = *src++; 4993 } while (--len); 4994 } 4995 4996 /* 4997 * Set tcph to point to the header of the outgoing packet, 4998 * not to the template header. 4999 */ 5000 tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 5001 5002 /* 5003 * Set the ECN info in the TCP header if it is not a zero 5004 * window probe. Zero window probe is only sent in 5005 * tcp_wput_data() and tcp_timer(). 5006 */ 5007 if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 5008 SET_ECT(tcp, rptr); 5009 5010 if (tcp->tcp_ecn_echo_on) 5011 tcph->th_flags[0] |= TH_ECE; 5012 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 5013 tcph->th_flags[0] |= TH_CWR; 5014 tcp->tcp_ecn_cwr_sent = B_TRUE; 5015 } 5016 } 5017 5018 /* Fill in SACK options */ 5019 if (num_sack_blk > 0) { 5020 uchar_t *wptr = rptr + tcp->tcp_hdr_len; 5021 sack_blk_t *tmp; 5022 int32_t i; 5023 5024 wptr[0] = TCPOPT_NOP; 5025 wptr[1] = TCPOPT_NOP; 5026 wptr[2] = TCPOPT_SACK; 5027 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5028 sizeof (sack_blk_t); 5029 wptr += TCPOPT_REAL_SACK_LEN; 5030 5031 tmp = tcp->tcp_sack_list; 5032 for (i = 0; i < num_sack_blk; i++) { 5033 U32_TO_BE32(tmp[i].begin, wptr); 5034 wptr += sizeof (tcp_seq); 5035 U32_TO_BE32(tmp[i].end, wptr); 5036 wptr += sizeof (tcp_seq); 5037 } 5038 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 5039 << 4); 5040 } 5041 5042 if (tail_unsent) { 5043 mp1 = mp->b_cont; 5044 if (mp1 == NULL) 5045 mp1 = mp; 5046 /* 5047 * If we're a little short, tack on more mblks 5048 * as long as we don't need to split an mblk. 5049 */ 5050 while (tail_unsent < 0 && 5051 tail_unsent + (int)(xmit_tail->b_cont->b_wptr - 5052 xmit_tail->b_cont->b_rptr) <= 0) { 5053 xmit_tail = xmit_tail->b_cont; 5054 /* Stash for rtt use later */ 5055 xmit_tail->b_prev = local_time; 5056 mp1->b_cont = dupb(xmit_tail); 5057 mp1 = mp1->b_cont; 5058 assert((uintptr_t)(xmit_tail->b_wptr - 5059 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 5060 tail_unsent += (int)(xmit_tail->b_wptr - 5061 xmit_tail->b_rptr); 5062 if (mp1 == NULL) { 5063 freemsg(mp); 5064 goto out_of_mem; 5065 } 5066 } 5067 /* Trim back any surplus on the last mblk */ 5068 if (tail_unsent > 0) 5069 mp1->b_wptr -= tail_unsent; 5070 if (tail_unsent < 0) { 5071 uint32_t ip_len; 5072 5073 /* 5074 * We did not send everything we could in 5075 * order to preserve mblk boundaries. 5076 */ 5077 usable -= tail_unsent; 5078 snxt += tail_unsent; 5079 tcp->tcp_last_sent_len += tail_unsent; 5080 UPDATE_MIB(tcp_mib.tcpOutDataBytes, 5081 tail_unsent); 5082 /* Adjust the IP length field. */ 5083 ip_len = ntohs(((struct ip *)rptr)->ip_len) + 5084 tail_unsent; 5085 ((struct ip *)rptr)->ip_len = htons(ip_len); 5086 tail_unsent = 0; 5087 } 5088 } 5089 5090 if (mp == NULL) 5091 goto out_of_mem; 5092 5093 /* 5094 * Performance hit! We need to pullup the whole message 5095 * in order to do checksum and for the MAC output routine. 5096 */ 5097 if (mp->b_cont != NULL) { 5098 int mp_size; 5099 #ifdef DEBUG 5100 printf("Multiple mblk %d\n", msgdsize(mp)); 5101 #endif 5102 new_mp = allocb(msgdsize(mp) + tcp_wroff_xtra, 0); 5103 new_mp->b_rptr += tcp_wroff_xtra; 5104 new_mp->b_wptr = new_mp->b_rptr; 5105 while (mp != NULL) { 5106 mp_size = mp->b_wptr - mp->b_rptr; 5107 bcopy(mp->b_rptr, new_mp->b_wptr, mp_size); 5108 new_mp->b_wptr += mp_size; 5109 mp = mp->b_cont; 5110 } 5111 freemsg(mp); 5112 mp = new_mp; 5113 } 5114 tcp_set_cksum(mp); 5115 ((struct ip *)mp->b_rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5116 TCP_DUMP_PACKET("tcp_wput_data", mp); 5117 (void) ipv4_tcp_output(sock_id, mp); 5118 freemsg(mp); 5119 } 5120 out_of_mem:; 5121 /* Pretend that all we were trying to send really got sent */ 5122 if (tail_unsent < 0) { 5123 do { 5124 xmit_tail = xmit_tail->b_cont; 5125 xmit_tail->b_prev = local_time; 5126 assert((uintptr_t)(xmit_tail->b_wptr - 5127 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 5128 tail_unsent += (int)(xmit_tail->b_wptr - 5129 xmit_tail->b_rptr); 5130 } while (tail_unsent < 0); 5131 } 5132 done:; 5133 tcp->tcp_xmit_tail = xmit_tail; 5134 tcp->tcp_xmit_tail_unsent = tail_unsent; 5135 len = tcp->tcp_snxt - snxt; 5136 if (len) { 5137 /* 5138 * If new data was sent, need to update the notsack 5139 * list, which is, afterall, data blocks that have 5140 * not been sack'ed by the receiver. New data is 5141 * not sack'ed. 5142 */ 5143 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 5144 /* len is a negative value. */ 5145 tcp->tcp_pipe -= len; 5146 tcp_notsack_update(&(tcp->tcp_notsack_list), 5147 tcp->tcp_snxt, snxt, 5148 &(tcp->tcp_num_notsack_blk), 5149 &(tcp->tcp_cnt_notsack_list)); 5150 } 5151 tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 5152 tcp->tcp_rack = tcp->tcp_rnxt; 5153 tcp->tcp_rack_cnt = 0; 5154 if ((snxt + len) == tcp->tcp_suna) { 5155 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5156 } 5157 /* 5158 * Note that len is the amount we just sent but with a negative 5159 * sign. We update tcp_unsent here since we may come back to 5160 * tcp_wput_data from tcp_state_wait. 5161 */ 5162 len += tcp->tcp_unsent; 5163 tcp->tcp_unsent = len; 5164 5165 /* 5166 * Let's wait till all the segments have been acked, since we 5167 * don't have a timer. 5168 */ 5169 (void) tcp_state_wait(sock_id, tcp, TCPS_ALL_ACKED); 5170 return; 5171 } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 5172 /* 5173 * Didn't send anything. Make sure the timer is running 5174 * so that we will probe a zero window. 5175 */ 5176 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5177 } 5178 5179 /* Note that len is the amount we just sent but with a negative sign */ 5180 len += tcp->tcp_unsent; 5181 tcp->tcp_unsent = len; 5182 5183 } 5184 5185 static void 5186 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 5187 uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcph_t *tcph, 5188 int sock_id) 5189 { 5190 int32_t bytes_acked; 5191 int32_t gap; 5192 int32_t rgap; 5193 tcp_opt_t tcpopt; 5194 uint_t flags; 5195 uint32_t new_swnd = 0; 5196 5197 #ifdef DEBUG 5198 printf("Time wait processing called ###############3\n"); 5199 #endif 5200 5201 /* Just make sure we send the right sock_id to tcp_clean_death */ 5202 if ((sockets[sock_id].pcb == NULL) || (sockets[sock_id].pcb != tcp)) 5203 sock_id = -1; 5204 5205 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 5206 new_swnd = BE16_TO_U16(tcph->th_win) << 5207 ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 5208 if (tcp->tcp_snd_ts_ok) { 5209 if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 5210 freemsg(mp); 5211 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5212 tcp->tcp_rnxt, TH_ACK, 0, -1); 5213 return; 5214 } 5215 } 5216 gap = seg_seq - tcp->tcp_rnxt; 5217 rgap = tcp->tcp_rwnd - (gap + seg_len); 5218 if (gap < 0) { 5219 BUMP_MIB(tcp_mib.tcpInDataDupSegs); 5220 UPDATE_MIB(tcp_mib.tcpInDataDupBytes, 5221 (seg_len > -gap ? -gap : seg_len)); 5222 seg_len += gap; 5223 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 5224 if (flags & TH_RST) { 5225 freemsg(mp); 5226 return; 5227 } 5228 if ((flags & TH_FIN) && seg_len == -1) { 5229 /* 5230 * When TCP receives a duplicate FIN in 5231 * TIME_WAIT state, restart the 2 MSL timer. 5232 * See page 73 in RFC 793. Make sure this TCP 5233 * is already on the TIME_WAIT list. If not, 5234 * just restart the timer. 5235 */ 5236 tcp_time_wait_remove(tcp); 5237 tcp_time_wait_append(tcp); 5238 TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 5239 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5240 tcp->tcp_rnxt, TH_ACK, 0, -1); 5241 freemsg(mp); 5242 return; 5243 } 5244 flags |= TH_ACK_NEEDED; 5245 seg_len = 0; 5246 goto process_ack; 5247 } 5248 5249 /* Fix seg_seq, and chew the gap off the front. */ 5250 seg_seq = tcp->tcp_rnxt; 5251 } 5252 5253 if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 5254 /* 5255 * Make sure that when we accept the connection, pick 5256 * an ISS greater than (tcp_snxt + ISS_INCR/2) for the 5257 * old connection. 5258 * 5259 * The next ISS generated is equal to tcp_iss_incr_extra 5260 * + ISS_INCR/2 + other components depending on the 5261 * value of tcp_strong_iss. We pre-calculate the new 5262 * ISS here and compare with tcp_snxt to determine if 5263 * we need to make adjustment to tcp_iss_incr_extra. 5264 * 5265 * Note that since we are now in the global queue 5266 * perimeter and need to do a lateral_put() to the 5267 * listener queue, there can be other connection requests/ 5268 * attempts while the lateral_put() is going on. That 5269 * means what we calculate here may not be correct. This 5270 * is extremely difficult to solve unless TCP and IP 5271 * modules are merged and there is no perimeter, but just 5272 * locks. The above calculation is ugly and is a 5273 * waste of CPU cycles... 5274 */ 5275 uint32_t new_iss = tcp_iss_incr_extra; 5276 int32_t adj; 5277 5278 /* Add time component and min random (i.e. 1). */ 5279 new_iss += (prom_gettime() >> ISS_NSEC_SHT) + 1; 5280 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 5281 /* 5282 * New ISS not guaranteed to be ISS_INCR/2 5283 * ahead of the current tcp_snxt, so add the 5284 * difference to tcp_iss_incr_extra. 5285 */ 5286 tcp_iss_incr_extra += adj; 5287 } 5288 tcp_clean_death(sock_id, tcp, 0); 5289 5290 /* 5291 * This is a passive open. Right now we do not 5292 * do anything... 5293 */ 5294 freemsg(mp); 5295 return; 5296 } 5297 5298 /* 5299 * rgap is the amount of stuff received out of window. A negative 5300 * value is the amount out of window. 5301 */ 5302 if (rgap < 0) { 5303 BUMP_MIB(tcp_mib.tcpInDataPastWinSegs); 5304 UPDATE_MIB(tcp_mib.tcpInDataPastWinBytes, -rgap); 5305 /* Fix seg_len and make sure there is something left. */ 5306 seg_len += rgap; 5307 if (seg_len <= 0) { 5308 if (flags & TH_RST) { 5309 freemsg(mp); 5310 return; 5311 } 5312 flags |= TH_ACK_NEEDED; 5313 seg_len = 0; 5314 goto process_ack; 5315 } 5316 } 5317 /* 5318 * Check whether we can update tcp_ts_recent. This test is 5319 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 5320 * Extensions for High Performance: An Update", Internet Draft. 5321 */ 5322 if (tcp->tcp_snd_ts_ok && 5323 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 5324 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 5325 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 5326 tcp->tcp_last_rcv_lbolt = prom_gettime(); 5327 } 5328 5329 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 5330 /* Always ack out of order packets */ 5331 flags |= TH_ACK_NEEDED; 5332 seg_len = 0; 5333 } else if (seg_len > 0) { 5334 BUMP_MIB(tcp_mib.tcpInDataInorderSegs); 5335 UPDATE_MIB(tcp_mib.tcpInDataInorderBytes, seg_len); 5336 } 5337 if (flags & TH_RST) { 5338 freemsg(mp); 5339 (void) tcp_clean_death(sock_id, tcp, 0); 5340 return; 5341 } 5342 if (flags & TH_SYN) { 5343 freemsg(mp); 5344 tcp_xmit_ctl("TH_SYN", tcp, NULL, seg_ack, seg_seq + 1, 5345 TH_RST|TH_ACK, 0, -1); 5346 /* 5347 * Do not delete the TCP structure if it is in 5348 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 5349 */ 5350 return; 5351 } 5352 process_ack: 5353 if (flags & TH_ACK) { 5354 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 5355 if (bytes_acked <= 0) { 5356 if (bytes_acked == 0 && seg_len == 0 && 5357 new_swnd == tcp->tcp_swnd) 5358 BUMP_MIB(tcp_mib.tcpInDupAck); 5359 } else { 5360 /* Acks something not sent */ 5361 flags |= TH_ACK_NEEDED; 5362 } 5363 } 5364 freemsg(mp); 5365 if (flags & TH_ACK_NEEDED) { 5366 /* 5367 * Time to send an ack for some reason. 5368 */ 5369 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5370 tcp->tcp_rnxt, TH_ACK, 0, -1); 5371 } 5372 } 5373 5374 static int 5375 tcp_init_values(tcp_t *tcp, struct inetboot_socket *isp) 5376 { 5377 int err; 5378 5379 tcp->tcp_family = AF_INET; 5380 tcp->tcp_ipversion = IPV4_VERSION; 5381 5382 /* 5383 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 5384 * will be close to tcp_rexmit_interval_initial. By doing this, we 5385 * allow the algorithm to adjust slowly to large fluctuations of RTT 5386 * during first few transmissions of a connection as seen in slow 5387 * links. 5388 */ 5389 tcp->tcp_rtt_sa = tcp_rexmit_interval_initial << 2; 5390 tcp->tcp_rtt_sd = tcp_rexmit_interval_initial >> 1; 5391 tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 5392 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + 5393 tcp_conn_grace_period; 5394 if (tcp->tcp_rto < tcp_rexmit_interval_min) 5395 tcp->tcp_rto = tcp_rexmit_interval_min; 5396 tcp->tcp_timer_backoff = 0; 5397 tcp->tcp_ms_we_have_waited = 0; 5398 tcp->tcp_last_recv_time = prom_gettime(); 5399 tcp->tcp_cwnd_max = tcp_cwnd_max_; 5400 tcp->tcp_snd_burst = TCP_CWND_INFINITE; 5401 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 5402 /* For Ethernet, the mtu returned is actually 1550... */ 5403 if (mac_get_type() == IFT_ETHER) { 5404 tcp->tcp_if_mtu = mac_get_mtu() - 50; 5405 } else { 5406 tcp->tcp_if_mtu = mac_get_mtu(); 5407 } 5408 tcp->tcp_mss = tcp->tcp_if_mtu; 5409 5410 tcp->tcp_first_timer_threshold = tcp_ip_notify_interval; 5411 tcp->tcp_first_ctimer_threshold = tcp_ip_notify_cinterval; 5412 tcp->tcp_second_timer_threshold = tcp_ip_abort_interval; 5413 /* 5414 * Fix it to tcp_ip_abort_linterval later if it turns out to be a 5415 * passive open. 5416 */ 5417 tcp->tcp_second_ctimer_threshold = tcp_ip_abort_cinterval; 5418 5419 tcp->tcp_naglim = tcp_naglim_def; 5420 5421 /* NOTE: ISS is now set in tcp_adapt_ire(). */ 5422 5423 /* Initialize the header template */ 5424 if (tcp->tcp_ipversion == IPV4_VERSION) { 5425 err = tcp_header_init_ipv4(tcp); 5426 } 5427 if (err) 5428 return (err); 5429 5430 /* 5431 * Init the window scale to the max so tcp_rwnd_set() won't pare 5432 * down tcp_rwnd. tcp_adapt_ire() will set the right value later. 5433 */ 5434 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 5435 tcp->tcp_xmit_lowater = tcp_xmit_lowat; 5436 if (isp != NULL) { 5437 tcp->tcp_xmit_hiwater = isp->so_sndbuf; 5438 tcp->tcp_rwnd = isp->so_rcvbuf; 5439 tcp->tcp_rwnd_max = isp->so_rcvbuf; 5440 } 5441 tcp->tcp_state = TCPS_IDLE; 5442 return (0); 5443 } 5444 5445 /* 5446 * Initialize the IPv4 header. Loses any record of any IP options. 5447 */ 5448 static int 5449 tcp_header_init_ipv4(tcp_t *tcp) 5450 { 5451 tcph_t *tcph; 5452 5453 /* 5454 * This is a simple initialization. If there's 5455 * already a template, it should never be too small, 5456 * so reuse it. Otherwise, allocate space for the new one. 5457 */ 5458 if (tcp->tcp_iphc != NULL) { 5459 assert(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 5460 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 5461 } else { 5462 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 5463 tcp->tcp_iphc = bkmem_zalloc(tcp->tcp_iphc_len); 5464 if (tcp->tcp_iphc == NULL) { 5465 tcp->tcp_iphc_len = 0; 5466 return (ENOMEM); 5467 } 5468 } 5469 tcp->tcp_ipha = (struct ip *)tcp->tcp_iphc; 5470 tcp->tcp_ipversion = IPV4_VERSION; 5471 5472 /* 5473 * Note that it does not include TCP options yet. It will 5474 * after the connection is established. 5475 */ 5476 tcp->tcp_hdr_len = sizeof (struct ip) + sizeof (tcph_t); 5477 tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 5478 tcp->tcp_ip_hdr_len = sizeof (struct ip); 5479 tcp->tcp_ipha->ip_v = IP_VERSION; 5480 /* We don't support IP options... */ 5481 tcp->tcp_ipha->ip_hl = IP_SIMPLE_HDR_LENGTH_IN_WORDS; 5482 tcp->tcp_ipha->ip_p = IPPROTO_TCP; 5483 /* We are not supposed to do PMTU discovery... */ 5484 tcp->tcp_ipha->ip_sum = 0; 5485 5486 tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (struct ip)); 5487 tcp->tcp_tcph = tcph; 5488 tcph->th_offset_and_rsrvd[0] = (5 << 4); 5489 return (0); 5490 } 5491 5492 /* 5493 * Send out a control packet on the tcp connection specified. This routine 5494 * is typically called where we need a simple ACK or RST generated. 5495 * 5496 * This function is called with or without a mp. 5497 */ 5498 static void 5499 tcp_xmit_ctl(char *str, tcp_t *tcp, mblk_t *mp, uint32_t seq, 5500 uint32_t ack, int ctl, uint_t ip_hdr_len, int sock_id) 5501 { 5502 uchar_t *rptr; 5503 tcph_t *tcph; 5504 struct ip *iph = NULL; 5505 int tcp_hdr_len; 5506 int tcp_ip_hdr_len; 5507 5508 tcp_hdr_len = tcp->tcp_hdr_len; 5509 tcp_ip_hdr_len = tcp->tcp_ip_hdr_len; 5510 5511 if (mp) { 5512 assert(ip_hdr_len != 0); 5513 rptr = mp->b_rptr; 5514 tcph = (tcph_t *)(rptr + ip_hdr_len); 5515 /* Don't reply to a RST segment. */ 5516 if (tcph->th_flags[0] & TH_RST) { 5517 freeb(mp); 5518 return; 5519 } 5520 freemsg(mp); 5521 rptr = NULL; 5522 } else { 5523 assert(ip_hdr_len == 0); 5524 } 5525 /* If a text string is passed in with the request, print it out. */ 5526 if (str != NULL) { 5527 dprintf("tcp_xmit_ctl(%d): '%s', seq 0x%x, ack 0x%x, " 5528 "ctl 0x%x\n", sock_id, str, seq, ack, ctl); 5529 } 5530 mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 0); 5531 if (mp == NULL) { 5532 dprintf("tcp_xmit_ctl(%d): Cannot allocate memory\n", sock_id); 5533 return; 5534 } 5535 rptr = &mp->b_rptr[tcp_wroff_xtra]; 5536 mp->b_rptr = rptr; 5537 mp->b_wptr = &rptr[tcp_hdr_len]; 5538 bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); 5539 5540 iph = (struct ip *)rptr; 5541 iph->ip_len = htons(tcp_hdr_len); 5542 5543 tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; 5544 tcph->th_flags[0] = (uint8_t)ctl; 5545 if (ctl & TH_RST) { 5546 BUMP_MIB(tcp_mib.tcpOutRsts); 5547 BUMP_MIB(tcp_mib.tcpOutControl); 5548 /* 5549 * Don't send TSopt w/ TH_RST packets per RFC 1323. 5550 */ 5551 if (tcp->tcp_snd_ts_ok && tcp->tcp_state > TCPS_SYN_SENT) { 5552 mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN]; 5553 *(mp->b_wptr) = TCPOPT_EOL; 5554 iph->ip_len = htons(tcp_hdr_len - 5555 TCPOPT_REAL_TS_LEN); 5556 tcph->th_offset_and_rsrvd[0] -= (3 << 4); 5557 } 5558 } 5559 if (ctl & TH_ACK) { 5560 uint32_t now = prom_gettime(); 5561 5562 if (tcp->tcp_snd_ts_ok) { 5563 U32_TO_BE32(now, 5564 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5565 U32_TO_BE32(tcp->tcp_ts_recent, 5566 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5567 } 5568 tcp->tcp_rack = ack; 5569 tcp->tcp_rack_cnt = 0; 5570 BUMP_MIB(tcp_mib.tcpOutAck); 5571 } 5572 BUMP_MIB(tcp_mib.tcpOutSegs); 5573 U32_TO_BE32(seq, tcph->th_seq); 5574 U32_TO_BE32(ack, tcph->th_ack); 5575 5576 tcp_set_cksum(mp); 5577 iph->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5578 TCP_DUMP_PACKET("tcp_xmit_ctl", mp); 5579 (void) ipv4_tcp_output(sock_id, mp); 5580 freeb(mp); 5581 } 5582 5583 /* Generate an ACK-only (no data) segment for a TCP endpoint */ 5584 static mblk_t * 5585 tcp_ack_mp(tcp_t *tcp) 5586 { 5587 if (tcp->tcp_valid_bits) { 5588 /* 5589 * For the complex case where we have to send some 5590 * controls (FIN or SYN), let tcp_xmit_mp do it. 5591 * When sending an ACK-only segment (no data) 5592 * into a zero window, always set the seq number to 5593 * suna, since snxt will be extended past the window. 5594 * If we used snxt, the receiver might consider the ACK 5595 * unacceptable. 5596 */ 5597 return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 5598 (tcp->tcp_zero_win_probe) ? 5599 tcp->tcp_suna : 5600 tcp->tcp_snxt, B_FALSE, NULL, B_FALSE)); 5601 } else { 5602 /* Generate a simple ACK */ 5603 uchar_t *rptr; 5604 tcph_t *tcph; 5605 mblk_t *mp1; 5606 int32_t tcp_hdr_len; 5607 int32_t num_sack_blk = 0; 5608 int32_t sack_opt_len; 5609 5610 /* 5611 * Allocate space for TCP + IP headers 5612 * and link-level header 5613 */ 5614 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 5615 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 5616 tcp->tcp_num_sack_blk); 5617 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 5618 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 5619 tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len; 5620 } else { 5621 tcp_hdr_len = tcp->tcp_hdr_len; 5622 } 5623 mp1 = allocb(tcp_hdr_len + tcp_wroff_xtra, 0); 5624 if (mp1 == NULL) 5625 return (NULL); 5626 5627 /* copy in prototype TCP + IP header */ 5628 rptr = mp1->b_rptr + tcp_wroff_xtra; 5629 mp1->b_rptr = rptr; 5630 mp1->b_wptr = rptr + tcp_hdr_len; 5631 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 5632 5633 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 5634 5635 /* 5636 * Set the TCP sequence number. 5637 * When sending an ACK-only segment (no data) 5638 * into a zero window, always set the seq number to 5639 * suna, since snxt will be extended past the window. 5640 * If we used snxt, the receiver might consider the ACK 5641 * unacceptable. 5642 */ 5643 U32_TO_ABE32((tcp->tcp_zero_win_probe) ? 5644 tcp->tcp_suna : tcp->tcp_snxt, tcph->th_seq); 5645 5646 /* Set up the TCP flag field. */ 5647 tcph->th_flags[0] = (uchar_t)TH_ACK; 5648 if (tcp->tcp_ecn_echo_on) 5649 tcph->th_flags[0] |= TH_ECE; 5650 5651 tcp->tcp_rack = tcp->tcp_rnxt; 5652 tcp->tcp_rack_cnt = 0; 5653 5654 /* fill in timestamp option if in use */ 5655 if (tcp->tcp_snd_ts_ok) { 5656 uint32_t llbolt = (uint32_t)prom_gettime(); 5657 5658 U32_TO_BE32(llbolt, 5659 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5660 U32_TO_BE32(tcp->tcp_ts_recent, 5661 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5662 } 5663 5664 /* Fill in SACK options */ 5665 if (num_sack_blk > 0) { 5666 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 5667 sack_blk_t *tmp; 5668 int32_t i; 5669 5670 wptr[0] = TCPOPT_NOP; 5671 wptr[1] = TCPOPT_NOP; 5672 wptr[2] = TCPOPT_SACK; 5673 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5674 sizeof (sack_blk_t); 5675 wptr += TCPOPT_REAL_SACK_LEN; 5676 5677 tmp = tcp->tcp_sack_list; 5678 for (i = 0; i < num_sack_blk; i++) { 5679 U32_TO_BE32(tmp[i].begin, wptr); 5680 wptr += sizeof (tcp_seq); 5681 U32_TO_BE32(tmp[i].end, wptr); 5682 wptr += sizeof (tcp_seq); 5683 } 5684 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 5685 << 4); 5686 } 5687 5688 ((struct ip *)rptr)->ip_len = htons(tcp_hdr_len); 5689 tcp_set_cksum(mp1); 5690 ((struct ip *)rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5691 return (mp1); 5692 } 5693 } 5694 5695 /* 5696 * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 5697 * ip and tcp header ready to pass down to IP. If the mp passed in is 5698 * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 5699 * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 5700 * otherwise it will dup partial mblks.) 5701 * Otherwise, an appropriate ACK packet will be generated. This 5702 * routine is not usually called to send new data for the first time. It 5703 * is mostly called out of the timer for retransmits, and to generate ACKs. 5704 * 5705 * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 5706 * be adjusted by *offset. And after dupb(), the offset and the ending mblk 5707 * of the original mblk chain will be returned in *offset and *end_mp. 5708 */ 5709 static mblk_t * 5710 tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 5711 mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 5712 boolean_t rexmit) 5713 { 5714 int data_length; 5715 int32_t off = 0; 5716 uint_t flags; 5717 mblk_t *mp1; 5718 mblk_t *mp2; 5719 mblk_t *new_mp; 5720 uchar_t *rptr; 5721 tcph_t *tcph; 5722 int32_t num_sack_blk = 0; 5723 int32_t sack_opt_len = 0; 5724 5725 /* Allocate for our maximum TCP header + link-level */ 5726 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 5727 tcp_wroff_xtra, 0); 5728 if (mp1 == NULL) 5729 return (NULL); 5730 data_length = 0; 5731 5732 /* 5733 * Note that tcp_mss has been adjusted to take into account the 5734 * timestamp option if applicable. Because SACK options do not 5735 * appear in every TCP segments and they are of variable lengths, 5736 * they cannot be included in tcp_mss. Thus we need to calculate 5737 * the actual segment length when we need to send a segment which 5738 * includes SACK options. 5739 */ 5740 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 5741 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 5742 tcp->tcp_num_sack_blk); 5743 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 5744 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 5745 if (max_to_send + sack_opt_len > tcp->tcp_mss) 5746 max_to_send -= sack_opt_len; 5747 } 5748 5749 if (offset != NULL) { 5750 off = *offset; 5751 /* We use offset as an indicator that end_mp is not NULL. */ 5752 *end_mp = NULL; 5753 } 5754 for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 5755 /* This could be faster with cooperation from downstream */ 5756 if (mp2 != mp1 && !sendall && 5757 data_length + (int)(mp->b_wptr - mp->b_rptr) > 5758 max_to_send) 5759 /* 5760 * Don't send the next mblk since the whole mblk 5761 * does not fit. 5762 */ 5763 break; 5764 mp2->b_cont = dupb(mp); 5765 mp2 = mp2->b_cont; 5766 if (mp2 == NULL) { 5767 freemsg(mp1); 5768 return (NULL); 5769 } 5770 mp2->b_rptr += off; 5771 assert((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 5772 (uintptr_t)INT_MAX); 5773 5774 data_length += (int)(mp2->b_wptr - mp2->b_rptr); 5775 if (data_length > max_to_send) { 5776 mp2->b_wptr -= data_length - max_to_send; 5777 data_length = max_to_send; 5778 off = mp2->b_wptr - mp->b_rptr; 5779 break; 5780 } else { 5781 off = 0; 5782 } 5783 } 5784 if (offset != NULL) { 5785 *offset = off; 5786 *end_mp = mp; 5787 } 5788 if (seg_len != NULL) { 5789 *seg_len = data_length; 5790 } 5791 5792 rptr = mp1->b_rptr + tcp_wroff_xtra; 5793 mp1->b_rptr = rptr; 5794 mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; 5795 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 5796 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 5797 U32_TO_ABE32(seq, tcph->th_seq); 5798 5799 /* 5800 * Use tcp_unsent to determine if the PUSH bit should be used assumes 5801 * that this function was called from tcp_wput_data. Thus, when called 5802 * to retransmit data the setting of the PUSH bit may appear some 5803 * what random in that it might get set when it should not. This 5804 * should not pose any performance issues. 5805 */ 5806 if (data_length != 0 && (tcp->tcp_unsent == 0 || 5807 tcp->tcp_unsent == data_length)) { 5808 flags = TH_ACK | TH_PUSH; 5809 } else { 5810 flags = TH_ACK; 5811 } 5812 5813 if (tcp->tcp_ecn_ok) { 5814 if (tcp->tcp_ecn_echo_on) 5815 flags |= TH_ECE; 5816 5817 /* 5818 * Only set ECT bit and ECN_CWR if a segment contains new data. 5819 * There is no TCP flow control for non-data segments, and 5820 * only data segment is transmitted reliably. 5821 */ 5822 if (data_length > 0 && !rexmit) { 5823 SET_ECT(tcp, rptr); 5824 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 5825 flags |= TH_CWR; 5826 tcp->tcp_ecn_cwr_sent = B_TRUE; 5827 } 5828 } 5829 } 5830 5831 if (tcp->tcp_valid_bits) { 5832 uint32_t u1; 5833 5834 if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 5835 seq == tcp->tcp_iss) { 5836 uchar_t *wptr; 5837 5838 /* 5839 * Tack on the MSS option. It is always needed 5840 * for both active and passive open. 5841 */ 5842 wptr = mp1->b_wptr; 5843 wptr[0] = TCPOPT_MAXSEG; 5844 wptr[1] = TCPOPT_MAXSEG_LEN; 5845 wptr += 2; 5846 /* 5847 * MSS option value should be interface MTU - MIN 5848 * TCP/IP header. 5849 */ 5850 u1 = tcp->tcp_if_mtu - IP_SIMPLE_HDR_LENGTH - 5851 TCP_MIN_HEADER_LENGTH; 5852 U16_TO_BE16(u1, wptr); 5853 mp1->b_wptr = wptr + 2; 5854 /* Update the offset to cover the additional word */ 5855 tcph->th_offset_and_rsrvd[0] += (1 << 4); 5856 5857 /* 5858 * Note that the following way of filling in 5859 * TCP options are not optimal. Some NOPs can 5860 * be saved. But there is no need at this time 5861 * to optimize it. When it is needed, we will 5862 * do it. 5863 */ 5864 switch (tcp->tcp_state) { 5865 case TCPS_SYN_SENT: 5866 flags = TH_SYN; 5867 5868 if (tcp->tcp_snd_ws_ok) { 5869 wptr = mp1->b_wptr; 5870 wptr[0] = TCPOPT_NOP; 5871 wptr[1] = TCPOPT_WSCALE; 5872 wptr[2] = TCPOPT_WS_LEN; 5873 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 5874 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 5875 tcph->th_offset_and_rsrvd[0] += 5876 (1 << 4); 5877 } 5878 5879 if (tcp->tcp_snd_ts_ok) { 5880 uint32_t llbolt; 5881 5882 llbolt = prom_gettime(); 5883 wptr = mp1->b_wptr; 5884 wptr[0] = TCPOPT_NOP; 5885 wptr[1] = TCPOPT_NOP; 5886 wptr[2] = TCPOPT_TSTAMP; 5887 wptr[3] = TCPOPT_TSTAMP_LEN; 5888 wptr += 4; 5889 U32_TO_BE32(llbolt, wptr); 5890 wptr += 4; 5891 assert(tcp->tcp_ts_recent == 0); 5892 U32_TO_BE32(0L, wptr); 5893 mp1->b_wptr += TCPOPT_REAL_TS_LEN; 5894 tcph->th_offset_and_rsrvd[0] += 5895 (3 << 4); 5896 } 5897 5898 if (tcp->tcp_snd_sack_ok) { 5899 wptr = mp1->b_wptr; 5900 wptr[0] = TCPOPT_NOP; 5901 wptr[1] = TCPOPT_NOP; 5902 wptr[2] = TCPOPT_SACK_PERMITTED; 5903 wptr[3] = TCPOPT_SACK_OK_LEN; 5904 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 5905 tcph->th_offset_and_rsrvd[0] += 5906 (1 << 4); 5907 } 5908 5909 /* 5910 * Set up all the bits to tell other side 5911 * we are ECN capable. 5912 */ 5913 if (tcp->tcp_ecn_ok) { 5914 flags |= (TH_ECE | TH_CWR); 5915 } 5916 break; 5917 case TCPS_SYN_RCVD: 5918 flags |= TH_SYN; 5919 5920 if (tcp->tcp_snd_ws_ok) { 5921 wptr = mp1->b_wptr; 5922 wptr[0] = TCPOPT_NOP; 5923 wptr[1] = TCPOPT_WSCALE; 5924 wptr[2] = TCPOPT_WS_LEN; 5925 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 5926 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 5927 tcph->th_offset_and_rsrvd[0] += (1 << 4); 5928 } 5929 5930 if (tcp->tcp_snd_sack_ok) { 5931 wptr = mp1->b_wptr; 5932 wptr[0] = TCPOPT_NOP; 5933 wptr[1] = TCPOPT_NOP; 5934 wptr[2] = TCPOPT_SACK_PERMITTED; 5935 wptr[3] = TCPOPT_SACK_OK_LEN; 5936 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 5937 tcph->th_offset_and_rsrvd[0] += 5938 (1 << 4); 5939 } 5940 5941 /* 5942 * If the other side is ECN capable, reply 5943 * that we are also ECN capable. 5944 */ 5945 if (tcp->tcp_ecn_ok) { 5946 flags |= TH_ECE; 5947 } 5948 break; 5949 default: 5950 break; 5951 } 5952 /* allocb() of adequate mblk assures space */ 5953 assert((uintptr_t)(mp1->b_wptr - 5954 mp1->b_rptr) <= (uintptr_t)INT_MAX); 5955 if (flags & TH_SYN) 5956 BUMP_MIB(tcp_mib.tcpOutControl); 5957 } 5958 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 5959 (seq + data_length) == tcp->tcp_fss) { 5960 if (!tcp->tcp_fin_acked) { 5961 flags |= TH_FIN; 5962 BUMP_MIB(tcp_mib.tcpOutControl); 5963 } 5964 if (!tcp->tcp_fin_sent) { 5965 tcp->tcp_fin_sent = B_TRUE; 5966 switch (tcp->tcp_state) { 5967 case TCPS_SYN_RCVD: 5968 case TCPS_ESTABLISHED: 5969 tcp->tcp_state = TCPS_FIN_WAIT_1; 5970 break; 5971 case TCPS_CLOSE_WAIT: 5972 tcp->tcp_state = TCPS_LAST_ACK; 5973 break; 5974 } 5975 if (tcp->tcp_suna == tcp->tcp_snxt) 5976 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5977 tcp->tcp_snxt = tcp->tcp_fss + 1; 5978 } 5979 } 5980 } 5981 tcph->th_flags[0] = (uchar_t)flags; 5982 tcp->tcp_rack = tcp->tcp_rnxt; 5983 tcp->tcp_rack_cnt = 0; 5984 5985 if (tcp->tcp_snd_ts_ok) { 5986 if (tcp->tcp_state != TCPS_SYN_SENT) { 5987 uint32_t llbolt = prom_gettime(); 5988 5989 U32_TO_BE32(llbolt, 5990 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5991 U32_TO_BE32(tcp->tcp_ts_recent, 5992 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5993 } 5994 } 5995 5996 if (num_sack_blk > 0) { 5997 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 5998 sack_blk_t *tmp; 5999 int32_t i; 6000 6001 wptr[0] = TCPOPT_NOP; 6002 wptr[1] = TCPOPT_NOP; 6003 wptr[2] = TCPOPT_SACK; 6004 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 6005 sizeof (sack_blk_t); 6006 wptr += TCPOPT_REAL_SACK_LEN; 6007 6008 tmp = tcp->tcp_sack_list; 6009 for (i = 0; i < num_sack_blk; i++) { 6010 U32_TO_BE32(tmp[i].begin, wptr); 6011 wptr += sizeof (tcp_seq); 6012 U32_TO_BE32(tmp[i].end, wptr); 6013 wptr += sizeof (tcp_seq); 6014 } 6015 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4); 6016 } 6017 assert((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 6018 data_length += (int)(mp1->b_wptr - rptr); 6019 if (tcp->tcp_ipversion == IPV4_VERSION) 6020 ((struct ip *)rptr)->ip_len = htons(data_length); 6021 6022 /* 6023 * Performance hit! We need to pullup the whole message 6024 * in order to do checksum and for the MAC output routine. 6025 */ 6026 if (mp1->b_cont != NULL) { 6027 int mp_size; 6028 #ifdef DEBUG 6029 printf("Multiple mblk %d\n", msgdsize(mp1)); 6030 #endif 6031 new_mp = allocb(msgdsize(mp1) + tcp_wroff_xtra, 0); 6032 new_mp->b_rptr += tcp_wroff_xtra; 6033 new_mp->b_wptr = new_mp->b_rptr; 6034 while (mp1 != NULL) { 6035 mp_size = mp1->b_wptr - mp1->b_rptr; 6036 bcopy(mp1->b_rptr, new_mp->b_wptr, mp_size); 6037 new_mp->b_wptr += mp_size; 6038 mp1 = mp1->b_cont; 6039 } 6040 freemsg(mp1); 6041 mp1 = new_mp; 6042 } 6043 tcp_set_cksum(mp1); 6044 /* Fill in the TTL field as it is 0 in the header template. */ 6045 ((struct ip *)mp1->b_rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 6046 6047 return (mp1); 6048 } 6049 6050 /* 6051 * Generate a "no listener here" reset in response to the 6052 * connection request contained within 'mp' 6053 */ 6054 static void 6055 tcp_xmit_listeners_reset(int sock_id, mblk_t *mp, uint_t ip_hdr_len) 6056 { 6057 uchar_t *rptr; 6058 uint32_t seg_len; 6059 tcph_t *tcph; 6060 uint32_t seg_seq; 6061 uint32_t seg_ack; 6062 uint_t flags; 6063 6064 rptr = mp->b_rptr; 6065 6066 tcph = (tcph_t *)&rptr[ip_hdr_len]; 6067 seg_seq = BE32_TO_U32(tcph->th_seq); 6068 seg_ack = BE32_TO_U32(tcph->th_ack); 6069 flags = tcph->th_flags[0]; 6070 6071 seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len); 6072 if (flags & TH_RST) { 6073 freeb(mp); 6074 } else if (flags & TH_ACK) { 6075 tcp_xmit_early_reset("no tcp, reset", 6076 sock_id, mp, seg_ack, 0, TH_RST, ip_hdr_len); 6077 } else { 6078 if (flags & TH_SYN) 6079 seg_len++; 6080 tcp_xmit_early_reset("no tcp, reset/ack", sock_id, 6081 mp, 0, seg_seq + seg_len, 6082 TH_RST | TH_ACK, ip_hdr_len); 6083 } 6084 } 6085 6086 /* Non overlapping byte exchanger */ 6087 static void 6088 tcp_xchg(uchar_t *a, uchar_t *b, int len) 6089 { 6090 uchar_t uch; 6091 6092 while (len-- > 0) { 6093 uch = a[len]; 6094 a[len] = b[len]; 6095 b[len] = uch; 6096 } 6097 } 6098 6099 /* 6100 * Generate a reset based on an inbound packet for which there is no active 6101 * tcp state that we can find. 6102 */ 6103 static void 6104 tcp_xmit_early_reset(char *str, int sock_id, mblk_t *mp, uint32_t seq, 6105 uint32_t ack, int ctl, uint_t ip_hdr_len) 6106 { 6107 struct ip *iph = NULL; 6108 ushort_t len; 6109 tcph_t *tcph; 6110 int i; 6111 ipaddr_t addr; 6112 mblk_t *new_mp; 6113 6114 if (str != NULL) { 6115 dprintf("tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 6116 "flags 0x%x\n", str, seq, ack, ctl); 6117 } 6118 6119 /* 6120 * We skip reversing source route here. 6121 * (for now we replace all IP options with EOL) 6122 */ 6123 iph = (struct ip *)mp->b_rptr; 6124 for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 6125 mp->b_rptr[i] = IPOPT_EOL; 6126 /* 6127 * Make sure that src address is not a limited broadcast 6128 * address. Not all broadcast address checking for the 6129 * src address is possible, since we don't know the 6130 * netmask of the src addr. 6131 * No check for destination address is done, since 6132 * IP will not pass up a packet with a broadcast dest address 6133 * to TCP. 6134 */ 6135 if (iph->ip_src.s_addr == INADDR_ANY || 6136 iph->ip_src.s_addr == INADDR_BROADCAST) { 6137 freemsg(mp); 6138 return; 6139 } 6140 6141 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6142 if (tcph->th_flags[0] & TH_RST) { 6143 freemsg(mp); 6144 return; 6145 } 6146 /* 6147 * Now copy the original header to a new buffer. The reason 6148 * for doing this is that we need to put extra room before 6149 * the header for the MAC layer address. The original mblk 6150 * does not have this extra head room. 6151 */ 6152 len = ip_hdr_len + sizeof (tcph_t); 6153 if ((new_mp = allocb(len + tcp_wroff_xtra, 0)) == NULL) { 6154 freemsg(mp); 6155 return; 6156 } 6157 new_mp->b_rptr += tcp_wroff_xtra; 6158 bcopy(mp->b_rptr, new_mp->b_rptr, len); 6159 new_mp->b_wptr = new_mp->b_rptr + len; 6160 freemsg(mp); 6161 mp = new_mp; 6162 iph = (struct ip *)mp->b_rptr; 6163 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6164 6165 tcph->th_offset_and_rsrvd[0] = (5 << 4); 6166 tcp_xchg(tcph->th_fport, tcph->th_lport, 2); 6167 U32_TO_BE32(ack, tcph->th_ack); 6168 U32_TO_BE32(seq, tcph->th_seq); 6169 U16_TO_BE16(0, tcph->th_win); 6170 bzero(tcph->th_sum, sizeof (int16_t)); 6171 tcph->th_flags[0] = (uint8_t)ctl; 6172 if (ctl & TH_RST) { 6173 BUMP_MIB(tcp_mib.tcpOutRsts); 6174 BUMP_MIB(tcp_mib.tcpOutControl); 6175 } 6176 6177 iph->ip_len = htons(len); 6178 /* Swap addresses */ 6179 addr = iph->ip_src.s_addr; 6180 iph->ip_src = iph->ip_dst; 6181 iph->ip_dst.s_addr = addr; 6182 iph->ip_id = 0; 6183 iph->ip_ttl = 0; 6184 tcp_set_cksum(mp); 6185 iph->ip_ttl = (uint8_t)tcp_ipv4_ttl; 6186 6187 /* Dump the packet when debugging. */ 6188 TCP_DUMP_PACKET("tcp_xmit_early_reset", mp); 6189 (void) ipv4_tcp_output(sock_id, mp); 6190 freemsg(mp); 6191 } 6192 6193 static void 6194 tcp_set_cksum(mblk_t *mp) 6195 { 6196 struct ip *iph; 6197 tcpha_t *tcph; 6198 int len; 6199 6200 iph = (struct ip *)mp->b_rptr; 6201 tcph = (tcpha_t *)(iph + 1); 6202 len = ntohs(iph->ip_len); 6203 /* 6204 * Calculate the TCP checksum. Need to include the psuedo header, 6205 * which is similar to the real IP header starting at the TTL field. 6206 */ 6207 iph->ip_sum = htons(len - IP_SIMPLE_HDR_LENGTH); 6208 tcph->tha_sum = 0; 6209 tcph->tha_sum = tcp_cksum((uint16_t *)&(iph->ip_ttl), 6210 len - IP_SIMPLE_HDR_LENGTH + 12); 6211 iph->ip_sum = 0; 6212 } 6213 6214 static uint16_t 6215 tcp_cksum(uint16_t *buf, uint32_t len) 6216 { 6217 /* 6218 * Compute Internet Checksum for "count" bytes 6219 * beginning at location "addr". 6220 */ 6221 int32_t sum = 0; 6222 6223 while (len > 1) { 6224 /* This is the inner loop */ 6225 sum += *buf++; 6226 len -= 2; 6227 } 6228 6229 /* Add left-over byte, if any */ 6230 if (len > 0) 6231 sum += *(unsigned char *)buf * 256; 6232 6233 /* Fold 32-bit sum to 16 bits */ 6234 while (sum >> 16) 6235 sum = (sum & 0xffff) + (sum >> 16); 6236 6237 return ((uint16_t)~sum); 6238 } 6239 6240 /* 6241 * Type three generator adapted from the random() function in 4.4 BSD: 6242 */ 6243 6244 /* 6245 * Copyright (c) 1983, 1993 6246 * The Regents of the University of California. All rights reserved. 6247 * 6248 * Redistribution and use in source and binary forms, with or without 6249 * modification, are permitted provided that the following conditions 6250 * are met: 6251 * 1. Redistributions of source code must retain the above copyright 6252 * notice, this list of conditions and the following disclaimer. 6253 * 2. Redistributions in binary form must reproduce the above copyright 6254 * notice, this list of conditions and the following disclaimer in the 6255 * documentation and/or other materials provided with the distribution. 6256 * 3. All advertising materials mentioning features or use of this software 6257 * must display the following acknowledgement: 6258 * This product includes software developed by the University of 6259 * California, Berkeley and its contributors. 6260 * 4. Neither the name of the University nor the names of its contributors 6261 * may be used to endorse or promote products derived from this software 6262 * without specific prior written permission. 6263 * 6264 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 6265 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 6266 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 6267 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 6268 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 6269 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 6270 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 6271 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 6272 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 6273 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 6274 * SUCH DAMAGE. 6275 */ 6276 6277 /* Type 3 -- x**31 + x**3 + 1 */ 6278 #define DEG_3 31 6279 #define SEP_3 3 6280 6281 6282 /* Protected by tcp_random_lock */ 6283 static int tcp_randtbl[DEG_3 + 1]; 6284 6285 static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 6286 static int *tcp_random_rptr = &tcp_randtbl[1]; 6287 6288 static int *tcp_random_state = &tcp_randtbl[1]; 6289 static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 6290 6291 static void 6292 tcp_random_init(void) 6293 { 6294 int i; 6295 uint32_t hrt; 6296 uint32_t wallclock; 6297 uint32_t result; 6298 6299 /* 6300 * 6301 * XXX We don't have high resolution time in standalone... The 6302 * following is just some approximation on the comment below. 6303 * 6304 * Use high-res timer and current time for seed. Gethrtime() returns 6305 * a longlong, which may contain resolution down to nanoseconds. 6306 * The current time will either be a 32-bit or a 64-bit quantity. 6307 * XOR the two together in a 64-bit result variable. 6308 * Convert the result to a 32-bit value by multiplying the high-order 6309 * 32-bits by the low-order 32-bits. 6310 * 6311 * XXX We don't have gethrtime() in prom and the wallclock.... 6312 */ 6313 6314 hrt = prom_gettime(); 6315 wallclock = (uint32_t)time(NULL); 6316 result = wallclock ^ hrt; 6317 tcp_random_state[0] = result; 6318 6319 for (i = 1; i < DEG_3; i++) 6320 tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 6321 + 12345; 6322 tcp_random_fptr = &tcp_random_state[SEP_3]; 6323 tcp_random_rptr = &tcp_random_state[0]; 6324 for (i = 0; i < 10 * DEG_3; i++) 6325 (void) tcp_random(); 6326 } 6327 6328 /* 6329 * tcp_random: Return a random number in the range [1 - (128K + 1)]. 6330 * This range is selected to be approximately centered on TCP_ISS / 2, 6331 * and easy to compute. We get this value by generating a 32-bit random 6332 * number, selecting out the high-order 17 bits, and then adding one so 6333 * that we never return zero. 6334 */ 6335 static int 6336 tcp_random(void) 6337 { 6338 int i; 6339 6340 *tcp_random_fptr += *tcp_random_rptr; 6341 6342 /* 6343 * The high-order bits are more random than the low-order bits, 6344 * so we select out the high-order 17 bits and add one so that 6345 * we never return zero. 6346 */ 6347 i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 6348 if (++tcp_random_fptr >= tcp_random_end_ptr) { 6349 tcp_random_fptr = tcp_random_state; 6350 ++tcp_random_rptr; 6351 } else if (++tcp_random_rptr >= tcp_random_end_ptr) 6352 tcp_random_rptr = tcp_random_state; 6353 6354 return (i); 6355 } 6356 6357 /* 6358 * Generate ISS, taking into account NDD changes may happen halfway through. 6359 * (If the iss is not zero, set it.) 6360 */ 6361 static void 6362 tcp_iss_init(tcp_t *tcp) 6363 { 6364 tcp_iss_incr_extra += (ISS_INCR >> 1); 6365 tcp->tcp_iss = tcp_iss_incr_extra; 6366 tcp->tcp_iss += (prom_gettime() >> ISS_NSEC_SHT) + tcp_random(); 6367 tcp->tcp_valid_bits = TCP_ISS_VALID; 6368 tcp->tcp_fss = tcp->tcp_iss - 1; 6369 tcp->tcp_suna = tcp->tcp_iss; 6370 tcp->tcp_snxt = tcp->tcp_iss + 1; 6371 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 6372 tcp->tcp_csuna = tcp->tcp_snxt; 6373 } 6374 6375 /* 6376 * Diagnostic routine used to return a string associated with the tcp state. 6377 * Note that if the caller does not supply a buffer, it will use an internal 6378 * static string. This means that if multiple threads call this function at 6379 * the same time, output can be corrupted... Note also that this function 6380 * does not check the size of the supplied buffer. The caller has to make 6381 * sure that it is big enough. 6382 */ 6383 static char * 6384 tcp_display(tcp_t *tcp, char *sup_buf, char format) 6385 { 6386 char buf1[30]; 6387 static char priv_buf[INET_ADDRSTRLEN * 2 + 80]; 6388 char *buf; 6389 char *cp; 6390 char local_addrbuf[INET_ADDRSTRLEN]; 6391 char remote_addrbuf[INET_ADDRSTRLEN]; 6392 struct in_addr addr; 6393 6394 if (sup_buf != NULL) 6395 buf = sup_buf; 6396 else 6397 buf = priv_buf; 6398 6399 if (tcp == NULL) 6400 return ("NULL_TCP"); 6401 switch (tcp->tcp_state) { 6402 case TCPS_CLOSED: 6403 cp = "TCP_CLOSED"; 6404 break; 6405 case TCPS_IDLE: 6406 cp = "TCP_IDLE"; 6407 break; 6408 case TCPS_BOUND: 6409 cp = "TCP_BOUND"; 6410 break; 6411 case TCPS_LISTEN: 6412 cp = "TCP_LISTEN"; 6413 break; 6414 case TCPS_SYN_SENT: 6415 cp = "TCP_SYN_SENT"; 6416 break; 6417 case TCPS_SYN_RCVD: 6418 cp = "TCP_SYN_RCVD"; 6419 break; 6420 case TCPS_ESTABLISHED: 6421 cp = "TCP_ESTABLISHED"; 6422 break; 6423 case TCPS_CLOSE_WAIT: 6424 cp = "TCP_CLOSE_WAIT"; 6425 break; 6426 case TCPS_FIN_WAIT_1: 6427 cp = "TCP_FIN_WAIT_1"; 6428 break; 6429 case TCPS_CLOSING: 6430 cp = "TCP_CLOSING"; 6431 break; 6432 case TCPS_LAST_ACK: 6433 cp = "TCP_LAST_ACK"; 6434 break; 6435 case TCPS_FIN_WAIT_2: 6436 cp = "TCP_FIN_WAIT_2"; 6437 break; 6438 case TCPS_TIME_WAIT: 6439 cp = "TCP_TIME_WAIT"; 6440 break; 6441 default: 6442 (void) sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 6443 cp = buf1; 6444 break; 6445 } 6446 switch (format) { 6447 case DISP_ADDR_AND_PORT: 6448 /* 6449 * Note that we use the remote address in the tcp_b 6450 * structure. This means that it will print out 6451 * the real destination address, not the next hop's 6452 * address if source routing is used. 6453 */ 6454 addr.s_addr = tcp->tcp_bound_source; 6455 bcopy(inet_ntoa(addr), local_addrbuf, sizeof (local_addrbuf)); 6456 addr.s_addr = tcp->tcp_remote; 6457 bcopy(inet_ntoa(addr), remote_addrbuf, sizeof (remote_addrbuf)); 6458 (void) snprintf(buf, sizeof (priv_buf), "[%s.%u, %s.%u] %s", 6459 local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf, 6460 ntohs(tcp->tcp_fport), cp); 6461 break; 6462 case DISP_PORT_ONLY: 6463 default: 6464 (void) snprintf(buf, sizeof (priv_buf), "[%u, %u] %s", 6465 ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp); 6466 break; 6467 } 6468 6469 return (buf); 6470 } 6471 6472 /* 6473 * Add a new piece to the tcp reassembly queue. If the gap at the beginning 6474 * is filled, return as much as we can. The message passed in may be 6475 * multi-part, chained using b_cont. "start" is the starting sequence 6476 * number for this piece. 6477 */ 6478 static mblk_t * 6479 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 6480 { 6481 uint32_t end; 6482 mblk_t *mp1; 6483 mblk_t *mp2; 6484 mblk_t *next_mp; 6485 uint32_t u1; 6486 6487 /* Walk through all the new pieces. */ 6488 do { 6489 assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 6490 (uintptr_t)INT_MAX); 6491 end = start + (int)(mp->b_wptr - mp->b_rptr); 6492 next_mp = mp->b_cont; 6493 if (start == end) { 6494 /* Empty. Blast it. */ 6495 freeb(mp); 6496 continue; 6497 } 6498 mp->b_cont = NULL; 6499 TCP_REASS_SET_SEQ(mp, start); 6500 TCP_REASS_SET_END(mp, end); 6501 mp1 = tcp->tcp_reass_tail; 6502 if (!mp1) { 6503 tcp->tcp_reass_tail = mp; 6504 tcp->tcp_reass_head = mp; 6505 BUMP_MIB(tcp_mib.tcpInDataUnorderSegs); 6506 UPDATE_MIB(tcp_mib.tcpInDataUnorderBytes, end - start); 6507 continue; 6508 } 6509 /* New stuff completely beyond tail? */ 6510 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { 6511 /* Link it on end. */ 6512 mp1->b_cont = mp; 6513 tcp->tcp_reass_tail = mp; 6514 BUMP_MIB(tcp_mib.tcpInDataUnorderSegs); 6515 UPDATE_MIB(tcp_mib.tcpInDataUnorderBytes, end - start); 6516 continue; 6517 } 6518 mp1 = tcp->tcp_reass_head; 6519 u1 = TCP_REASS_SEQ(mp1); 6520 /* New stuff at the front? */ 6521 if (SEQ_LT(start, u1)) { 6522 /* Yes... Check for overlap. */ 6523 mp->b_cont = mp1; 6524 tcp->tcp_reass_head = mp; 6525 tcp_reass_elim_overlap(tcp, mp); 6526 continue; 6527 } 6528 /* 6529 * The new piece fits somewhere between the head and tail. 6530 * We find our slot, where mp1 precedes us and mp2 trails. 6531 */ 6532 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 6533 u1 = TCP_REASS_SEQ(mp2); 6534 if (SEQ_LEQ(start, u1)) 6535 break; 6536 } 6537 /* Link ourselves in */ 6538 mp->b_cont = mp2; 6539 mp1->b_cont = mp; 6540 6541 /* Trim overlap with following mblk(s) first */ 6542 tcp_reass_elim_overlap(tcp, mp); 6543 6544 /* Trim overlap with preceding mblk */ 6545 tcp_reass_elim_overlap(tcp, mp1); 6546 6547 } while (start = end, mp = next_mp); 6548 mp1 = tcp->tcp_reass_head; 6549 /* Anything ready to go? */ 6550 if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 6551 return (NULL); 6552 /* Eat what we can off the queue */ 6553 for (;;) { 6554 mp = mp1->b_cont; 6555 end = TCP_REASS_END(mp1); 6556 TCP_REASS_SET_SEQ(mp1, 0); 6557 TCP_REASS_SET_END(mp1, 0); 6558 if (!mp) { 6559 tcp->tcp_reass_tail = NULL; 6560 break; 6561 } 6562 if (end != TCP_REASS_SEQ(mp)) { 6563 mp1->b_cont = NULL; 6564 break; 6565 } 6566 mp1 = mp; 6567 } 6568 mp1 = tcp->tcp_reass_head; 6569 tcp->tcp_reass_head = mp; 6570 return (mp1); 6571 } 6572 6573 /* Eliminate any overlap that mp may have over later mblks */ 6574 static void 6575 tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 6576 { 6577 uint32_t end; 6578 mblk_t *mp1; 6579 uint32_t u1; 6580 6581 end = TCP_REASS_END(mp); 6582 while ((mp1 = mp->b_cont) != NULL) { 6583 u1 = TCP_REASS_SEQ(mp1); 6584 if (!SEQ_GT(end, u1)) 6585 break; 6586 if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 6587 mp->b_wptr -= end - u1; 6588 TCP_REASS_SET_END(mp, u1); 6589 BUMP_MIB(tcp_mib.tcpInDataPartDupSegs); 6590 UPDATE_MIB(tcp_mib.tcpInDataPartDupBytes, end - u1); 6591 break; 6592 } 6593 mp->b_cont = mp1->b_cont; 6594 freeb(mp1); 6595 BUMP_MIB(tcp_mib.tcpInDataDupSegs); 6596 UPDATE_MIB(tcp_mib.tcpInDataDupBytes, end - u1); 6597 } 6598 if (!mp1) 6599 tcp->tcp_reass_tail = mp; 6600 } 6601 6602 /* 6603 * Remove a connection from the list of detached TIME_WAIT connections. 6604 */ 6605 static void 6606 tcp_time_wait_remove(tcp_t *tcp) 6607 { 6608 if (tcp->tcp_time_wait_expire == 0) { 6609 assert(tcp->tcp_time_wait_next == NULL); 6610 assert(tcp->tcp_time_wait_prev == NULL); 6611 return; 6612 } 6613 assert(tcp->tcp_state == TCPS_TIME_WAIT); 6614 if (tcp == tcp_time_wait_head) { 6615 assert(tcp->tcp_time_wait_prev == NULL); 6616 tcp_time_wait_head = tcp->tcp_time_wait_next; 6617 if (tcp_time_wait_head != NULL) { 6618 tcp_time_wait_head->tcp_time_wait_prev = NULL; 6619 } else { 6620 tcp_time_wait_tail = NULL; 6621 } 6622 } else if (tcp == tcp_time_wait_tail) { 6623 assert(tcp != tcp_time_wait_head); 6624 assert(tcp->tcp_time_wait_next == NULL); 6625 tcp_time_wait_tail = tcp->tcp_time_wait_prev; 6626 assert(tcp_time_wait_tail != NULL); 6627 tcp_time_wait_tail->tcp_time_wait_next = NULL; 6628 } else { 6629 assert(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 6630 assert(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 6631 tcp->tcp_time_wait_prev->tcp_time_wait_next = 6632 tcp->tcp_time_wait_next; 6633 tcp->tcp_time_wait_next->tcp_time_wait_prev = 6634 tcp->tcp_time_wait_prev; 6635 } 6636 tcp->tcp_time_wait_next = NULL; 6637 tcp->tcp_time_wait_prev = NULL; 6638 tcp->tcp_time_wait_expire = 0; 6639 } 6640 6641 /* 6642 * Add a connection to the list of detached TIME_WAIT connections 6643 * and set its time to expire ... 6644 */ 6645 static void 6646 tcp_time_wait_append(tcp_t *tcp) 6647 { 6648 tcp->tcp_time_wait_expire = prom_gettime() + tcp_time_wait_interval; 6649 if (tcp->tcp_time_wait_expire == 0) 6650 tcp->tcp_time_wait_expire = 1; 6651 6652 if (tcp_time_wait_head == NULL) { 6653 assert(tcp_time_wait_tail == NULL); 6654 tcp_time_wait_head = tcp; 6655 } else { 6656 assert(tcp_time_wait_tail != NULL); 6657 assert(tcp_time_wait_tail->tcp_state == TCPS_TIME_WAIT); 6658 tcp_time_wait_tail->tcp_time_wait_next = tcp; 6659 tcp->tcp_time_wait_prev = tcp_time_wait_tail; 6660 } 6661 tcp_time_wait_tail = tcp; 6662 6663 /* for ndd stats about compression */ 6664 tcp_cum_timewait++; 6665 } 6666 6667 /* 6668 * Periodic qtimeout routine run on the default queue. 6669 * Performs 2 functions. 6670 * 1. Does TIME_WAIT compression on all recently added tcps. List 6671 * traversal is done backwards from the tail. 6672 * 2. Blows away all tcps whose TIME_WAIT has expired. List traversal 6673 * is done forwards from the head. 6674 */ 6675 void 6676 tcp_time_wait_collector(void) 6677 { 6678 tcp_t *tcp; 6679 uint32_t now; 6680 6681 /* 6682 * In order to reap time waits reliably, we should use a 6683 * source of time that is not adjustable by the user 6684 */ 6685 now = prom_gettime(); 6686 while ((tcp = tcp_time_wait_head) != NULL) { 6687 /* 6688 * Compare times using modular arithmetic, since 6689 * lbolt can wrapover. 6690 */ 6691 if ((int32_t)(now - tcp->tcp_time_wait_expire) < 0) { 6692 break; 6693 } 6694 /* 6695 * Note that the err must be 0 as there is no socket 6696 * associated with this TCP... 6697 */ 6698 (void) tcp_clean_death(-1, tcp, 0); 6699 } 6700 /* Schedule next run time. */ 6701 tcp_time_wait_runtime = prom_gettime() + 10000; 6702 } 6703 6704 void 6705 tcp_time_wait_report(void) 6706 { 6707 tcp_t *tcp; 6708 6709 printf("Current time %u\n", prom_gettime()); 6710 for (tcp = tcp_time_wait_head; tcp != NULL; 6711 tcp = tcp->tcp_time_wait_next) { 6712 printf("%s expires at %u\n", tcp_display(tcp, NULL, 6713 DISP_ADDR_AND_PORT), tcp->tcp_time_wait_expire); 6714 } 6715 } 6716 6717 /* 6718 * Send up all messages queued on tcp_rcv_list. 6719 * Have to set tcp_co_norm since we use putnext. 6720 */ 6721 static void 6722 tcp_rcv_drain(int sock_id, tcp_t *tcp) 6723 { 6724 mblk_t *mp; 6725 struct inetgram *in_gram; 6726 mblk_t *in_mp; 6727 int len; 6728 6729 /* Don't drain if the app has not finished reading all the data. */ 6730 if (sockets[sock_id].so_rcvbuf <= 0) 6731 return; 6732 6733 /* We might have come here just to updated the rwnd */ 6734 if (tcp->tcp_rcv_list == NULL) 6735 goto win_update; 6736 6737 if ((in_gram = (struct inetgram *)bkmem_zalloc( 6738 sizeof (struct inetgram))) == NULL) { 6739 return; 6740 } 6741 if ((in_mp = allocb(tcp->tcp_rcv_cnt, 0)) == NULL) { 6742 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 6743 return; 6744 } 6745 in_gram->igm_level = APP_LVL; 6746 in_gram->igm_mp = in_mp; 6747 in_gram->igm_id = 0; 6748 6749 while ((mp = tcp->tcp_rcv_list) != NULL) { 6750 tcp->tcp_rcv_list = mp->b_cont; 6751 len = mp->b_wptr - mp->b_rptr; 6752 bcopy(mp->b_rptr, in_mp->b_wptr, len); 6753 in_mp->b_wptr += len; 6754 freeb(mp); 6755 } 6756 6757 tcp->tcp_rcv_last_tail = NULL; 6758 tcp->tcp_rcv_cnt = 0; 6759 add_grams(&sockets[sock_id].inq, in_gram); 6760 6761 /* This means that so_rcvbuf can be less than 0. */ 6762 sockets[sock_id].so_rcvbuf -= in_mp->b_wptr - in_mp->b_rptr; 6763 win_update: 6764 /* 6765 * Increase the receive window to max. But we need to do receiver 6766 * SWS avoidance. This means that we need to check the increase of 6767 * of receive window is at least 1 MSS. 6768 */ 6769 if (sockets[sock_id].so_rcvbuf > 0 && 6770 (tcp->tcp_rwnd_max - tcp->tcp_rwnd >= tcp->tcp_mss)) { 6771 tcp->tcp_rwnd = tcp->tcp_rwnd_max; 6772 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 6773 tcp->tcp_tcph->th_win); 6774 } 6775 } 6776 6777 /* 6778 * Wrapper for recvfrom to call 6779 */ 6780 void 6781 tcp_rcv_drain_sock(int sock_id) 6782 { 6783 tcp_t *tcp; 6784 if ((tcp = sockets[sock_id].pcb) == NULL) 6785 return; 6786 tcp_rcv_drain(sock_id, tcp); 6787 } 6788 6789 /* 6790 * If the inq == NULL and the tcp_rcv_list != NULL, we have data that 6791 * recvfrom could read. Place a magic message in the inq to let recvfrom 6792 * know that it needs to call tcp_rcv_drain_sock to pullup the data. 6793 */ 6794 static void 6795 tcp_drain_needed(int sock_id, tcp_t *tcp) 6796 { 6797 struct inetgram *in_gram; 6798 #ifdef DEBUG 6799 printf("tcp_drain_needed: inq %x, tcp_rcv_list %x\n", 6800 sockets[sock_id].inq, tcp->tcp_rcv_list); 6801 #endif 6802 if ((sockets[sock_id].inq != NULL) || 6803 (tcp->tcp_rcv_list == NULL)) 6804 return; 6805 6806 if ((in_gram = (struct inetgram *)bkmem_zalloc( 6807 sizeof (struct inetgram))) == NULL) 6808 return; 6809 6810 in_gram->igm_level = APP_LVL; 6811 in_gram->igm_mp = NULL; 6812 in_gram->igm_id = TCP_CALLB_MAGIC_ID; 6813 6814 add_grams(&sockets[sock_id].inq, in_gram); 6815 } 6816 6817 /* 6818 * Queue data on tcp_rcv_list which is a b_next chain. 6819 * Each element of the chain is a b_cont chain. 6820 * 6821 * M_DATA messages are added to the current element. 6822 * Other messages are added as new (b_next) elements. 6823 */ 6824 static void 6825 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) 6826 { 6827 assert(seg_len == msgdsize(mp)); 6828 if (tcp->tcp_rcv_list == NULL) { 6829 tcp->tcp_rcv_list = mp; 6830 } else { 6831 tcp->tcp_rcv_last_tail->b_cont = mp; 6832 } 6833 while (mp->b_cont) 6834 mp = mp->b_cont; 6835 tcp->tcp_rcv_last_tail = mp; 6836 tcp->tcp_rcv_cnt += seg_len; 6837 tcp->tcp_rwnd -= seg_len; 6838 #ifdef DEBUG 6839 printf("tcp_rcv_enqueue rwnd %d\n", tcp->tcp_rwnd); 6840 #endif 6841 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 6842 } 6843 6844 /* The minimum of smoothed mean deviation in RTO calculation. */ 6845 #define TCP_SD_MIN 400 6846 6847 /* 6848 * Set RTO for this connection. The formula is from Jacobson and Karels' 6849 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 6850 * are the same as those in Appendix A.2 of that paper. 6851 * 6852 * m = new measurement 6853 * sa = smoothed RTT average (8 * average estimates). 6854 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 6855 */ 6856 static void 6857 tcp_set_rto(tcp_t *tcp, int32_t rtt) 6858 { 6859 int32_t m = rtt; 6860 uint32_t sa = tcp->tcp_rtt_sa; 6861 uint32_t sv = tcp->tcp_rtt_sd; 6862 uint32_t rto; 6863 6864 BUMP_MIB(tcp_mib.tcpRttUpdate); 6865 tcp->tcp_rtt_update++; 6866 6867 /* tcp_rtt_sa is not 0 means this is a new sample. */ 6868 if (sa != 0) { 6869 /* 6870 * Update average estimator: 6871 * new rtt = 7/8 old rtt + 1/8 Error 6872 */ 6873 6874 /* m is now Error in estimate. */ 6875 m -= sa >> 3; 6876 if ((int32_t)(sa += m) <= 0) { 6877 /* 6878 * Don't allow the smoothed average to be negative. 6879 * We use 0 to denote reinitialization of the 6880 * variables. 6881 */ 6882 sa = 1; 6883 } 6884 6885 /* 6886 * Update deviation estimator: 6887 * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) 6888 */ 6889 if (m < 0) 6890 m = -m; 6891 m -= sv >> 2; 6892 sv += m; 6893 } else { 6894 /* 6895 * This follows BSD's implementation. So the reinitialized 6896 * RTO is 3 * m. We cannot go less than 2 because if the 6897 * link is bandwidth dominated, doubling the window size 6898 * during slow start means doubling the RTT. We want to be 6899 * more conservative when we reinitialize our estimates. 3 6900 * is just a convenient number. 6901 */ 6902 sa = m << 3; 6903 sv = m << 1; 6904 } 6905 if (sv < TCP_SD_MIN) { 6906 /* 6907 * We do not know that if sa captures the delay ACK 6908 * effect as in a long train of segments, a receiver 6909 * does not delay its ACKs. So set the minimum of sv 6910 * to be TCP_SD_MIN, which is default to 400 ms, twice 6911 * of BSD DATO. That means the minimum of mean 6912 * deviation is 100 ms. 6913 * 6914 */ 6915 sv = TCP_SD_MIN; 6916 } 6917 tcp->tcp_rtt_sa = sa; 6918 tcp->tcp_rtt_sd = sv; 6919 /* 6920 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) 6921 * 6922 * Add tcp_rexmit_interval extra in case of extreme environment 6923 * where the algorithm fails to work. The default value of 6924 * tcp_rexmit_interval_extra should be 0. 6925 * 6926 * As we use a finer grained clock than BSD and update 6927 * RTO for every ACKs, add in another .25 of RTT to the 6928 * deviation of RTO to accomodate burstiness of 1/4 of 6929 * window size. 6930 */ 6931 rto = (sa >> 3) + sv + tcp_rexmit_interval_extra + (sa >> 5); 6932 6933 if (rto > tcp_rexmit_interval_max) { 6934 tcp->tcp_rto = tcp_rexmit_interval_max; 6935 } else if (rto < tcp_rexmit_interval_min) { 6936 tcp->tcp_rto = tcp_rexmit_interval_min; 6937 } else { 6938 tcp->tcp_rto = rto; 6939 } 6940 6941 /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 6942 tcp->tcp_timer_backoff = 0; 6943 } 6944 6945 /* 6946 * Initiate closedown sequence on an active connection. 6947 * Return value zero for OK return, non-zero for error return. 6948 */ 6949 static int 6950 tcp_xmit_end(tcp_t *tcp, int sock_id) 6951 { 6952 mblk_t *mp; 6953 6954 if (tcp->tcp_state < TCPS_SYN_RCVD || 6955 tcp->tcp_state > TCPS_CLOSE_WAIT) { 6956 /* 6957 * Invalid state, only states TCPS_SYN_RCVD, 6958 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 6959 */ 6960 return (-1); 6961 } 6962 6963 tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 6964 tcp->tcp_valid_bits |= TCP_FSS_VALID; 6965 /* 6966 * If there is nothing more unsent, send the FIN now. 6967 * Otherwise, it will go out with the last segment. 6968 */ 6969 if (tcp->tcp_unsent == 0) { 6970 mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 6971 tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 6972 6973 if (mp != NULL) { 6974 /* Dump the packet when debugging. */ 6975 TCP_DUMP_PACKET("tcp_xmit_end", mp); 6976 (void) ipv4_tcp_output(sock_id, mp); 6977 freeb(mp); 6978 } else { 6979 /* 6980 * Couldn't allocate msg. Pretend we got it out. 6981 * Wait for rexmit timeout. 6982 */ 6983 tcp->tcp_snxt = tcp->tcp_fss + 1; 6984 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 6985 } 6986 6987 /* 6988 * If needed, update tcp_rexmit_snxt as tcp_snxt is 6989 * changed. 6990 */ 6991 if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 6992 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 6993 } 6994 } else { 6995 tcp_wput_data(tcp, NULL, B_FALSE); 6996 } 6997 6998 return (0); 6999 } 7000 7001 int 7002 tcp_opt_set(tcp_t *tcp, int level, int option, const void *optval, 7003 socklen_t optlen) 7004 { 7005 switch (level) { 7006 case SOL_SOCKET: { 7007 switch (option) { 7008 case SO_RCVBUF: 7009 if (optlen == sizeof (int)) { 7010 int val = *(int *)optval; 7011 7012 if (val > tcp_max_buf) { 7013 errno = ENOBUFS; 7014 break; 7015 } 7016 /* Silently ignore zero */ 7017 if (val != 0) { 7018 val = MSS_ROUNDUP(val, tcp->tcp_mss); 7019 (void) tcp_rwnd_set(tcp, val); 7020 } 7021 } else { 7022 errno = EINVAL; 7023 } 7024 break; 7025 case SO_SNDBUF: 7026 if (optlen == sizeof (int)) { 7027 tcp->tcp_xmit_hiwater = *(int *)optval; 7028 if (tcp->tcp_xmit_hiwater > tcp_max_buf) 7029 tcp->tcp_xmit_hiwater = tcp_max_buf; 7030 } else { 7031 errno = EINVAL; 7032 } 7033 break; 7034 case SO_LINGER: 7035 if (optlen == sizeof (struct linger)) { 7036 struct linger *lgr = (struct linger *)optval; 7037 7038 if (lgr->l_onoff) { 7039 tcp->tcp_linger = 1; 7040 tcp->tcp_lingertime = lgr->l_linger; 7041 } else { 7042 tcp->tcp_linger = 0; 7043 tcp->tcp_lingertime = 0; 7044 } 7045 } else { 7046 errno = EINVAL; 7047 } 7048 break; 7049 default: 7050 errno = ENOPROTOOPT; 7051 break; 7052 } 7053 break; 7054 } /* case SOL_SOCKET */ 7055 case IPPROTO_TCP: { 7056 switch (option) { 7057 default: 7058 errno = ENOPROTOOPT; 7059 break; 7060 } 7061 break; 7062 } /* case IPPROTO_TCP */ 7063 case IPPROTO_IP: { 7064 switch (option) { 7065 default: 7066 errno = ENOPROTOOPT; 7067 break; 7068 } 7069 break; 7070 } /* case IPPROTO_IP */ 7071 default: 7072 errno = ENOPROTOOPT; 7073 break; 7074 } /* switch (level) */ 7075 7076 if (errno != 0) 7077 return (-1); 7078 else 7079 return (0); 7080 } 7081