1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 * 26 * tcp.c, Code implementing the TCP protocol. 27 */ 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include <sys/types.h> 32 #include <socket_impl.h> 33 #include <socket_inet.h> 34 #include <sys/sysmacros.h> 35 #include <sys/promif.h> 36 #include <sys/socket.h> 37 #include <netinet/in_systm.h> 38 #include <netinet/in.h> 39 #include <netinet/ip.h> 40 #include <netinet/tcp.h> 41 #include <net/if_types.h> 42 #include <sys/salib.h> 43 44 #include "ipv4.h" 45 #include "ipv4_impl.h" 46 #include "mac.h" 47 #include "mac_impl.h" 48 #include "v4_sum_impl.h" 49 #include <sys/bootdebug.h> 50 #include "tcp_inet.h" 51 #include "tcp_sack.h" 52 #include <inet/common.h> 53 #include <inet/mib2.h> 54 55 /* 56 * We need to redefine BUMP_MIB/UPDATE_MIB to not have DTrace probes. 57 */ 58 #undef BUMP_MIB 59 #define BUMP_MIB(x) (x)++ 60 61 #undef UPDATE_MIB 62 #define UPDATE_MIB(x, y) x += y 63 64 /* 65 * MIB-2 stuff for SNMP 66 */ 67 mib2_tcp_t tcp_mib; /* SNMP fixed size info */ 68 69 /* The TCP mib does not include the following errors. */ 70 static uint_t tcp_cksum_errors; 71 static uint_t tcp_drops; 72 73 /* Macros for timestamp comparisons */ 74 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 75 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 76 77 /* 78 * Parameters for TCP Initial Send Sequence number (ISS) generation. 79 * The ISS is calculated by adding three components: a time component 80 * which grows by 1 every 4096 nanoseconds (versus every 4 microseconds 81 * suggested by RFC 793, page 27); 82 * a per-connection component which grows by 125000 for every new connection; 83 * and an "extra" component that grows by a random amount centered 84 * approximately on 64000. This causes the the ISS generator to cycle every 85 * 4.89 hours if no TCP connections are made, and faster if connections are 86 * made. 87 */ 88 #define ISS_INCR 250000 89 #define ISS_NSEC_SHT 0 90 91 static uint32_t tcp_iss_incr_extra; /* Incremented for each connection */ 92 93 #define TCP_XMIT_LOWATER 4096 94 #define TCP_XMIT_HIWATER 49152 95 #define TCP_RECV_LOWATER 2048 96 #define TCP_RECV_HIWATER 49152 97 98 /* 99 * PAWS needs a timer for 24 days. This is the number of ms in 24 days 100 */ 101 #define PAWS_TIMEOUT ((uint32_t)(24*24*60*60*1000)) 102 103 /* 104 * TCP options struct returned from tcp_parse_options. 105 */ 106 typedef struct tcp_opt_s { 107 uint32_t tcp_opt_mss; 108 uint32_t tcp_opt_wscale; 109 uint32_t tcp_opt_ts_val; 110 uint32_t tcp_opt_ts_ecr; 111 tcp_t *tcp; 112 } tcp_opt_t; 113 114 /* 115 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 116 */ 117 118 #ifdef _BIG_ENDIAN 119 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 120 (TCPOPT_TSTAMP << 8) | 10) 121 #else 122 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 123 (TCPOPT_NOP << 8) | TCPOPT_NOP) 124 #endif 125 126 /* 127 * Flags returned from tcp_parse_options. 128 */ 129 #define TCP_OPT_MSS_PRESENT 1 130 #define TCP_OPT_WSCALE_PRESENT 2 131 #define TCP_OPT_TSTAMP_PRESENT 4 132 #define TCP_OPT_SACK_OK_PRESENT 8 133 #define TCP_OPT_SACK_PRESENT 16 134 135 /* TCP option length */ 136 #define TCPOPT_NOP_LEN 1 137 #define TCPOPT_MAXSEG_LEN 4 138 #define TCPOPT_WS_LEN 3 139 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 140 #define TCPOPT_TSTAMP_LEN 10 141 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 142 #define TCPOPT_SACK_OK_LEN 2 143 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 144 #define TCPOPT_REAL_SACK_LEN 4 145 #define TCPOPT_MAX_SACK_LEN 36 146 #define TCPOPT_HEADER_LEN 2 147 148 /* TCP cwnd burst factor. */ 149 #define TCP_CWND_INFINITE 65535 150 #define TCP_CWND_SS 3 151 #define TCP_CWND_NORMAL 5 152 153 /* Named Dispatch Parameter Management Structure */ 154 typedef struct tcpparam_s { 155 uint32_t tcp_param_min; 156 uint32_t tcp_param_max; 157 uint32_t tcp_param_val; 158 char *tcp_param_name; 159 } tcpparam_t; 160 161 /* Max size IP datagram is 64k - 1 */ 162 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (struct ip) + \ 163 sizeof (tcph_t))) 164 165 /* Max of the above */ 166 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 167 168 /* Largest TCP port number */ 169 #define TCP_MAX_PORT (64 * 1024 - 1) 170 171 /* Round up the value to the nearest mss. */ 172 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 173 174 #define MS 1L 175 #define SECONDS (1000 * MS) 176 #define MINUTES (60 * SECONDS) 177 #define HOURS (60 * MINUTES) 178 #define DAYS (24 * HOURS) 179 180 /* All NDD params in the core TCP became static variables. */ 181 static int tcp_time_wait_interval = 1 * MINUTES; 182 static int tcp_conn_req_max_q = 128; 183 static int tcp_conn_req_max_q0 = 1024; 184 static int tcp_conn_req_min = 1; 185 static int tcp_conn_grace_period = 0 * SECONDS; 186 static int tcp_cwnd_max_ = 1024 * 1024; 187 static int tcp_smallest_nonpriv_port = 1024; 188 static int tcp_ip_abort_cinterval = 3 * MINUTES; 189 static int tcp_ip_abort_linterval = 3 * MINUTES; 190 static int tcp_ip_abort_interval = 8 * MINUTES; 191 static int tcp_ip_notify_cinterval = 10 * SECONDS; 192 static int tcp_ip_notify_interval = 10 * SECONDS; 193 static int tcp_ipv4_ttl = 64; 194 static int tcp_mss_def_ipv4 = 536; 195 static int tcp_mss_max_ipv4 = TCP_MSS_MAX_IPV4; 196 static int tcp_mss_min = 108; 197 static int tcp_naglim_def = (4*1024)-1; 198 static int tcp_rexmit_interval_initial = 3 * SECONDS; 199 static int tcp_rexmit_interval_max = 60 * SECONDS; 200 static int tcp_rexmit_interval_min = 400 * MS; 201 static int tcp_dupack_fast_retransmit = 3; 202 static int tcp_smallest_anon_port = 32 * 1024; 203 static int tcp_largest_anon_port = TCP_MAX_PORT; 204 static int tcp_xmit_lowat = TCP_XMIT_LOWATER; 205 static int tcp_recv_hiwat_minmss = 4; 206 static int tcp_fin_wait_2_flush_interval = 1 * MINUTES; 207 static int tcp_max_buf = 1024 * 1024; 208 static int tcp_wscale_always = 1; 209 static int tcp_tstamp_always = 1; 210 static int tcp_tstamp_if_wscale = 1; 211 static int tcp_rexmit_interval_extra = 0; 212 static int tcp_slow_start_after_idle = 2; 213 static int tcp_slow_start_initial = 2; 214 static int tcp_sack_permitted = 2; 215 static int tcp_ecn_permitted = 2; 216 217 /* Extra room to fit in headers. */ 218 static uint_t tcp_wroff_xtra; 219 220 /* Hint for next port to try. */ 221 static in_port_t tcp_next_port_to_try = 32*1024; 222 223 /* 224 * Figure out the value of window scale opton. Note that the rwnd is 225 * ASSUMED to be rounded up to the nearest MSS before the calculation. 226 * We cannot find the scale value and then do a round up of tcp_rwnd 227 * because the scale value may not be correct after that. 228 */ 229 #define SET_WS_VALUE(tcp) \ 230 { \ 231 int i; \ 232 uint32_t rwnd = (tcp)->tcp_rwnd; \ 233 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; \ 234 i++, rwnd >>= 1) \ 235 ; \ 236 (tcp)->tcp_rcv_ws = i; \ 237 } 238 239 /* 240 * Set ECN capable transport (ECT) code point in IP header. 241 * 242 * Note that there are 2 ECT code points '01' and '10', which are called 243 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 244 * point ECT(0) for TCP as described in RFC 2481. 245 */ 246 #define SET_ECT(tcp, iph) \ 247 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 248 /* We need to clear the code point first. */ \ 249 ((struct ip *)(iph))->ip_tos &= 0xFC; \ 250 ((struct ip *)(iph))->ip_tos |= IPH_ECN_ECT0; \ 251 } 252 253 /* 254 * The format argument to pass to tcp_display(). 255 * DISP_PORT_ONLY means that the returned string has only port info. 256 * DISP_ADDR_AND_PORT means that the returned string also contains the 257 * remote and local IP address. 258 */ 259 #define DISP_PORT_ONLY 1 260 #define DISP_ADDR_AND_PORT 2 261 262 /* 263 * TCP reassembly macros. We hide starting and ending sequence numbers in 264 * b_next and b_prev of messages on the reassembly queue. The messages are 265 * chained using b_cont. These macros are used in tcp_reass() so we don't 266 * have to see the ugly casts and assignments. 267 */ 268 #define TCP_REASS_SEQ(mp) ((uint32_t)((mp)->b_next)) 269 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = (mblk_t *)(u)) 270 #define TCP_REASS_END(mp) ((uint32_t)((mp)->b_prev)) 271 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = (mblk_t *)(u)) 272 273 #define TCP_TIMER_RESTART(tcp, intvl) \ 274 (tcp)->tcp_rto_timeout = prom_gettime() + intvl; \ 275 (tcp)->tcp_timer_running = B_TRUE; 276 277 static int tcp_accept_comm(tcp_t *, tcp_t *, mblk_t *, uint_t); 278 static mblk_t *tcp_ack_mp(tcp_t *); 279 static in_port_t tcp_bindi(in_port_t, in_addr_t *, boolean_t, boolean_t); 280 static uint16_t tcp_cksum(uint16_t *, uint32_t); 281 static void tcp_clean_death(int, tcp_t *, int err); 282 static tcp_t *tcp_conn_request(tcp_t *, mblk_t *mp, uint_t, uint_t); 283 static char *tcp_display(tcp_t *, char *, char); 284 static int tcp_drain_input(tcp_t *, int, int); 285 static void tcp_drain_needed(int, tcp_t *); 286 static boolean_t tcp_drop_q0(tcp_t *); 287 static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *); 288 static int tcp_header_len(struct inetgram *); 289 static in_port_t tcp_report_ports(uint16_t *, enum Ports); 290 static int tcp_input(int); 291 static void tcp_iss_init(tcp_t *); 292 static tcp_t *tcp_lookup_ipv4(struct ip *, tcpha_t *, int, int *); 293 static tcp_t *tcp_lookup_listener_ipv4(in_addr_t, in_port_t, int *); 294 static int tcp_conn_check(tcp_t *); 295 static int tcp_close(int); 296 static void tcp_close_detached(tcp_t *); 297 static void tcp_eager_cleanup(tcp_t *, boolean_t, int); 298 static void tcp_eager_unlink(tcp_t *); 299 static void tcp_free(tcp_t *); 300 static int tcp_header_init_ipv4(tcp_t *); 301 static void tcp_mss_set(tcp_t *, uint32_t); 302 static int tcp_parse_options(tcph_t *, tcp_opt_t *); 303 static boolean_t tcp_paws_check(tcp_t *, tcph_t *, tcp_opt_t *); 304 static void tcp_process_options(tcp_t *, tcph_t *); 305 static int tcp_random(void); 306 static void tcp_random_init(void); 307 static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); 308 static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); 309 static void tcp_rcv_drain(int sock_id, tcp_t *); 310 static void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t); 311 static void tcp_rput_data(tcp_t *, mblk_t *, int); 312 static int tcp_rwnd_set(tcp_t *, uint32_t); 313 static int32_t tcp_sack_rxmit(tcp_t *, int); 314 static void tcp_set_cksum(mblk_t *); 315 static void tcp_set_rto(tcp_t *, int32_t); 316 static void tcp_ss_rexmit(tcp_t *, int); 317 static int tcp_state_wait(int, tcp_t *, int); 318 static void tcp_timer(tcp_t *, int); 319 static void tcp_time_wait_append(tcp_t *); 320 static void tcp_time_wait_collector(void); 321 static void tcp_time_wait_processing(tcp_t *, mblk_t *, uint32_t, 322 uint32_t, int, tcph_t *, int sock_id); 323 static void tcp_time_wait_remove(tcp_t *); 324 static in_port_t tcp_update_next_port(in_port_t); 325 static int tcp_verify_cksum(mblk_t *); 326 static void tcp_wput_data(tcp_t *, mblk_t *, int); 327 static void tcp_xmit_ctl(char *, tcp_t *, mblk_t *, uint32_t, uint32_t, 328 int, uint_t, int); 329 static void tcp_xmit_early_reset(char *, int, mblk_t *, uint32_t, uint32_t, 330 int, uint_t); 331 static int tcp_xmit_end(tcp_t *, int); 332 static void tcp_xmit_listeners_reset(int, mblk_t *, uint_t); 333 static mblk_t *tcp_xmit_mp(tcp_t *, mblk_t *, int32_t, int32_t *, 334 mblk_t **, uint32_t, boolean_t, uint32_t *, boolean_t); 335 static int tcp_init_values(tcp_t *, struct inetboot_socket *); 336 337 #if DEBUG > 1 338 #define TCP_DUMP_PACKET(str, mp) \ 339 { \ 340 int len = (mp)->b_wptr - (mp)->b_rptr; \ 341 \ 342 printf("%s: dump TCP(%d): \n", (str), len); \ 343 hexdump((char *)(mp)->b_rptr, len); \ 344 } 345 #else 346 #define TCP_DUMP_PACKET(str, mp) 347 #endif 348 349 #ifdef DEBUG 350 #define DEBUG_1(str, arg) printf(str, (arg)) 351 #define DEBUG_2(str, arg1, arg2) printf(str, (arg1), (arg2)) 352 #define DEBUG_3(str, arg1, arg2, arg3) printf(str, (arg1), (arg2), (arg3)) 353 #else 354 #define DEBUG_1(str, arg) 355 #define DEBUG_2(str, arg1, arg2) 356 #define DEBUG_3(str, arg1, arg2, arg3) 357 #endif 358 359 /* Whether it is the first time TCP is used. */ 360 static boolean_t tcp_initialized = B_FALSE; 361 362 /* TCP time wait list. */ 363 static tcp_t *tcp_time_wait_head; 364 static tcp_t *tcp_time_wait_tail; 365 static uint32_t tcp_cum_timewait; 366 /* When the tcp_time_wait_collector is run. */ 367 static uint32_t tcp_time_wait_runtime; 368 369 #define TCP_RUN_TIME_WAIT_COLLECTOR() \ 370 if (prom_gettime() > tcp_time_wait_runtime) \ 371 tcp_time_wait_collector(); 372 373 /* 374 * Accept will return with an error if there is no connection coming in 375 * after this (in ms). 376 */ 377 static int tcp_accept_timeout = 60000; 378 379 /* 380 * Initialize the TCP-specific parts of a socket. 381 */ 382 void 383 tcp_socket_init(struct inetboot_socket *isp) 384 { 385 /* Do some initializations. */ 386 if (!tcp_initialized) { 387 tcp_random_init(); 388 /* Extra head room for the MAC layer address. */ 389 if ((tcp_wroff_xtra = mac_get_hdr_len()) & 0x3) { 390 tcp_wroff_xtra = (tcp_wroff_xtra & ~0x3) + 0x4; 391 } 392 /* Schedule the first time wait cleanup time */ 393 tcp_time_wait_runtime = prom_gettime() + tcp_time_wait_interval; 394 tcp_initialized = B_TRUE; 395 } 396 TCP_RUN_TIME_WAIT_COLLECTOR(); 397 398 isp->proto = IPPROTO_TCP; 399 isp->input[TRANSPORT_LVL] = tcp_input; 400 /* Socket layer should call tcp_send() directly. */ 401 isp->output[TRANSPORT_LVL] = NULL; 402 isp->close[TRANSPORT_LVL] = tcp_close; 403 isp->headerlen[TRANSPORT_LVL] = tcp_header_len; 404 isp->ports = tcp_report_ports; 405 if ((isp->pcb = bkmem_alloc(sizeof (tcp_t))) == NULL) { 406 errno = ENOBUFS; 407 return; 408 } 409 if ((errno = tcp_init_values((tcp_t *)isp->pcb, isp)) != 0) { 410 bkmem_free(isp->pcb, sizeof (tcp_t)); 411 return; 412 } 413 /* 414 * This is set last because this field is used to determine if 415 * a socket is in use or not. 416 */ 417 isp->type = INETBOOT_STREAM; 418 } 419 420 /* 421 * Return the size of a TCP header including TCP option. 422 */ 423 static int 424 tcp_header_len(struct inetgram *igm) 425 { 426 mblk_t *pkt; 427 int ipvers; 428 429 /* Just returns the standard TCP header without option */ 430 if (igm == NULL) 431 return (sizeof (tcph_t)); 432 433 if ((pkt = igm->igm_mp) == NULL) 434 return (0); 435 436 ipvers = ((struct ip *)pkt->b_rptr)->ip_v; 437 if (ipvers == IPV4_VERSION) { 438 return (TCP_HDR_LENGTH((tcph_t *)(pkt + IPH_HDR_LENGTH(pkt)))); 439 } else { 440 dprintf("tcp_header_len: non-IPv4 packet.\n"); 441 return (0); 442 } 443 } 444 445 /* 446 * Return the requested port number in network order. 447 */ 448 static in_port_t 449 tcp_report_ports(uint16_t *tcphp, enum Ports request) 450 { 451 if (request == SOURCE) 452 return (*(uint16_t *)(((tcph_t *)tcphp)->th_lport)); 453 return (*(uint16_t *)(((tcph_t *)tcphp)->th_fport)); 454 } 455 456 /* 457 * Because inetboot is not interrupt driven, TCP can only poll. This 458 * means that there can be packets stuck in the NIC buffer waiting to 459 * be processed. Thus we need to drain them before, for example, sending 460 * anything because an ACK may actually be stuck there. 461 * 462 * The timeout arguments determine how long we should wait for draining. 463 */ 464 static int 465 tcp_drain_input(tcp_t *tcp, int sock_id, int timeout) 466 { 467 struct inetgram *in_gram; 468 struct inetgram *old_in_gram; 469 int old_timeout; 470 mblk_t *mp; 471 int i; 472 473 dprintf("tcp_drain_input(%d): %s\n", sock_id, 474 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 475 476 /* 477 * Since the driver uses the in_timeout value in the socket 478 * structure to determine the timeout value, we need to save 479 * the original one so that we can restore that after draining. 480 */ 481 old_timeout = sockets[sock_id].in_timeout; 482 sockets[sock_id].in_timeout = timeout; 483 484 /* 485 * We do this because the input queue may have some user 486 * data already. 487 */ 488 old_in_gram = sockets[sock_id].inq; 489 sockets[sock_id].inq = NULL; 490 491 /* Go out and check the wire */ 492 for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) { 493 if (sockets[sock_id].input[i] != NULL) { 494 if (sockets[sock_id].input[i](sock_id) < 0) { 495 sockets[sock_id].in_timeout = old_timeout; 496 if (sockets[sock_id].inq != NULL) 497 nuke_grams(&sockets[sock_id].inq); 498 sockets[sock_id].inq = old_in_gram; 499 return (-1); 500 } 501 } 502 } 503 #if DEBUG 504 printf("tcp_drain_input: done with checking packets\n"); 505 #endif 506 while ((in_gram = sockets[sock_id].inq) != NULL) { 507 /* Remove unknown inetgrams from the head of inq. */ 508 if (in_gram->igm_level != TRANSPORT_LVL) { 509 #if DEBUG 510 printf("tcp_drain_input: unexpected packet " 511 "level %d frame found\n", in_gram->igm_level); 512 #endif 513 del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 514 continue; 515 } 516 mp = in_gram->igm_mp; 517 del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 518 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 519 tcp_rput_data(tcp, mp, sock_id); 520 sockets[sock_id].in_timeout = old_timeout; 521 522 /* 523 * The other side may have closed this connection or 524 * RST us. But we need to continue to process other 525 * packets in the socket's queue because they may be 526 * belong to another TCP connections. 527 */ 528 if (sockets[sock_id].pcb == NULL) 529 tcp = NULL; 530 } 531 532 if (tcp == NULL || sockets[sock_id].pcb == NULL) { 533 if (sockets[sock_id].so_error != 0) 534 return (-1); 535 else 536 return (0); 537 } 538 #if DEBUG 539 printf("tcp_drain_input: done with processing packets\n"); 540 #endif 541 sockets[sock_id].in_timeout = old_timeout; 542 sockets[sock_id].inq = old_in_gram; 543 544 /* 545 * Data may have been received so indicate it is available 546 */ 547 tcp_drain_needed(sock_id, tcp); 548 return (0); 549 } 550 551 /* 552 * The receive entry point for upper layer to call to get data. Note 553 * that this follows the current architecture that lower layer receive 554 * routines have been called already. Thus if the inq of socket is 555 * not NULL, the packets must be for us. 556 */ 557 static int 558 tcp_input(int sock_id) 559 { 560 struct inetgram *in_gram; 561 mblk_t *mp; 562 tcp_t *tcp; 563 564 TCP_RUN_TIME_WAIT_COLLECTOR(); 565 566 if ((tcp = sockets[sock_id].pcb) == NULL) 567 return (-1); 568 569 while ((in_gram = sockets[sock_id].inq) != NULL) { 570 /* Remove unknown inetgrams from the head of inq. */ 571 if (in_gram->igm_level != TRANSPORT_LVL) { 572 #ifdef DEBUG 573 printf("tcp_input: unexpected packet " 574 "level %d frame found\n", in_gram->igm_level); 575 #endif 576 del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 577 continue; 578 } 579 mp = in_gram->igm_mp; 580 del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 581 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 582 tcp_rput_data(tcp, mp, sock_id); 583 /* The TCP may be gone because it gets a RST. */ 584 if (sockets[sock_id].pcb == NULL) 585 return (-1); 586 } 587 588 /* Flush the receive list. */ 589 if (tcp->tcp_rcv_list != NULL) { 590 tcp_rcv_drain(sock_id, tcp); 591 } else { 592 /* The other side has closed the connection, report this up. */ 593 if (tcp->tcp_state == TCPS_CLOSE_WAIT) { 594 sockets[sock_id].so_state |= SS_CANTRCVMORE; 595 return (0); 596 } 597 } 598 return (0); 599 } 600 601 /* 602 * The send entry point for upper layer to call to send data. In order 603 * to minimize changes to the core TCP code, we need to put the 604 * data into mblks. 605 */ 606 int 607 tcp_send(int sock_id, tcp_t *tcp, const void *msg, int len) 608 { 609 mblk_t *mp; 610 mblk_t *head = NULL; 611 mblk_t *tail; 612 int mss = tcp->tcp_mss; 613 int cnt = 0; 614 int win_size; 615 char *buf = (char *)msg; 616 617 TCP_RUN_TIME_WAIT_COLLECTOR(); 618 619 /* We don't want to append 0 size mblk. */ 620 if (len == 0) 621 return (0); 622 while (len > 0) { 623 if (len < mss) { 624 mss = len; 625 } 626 /* 627 * If we cannot allocate more buffer, stop here and 628 * the number of bytes buffered will be returned. 629 * 630 * Note that we follow the core TCP optimization that 631 * each mblk contains only MSS bytes data. 632 */ 633 if ((mp = allocb(mss + tcp->tcp_ip_hdr_len + 634 TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 0)) == NULL) { 635 break; 636 } 637 mp->b_rptr += tcp->tcp_hdr_len + tcp_wroff_xtra; 638 bcopy(buf, mp->b_rptr, mss); 639 mp->b_wptr = mp->b_rptr + mss; 640 buf += mss; 641 cnt += mss; 642 len -= mss; 643 644 if (head == NULL) { 645 head = mp; 646 tail = mp; 647 } else { 648 tail->b_cont = mp; 649 tail = mp; 650 } 651 } 652 653 /* 654 * Since inetboot is not interrupt driven, there may be 655 * some ACKs in the MAC's buffer. Drain them first, 656 * otherwise, we may not be able to send. 657 * 658 * We expect an ACK in two cases: 659 * 660 * 1) We have un-ACK'ed data. 661 * 662 * 2) All ACK's have been received and the sender's window has been 663 * closed. We need an ACK back to open the window so that we can 664 * send. In this case, call tcp_drain_input() if the window size is 665 * less than 2 * MSS. 666 */ 667 668 /* window size = MIN(swnd, cwnd) - unacked bytes */ 669 win_size = (tcp->tcp_swnd > tcp->tcp_cwnd) ? tcp->tcp_cwnd : 670 tcp->tcp_swnd; 671 win_size -= tcp->tcp_snxt; 672 win_size += tcp->tcp_suna; 673 if (win_size < (2 * tcp->tcp_mss)) 674 if (tcp_drain_input(tcp, sock_id, 5) < 0) 675 return (-1); 676 677 tcp_wput_data(tcp, head, sock_id); 678 return (cnt); 679 } 680 681 /* Free up all TCP related stuff */ 682 static void 683 tcp_free(tcp_t *tcp) 684 { 685 if (tcp->tcp_iphc != NULL) { 686 bkmem_free((caddr_t)tcp->tcp_iphc, tcp->tcp_iphc_len); 687 tcp->tcp_iphc = NULL; 688 } 689 if (tcp->tcp_xmit_head != NULL) { 690 freemsg(tcp->tcp_xmit_head); 691 tcp->tcp_xmit_head = NULL; 692 } 693 if (tcp->tcp_rcv_list != NULL) { 694 freemsg(tcp->tcp_rcv_list); 695 tcp->tcp_rcv_list = NULL; 696 } 697 if (tcp->tcp_reass_head != NULL) { 698 freemsg(tcp->tcp_reass_head); 699 tcp->tcp_reass_head = NULL; 700 } 701 if (tcp->tcp_sack_info != NULL) { 702 bkmem_free((caddr_t)tcp->tcp_sack_info, 703 sizeof (tcp_sack_info_t)); 704 tcp->tcp_sack_info = NULL; 705 } 706 } 707 708 static void 709 tcp_close_detached(tcp_t *tcp) 710 { 711 if (tcp->tcp_listener != NULL) 712 tcp_eager_unlink(tcp); 713 tcp_free(tcp); 714 bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 715 } 716 717 /* 718 * If we are an eager connection hanging off a listener that hasn't 719 * formally accepted the connection yet, get off his list and blow off 720 * any data that we have accumulated. 721 */ 722 static void 723 tcp_eager_unlink(tcp_t *tcp) 724 { 725 tcp_t *listener = tcp->tcp_listener; 726 727 assert(listener != NULL); 728 if (tcp->tcp_eager_next_q0 != NULL) { 729 assert(tcp->tcp_eager_prev_q0 != NULL); 730 731 /* Remove the eager tcp from q0 */ 732 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 733 tcp->tcp_eager_prev_q0; 734 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 735 tcp->tcp_eager_next_q0; 736 listener->tcp_conn_req_cnt_q0--; 737 } else { 738 tcp_t **tcpp = &listener->tcp_eager_next_q; 739 tcp_t *prev = NULL; 740 741 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 742 if (tcpp[0] == tcp) { 743 if (listener->tcp_eager_last_q == tcp) { 744 /* 745 * If we are unlinking the last 746 * element on the list, adjust 747 * tail pointer. Set tail pointer 748 * to nil when list is empty. 749 */ 750 assert(tcp->tcp_eager_next_q == NULL); 751 if (listener->tcp_eager_last_q == 752 listener->tcp_eager_next_q) { 753 listener->tcp_eager_last_q = 754 NULL; 755 } else { 756 /* 757 * We won't get here if there 758 * is only one eager in the 759 * list. 760 */ 761 assert(prev != NULL); 762 listener->tcp_eager_last_q = 763 prev; 764 } 765 } 766 tcpp[0] = tcp->tcp_eager_next_q; 767 tcp->tcp_eager_next_q = NULL; 768 tcp->tcp_eager_last_q = NULL; 769 listener->tcp_conn_req_cnt_q--; 770 break; 771 } 772 prev = tcpp[0]; 773 } 774 } 775 tcp->tcp_listener = NULL; 776 } 777 778 /* 779 * Reset any eager connection hanging off this listener 780 * and then reclaim it's resources. 781 */ 782 static void 783 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only, int sock_id) 784 { 785 tcp_t *eager; 786 787 if (!q0_only) { 788 /* First cleanup q */ 789 while ((eager = listener->tcp_eager_next_q) != NULL) { 790 assert(listener->tcp_eager_last_q != NULL); 791 tcp_xmit_ctl("tcp_eager_cleanup, can't wait", 792 eager, NULL, eager->tcp_snxt, 0, TH_RST, 0, 793 sock_id); 794 tcp_close_detached(eager); 795 } 796 assert(listener->tcp_eager_last_q == NULL); 797 } 798 /* Then cleanup q0 */ 799 while ((eager = listener->tcp_eager_next_q0) != listener) { 800 tcp_xmit_ctl("tcp_eager_cleanup, can't wait", 801 eager, NULL, eager->tcp_snxt, 0, TH_RST, 0, sock_id); 802 tcp_close_detached(eager); 803 } 804 } 805 806 /* 807 * To handle the shutdown request. Called from shutdown() 808 */ 809 int 810 tcp_shutdown(int sock_id) 811 { 812 tcp_t *tcp; 813 814 DEBUG_1("tcp_shutdown: sock_id %x\n", sock_id); 815 816 if ((tcp = sockets[sock_id].pcb) == NULL) { 817 return (-1); 818 } 819 820 /* 821 * Since inetboot is not interrupt driven, there may be 822 * some ACKs in the MAC's buffer. Drain them first, 823 * otherwise, we may not be able to send. 824 */ 825 if (tcp_drain_input(tcp, sock_id, 5) < 0) { 826 /* 827 * If we return now without freeing TCP, there will be 828 * a memory leak. 829 */ 830 if (sockets[sock_id].pcb != NULL) 831 tcp_clean_death(sock_id, tcp, 0); 832 return (-1); 833 } 834 835 DEBUG_1("tcp_shutdown: tcp_state %x\n", tcp->tcp_state); 836 switch (tcp->tcp_state) { 837 838 case TCPS_SYN_RCVD: 839 /* 840 * Shutdown during the connect 3-way handshake 841 */ 842 case TCPS_ESTABLISHED: 843 /* 844 * Transmit the FIN 845 * wait for the FIN to be ACKed, 846 * then remain in FIN_WAIT_2 847 */ 848 dprintf("tcp_shutdown: sending fin\n"); 849 if (tcp_xmit_end(tcp, sock_id) == 0 && 850 tcp_state_wait(sock_id, tcp, TCPS_FIN_WAIT_2) < 0) { 851 /* During the wait, TCP may be gone... */ 852 if (sockets[sock_id].pcb == NULL) 853 return (-1); 854 } 855 dprintf("tcp_shutdown: done\n"); 856 break; 857 858 default: 859 break; 860 861 } 862 return (0); 863 } 864 865 /* To handle closing of the socket */ 866 static int 867 tcp_close(int sock_id) 868 { 869 char *msg; 870 tcp_t *tcp; 871 int error = 0; 872 873 if ((tcp = sockets[sock_id].pcb) == NULL) { 874 return (-1); 875 } 876 877 TCP_RUN_TIME_WAIT_COLLECTOR(); 878 879 /* 880 * Since inetboot is not interrupt driven, there may be 881 * some ACKs in the MAC's buffer. Drain them first, 882 * otherwise, we may not be able to send. 883 */ 884 if (tcp_drain_input(tcp, sock_id, 5) < 0) { 885 /* 886 * If we return now without freeing TCP, there will be 887 * a memory leak. 888 */ 889 if (sockets[sock_id].pcb != NULL) 890 tcp_clean_death(sock_id, tcp, 0); 891 return (-1); 892 } 893 894 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 895 /* Cleanup for listener */ 896 tcp_eager_cleanup(tcp, 0, sock_id); 897 } 898 899 msg = NULL; 900 switch (tcp->tcp_state) { 901 case TCPS_CLOSED: 902 case TCPS_IDLE: 903 case TCPS_BOUND: 904 case TCPS_LISTEN: 905 break; 906 case TCPS_SYN_SENT: 907 msg = "tcp_close, during connect"; 908 break; 909 case TCPS_SYN_RCVD: 910 /* 911 * Close during the connect 3-way handshake 912 * but here there may or may not be pending data 913 * already on queue. Process almost same as in 914 * the ESTABLISHED state. 915 */ 916 /* FALLTHRU */ 917 default: 918 /* 919 * If SO_LINGER has set a zero linger time, abort the 920 * connection with a reset. 921 */ 922 if (tcp->tcp_linger && tcp->tcp_lingertime == 0) { 923 msg = "tcp_close, zero lingertime"; 924 break; 925 } 926 927 /* 928 * Abort connection if there is unread data queued. 929 */ 930 if (tcp->tcp_rcv_list != NULL || 931 tcp->tcp_reass_head != NULL) { 932 msg = "tcp_close, unread data"; 933 break; 934 } 935 if (tcp->tcp_state <= TCPS_LISTEN) 936 break; 937 938 /* 939 * Transmit the FIN before detaching the tcp_t. 940 * After tcp_detach returns this queue/perimeter 941 * no longer owns the tcp_t thus others can modify it. 942 * The TCP could be closed in tcp_state_wait called by 943 * tcp_wput_data called by tcp_xmit_end. 944 */ 945 (void) tcp_xmit_end(tcp, sock_id); 946 if (sockets[sock_id].pcb == NULL) 947 return (0); 948 949 /* 950 * If lingering on close then wait until the fin is acked, 951 * the SO_LINGER time passes, or a reset is sent/received. 952 */ 953 if (tcp->tcp_linger && tcp->tcp_lingertime > 0 && 954 !(tcp->tcp_fin_acked) && 955 tcp->tcp_state >= TCPS_ESTABLISHED) { 956 uint32_t stoptime; /* in ms */ 957 958 tcp->tcp_client_errno = 0; 959 stoptime = prom_gettime() + 960 (tcp->tcp_lingertime * 1000); 961 while (!(tcp->tcp_fin_acked) && 962 tcp->tcp_state >= TCPS_ESTABLISHED && 963 tcp->tcp_client_errno == 0 && 964 ((int32_t)(stoptime - prom_gettime()) > 0)) { 965 if (tcp_drain_input(tcp, sock_id, 5) < 0) { 966 if (sockets[sock_id].pcb != NULL) { 967 tcp_clean_death(sock_id, 968 tcp, 0); 969 } 970 return (-1); 971 } 972 } 973 tcp->tcp_client_errno = 0; 974 } 975 if (tcp_state_wait(sock_id, tcp, TCPS_TIME_WAIT) < 0) { 976 /* During the wait, TCP may be gone... */ 977 if (sockets[sock_id].pcb == NULL) 978 return (0); 979 msg = "tcp_close, couldn't detach"; 980 } else { 981 return (0); 982 } 983 break; 984 } 985 986 /* Something went wrong... Send a RST and report the error */ 987 if (msg != NULL) { 988 if (tcp->tcp_state == TCPS_ESTABLISHED || 989 tcp->tcp_state == TCPS_CLOSE_WAIT) 990 BUMP_MIB(tcp_mib.tcpEstabResets); 991 if (tcp->tcp_state == TCPS_SYN_SENT || 992 tcp->tcp_state == TCPS_SYN_RCVD) 993 BUMP_MIB(tcp_mib.tcpAttemptFails); 994 tcp_xmit_ctl(msg, tcp, NULL, tcp->tcp_snxt, 0, TH_RST, 0, 995 sock_id); 996 } 997 998 tcp_free(tcp); 999 bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 1000 sockets[sock_id].pcb = NULL; 1001 return (error); 1002 } 1003 1004 /* To make an endpoint a listener. */ 1005 int 1006 tcp_listen(int sock_id, int backlog) 1007 { 1008 tcp_t *tcp; 1009 1010 if ((tcp = (tcp_t *)(sockets[sock_id].pcb)) == NULL) { 1011 errno = EINVAL; 1012 return (-1); 1013 } 1014 /* We allow calling listen() multiple times to change the backlog. */ 1015 if (tcp->tcp_state > TCPS_LISTEN || tcp->tcp_state < TCPS_BOUND) { 1016 errno = EOPNOTSUPP; 1017 return (-1); 1018 } 1019 /* The following initialization should only be done once. */ 1020 if (tcp->tcp_state != TCPS_LISTEN) { 1021 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 1022 tcp->tcp_eager_next_q = NULL; 1023 tcp->tcp_state = TCPS_LISTEN; 1024 tcp->tcp_second_ctimer_threshold = tcp_ip_abort_linterval; 1025 } 1026 if ((tcp->tcp_conn_req_max = backlog) > tcp_conn_req_max_q) { 1027 tcp->tcp_conn_req_max = tcp_conn_req_max_q; 1028 } 1029 if (tcp->tcp_conn_req_max < tcp_conn_req_min) { 1030 tcp->tcp_conn_req_max = tcp_conn_req_min; 1031 } 1032 return (0); 1033 } 1034 1035 /* To accept connections. */ 1036 int 1037 tcp_accept(int sock_id, struct sockaddr *addr, socklen_t *addr_len) 1038 { 1039 tcp_t *listener; 1040 tcp_t *eager; 1041 int sd, new_sock_id; 1042 struct sockaddr_in *new_addr = (struct sockaddr_in *)addr; 1043 int timeout; 1044 1045 /* Sanity check. */ 1046 if ((listener = (tcp_t *)(sockets[sock_id].pcb)) == NULL || 1047 new_addr == NULL || addr_len == NULL || 1048 *addr_len < sizeof (struct sockaddr_in) || 1049 listener->tcp_state != TCPS_LISTEN) { 1050 errno = EINVAL; 1051 return (-1); 1052 } 1053 1054 if (sockets[sock_id].in_timeout > tcp_accept_timeout) 1055 timeout = prom_gettime() + sockets[sock_id].in_timeout; 1056 else 1057 timeout = prom_gettime() + tcp_accept_timeout; 1058 while (listener->tcp_eager_next_q == NULL && 1059 timeout > prom_gettime()) { 1060 #if DEBUG 1061 printf("tcp_accept: Waiting in tcp_accept()\n"); 1062 #endif 1063 if (tcp_drain_input(listener, sock_id, 5) < 0) { 1064 return (-1); 1065 } 1066 } 1067 /* If there is an eager, don't timeout... */ 1068 if (timeout <= prom_gettime() && listener->tcp_eager_next_q == NULL) { 1069 #if DEBUG 1070 printf("tcp_accept: timeout\n"); 1071 #endif 1072 errno = ETIMEDOUT; 1073 return (-1); 1074 } 1075 #if DEBUG 1076 printf("tcp_accept: got a connection\n"); 1077 #endif 1078 1079 /* Now create the socket for this new TCP. */ 1080 if ((sd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { 1081 return (-1); 1082 } 1083 if ((new_sock_id = so_check_fd(sd, &errno)) == -1) 1084 /* This should not happen! */ 1085 prom_panic("so_check_fd() fails in tcp_accept()"); 1086 /* Free the TCP PCB in the original socket. */ 1087 bkmem_free((caddr_t)(sockets[new_sock_id].pcb), sizeof (tcp_t)); 1088 /* Dequeue the eager and attach it to the socket. */ 1089 eager = listener->tcp_eager_next_q; 1090 listener->tcp_eager_next_q = eager->tcp_eager_next_q; 1091 if (listener->tcp_eager_last_q == eager) 1092 listener->tcp_eager_last_q = NULL; 1093 eager->tcp_eager_next_q = NULL; 1094 sockets[new_sock_id].pcb = eager; 1095 listener->tcp_conn_req_cnt_q--; 1096 1097 /* Copy in the address info. */ 1098 bcopy(&eager->tcp_remote, &new_addr->sin_addr.s_addr, 1099 sizeof (in_addr_t)); 1100 bcopy(&eager->tcp_fport, &new_addr->sin_port, sizeof (in_port_t)); 1101 new_addr->sin_family = AF_INET; 1102 1103 #ifdef DEBUG 1104 printf("tcp_accept(), new sock_id: %d\n", sd); 1105 #endif 1106 return (sd); 1107 } 1108 1109 /* Update the next anonymous port to use. */ 1110 static in_port_t 1111 tcp_update_next_port(in_port_t port) 1112 { 1113 /* Don't allow the port to fall out of the anonymous port range. */ 1114 if (port < tcp_smallest_anon_port || port > tcp_largest_anon_port) 1115 port = (in_port_t)tcp_smallest_anon_port; 1116 1117 if (port < tcp_smallest_nonpriv_port) 1118 port = (in_port_t)tcp_smallest_nonpriv_port; 1119 return (port); 1120 } 1121 1122 /* To check whether a bind to a port is allowed. */ 1123 static in_port_t 1124 tcp_bindi(in_port_t port, in_addr_t *addr, boolean_t reuseaddr, 1125 boolean_t bind_to_req_port_only) 1126 { 1127 int i, count; 1128 tcp_t *tcp; 1129 1130 count = tcp_largest_anon_port - tcp_smallest_anon_port; 1131 try_again: 1132 for (i = 0; i < MAXSOCKET; i++) { 1133 if (sockets[i].type != INETBOOT_STREAM || 1134 ((tcp = (tcp_t *)sockets[i].pcb) == NULL) || 1135 ntohs(tcp->tcp_lport) != port) { 1136 continue; 1137 } 1138 /* 1139 * Both TCPs have the same port. If SO_REUSEDADDR is 1140 * set and the bound TCP has a state greater than 1141 * TCPS_LISTEN, it is fine. 1142 */ 1143 if (reuseaddr && tcp->tcp_state > TCPS_LISTEN) { 1144 continue; 1145 } 1146 if (tcp->tcp_bound_source != INADDR_ANY && 1147 *addr != INADDR_ANY && 1148 tcp->tcp_bound_source != *addr) { 1149 continue; 1150 } 1151 if (bind_to_req_port_only) { 1152 return (0); 1153 } 1154 if (--count > 0) { 1155 port = tcp_update_next_port(++port); 1156 goto try_again; 1157 } else { 1158 return (0); 1159 } 1160 } 1161 return (port); 1162 } 1163 1164 /* To handle the bind request. */ 1165 int 1166 tcp_bind(int sock_id) 1167 { 1168 tcp_t *tcp; 1169 in_port_t requested_port, allocated_port; 1170 boolean_t bind_to_req_port_only; 1171 boolean_t reuseaddr; 1172 1173 if ((tcp = (tcp_t *)sockets[sock_id].pcb) == NULL) { 1174 errno = EINVAL; 1175 return (-1); 1176 } 1177 1178 if (tcp->tcp_state >= TCPS_BOUND) { 1179 /* We don't allow multiple bind(). */ 1180 errno = EPROTO; 1181 return (-1); 1182 } 1183 1184 requested_port = ntohs(sockets[sock_id].bind.sin_port); 1185 1186 /* The bound source can be INADDR_ANY. */ 1187 tcp->tcp_bound_source = sockets[sock_id].bind.sin_addr.s_addr; 1188 1189 tcp->tcp_ipha->ip_src.s_addr = tcp->tcp_bound_source; 1190 1191 /* Verify the port is available. */ 1192 if (requested_port == 0) 1193 bind_to_req_port_only = B_FALSE; 1194 else /* T_BIND_REQ and requested_port != 0 */ 1195 bind_to_req_port_only = B_TRUE; 1196 1197 if (requested_port == 0) { 1198 requested_port = tcp_update_next_port(++tcp_next_port_to_try); 1199 } 1200 reuseaddr = sockets[sock_id].so_opt & SO_REUSEADDR; 1201 allocated_port = tcp_bindi(requested_port, &(tcp->tcp_bound_source), 1202 reuseaddr, bind_to_req_port_only); 1203 1204 if (allocated_port == 0) { 1205 errno = EADDRINUSE; 1206 return (-1); 1207 } 1208 tcp->tcp_lport = htons(allocated_port); 1209 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 1210 sockets[sock_id].bind.sin_port = tcp->tcp_lport; 1211 tcp->tcp_state = TCPS_BOUND; 1212 return (0); 1213 } 1214 1215 /* 1216 * Check for duplicate TCP connections. 1217 */ 1218 static int 1219 tcp_conn_check(tcp_t *tcp) 1220 { 1221 int i; 1222 tcp_t *tmp_tcp; 1223 1224 for (i = 0; i < MAXSOCKET; i++) { 1225 if (sockets[i].type != INETBOOT_STREAM) 1226 continue; 1227 /* Socket may not be closed but the TCP can be gone. */ 1228 if ((tmp_tcp = (tcp_t *)sockets[i].pcb) == NULL) 1229 continue; 1230 /* We only care about TCP in states later than SYN_SENT. */ 1231 if (tmp_tcp->tcp_state < TCPS_SYN_SENT) 1232 continue; 1233 if (tmp_tcp->tcp_lport != tcp->tcp_lport || 1234 tmp_tcp->tcp_fport != tcp->tcp_fport || 1235 tmp_tcp->tcp_bound_source != tcp->tcp_bound_source || 1236 tmp_tcp->tcp_remote != tcp->tcp_remote) { 1237 continue; 1238 } else { 1239 return (-1); 1240 } 1241 } 1242 return (0); 1243 } 1244 1245 /* To handle a connect request. */ 1246 int 1247 tcp_connect(int sock_id) 1248 { 1249 tcp_t *tcp; 1250 in_addr_t dstaddr; 1251 in_port_t dstport; 1252 tcph_t *tcph; 1253 int mss; 1254 mblk_t *syn_mp; 1255 1256 if ((tcp = (tcp_t *)(sockets[sock_id].pcb)) == NULL) { 1257 errno = EINVAL; 1258 return (-1); 1259 } 1260 1261 TCP_RUN_TIME_WAIT_COLLECTOR(); 1262 1263 dstaddr = sockets[sock_id].remote.sin_addr.s_addr; 1264 dstport = sockets[sock_id].remote.sin_port; 1265 1266 /* 1267 * Check for attempt to connect to INADDR_ANY or non-unicast addrress. 1268 * We don't have enough info to check for broadcast addr, except 1269 * for the all 1 broadcast. 1270 */ 1271 if (dstaddr == INADDR_ANY || IN_CLASSD(ntohl(dstaddr)) || 1272 dstaddr == INADDR_BROADCAST) { 1273 /* 1274 * SunOS 4.x and 4.3 BSD allow an application 1275 * to connect a TCP socket to INADDR_ANY. 1276 * When they do this, the kernel picks the 1277 * address of one interface and uses it 1278 * instead. The kernel usually ends up 1279 * picking the address of the loopback 1280 * interface. This is an undocumented feature. 1281 * However, we provide the same thing here 1282 * in order to have source and binary 1283 * compatibility with SunOS 4.x. 1284 * Update the T_CONN_REQ (sin/sin6) since it is used to 1285 * generate the T_CONN_CON. 1286 * 1287 * Fail this for inetboot TCP. 1288 */ 1289 errno = EINVAL; 1290 return (-1); 1291 } 1292 1293 /* It is not bound to any address yet... */ 1294 if (tcp->tcp_bound_source == INADDR_ANY) { 1295 ipv4_getipaddr(&(sockets[sock_id].bind.sin_addr)); 1296 /* We don't have an address! */ 1297 if (ntohl(sockets[sock_id].bind.sin_addr.s_addr) == 1298 INADDR_ANY) { 1299 errno = EPROTO; 1300 return (-1); 1301 } 1302 tcp->tcp_bound_source = sockets[sock_id].bind.sin_addr.s_addr; 1303 tcp->tcp_ipha->ip_src.s_addr = tcp->tcp_bound_source; 1304 } 1305 1306 /* 1307 * Don't let an endpoint connect to itself. 1308 */ 1309 if (dstaddr == tcp->tcp_ipha->ip_src.s_addr && 1310 dstport == tcp->tcp_lport) { 1311 errno = EINVAL; 1312 return (-1); 1313 } 1314 1315 tcp->tcp_ipha->ip_dst.s_addr = dstaddr; 1316 tcp->tcp_remote = dstaddr; 1317 tcph = tcp->tcp_tcph; 1318 *(uint16_t *)tcph->th_fport = dstport; 1319 tcp->tcp_fport = dstport; 1320 1321 /* 1322 * Don't allow this connection to completely duplicate 1323 * an existing connection. 1324 */ 1325 if (tcp_conn_check(tcp) < 0) { 1326 errno = EADDRINUSE; 1327 return (-1); 1328 } 1329 1330 /* 1331 * Just make sure our rwnd is at 1332 * least tcp_recv_hiwat_mss * MSS 1333 * large, and round up to the nearest 1334 * MSS. 1335 * 1336 * We do the round up here because 1337 * we need to get the interface 1338 * MTU first before we can do the 1339 * round up. 1340 */ 1341 mss = tcp->tcp_mss - tcp->tcp_hdr_len; 1342 tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 1343 tcp_recv_hiwat_minmss * mss); 1344 tcp->tcp_rwnd_max = tcp->tcp_rwnd; 1345 SET_WS_VALUE(tcp); 1346 U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), 1347 tcp->tcp_tcph->th_win); 1348 if (tcp->tcp_rcv_ws > 0 || tcp_wscale_always) 1349 tcp->tcp_snd_ws_ok = B_TRUE; 1350 1351 /* 1352 * Set tcp_snd_ts_ok to true 1353 * so that tcp_xmit_mp will 1354 * include the timestamp 1355 * option in the SYN segment. 1356 */ 1357 if (tcp_tstamp_always || 1358 (tcp->tcp_rcv_ws && tcp_tstamp_if_wscale)) { 1359 tcp->tcp_snd_ts_ok = B_TRUE; 1360 } 1361 1362 if (tcp_sack_permitted == 2 || 1363 tcp->tcp_snd_sack_ok) { 1364 assert(tcp->tcp_sack_info == NULL); 1365 if ((tcp->tcp_sack_info = (tcp_sack_info_t *)bkmem_zalloc( 1366 sizeof (tcp_sack_info_t))) == NULL) { 1367 tcp->tcp_snd_sack_ok = B_FALSE; 1368 } else { 1369 tcp->tcp_snd_sack_ok = B_TRUE; 1370 } 1371 } 1372 /* 1373 * Should we use ECN? Note that the current 1374 * default value (SunOS 5.9) of tcp_ecn_permitted 1375 * is 2. The reason for doing this is that there 1376 * are equipments out there that will drop ECN 1377 * enabled IP packets. Setting it to 1 avoids 1378 * compatibility problems. 1379 */ 1380 if (tcp_ecn_permitted == 2) 1381 tcp->tcp_ecn_ok = B_TRUE; 1382 1383 tcp_iss_init(tcp); 1384 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 1385 tcp->tcp_active_open = B_TRUE; 1386 1387 tcp->tcp_state = TCPS_SYN_SENT; 1388 syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, tcp->tcp_iss, B_FALSE, 1389 NULL, B_FALSE); 1390 if (syn_mp != NULL) { 1391 int ret; 1392 1393 /* Dump the packet when debugging. */ 1394 TCP_DUMP_PACKET("tcp_connect", syn_mp); 1395 /* Send out the SYN packet. */ 1396 ret = ipv4_tcp_output(sock_id, syn_mp); 1397 freeb(syn_mp); 1398 if (ret < 0) { 1399 return (-1); 1400 } 1401 /* tcp_state_wait() will finish the 3 way handshake. */ 1402 return (tcp_state_wait(sock_id, tcp, TCPS_ESTABLISHED)); 1403 } else { 1404 errno = ENOBUFS; 1405 return (-1); 1406 } 1407 } 1408 1409 /* 1410 * Common accept code. Called by tcp_conn_request. 1411 * cr_pkt is the SYN packet. 1412 */ 1413 static int 1414 tcp_accept_comm(tcp_t *listener, tcp_t *acceptor, mblk_t *cr_pkt, 1415 uint_t ip_hdr_len) 1416 { 1417 tcph_t *tcph; 1418 1419 #ifdef DEBUG 1420 printf("tcp_accept_comm #######################\n"); 1421 #endif 1422 1423 /* 1424 * When we get here, we know that the acceptor header template 1425 * has already been initialized. 1426 * However, it may not match the listener if the listener 1427 * includes options... 1428 * It may also not match the listener if the listener is v6 and 1429 * and the acceptor is v4 1430 */ 1431 acceptor->tcp_lport = listener->tcp_lport; 1432 1433 if (listener->tcp_ipversion == acceptor->tcp_ipversion) { 1434 if (acceptor->tcp_iphc_len != listener->tcp_iphc_len) { 1435 /* 1436 * Listener had options of some sort; acceptor inherits. 1437 * Free up the acceptor template and allocate one 1438 * of the right size. 1439 */ 1440 bkmem_free(acceptor->tcp_iphc, acceptor->tcp_iphc_len); 1441 acceptor->tcp_iphc = bkmem_zalloc( 1442 listener->tcp_iphc_len); 1443 if (acceptor->tcp_iphc == NULL) { 1444 acceptor->tcp_iphc_len = 0; 1445 return (ENOMEM); 1446 } 1447 acceptor->tcp_iphc_len = listener->tcp_iphc_len; 1448 } 1449 acceptor->tcp_hdr_len = listener->tcp_hdr_len; 1450 acceptor->tcp_ip_hdr_len = listener->tcp_ip_hdr_len; 1451 acceptor->tcp_tcp_hdr_len = listener->tcp_tcp_hdr_len; 1452 1453 /* 1454 * Copy the IP+TCP header template from listener to acceptor 1455 */ 1456 bcopy(listener->tcp_iphc, acceptor->tcp_iphc, 1457 listener->tcp_hdr_len); 1458 acceptor->tcp_ipha = (struct ip *)acceptor->tcp_iphc; 1459 acceptor->tcp_tcph = (tcph_t *)(acceptor->tcp_iphc + 1460 acceptor->tcp_ip_hdr_len); 1461 } else { 1462 prom_panic("tcp_accept_comm: version not equal"); 1463 } 1464 1465 /* Copy our new dest and fport from the connection request packet */ 1466 if (acceptor->tcp_ipversion == IPV4_VERSION) { 1467 struct ip *ipha; 1468 1469 ipha = (struct ip *)cr_pkt->b_rptr; 1470 acceptor->tcp_ipha->ip_dst = ipha->ip_src; 1471 acceptor->tcp_remote = ipha->ip_src.s_addr; 1472 acceptor->tcp_ipha->ip_src = ipha->ip_dst; 1473 acceptor->tcp_bound_source = ipha->ip_dst.s_addr; 1474 tcph = (tcph_t *)&cr_pkt->b_rptr[ip_hdr_len]; 1475 } else { 1476 prom_panic("tcp_accept_comm: not IPv4"); 1477 } 1478 bcopy(tcph->th_lport, acceptor->tcp_tcph->th_fport, sizeof (in_port_t)); 1479 bcopy(acceptor->tcp_tcph->th_fport, &acceptor->tcp_fport, 1480 sizeof (in_port_t)); 1481 /* 1482 * For an all-port proxy listener, the local port is determined by 1483 * the port number field in the SYN packet. 1484 */ 1485 if (listener->tcp_lport == 0) { 1486 acceptor->tcp_lport = *(in_port_t *)tcph->th_fport; 1487 bcopy(tcph->th_fport, acceptor->tcp_tcph->th_lport, 1488 sizeof (in_port_t)); 1489 } 1490 /* Inherit various TCP parameters from the listener */ 1491 acceptor->tcp_naglim = listener->tcp_naglim; 1492 acceptor->tcp_first_timer_threshold = 1493 listener->tcp_first_timer_threshold; 1494 acceptor->tcp_second_timer_threshold = 1495 listener->tcp_second_timer_threshold; 1496 1497 acceptor->tcp_first_ctimer_threshold = 1498 listener->tcp_first_ctimer_threshold; 1499 acceptor->tcp_second_ctimer_threshold = 1500 listener->tcp_second_ctimer_threshold; 1501 1502 acceptor->tcp_xmit_hiwater = listener->tcp_xmit_hiwater; 1503 1504 acceptor->tcp_state = TCPS_LISTEN; 1505 tcp_iss_init(acceptor); 1506 1507 /* Process all TCP options. */ 1508 tcp_process_options(acceptor, tcph); 1509 1510 /* Is the other end ECN capable? */ 1511 if (tcp_ecn_permitted >= 1 && 1512 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 1513 acceptor->tcp_ecn_ok = B_TRUE; 1514 } 1515 1516 /* 1517 * listener->tcp_rq->q_hiwat should be the default window size or a 1518 * window size changed via SO_RCVBUF option. First round up the 1519 * acceptor's tcp_rwnd to the nearest MSS. Then find out the window 1520 * scale option value if needed. Call tcp_rwnd_set() to finish the 1521 * setting. 1522 * 1523 * Note if there is a rpipe metric associated with the remote host, 1524 * we should not inherit receive window size from listener. 1525 */ 1526 acceptor->tcp_rwnd = MSS_ROUNDUP( 1527 (acceptor->tcp_rwnd == 0 ? listener->tcp_rwnd_max : 1528 acceptor->tcp_rwnd), acceptor->tcp_mss); 1529 if (acceptor->tcp_snd_ws_ok) 1530 SET_WS_VALUE(acceptor); 1531 /* 1532 * Note that this is the only place tcp_rwnd_set() is called for 1533 * accepting a connection. We need to call it here instead of 1534 * after the 3-way handshake because we need to tell the other 1535 * side our rwnd in the SYN-ACK segment. 1536 */ 1537 (void) tcp_rwnd_set(acceptor, acceptor->tcp_rwnd); 1538 1539 return (0); 1540 } 1541 1542 /* 1543 * Defense for the SYN attack - 1544 * 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest 1545 * one that doesn't have the dontdrop bit set. 1546 * 2. Don't drop a SYN request before its first timeout. This gives every 1547 * request at least til the first timeout to complete its 3-way handshake. 1548 * 3. The current threshold is - # of timeout > q0len/4 => SYN alert on 1549 * # of timeout drops back to <= q0len/32 => SYN alert off 1550 */ 1551 static boolean_t 1552 tcp_drop_q0(tcp_t *tcp) 1553 { 1554 tcp_t *eager; 1555 1556 assert(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 1557 /* 1558 * New one is added after next_q0 so prev_q0 points to the oldest 1559 * Also do not drop any established connections that are deferred on 1560 * q0 due to q being full 1561 */ 1562 1563 eager = tcp->tcp_eager_prev_q0; 1564 while (eager->tcp_dontdrop || eager->tcp_conn_def_q0) { 1565 /* XXX should move the eager to the head */ 1566 eager = eager->tcp_eager_prev_q0; 1567 if (eager == tcp) { 1568 eager = tcp->tcp_eager_prev_q0; 1569 break; 1570 } 1571 } 1572 dprintf("tcp_drop_q0: listen half-open queue (max=%d) overflow" 1573 " (%d pending) on %s, drop one", tcp_conn_req_max_q0, 1574 tcp->tcp_conn_req_cnt_q0, 1575 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1576 1577 BUMP_MIB(tcp_mib.tcpHalfOpenDrop); 1578 bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1579 return (B_TRUE); 1580 } 1581 1582 /* ARGSUSED */ 1583 static tcp_t * 1584 tcp_conn_request(tcp_t *tcp, mblk_t *mp, uint_t sock_id, uint_t ip_hdr_len) 1585 { 1586 tcp_t *eager; 1587 struct ip *ipha; 1588 int err; 1589 1590 #ifdef DEBUG 1591 printf("tcp_conn_request ###################\n"); 1592 #endif 1593 1594 if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { 1595 BUMP_MIB(tcp_mib.tcpListenDrop); 1596 dprintf("tcp_conn_request: listen backlog (max=%d) " 1597 "overflow (%d pending) on %s", 1598 tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, 1599 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1600 return (NULL); 1601 } 1602 1603 assert(OK_32PTR(mp->b_rptr)); 1604 1605 if (tcp->tcp_conn_req_cnt_q0 >= 1606 tcp->tcp_conn_req_max + tcp_conn_req_max_q0) { 1607 /* 1608 * Q0 is full. Drop a pending half-open req from the queue 1609 * to make room for the new SYN req. Also mark the time we 1610 * drop a SYN. 1611 */ 1612 tcp->tcp_last_rcv_lbolt = prom_gettime(); 1613 if (!tcp_drop_q0(tcp)) { 1614 freemsg(mp); 1615 BUMP_MIB(tcp_mib.tcpListenDropQ0); 1616 dprintf("tcp_conn_request: listen half-open queue " 1617 "(max=%d) full (%d pending) on %s", 1618 tcp_conn_req_max_q0, 1619 tcp->tcp_conn_req_cnt_q0, 1620 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 1621 return (NULL); 1622 } 1623 } 1624 1625 ipha = (struct ip *)mp->b_rptr; 1626 if (IN_CLASSD(ntohl(ipha->ip_src.s_addr)) || 1627 ipha->ip_src.s_addr == INADDR_BROADCAST || 1628 ipha->ip_src.s_addr == INADDR_ANY || 1629 ipha->ip_dst.s_addr == INADDR_BROADCAST) { 1630 freemsg(mp); 1631 return (NULL); 1632 } 1633 /* 1634 * We allow the connection to proceed 1635 * by generating a detached tcp state vector and put it in 1636 * the eager queue. When an accept happens, it will be 1637 * dequeued sequentially. 1638 */ 1639 if ((eager = (tcp_t *)bkmem_alloc(sizeof (tcp_t))) == NULL) { 1640 freemsg(mp); 1641 errno = ENOBUFS; 1642 return (NULL); 1643 } 1644 if ((errno = tcp_init_values(eager, NULL)) != 0) { 1645 freemsg(mp); 1646 bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1647 return (NULL); 1648 } 1649 1650 /* 1651 * Eager connection inherits address form from its listener, 1652 * but its packet form comes from the version of the received 1653 * SYN segment. 1654 */ 1655 eager->tcp_family = tcp->tcp_family; 1656 1657 err = tcp_accept_comm(tcp, eager, mp, ip_hdr_len); 1658 if (err) { 1659 bkmem_free((caddr_t)eager, sizeof (tcp_t)); 1660 return (NULL); 1661 } 1662 1663 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 1664 eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 1665 tcp->tcp_eager_next_q0 = eager; 1666 eager->tcp_eager_prev_q0 = tcp; 1667 1668 /* Set tcp_listener before adding it to tcp_conn_fanout */ 1669 eager->tcp_listener = tcp; 1670 tcp->tcp_conn_req_cnt_q0++; 1671 1672 return (eager); 1673 } 1674 1675 /* 1676 * To get around the non-interrupt problem of inetboot. 1677 * Keep on processing packets until a certain state is reached or the 1678 * TCP is destroyed because of getting a RST packet. 1679 */ 1680 static int 1681 tcp_state_wait(int sock_id, tcp_t *tcp, int state) 1682 { 1683 int i; 1684 struct inetgram *in_gram; 1685 mblk_t *mp; 1686 int timeout; 1687 boolean_t changed = B_FALSE; 1688 1689 /* 1690 * We need to make sure that the MAC does not wait longer 1691 * than RTO for any packet so that TCP can do retransmission. 1692 * But if the MAC timeout is less than tcp_rto, we are fine 1693 * and do not need to change it. 1694 */ 1695 timeout = sockets[sock_id].in_timeout; 1696 if (timeout > tcp->tcp_rto) { 1697 sockets[sock_id].in_timeout = tcp->tcp_rto; 1698 changed = B_TRUE; 1699 } 1700 retry: 1701 if (sockets[sock_id].inq == NULL) { 1702 /* Go out and check the wire */ 1703 for (i = MEDIA_LVL; i < TRANSPORT_LVL; i++) { 1704 if (sockets[sock_id].input[i] != NULL) { 1705 if (sockets[sock_id].input[i](sock_id) < 0) { 1706 if (changed) { 1707 sockets[sock_id].in_timeout = 1708 timeout; 1709 } 1710 return (-1); 1711 } 1712 } 1713 } 1714 } 1715 1716 while ((in_gram = sockets[sock_id].inq) != NULL) { 1717 if (tcp != NULL && tcp->tcp_state == state) 1718 break; 1719 1720 /* Remove unknown inetgrams from the head of inq. */ 1721 if (in_gram->igm_level != TRANSPORT_LVL) { 1722 #ifdef DEBUG 1723 printf("tcp_state_wait for state %d: unexpected " 1724 "packet level %d frame found\n", state, 1725 in_gram->igm_level); 1726 #endif 1727 del_gram(&sockets[sock_id].inq, in_gram, B_TRUE); 1728 continue; 1729 } 1730 mp = in_gram->igm_mp; 1731 del_gram(&sockets[sock_id].inq, in_gram, B_FALSE); 1732 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 1733 tcp_rput_data(tcp, mp, sock_id); 1734 1735 /* 1736 * The other side may have closed this connection or 1737 * RST us. But we need to continue to process other 1738 * packets in the socket's queue because they may be 1739 * belong to another TCP connections. 1740 */ 1741 if (sockets[sock_id].pcb == NULL) { 1742 tcp = NULL; 1743 } 1744 } 1745 1746 /* If the other side has closed the connection, just return. */ 1747 if (tcp == NULL || sockets[sock_id].pcb == NULL) { 1748 #ifdef DEBUG 1749 printf("tcp_state_wait other side dead: state %d " 1750 "error %d\n", state, sockets[sock_id].so_error); 1751 #endif 1752 if (sockets[sock_id].so_error != 0) 1753 return (-1); 1754 else 1755 return (0); 1756 } 1757 /* 1758 * TCPS_ALL_ACKED is not a valid TCP state, it is just used as an 1759 * indicator to tcp_state_wait to mean that it is being called 1760 * to wait till we have received acks for all the new segments sent. 1761 */ 1762 if ((state == TCPS_ALL_ACKED) && (tcp->tcp_suna == tcp->tcp_snxt)) { 1763 goto done; 1764 } 1765 if (tcp->tcp_state != state) { 1766 if (prom_gettime() > tcp->tcp_rto_timeout) 1767 tcp_timer(tcp, sock_id); 1768 goto retry; 1769 } 1770 done: 1771 if (changed) 1772 sockets[sock_id].in_timeout = timeout; 1773 1774 tcp_drain_needed(sock_id, tcp); 1775 return (0); 1776 } 1777 1778 /* Verify the checksum of a segment. */ 1779 static int 1780 tcp_verify_cksum(mblk_t *mp) 1781 { 1782 struct ip *iph; 1783 tcpha_t *tcph; 1784 int len; 1785 uint16_t old_sum; 1786 1787 iph = (struct ip *)mp->b_rptr; 1788 tcph = (tcpha_t *)(iph + 1); 1789 len = ntohs(iph->ip_len); 1790 1791 /* 1792 * Calculate the TCP checksum. Need to include the psuedo header, 1793 * which is similar to the real IP header starting at the TTL field. 1794 */ 1795 iph->ip_sum = htons(len - IP_SIMPLE_HDR_LENGTH); 1796 old_sum = tcph->tha_sum; 1797 tcph->tha_sum = 0; 1798 iph->ip_ttl = 0; 1799 if (old_sum == tcp_cksum((uint16_t *)&(iph->ip_ttl), 1800 len - IP_SIMPLE_HDR_LENGTH + 12)) { 1801 return (0); 1802 } else { 1803 tcp_cksum_errors++; 1804 return (-1); 1805 } 1806 } 1807 1808 /* To find a TCP connection matching the incoming segment. */ 1809 static tcp_t * 1810 tcp_lookup_ipv4(struct ip *iph, tcpha_t *tcph, int min_state, int *sock_id) 1811 { 1812 int i; 1813 tcp_t *tcp; 1814 1815 for (i = 0; i < MAXSOCKET; i++) { 1816 if (sockets[i].type == INETBOOT_STREAM && 1817 (tcp = (tcp_t *)sockets[i].pcb) != NULL) { 1818 if (tcph->tha_lport == tcp->tcp_fport && 1819 tcph->tha_fport == tcp->tcp_lport && 1820 iph->ip_src.s_addr == tcp->tcp_remote && 1821 iph->ip_dst.s_addr == tcp->tcp_bound_source && 1822 tcp->tcp_state >= min_state) { 1823 *sock_id = i; 1824 return (tcp); 1825 } 1826 } 1827 } 1828 /* Find it in the time wait list. */ 1829 for (tcp = tcp_time_wait_head; tcp != NULL; 1830 tcp = tcp->tcp_time_wait_next) { 1831 if (tcph->tha_lport == tcp->tcp_fport && 1832 tcph->tha_fport == tcp->tcp_lport && 1833 iph->ip_src.s_addr == tcp->tcp_remote && 1834 iph->ip_dst.s_addr == tcp->tcp_bound_source && 1835 tcp->tcp_state >= min_state) { 1836 *sock_id = -1; 1837 return (tcp); 1838 } 1839 } 1840 return (NULL); 1841 } 1842 1843 /* To find a TCP listening connection matching the incoming segment. */ 1844 static tcp_t * 1845 tcp_lookup_listener_ipv4(in_addr_t addr, in_port_t port, int *sock_id) 1846 { 1847 int i; 1848 tcp_t *tcp; 1849 1850 for (i = 0; i < MAXSOCKET; i++) { 1851 if (sockets[i].type == INETBOOT_STREAM && 1852 (tcp = (tcp_t *)sockets[i].pcb) != NULL) { 1853 if (tcp->tcp_lport == port && 1854 (tcp->tcp_bound_source == addr || 1855 tcp->tcp_bound_source == INADDR_ANY)) { 1856 *sock_id = i; 1857 return (tcp); 1858 } 1859 } 1860 } 1861 1862 return (NULL); 1863 } 1864 1865 /* To find a TCP eager matching the incoming segment. */ 1866 static tcp_t * 1867 tcp_lookup_eager_ipv4(tcp_t *listener, struct ip *iph, tcpha_t *tcph) 1868 { 1869 tcp_t *tcp; 1870 1871 #ifdef DEBUG 1872 printf("tcp_lookup_eager_ipv4 ###############\n"); 1873 #endif 1874 for (tcp = listener->tcp_eager_next_q; tcp != NULL; 1875 tcp = tcp->tcp_eager_next_q) { 1876 if (tcph->tha_lport == tcp->tcp_fport && 1877 tcph->tha_fport == tcp->tcp_lport && 1878 iph->ip_src.s_addr == tcp->tcp_remote && 1879 iph->ip_dst.s_addr == tcp->tcp_bound_source) { 1880 return (tcp); 1881 } 1882 } 1883 1884 for (tcp = listener->tcp_eager_next_q0; tcp != listener; 1885 tcp = tcp->tcp_eager_next_q0) { 1886 if (tcph->tha_lport == tcp->tcp_fport && 1887 tcph->tha_fport == tcp->tcp_lport && 1888 iph->ip_src.s_addr == tcp->tcp_remote && 1889 iph->ip_dst.s_addr == tcp->tcp_bound_source) { 1890 return (tcp); 1891 } 1892 } 1893 #ifdef DEBUG 1894 printf("No eager found\n"); 1895 #endif 1896 return (NULL); 1897 } 1898 1899 /* To destroy a TCP control block. */ 1900 static void 1901 tcp_clean_death(int sock_id, tcp_t *tcp, int err) 1902 { 1903 tcp_free(tcp); 1904 if (tcp->tcp_state == TCPS_TIME_WAIT) 1905 tcp_time_wait_remove(tcp); 1906 1907 if (sock_id >= 0) { 1908 sockets[sock_id].pcb = NULL; 1909 if (err != 0) 1910 sockets[sock_id].so_error = err; 1911 } 1912 bkmem_free((caddr_t)tcp, sizeof (tcp_t)); 1913 } 1914 1915 /* 1916 * tcp_rwnd_set() is called to adjust the receive window to a desired value. 1917 * We do not allow the receive window to shrink. After setting rwnd, 1918 * set the flow control hiwat of the stream. 1919 * 1920 * This function is called in 2 cases: 1921 * 1922 * 1) Before data transfer begins, in tcp_accept_comm() for accepting a 1923 * connection (passive open) and in tcp_rput_data() for active connect. 1924 * This is called after tcp_mss_set() when the desired MSS value is known. 1925 * This makes sure that our window size is a mutiple of the other side's 1926 * MSS. 1927 * 2) Handling SO_RCVBUF option. 1928 * 1929 * It is ASSUMED that the requested size is a multiple of the current MSS. 1930 * 1931 * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 1932 * user requests so. 1933 */ 1934 static int 1935 tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 1936 { 1937 uint32_t mss = tcp->tcp_mss; 1938 uint32_t old_max_rwnd; 1939 uint32_t max_transmittable_rwnd; 1940 1941 if (tcp->tcp_rwnd_max != 0) 1942 old_max_rwnd = tcp->tcp_rwnd_max; 1943 else 1944 old_max_rwnd = tcp->tcp_rwnd; 1945 1946 /* 1947 * Insist on a receive window that is at least 1948 * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 1949 * funny TCP interactions of Nagle algorithm, SWS avoidance 1950 * and delayed acknowledgement. 1951 */ 1952 rwnd = MAX(rwnd, tcp_recv_hiwat_minmss * mss); 1953 1954 /* 1955 * If window size info has already been exchanged, TCP should not 1956 * shrink the window. Shrinking window is doable if done carefully. 1957 * We may add that support later. But so far there is not a real 1958 * need to do that. 1959 */ 1960 if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 1961 /* MSS may have changed, do a round up again. */ 1962 rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 1963 } 1964 1965 /* 1966 * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 1967 * can be applied even before the window scale option is decided. 1968 */ 1969 max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 1970 if (rwnd > max_transmittable_rwnd) { 1971 rwnd = max_transmittable_rwnd - 1972 (max_transmittable_rwnd % mss); 1973 if (rwnd < mss) 1974 rwnd = max_transmittable_rwnd; 1975 /* 1976 * If we're over the limit we may have to back down tcp_rwnd. 1977 * The increment below won't work for us. So we set all three 1978 * here and the increment below will have no effect. 1979 */ 1980 tcp->tcp_rwnd = old_max_rwnd = rwnd; 1981 } 1982 1983 /* 1984 * Increment the current rwnd by the amount the maximum grew (we 1985 * can not overwrite it since we might be in the middle of a 1986 * connection.) 1987 */ 1988 tcp->tcp_rwnd += rwnd - old_max_rwnd; 1989 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 1990 if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 1991 tcp->tcp_cwnd_max = rwnd; 1992 tcp->tcp_rwnd_max = rwnd; 1993 1994 return (rwnd); 1995 } 1996 1997 /* 1998 * Extract option values from a tcp header. We put any found values into the 1999 * tcpopt struct and return a bitmask saying which options were found. 2000 */ 2001 static int 2002 tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) 2003 { 2004 uchar_t *endp; 2005 int len; 2006 uint32_t mss; 2007 uchar_t *up = (uchar_t *)tcph; 2008 int found = 0; 2009 int32_t sack_len; 2010 tcp_seq sack_begin, sack_end; 2011 tcp_t *tcp; 2012 2013 endp = up + TCP_HDR_LENGTH(tcph); 2014 up += TCP_MIN_HEADER_LENGTH; 2015 while (up < endp) { 2016 len = endp - up; 2017 switch (*up) { 2018 case TCPOPT_EOL: 2019 break; 2020 2021 case TCPOPT_NOP: 2022 up++; 2023 continue; 2024 2025 case TCPOPT_MAXSEG: 2026 if (len < TCPOPT_MAXSEG_LEN || 2027 up[1] != TCPOPT_MAXSEG_LEN) 2028 break; 2029 2030 mss = BE16_TO_U16(up+2); 2031 /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 2032 tcpopt->tcp_opt_mss = mss; 2033 found |= TCP_OPT_MSS_PRESENT; 2034 2035 up += TCPOPT_MAXSEG_LEN; 2036 continue; 2037 2038 case TCPOPT_WSCALE: 2039 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 2040 break; 2041 2042 if (up[2] > TCP_MAX_WINSHIFT) 2043 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 2044 else 2045 tcpopt->tcp_opt_wscale = up[2]; 2046 found |= TCP_OPT_WSCALE_PRESENT; 2047 2048 up += TCPOPT_WS_LEN; 2049 continue; 2050 2051 case TCPOPT_SACK_PERMITTED: 2052 if (len < TCPOPT_SACK_OK_LEN || 2053 up[1] != TCPOPT_SACK_OK_LEN) 2054 break; 2055 found |= TCP_OPT_SACK_OK_PRESENT; 2056 up += TCPOPT_SACK_OK_LEN; 2057 continue; 2058 2059 case TCPOPT_SACK: 2060 if (len <= 2 || up[1] <= 2 || len < up[1]) 2061 break; 2062 2063 /* If TCP is not interested in SACK blks... */ 2064 if ((tcp = tcpopt->tcp) == NULL) { 2065 up += up[1]; 2066 continue; 2067 } 2068 sack_len = up[1] - TCPOPT_HEADER_LEN; 2069 up += TCPOPT_HEADER_LEN; 2070 2071 /* 2072 * If the list is empty, allocate one and assume 2073 * nothing is sack'ed. 2074 */ 2075 assert(tcp->tcp_sack_info != NULL); 2076 if (tcp->tcp_notsack_list == NULL) { 2077 tcp_notsack_update(&(tcp->tcp_notsack_list), 2078 tcp->tcp_suna, tcp->tcp_snxt, 2079 &(tcp->tcp_num_notsack_blk), 2080 &(tcp->tcp_cnt_notsack_list)); 2081 2082 /* 2083 * Make sure tcp_notsack_list is not NULL. 2084 * This happens when kmem_alloc(KM_NOSLEEP) 2085 * returns NULL. 2086 */ 2087 if (tcp->tcp_notsack_list == NULL) { 2088 up += sack_len; 2089 continue; 2090 } 2091 tcp->tcp_fack = tcp->tcp_suna; 2092 } 2093 2094 while (sack_len > 0) { 2095 if (up + 8 > endp) { 2096 up = endp; 2097 break; 2098 } 2099 sack_begin = BE32_TO_U32(up); 2100 up += 4; 2101 sack_end = BE32_TO_U32(up); 2102 up += 4; 2103 sack_len -= 8; 2104 /* 2105 * Bounds checking. Make sure the SACK 2106 * info is within tcp_suna and tcp_snxt. 2107 * If this SACK blk is out of bound, ignore 2108 * it but continue to parse the following 2109 * blks. 2110 */ 2111 if (SEQ_LEQ(sack_end, sack_begin) || 2112 SEQ_LT(sack_begin, tcp->tcp_suna) || 2113 SEQ_GT(sack_end, tcp->tcp_snxt)) { 2114 continue; 2115 } 2116 tcp_notsack_insert(&(tcp->tcp_notsack_list), 2117 sack_begin, sack_end, 2118 &(tcp->tcp_num_notsack_blk), 2119 &(tcp->tcp_cnt_notsack_list)); 2120 if (SEQ_GT(sack_end, tcp->tcp_fack)) { 2121 tcp->tcp_fack = sack_end; 2122 } 2123 } 2124 found |= TCP_OPT_SACK_PRESENT; 2125 continue; 2126 2127 case TCPOPT_TSTAMP: 2128 if (len < TCPOPT_TSTAMP_LEN || 2129 up[1] != TCPOPT_TSTAMP_LEN) 2130 break; 2131 2132 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 2133 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 2134 2135 found |= TCP_OPT_TSTAMP_PRESENT; 2136 2137 up += TCPOPT_TSTAMP_LEN; 2138 continue; 2139 2140 default: 2141 if (len <= 1 || len < (int)up[1] || up[1] == 0) 2142 break; 2143 up += up[1]; 2144 continue; 2145 } 2146 break; 2147 } 2148 return (found); 2149 } 2150 2151 /* 2152 * Set the mss associated with a particular tcp based on its current value, 2153 * and a new one passed in. Observe minimums and maximums, and reset 2154 * other state variables that we want to view as multiples of mss. 2155 * 2156 * This function is called in various places mainly because 2157 * 1) Various stuffs, tcp_mss, tcp_cwnd, ... need to be adjusted when the 2158 * other side's SYN/SYN-ACK packet arrives. 2159 * 2) PMTUd may get us a new MSS. 2160 * 3) If the other side stops sending us timestamp option, we need to 2161 * increase the MSS size to use the extra bytes available. 2162 */ 2163 static void 2164 tcp_mss_set(tcp_t *tcp, uint32_t mss) 2165 { 2166 uint32_t mss_max; 2167 2168 mss_max = tcp_mss_max_ipv4; 2169 2170 if (mss < tcp_mss_min) 2171 mss = tcp_mss_min; 2172 if (mss > mss_max) 2173 mss = mss_max; 2174 /* 2175 * Unless naglim has been set by our client to 2176 * a non-mss value, force naglim to track mss. 2177 * This can help to aggregate small writes. 2178 */ 2179 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 2180 tcp->tcp_naglim = mss; 2181 /* 2182 * TCP should be able to buffer at least 4 MSS data for obvious 2183 * performance reason. 2184 */ 2185 if ((mss << 2) > tcp->tcp_xmit_hiwater) 2186 tcp->tcp_xmit_hiwater = mss << 2; 2187 tcp->tcp_mss = mss; 2188 /* 2189 * Initialize cwnd according to draft-floyd-incr-init-win-01.txt. 2190 * Previously, we use tcp_slow_start_initial to control the size 2191 * of the initial cwnd. Now, when tcp_slow_start_initial * mss 2192 * is smaller than the cwnd calculated from the formula suggested in 2193 * the draft, we use tcp_slow_start_initial * mss as the cwnd. 2194 * Otherwise, use the cwnd from the draft's formula. The default 2195 * of tcp_slow_start_initial is 2. 2196 */ 2197 tcp->tcp_cwnd = MIN(tcp_slow_start_initial * mss, 2198 MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 2199 tcp->tcp_cwnd_cnt = 0; 2200 } 2201 2202 /* 2203 * Process all TCP option in SYN segment. 2204 * 2205 * This function sets up the correct tcp_mss value according to the 2206 * MSS option value and our header size. It also sets up the window scale 2207 * and timestamp values, and initialize SACK info blocks. But it does not 2208 * change receive window size after setting the tcp_mss value. The caller 2209 * should do the appropriate change. 2210 */ 2211 void 2212 tcp_process_options(tcp_t *tcp, tcph_t *tcph) 2213 { 2214 int options; 2215 tcp_opt_t tcpopt; 2216 uint32_t mss_max; 2217 char *tmp_tcph; 2218 2219 tcpopt.tcp = NULL; 2220 options = tcp_parse_options(tcph, &tcpopt); 2221 2222 /* 2223 * Process MSS option. Note that MSS option value does not account 2224 * for IP or TCP options. This means that it is equal to MTU - minimum 2225 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 2226 * IPv6. 2227 */ 2228 if (!(options & TCP_OPT_MSS_PRESENT)) { 2229 tcpopt.tcp_opt_mss = tcp_mss_def_ipv4; 2230 } else { 2231 if (tcp->tcp_ipversion == IPV4_VERSION) 2232 mss_max = tcp_mss_max_ipv4; 2233 if (tcpopt.tcp_opt_mss < tcp_mss_min) 2234 tcpopt.tcp_opt_mss = tcp_mss_min; 2235 else if (tcpopt.tcp_opt_mss > mss_max) 2236 tcpopt.tcp_opt_mss = mss_max; 2237 } 2238 2239 /* Process Window Scale option. */ 2240 if (options & TCP_OPT_WSCALE_PRESENT) { 2241 tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 2242 tcp->tcp_snd_ws_ok = B_TRUE; 2243 } else { 2244 tcp->tcp_snd_ws = B_FALSE; 2245 tcp->tcp_snd_ws_ok = B_FALSE; 2246 tcp->tcp_rcv_ws = B_FALSE; 2247 } 2248 2249 /* Process Timestamp option. */ 2250 if ((options & TCP_OPT_TSTAMP_PRESENT) && 2251 (tcp->tcp_snd_ts_ok || !tcp->tcp_active_open)) { 2252 tmp_tcph = (char *)tcp->tcp_tcph; 2253 2254 tcp->tcp_snd_ts_ok = B_TRUE; 2255 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 2256 tcp->tcp_last_rcv_lbolt = prom_gettime(); 2257 assert(OK_32PTR(tmp_tcph)); 2258 assert(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 2259 2260 /* Fill in our template header with basic timestamp option. */ 2261 tmp_tcph += tcp->tcp_tcp_hdr_len; 2262 tmp_tcph[0] = TCPOPT_NOP; 2263 tmp_tcph[1] = TCPOPT_NOP; 2264 tmp_tcph[2] = TCPOPT_TSTAMP; 2265 tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 2266 tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN; 2267 tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN; 2268 tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4); 2269 } else { 2270 tcp->tcp_snd_ts_ok = B_FALSE; 2271 } 2272 2273 /* 2274 * Process SACK options. If SACK is enabled for this connection, 2275 * then allocate the SACK info structure. 2276 */ 2277 if ((options & TCP_OPT_SACK_OK_PRESENT) && 2278 (tcp->tcp_snd_sack_ok || 2279 (tcp_sack_permitted != 0 && !tcp->tcp_active_open))) { 2280 /* This should be true only in the passive case. */ 2281 if (tcp->tcp_sack_info == NULL) { 2282 tcp->tcp_sack_info = (tcp_sack_info_t *)bkmem_zalloc( 2283 sizeof (tcp_sack_info_t)); 2284 } 2285 if (tcp->tcp_sack_info == NULL) { 2286 tcp->tcp_snd_sack_ok = B_FALSE; 2287 } else { 2288 tcp->tcp_snd_sack_ok = B_TRUE; 2289 if (tcp->tcp_snd_ts_ok) { 2290 tcp->tcp_max_sack_blk = 3; 2291 } else { 2292 tcp->tcp_max_sack_blk = 4; 2293 } 2294 } 2295 } else { 2296 /* 2297 * Resetting tcp_snd_sack_ok to B_FALSE so that 2298 * no SACK info will be used for this 2299 * connection. This assumes that SACK usage 2300 * permission is negotiated. This may need 2301 * to be changed once this is clarified. 2302 */ 2303 if (tcp->tcp_sack_info != NULL) { 2304 bkmem_free((caddr_t)tcp->tcp_sack_info, 2305 sizeof (tcp_sack_info_t)); 2306 tcp->tcp_sack_info = NULL; 2307 } 2308 tcp->tcp_snd_sack_ok = B_FALSE; 2309 } 2310 2311 /* 2312 * Now we know the exact TCP/IP header length, subtract 2313 * that from tcp_mss to get our side's MSS. 2314 */ 2315 tcp->tcp_mss -= tcp->tcp_hdr_len; 2316 /* 2317 * Here we assume that the other side's header size will be equal to 2318 * our header size. We calculate the real MSS accordingly. Need to 2319 * take into additional stuffs IPsec puts in. 2320 * 2321 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 2322 */ 2323 tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len - 2324 (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH); 2325 2326 /* 2327 * Set MSS to the smaller one of both ends of the connection. 2328 * We should not have called tcp_mss_set() before, but our 2329 * side of the MSS should have been set to a proper value 2330 * by tcp_adapt_ire(). tcp_mss_set() will also set up the 2331 * STREAM head parameters properly. 2332 * 2333 * If we have a larger-than-16-bit window but the other side 2334 * didn't want to do window scale, tcp_rwnd_set() will take 2335 * care of that. 2336 */ 2337 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); 2338 } 2339 2340 /* 2341 * This function does PAWS protection check. Returns B_TRUE if the 2342 * segment passes the PAWS test, else returns B_FALSE. 2343 */ 2344 boolean_t 2345 tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) 2346 { 2347 uint8_t flags; 2348 int options; 2349 uint8_t *up; 2350 2351 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 2352 /* 2353 * If timestamp option is aligned nicely, get values inline, 2354 * otherwise call general routine to parse. Only do that 2355 * if timestamp is the only option. 2356 */ 2357 if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH + 2358 TCPOPT_REAL_TS_LEN && 2359 OK_32PTR((up = ((uint8_t *)tcph) + 2360 TCP_MIN_HEADER_LENGTH)) && 2361 *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 2362 tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 2363 tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 2364 2365 options = TCP_OPT_TSTAMP_PRESENT; 2366 } else { 2367 if (tcp->tcp_snd_sack_ok) { 2368 tcpoptp->tcp = tcp; 2369 } else { 2370 tcpoptp->tcp = NULL; 2371 } 2372 options = tcp_parse_options(tcph, tcpoptp); 2373 } 2374 2375 if (options & TCP_OPT_TSTAMP_PRESENT) { 2376 /* 2377 * Do PAWS per RFC 1323 section 4.2. Accept RST 2378 * regardless of the timestamp, page 18 RFC 1323.bis. 2379 */ 2380 if ((flags & TH_RST) == 0 && 2381 TSTMP_LT(tcpoptp->tcp_opt_ts_val, 2382 tcp->tcp_ts_recent)) { 2383 if (TSTMP_LT(prom_gettime(), 2384 tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) { 2385 /* This segment is not acceptable. */ 2386 return (B_FALSE); 2387 } else { 2388 /* 2389 * Connection has been idle for 2390 * too long. Reset the timestamp 2391 * and assume the segment is valid. 2392 */ 2393 tcp->tcp_ts_recent = 2394 tcpoptp->tcp_opt_ts_val; 2395 } 2396 } 2397 } else { 2398 /* 2399 * If we don't get a timestamp on every packet, we 2400 * figure we can't really trust 'em, so we stop sending 2401 * and parsing them. 2402 */ 2403 tcp->tcp_snd_ts_ok = B_FALSE; 2404 2405 tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 2406 tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 2407 tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4); 2408 tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); 2409 if (tcp->tcp_snd_sack_ok) { 2410 assert(tcp->tcp_sack_info != NULL); 2411 tcp->tcp_max_sack_blk = 4; 2412 } 2413 } 2414 return (B_TRUE); 2415 } 2416 2417 /* 2418 * tcp_get_seg_mp() is called to get the pointer to a segment in the 2419 * send queue which starts at the given seq. no. 2420 * 2421 * Parameters: 2422 * tcp_t *tcp: the tcp instance pointer. 2423 * uint32_t seq: the starting seq. no of the requested segment. 2424 * int32_t *off: after the execution, *off will be the offset to 2425 * the returned mblk which points to the requested seq no. 2426 * 2427 * Return: 2428 * A mblk_t pointer pointing to the requested segment in send queue. 2429 */ 2430 static mblk_t * 2431 tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 2432 { 2433 int32_t cnt; 2434 mblk_t *mp; 2435 2436 /* Defensive coding. Make sure we don't send incorrect data. */ 2437 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt) || 2438 off == NULL) { 2439 return (NULL); 2440 } 2441 cnt = seq - tcp->tcp_suna; 2442 mp = tcp->tcp_xmit_head; 2443 while (cnt > 0 && mp) { 2444 cnt -= mp->b_wptr - mp->b_rptr; 2445 if (cnt < 0) { 2446 cnt += mp->b_wptr - mp->b_rptr; 2447 break; 2448 } 2449 mp = mp->b_cont; 2450 } 2451 assert(mp != NULL); 2452 *off = cnt; 2453 return (mp); 2454 } 2455 2456 /* 2457 * This function handles all retransmissions if SACK is enabled for this 2458 * connection. First it calculates how many segments can be retransmitted 2459 * based on tcp_pipe. Then it goes thru the notsack list to find eligible 2460 * segments. A segment is eligible if sack_cnt for that segment is greater 2461 * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 2462 * all eligible segments, it checks to see if TCP can send some new segments 2463 * (fast recovery). If it can, it returns 1. Otherwise it returns 0. 2464 * 2465 * Parameters: 2466 * tcp_t *tcp: the tcp structure of the connection. 2467 * 2468 * Return: 2469 * 1 if the pipe is not full (new data can be sent), 0 otherwise 2470 */ 2471 static int32_t 2472 tcp_sack_rxmit(tcp_t *tcp, int sock_id) 2473 { 2474 notsack_blk_t *notsack_blk; 2475 int32_t usable_swnd; 2476 int32_t mss; 2477 uint32_t seg_len; 2478 mblk_t *xmit_mp; 2479 2480 assert(tcp->tcp_sack_info != NULL); 2481 assert(tcp->tcp_notsack_list != NULL); 2482 assert(tcp->tcp_rexmit == B_FALSE); 2483 2484 /* Defensive coding in case there is a bug... */ 2485 if (tcp->tcp_notsack_list == NULL) { 2486 return (0); 2487 } 2488 notsack_blk = tcp->tcp_notsack_list; 2489 mss = tcp->tcp_mss; 2490 2491 /* 2492 * Limit the num of outstanding data in the network to be 2493 * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 2494 */ 2495 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 2496 2497 /* At least retransmit 1 MSS of data. */ 2498 if (usable_swnd <= 0) { 2499 usable_swnd = mss; 2500 } 2501 2502 /* Make sure no new RTT samples will be taken. */ 2503 tcp->tcp_csuna = tcp->tcp_snxt; 2504 2505 notsack_blk = tcp->tcp_notsack_list; 2506 while (usable_swnd > 0) { 2507 mblk_t *snxt_mp, *tmp_mp; 2508 tcp_seq begin = tcp->tcp_sack_snxt; 2509 tcp_seq end; 2510 int32_t off; 2511 2512 for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 2513 if (SEQ_GT(notsack_blk->end, begin) && 2514 (notsack_blk->sack_cnt >= 2515 tcp_dupack_fast_retransmit)) { 2516 end = notsack_blk->end; 2517 if (SEQ_LT(begin, notsack_blk->begin)) { 2518 begin = notsack_blk->begin; 2519 } 2520 break; 2521 } 2522 } 2523 /* 2524 * All holes are filled. Manipulate tcp_cwnd to send more 2525 * if we can. Note that after the SACK recovery, tcp_cwnd is 2526 * set to tcp_cwnd_ssthresh. 2527 */ 2528 if (notsack_blk == NULL) { 2529 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 2530 if (usable_swnd <= 0) { 2531 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 2532 assert(tcp->tcp_cwnd > 0); 2533 return (0); 2534 } else { 2535 usable_swnd = usable_swnd / mss; 2536 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 2537 MAX(usable_swnd * mss, mss); 2538 return (1); 2539 } 2540 } 2541 2542 /* 2543 * Note that we may send more than usable_swnd allows here 2544 * because of round off, but no more than 1 MSS of data. 2545 */ 2546 seg_len = end - begin; 2547 if (seg_len > mss) 2548 seg_len = mss; 2549 snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 2550 assert(snxt_mp != NULL); 2551 /* This should not happen. Defensive coding again... */ 2552 if (snxt_mp == NULL) { 2553 return (0); 2554 } 2555 2556 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 2557 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 2558 2559 if (xmit_mp == NULL) 2560 return (0); 2561 2562 usable_swnd -= seg_len; 2563 tcp->tcp_pipe += seg_len; 2564 tcp->tcp_sack_snxt = begin + seg_len; 2565 TCP_DUMP_PACKET("tcp_sack_rxmit", xmit_mp); 2566 (void) ipv4_tcp_output(sock_id, xmit_mp); 2567 freeb(xmit_mp); 2568 2569 /* 2570 * Update the send timestamp to avoid false retransmission. 2571 */ 2572 snxt_mp->b_prev = (mblk_t *)prom_gettime(); 2573 2574 BUMP_MIB(tcp_mib.tcpRetransSegs); 2575 UPDATE_MIB(tcp_mib.tcpRetransBytes, seg_len); 2576 BUMP_MIB(tcp_mib.tcpOutSackRetransSegs); 2577 /* 2578 * Update tcp_rexmit_max to extend this SACK recovery phase. 2579 * This happens when new data sent during fast recovery is 2580 * also lost. If TCP retransmits those new data, it needs 2581 * to extend SACK recover phase to avoid starting another 2582 * fast retransmit/recovery unnecessarily. 2583 */ 2584 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 2585 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 2586 } 2587 } 2588 return (0); 2589 } 2590 2591 static void 2592 tcp_rput_data(tcp_t *tcp, mblk_t *mp, int sock_id) 2593 { 2594 uchar_t *rptr; 2595 struct ip *iph; 2596 tcp_t *tcp1; 2597 tcpha_t *tcph; 2598 uint32_t seg_ack; 2599 int seg_len; 2600 uint_t ip_hdr_len; 2601 uint32_t seg_seq; 2602 mblk_t *mp1; 2603 uint_t flags; 2604 uint32_t new_swnd = 0; 2605 int mss; 2606 boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 2607 int32_t gap; 2608 int32_t rgap; 2609 tcp_opt_t tcpopt; 2610 int32_t bytes_acked; 2611 int npkt; 2612 uint32_t cwnd; 2613 uint32_t add; 2614 2615 #ifdef DEBUG 2616 printf("tcp_rput_data sock %d mp %x mp_datap %x #################\n", 2617 sock_id, mp, mp->b_datap); 2618 #endif 2619 2620 /* Dump the packet when debugging. */ 2621 TCP_DUMP_PACKET("tcp_rput_data", mp); 2622 2623 assert(OK_32PTR(mp->b_rptr)); 2624 2625 rptr = mp->b_rptr; 2626 iph = (struct ip *)rptr; 2627 ip_hdr_len = IPH_HDR_LENGTH(rptr); 2628 if (ip_hdr_len != IP_SIMPLE_HDR_LENGTH) { 2629 #ifdef DEBUG 2630 printf("Not simple IP header\n"); 2631 #endif 2632 /* We cannot handle IP option yet... */ 2633 tcp_drops++; 2634 freeb(mp); 2635 return; 2636 } 2637 /* The TCP header must be aligned. */ 2638 tcph = (tcpha_t *)&rptr[ip_hdr_len]; 2639 seg_seq = ntohl(tcph->tha_seq); 2640 seg_ack = ntohl(tcph->tha_ack); 2641 assert((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 2642 seg_len = (int)(mp->b_wptr - rptr) - 2643 (ip_hdr_len + TCP_HDR_LENGTH(((tcph_t *)tcph))); 2644 /* In inetboot, b_cont should always be NULL. */ 2645 assert(mp->b_cont == NULL); 2646 2647 /* Verify the checksum. */ 2648 if (tcp_verify_cksum(mp) < 0) { 2649 #ifdef DEBUG 2650 printf("tcp_rput_data: wrong cksum\n"); 2651 #endif 2652 freemsg(mp); 2653 return; 2654 } 2655 2656 /* 2657 * This segment is not for us, try to find its 2658 * intended receiver. 2659 */ 2660 if (tcp == NULL || 2661 tcph->tha_lport != tcp->tcp_fport || 2662 tcph->tha_fport != tcp->tcp_lport || 2663 iph->ip_src.s_addr != tcp->tcp_remote || 2664 iph->ip_dst.s_addr != tcp->tcp_bound_source) { 2665 #ifdef DEBUG 2666 printf("tcp_rput_data: not for us, state %d\n", 2667 tcp->tcp_state); 2668 #endif 2669 /* 2670 * First try to find a established connection. If none 2671 * is found, look for a listener. 2672 * 2673 * If a listener is found, we need to check to see if the 2674 * incoming segment is for one of its eagers. If it is, 2675 * give it to the eager. If not, listener should take care 2676 * of it. 2677 */ 2678 if ((tcp1 = tcp_lookup_ipv4(iph, tcph, TCPS_SYN_SENT, 2679 &sock_id)) != NULL || 2680 (tcp1 = tcp_lookup_listener_ipv4(iph->ip_dst.s_addr, 2681 tcph->tha_fport, &sock_id)) != NULL) { 2682 if (tcp1->tcp_state == TCPS_LISTEN) { 2683 if ((tcp = tcp_lookup_eager_ipv4(tcp1, 2684 iph, tcph)) == NULL) { 2685 /* No eager... sent to listener */ 2686 #ifdef DEBUG 2687 printf("found the listener: %s\n", 2688 tcp_display(tcp1, NULL, 2689 DISP_ADDR_AND_PORT)); 2690 #endif 2691 tcp = tcp1; 2692 } 2693 #ifdef DEBUG 2694 else { 2695 printf("found the eager: %s\n", 2696 tcp_display(tcp, NULL, 2697 DISP_ADDR_AND_PORT)); 2698 } 2699 #endif 2700 } else { 2701 /* Non listener found... */ 2702 #ifdef DEBUG 2703 printf("found the connection: %s\n", 2704 tcp_display(tcp1, NULL, 2705 DISP_ADDR_AND_PORT)); 2706 #endif 2707 tcp = tcp1; 2708 } 2709 } else { 2710 /* 2711 * No connection for this segment... 2712 * Send a RST to the other side. 2713 */ 2714 tcp_xmit_listeners_reset(sock_id, mp, ip_hdr_len); 2715 return; 2716 } 2717 } 2718 2719 flags = tcph->tha_flags & 0xFF; 2720 BUMP_MIB(tcp_mib.tcpInSegs); 2721 if (tcp->tcp_state == TCPS_TIME_WAIT) { 2722 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 2723 seg_len, (tcph_t *)tcph, sock_id); 2724 return; 2725 } 2726 /* 2727 * From this point we can assume that the tcp is not compressed, 2728 * since we would have branched off to tcp_time_wait_processing() 2729 * in such a case. 2730 */ 2731 assert(tcp != NULL && tcp->tcp_state != TCPS_TIME_WAIT); 2732 2733 /* 2734 * After this point, we know we have the correct TCP, so update 2735 * the receive time. 2736 */ 2737 tcp->tcp_last_recv_time = prom_gettime(); 2738 2739 /* In inetboot, we do not handle urgent pointer... */ 2740 if (flags & TH_URG) { 2741 freemsg(mp); 2742 DEBUG_1("tcp_rput_data(%d): received segment with urgent " 2743 "pointer\n", sock_id); 2744 tcp_drops++; 2745 return; 2746 } 2747 2748 switch (tcp->tcp_state) { 2749 case TCPS_LISTEN: 2750 if ((flags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN) { 2751 if (flags & TH_RST) { 2752 freemsg(mp); 2753 return; 2754 } 2755 if (flags & TH_ACK) { 2756 tcp_xmit_early_reset("TCPS_LISTEN-TH_ACK", 2757 sock_id, mp, seg_ack, 0, TH_RST, 2758 ip_hdr_len); 2759 return; 2760 } 2761 if (!(flags & TH_SYN)) { 2762 freemsg(mp); 2763 return; 2764 } 2765 printf("tcp_rput_data: %d\n", __LINE__); 2766 prom_panic("inetboot"); 2767 } 2768 if (tcp->tcp_conn_req_max > 0) { 2769 tcp = tcp_conn_request(tcp, mp, sock_id, ip_hdr_len); 2770 if (tcp == NULL) { 2771 freemsg(mp); 2772 return; 2773 } 2774 #ifdef DEBUG 2775 printf("tcp_rput_data: new tcp created\n"); 2776 #endif 2777 } 2778 tcp->tcp_irs = seg_seq; 2779 tcp->tcp_rack = seg_seq; 2780 tcp->tcp_rnxt = seg_seq + 1; 2781 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 2782 BUMP_MIB(tcp_mib.tcpPassiveOpens); 2783 goto syn_rcvd; 2784 case TCPS_SYN_SENT: 2785 if (flags & TH_ACK) { 2786 /* 2787 * Note that our stack cannot send data before a 2788 * connection is established, therefore the 2789 * following check is valid. Otherwise, it has 2790 * to be changed. 2791 */ 2792 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 2793 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 2794 if (flags & TH_RST) { 2795 freemsg(mp); 2796 return; 2797 } 2798 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 2799 tcp, mp, seg_ack, 0, TH_RST, 2800 ip_hdr_len, sock_id); 2801 return; 2802 } 2803 assert(tcp->tcp_suna + 1 == seg_ack); 2804 } 2805 if (flags & TH_RST) { 2806 freemsg(mp); 2807 if (flags & TH_ACK) { 2808 tcp_clean_death(sock_id, tcp, ECONNREFUSED); 2809 } 2810 return; 2811 } 2812 if (!(flags & TH_SYN)) { 2813 freemsg(mp); 2814 return; 2815 } 2816 2817 /* Process all TCP options. */ 2818 tcp_process_options(tcp, (tcph_t *)tcph); 2819 /* 2820 * The following changes our rwnd to be a multiple of the 2821 * MIN(peer MSS, our MSS) for performance reason. 2822 */ 2823 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rwnd, 2824 tcp->tcp_mss)); 2825 2826 /* Is the other end ECN capable? */ 2827 if (tcp->tcp_ecn_ok) { 2828 if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 2829 tcp->tcp_ecn_ok = B_FALSE; 2830 } 2831 } 2832 /* 2833 * Clear ECN flags because it may interfere with later 2834 * processing. 2835 */ 2836 flags &= ~(TH_ECE|TH_CWR); 2837 2838 tcp->tcp_irs = seg_seq; 2839 tcp->tcp_rack = seg_seq; 2840 tcp->tcp_rnxt = seg_seq + 1; 2841 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 2842 2843 if (flags & TH_ACK) { 2844 /* One for the SYN */ 2845 tcp->tcp_suna = tcp->tcp_iss + 1; 2846 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 2847 tcp->tcp_state = TCPS_ESTABLISHED; 2848 2849 /* 2850 * If SYN was retransmitted, need to reset all 2851 * retransmission info. This is because this 2852 * segment will be treated as a dup ACK. 2853 */ 2854 if (tcp->tcp_rexmit) { 2855 tcp->tcp_rexmit = B_FALSE; 2856 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 2857 tcp->tcp_rexmit_max = tcp->tcp_snxt; 2858 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 2859 2860 /* 2861 * Set tcp_cwnd back to 1 MSS, per 2862 * recommendation from 2863 * draft-floyd-incr-init-win-01.txt, 2864 * Increasing TCP's Initial Window. 2865 */ 2866 tcp->tcp_cwnd = tcp->tcp_mss; 2867 } 2868 2869 tcp->tcp_swl1 = seg_seq; 2870 tcp->tcp_swl2 = seg_ack; 2871 2872 new_swnd = BE16_TO_U16(((tcph_t *)tcph)->th_win); 2873 tcp->tcp_swnd = new_swnd; 2874 if (new_swnd > tcp->tcp_max_swnd) 2875 tcp->tcp_max_swnd = new_swnd; 2876 2877 /* 2878 * Always send the three-way handshake ack immediately 2879 * in order to make the connection complete as soon as 2880 * possible on the accepting host. 2881 */ 2882 flags |= TH_ACK_NEEDED; 2883 /* 2884 * Check to see if there is data to be sent. If 2885 * yes, set the transmit flag. Then check to see 2886 * if received data processing needs to be done. 2887 * If not, go straight to xmit_check. This short 2888 * cut is OK as we don't support T/TCP. 2889 */ 2890 if (tcp->tcp_unsent) 2891 flags |= TH_XMIT_NEEDED; 2892 2893 if (seg_len == 0) { 2894 freemsg(mp); 2895 goto xmit_check; 2896 } 2897 2898 flags &= ~TH_SYN; 2899 seg_seq++; 2900 break; 2901 } 2902 syn_rcvd: 2903 tcp->tcp_state = TCPS_SYN_RCVD; 2904 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 2905 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 2906 if (mp1 != NULL) { 2907 TCP_DUMP_PACKET("tcp_rput_data replying SYN", mp1); 2908 (void) ipv4_tcp_output(sock_id, mp1); 2909 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 2910 freeb(mp1); 2911 /* 2912 * Let's wait till our SYN has been ACKED since we 2913 * don't have a timer. 2914 */ 2915 if (tcp_state_wait(sock_id, tcp, TCPS_ALL_ACKED) < 0) { 2916 freemsg(mp); 2917 return; 2918 } 2919 } 2920 freemsg(mp); 2921 return; 2922 default: 2923 break; 2924 } 2925 mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH((tcph_t *)tcph); 2926 new_swnd = ntohs(tcph->tha_win) << 2927 ((flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 2928 mss = tcp->tcp_mss; 2929 2930 if (tcp->tcp_snd_ts_ok) { 2931 if (!tcp_paws_check(tcp, (tcph_t *)tcph, &tcpopt)) { 2932 /* 2933 * This segment is not acceptable. 2934 * Drop it and send back an ACK. 2935 */ 2936 freemsg(mp); 2937 flags |= TH_ACK_NEEDED; 2938 goto ack_check; 2939 } 2940 } else if (tcp->tcp_snd_sack_ok) { 2941 assert(tcp->tcp_sack_info != NULL); 2942 tcpopt.tcp = tcp; 2943 /* 2944 * SACK info in already updated in tcp_parse_options. Ignore 2945 * all other TCP options... 2946 */ 2947 (void) tcp_parse_options((tcph_t *)tcph, &tcpopt); 2948 } 2949 try_again:; 2950 gap = seg_seq - tcp->tcp_rnxt; 2951 rgap = tcp->tcp_rwnd - (gap + seg_len); 2952 /* 2953 * gap is the amount of sequence space between what we expect to see 2954 * and what we got for seg_seq. A positive value for gap means 2955 * something got lost. A negative value means we got some old stuff. 2956 */ 2957 if (gap < 0) { 2958 /* Old stuff present. Is the SYN in there? */ 2959 if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 2960 (seg_len != 0)) { 2961 flags &= ~TH_SYN; 2962 seg_seq++; 2963 /* Recompute the gaps after noting the SYN. */ 2964 goto try_again; 2965 } 2966 BUMP_MIB(tcp_mib.tcpInDataDupSegs); 2967 UPDATE_MIB(tcp_mib.tcpInDataDupBytes, 2968 (seg_len > -gap ? -gap : seg_len)); 2969 /* Remove the old stuff from seg_len. */ 2970 seg_len += gap; 2971 /* 2972 * Anything left? 2973 * Make sure to check for unack'd FIN when rest of data 2974 * has been previously ack'd. 2975 */ 2976 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 2977 /* 2978 * Resets are only valid if they lie within our offered 2979 * window. If the RST bit is set, we just ignore this 2980 * segment. 2981 */ 2982 if (flags & TH_RST) { 2983 freemsg(mp); 2984 return; 2985 } 2986 2987 /* 2988 * This segment is "unacceptable". None of its 2989 * sequence space lies within our advertized window. 2990 * 2991 * Adjust seg_len to the original value for tracing. 2992 */ 2993 seg_len -= gap; 2994 #ifdef DEBUG 2995 printf("tcp_rput: unacceptable, gap %d, rgap " 2996 "%d, flags 0x%x, seg_seq %u, seg_ack %u, " 2997 "seg_len %d, rnxt %u, snxt %u, %s", 2998 gap, rgap, flags, seg_seq, seg_ack, 2999 seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 3000 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 3001 #endif 3002 3003 /* 3004 * Arrange to send an ACK in response to the 3005 * unacceptable segment per RFC 793 page 69. There 3006 * is only one small difference between ours and the 3007 * acceptability test in the RFC - we accept ACK-only 3008 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 3009 * will be generated. 3010 * 3011 * Note that we have to ACK an ACK-only packet at least 3012 * for stacks that send 0-length keep-alives with 3013 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 3014 * section 4.2.3.6. As long as we don't ever generate 3015 * an unacceptable packet in response to an incoming 3016 * packet that is unacceptable, it should not cause 3017 * "ACK wars". 3018 */ 3019 flags |= TH_ACK_NEEDED; 3020 3021 /* 3022 * Continue processing this segment in order to use the 3023 * ACK information it contains, but skip all other 3024 * sequence-number processing. Processing the ACK 3025 * information is necessary in order to 3026 * re-synchronize connections that may have lost 3027 * synchronization. 3028 * 3029 * We clear seg_len and flag fields related to 3030 * sequence number processing as they are not 3031 * to be trusted for an unacceptable segment. 3032 */ 3033 seg_len = 0; 3034 flags &= ~(TH_SYN | TH_FIN | TH_URG); 3035 goto process_ack; 3036 } 3037 3038 /* Fix seg_seq, and chew the gap off the front. */ 3039 seg_seq = tcp->tcp_rnxt; 3040 do { 3041 mblk_t *mp2; 3042 assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 3043 (uintptr_t)UINT_MAX); 3044 gap += (uint_t)(mp->b_wptr - mp->b_rptr); 3045 if (gap > 0) { 3046 mp->b_rptr = mp->b_wptr - gap; 3047 break; 3048 } 3049 mp2 = mp; 3050 mp = mp->b_cont; 3051 freeb(mp2); 3052 } while (gap < 0); 3053 } 3054 /* 3055 * rgap is the amount of stuff received out of window. A negative 3056 * value is the amount out of window. 3057 */ 3058 if (rgap < 0) { 3059 mblk_t *mp2; 3060 3061 if (tcp->tcp_rwnd == 0) 3062 BUMP_MIB(tcp_mib.tcpInWinProbe); 3063 else { 3064 BUMP_MIB(tcp_mib.tcpInDataPastWinSegs); 3065 UPDATE_MIB(tcp_mib.tcpInDataPastWinBytes, -rgap); 3066 } 3067 3068 /* 3069 * seg_len does not include the FIN, so if more than 3070 * just the FIN is out of window, we act like we don't 3071 * see it. (If just the FIN is out of window, rgap 3072 * will be zero and we will go ahead and acknowledge 3073 * the FIN.) 3074 */ 3075 flags &= ~TH_FIN; 3076 3077 /* Fix seg_len and make sure there is something left. */ 3078 seg_len += rgap; 3079 if (seg_len <= 0) { 3080 /* 3081 * Resets are only valid if they lie within our offered 3082 * window. If the RST bit is set, we just ignore this 3083 * segment. 3084 */ 3085 if (flags & TH_RST) { 3086 freemsg(mp); 3087 return; 3088 } 3089 3090 /* Per RFC 793, we need to send back an ACK. */ 3091 flags |= TH_ACK_NEEDED; 3092 3093 /* 3094 * If this is a zero window probe, continue to 3095 * process the ACK part. But we need to set seg_len 3096 * to 0 to avoid data processing. Otherwise just 3097 * drop the segment and send back an ACK. 3098 */ 3099 if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 3100 flags &= ~(TH_SYN | TH_URG); 3101 seg_len = 0; 3102 /* Let's see if we can update our rwnd */ 3103 tcp_rcv_drain(sock_id, tcp); 3104 goto process_ack; 3105 } else { 3106 freemsg(mp); 3107 goto ack_check; 3108 } 3109 } 3110 /* Pitch out of window stuff off the end. */ 3111 rgap = seg_len; 3112 mp2 = mp; 3113 do { 3114 assert((uintptr_t)(mp2->b_wptr - 3115 mp2->b_rptr) <= (uintptr_t)INT_MAX); 3116 rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 3117 if (rgap < 0) { 3118 mp2->b_wptr += rgap; 3119 if ((mp1 = mp2->b_cont) != NULL) { 3120 mp2->b_cont = NULL; 3121 freemsg(mp1); 3122 } 3123 break; 3124 } 3125 } while ((mp2 = mp2->b_cont) != NULL); 3126 } 3127 ok:; 3128 /* 3129 * TCP should check ECN info for segments inside the window only. 3130 * Therefore the check should be done here. 3131 */ 3132 if (tcp->tcp_ecn_ok) { 3133 uchar_t tos = ((struct ip *)rptr)->ip_tos; 3134 3135 if (flags & TH_CWR) { 3136 tcp->tcp_ecn_echo_on = B_FALSE; 3137 } 3138 /* 3139 * Note that both ECN_CE and CWR can be set in the 3140 * same segment. In this case, we once again turn 3141 * on ECN_ECHO. 3142 */ 3143 if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 3144 tcp->tcp_ecn_echo_on = B_TRUE; 3145 } 3146 } 3147 3148 /* 3149 * Check whether we can update tcp_ts_recent. This test is 3150 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 3151 * Extensions for High Performance: An Update", Internet Draft. 3152 */ 3153 if (tcp->tcp_snd_ts_ok && 3154 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 3155 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 3156 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 3157 tcp->tcp_last_rcv_lbolt = prom_gettime(); 3158 } 3159 3160 if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 3161 /* 3162 * FIN in an out of order segment. We record this in 3163 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 3164 * Clear the FIN so that any check on FIN flag will fail. 3165 * Remember that FIN also counts in the sequence number 3166 * space. So we need to ack out of order FIN only segments. 3167 */ 3168 if (flags & TH_FIN) { 3169 tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 3170 tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 3171 flags &= ~TH_FIN; 3172 flags |= TH_ACK_NEEDED; 3173 } 3174 if (seg_len > 0) { 3175 /* Fill in the SACK blk list. */ 3176 if (tcp->tcp_snd_sack_ok) { 3177 assert(tcp->tcp_sack_info != NULL); 3178 tcp_sack_insert(tcp->tcp_sack_list, 3179 seg_seq, seg_seq + seg_len, 3180 &(tcp->tcp_num_sack_blk)); 3181 } 3182 3183 /* 3184 * Attempt reassembly and see if we have something 3185 * ready to go. 3186 */ 3187 mp = tcp_reass(tcp, mp, seg_seq); 3188 /* Always ack out of order packets */ 3189 flags |= TH_ACK_NEEDED | TH_PUSH; 3190 if (mp != NULL) { 3191 assert((uintptr_t)(mp->b_wptr - 3192 mp->b_rptr) <= (uintptr_t)INT_MAX); 3193 seg_len = mp->b_cont ? msgdsize(mp) : 3194 (int)(mp->b_wptr - mp->b_rptr); 3195 seg_seq = tcp->tcp_rnxt; 3196 /* 3197 * A gap is filled and the seq num and len 3198 * of the gap match that of a previously 3199 * received FIN, put the FIN flag back in. 3200 */ 3201 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3202 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3203 flags |= TH_FIN; 3204 tcp->tcp_valid_bits &= 3205 ~TCP_OFO_FIN_VALID; 3206 } 3207 } else { 3208 /* 3209 * Keep going even with NULL mp. 3210 * There may be a useful ACK or something else 3211 * we don't want to miss. 3212 * 3213 * But TCP should not perform fast retransmit 3214 * because of the ack number. TCP uses 3215 * seg_len == 0 to determine if it is a pure 3216 * ACK. And this is not a pure ACK. 3217 */ 3218 seg_len = 0; 3219 ofo_seg = B_TRUE; 3220 } 3221 } 3222 } else if (seg_len > 0) { 3223 BUMP_MIB(tcp_mib.tcpInDataInorderSegs); 3224 UPDATE_MIB(tcp_mib.tcpInDataInorderBytes, seg_len); 3225 /* 3226 * If an out of order FIN was received before, and the seq 3227 * num and len of the new segment match that of the FIN, 3228 * put the FIN flag back in. 3229 */ 3230 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 3231 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 3232 flags |= TH_FIN; 3233 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 3234 } 3235 } 3236 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 3237 if (flags & TH_RST) { 3238 freemsg(mp); 3239 switch (tcp->tcp_state) { 3240 case TCPS_SYN_RCVD: 3241 (void) tcp_clean_death(sock_id, tcp, ECONNREFUSED); 3242 break; 3243 case TCPS_ESTABLISHED: 3244 case TCPS_FIN_WAIT_1: 3245 case TCPS_FIN_WAIT_2: 3246 case TCPS_CLOSE_WAIT: 3247 (void) tcp_clean_death(sock_id, tcp, ECONNRESET); 3248 break; 3249 case TCPS_CLOSING: 3250 case TCPS_LAST_ACK: 3251 (void) tcp_clean_death(sock_id, tcp, 0); 3252 break; 3253 default: 3254 assert(tcp->tcp_state != TCPS_TIME_WAIT); 3255 (void) tcp_clean_death(sock_id, tcp, ENXIO); 3256 break; 3257 } 3258 return; 3259 } 3260 if (flags & TH_SYN) { 3261 /* 3262 * See RFC 793, Page 71 3263 * 3264 * The seq number must be in the window as it should 3265 * be "fixed" above. If it is outside window, it should 3266 * be already rejected. Note that we allow seg_seq to be 3267 * rnxt + rwnd because we want to accept 0 window probe. 3268 */ 3269 assert(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 3270 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 3271 freemsg(mp); 3272 /* 3273 * If the ACK flag is not set, just use our snxt as the 3274 * seq number of the RST segment. 3275 */ 3276 if (!(flags & TH_ACK)) { 3277 seg_ack = tcp->tcp_snxt; 3278 } 3279 tcp_xmit_ctl("TH_SYN", tcp, NULL, seg_ack, 3280 seg_seq + 1, TH_RST|TH_ACK, 0, sock_id); 3281 assert(tcp->tcp_state != TCPS_TIME_WAIT); 3282 (void) tcp_clean_death(sock_id, tcp, ECONNRESET); 3283 return; 3284 } 3285 3286 process_ack: 3287 if (!(flags & TH_ACK)) { 3288 #ifdef DEBUG 3289 printf("No ack in segment, dropped it, seq:%x\n", seg_seq); 3290 #endif 3291 freemsg(mp); 3292 goto xmit_check; 3293 } 3294 } 3295 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 3296 3297 if (tcp->tcp_state == TCPS_SYN_RCVD) { 3298 tcp_t *listener = tcp->tcp_listener; 3299 #ifdef DEBUG 3300 printf("Done with eager 3-way handshake\n"); 3301 #endif 3302 /* 3303 * NOTE: RFC 793 pg. 72 says this should be 'bytes_acked < 0' 3304 * but that would mean we have an ack that ignored our SYN. 3305 */ 3306 if (bytes_acked < 1 || SEQ_GT(seg_ack, tcp->tcp_snxt)) { 3307 freemsg(mp); 3308 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 3309 tcp, NULL, seg_ack, 0, TH_RST, 0, sock_id); 3310 return; 3311 } 3312 3313 /* 3314 * if the conn_req_q is full defer processing 3315 * until space is availabe after accept() 3316 * processing 3317 */ 3318 if (listener->tcp_conn_req_cnt_q < 3319 listener->tcp_conn_req_max) { 3320 tcp_t *tail; 3321 3322 listener->tcp_conn_req_cnt_q0--; 3323 listener->tcp_conn_req_cnt_q++; 3324 3325 /* Move from SYN_RCVD to ESTABLISHED list */ 3326 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 3327 tcp->tcp_eager_prev_q0; 3328 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 3329 tcp->tcp_eager_next_q0; 3330 tcp->tcp_eager_prev_q0 = NULL; 3331 tcp->tcp_eager_next_q0 = NULL; 3332 3333 /* 3334 * Insert at end of the queue because sockfs 3335 * sends down T_CONN_RES in chronological 3336 * order. Leaving the older conn indications 3337 * at front of the queue helps reducing search 3338 * time. 3339 */ 3340 tail = listener->tcp_eager_last_q; 3341 if (tail != NULL) { 3342 tail->tcp_eager_next_q = tcp; 3343 } else { 3344 listener->tcp_eager_next_q = tcp; 3345 } 3346 listener->tcp_eager_last_q = tcp; 3347 tcp->tcp_eager_next_q = NULL; 3348 } else { 3349 /* 3350 * Defer connection on q0 and set deferred 3351 * connection bit true 3352 */ 3353 tcp->tcp_conn_def_q0 = B_TRUE; 3354 3355 /* take tcp out of q0 ... */ 3356 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 3357 tcp->tcp_eager_next_q0; 3358 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 3359 tcp->tcp_eager_prev_q0; 3360 3361 /* ... and place it at the end of q0 */ 3362 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 3363 tcp->tcp_eager_next_q0 = listener; 3364 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 3365 listener->tcp_eager_prev_q0 = tcp; 3366 } 3367 3368 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 3369 bytes_acked--; 3370 3371 /* 3372 * If SYN was retransmitted, need to reset all 3373 * retransmission info as this segment will be 3374 * treated as a dup ACK. 3375 */ 3376 if (tcp->tcp_rexmit) { 3377 tcp->tcp_rexmit = B_FALSE; 3378 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3379 tcp->tcp_rexmit_max = tcp->tcp_snxt; 3380 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3381 tcp->tcp_ms_we_have_waited = 0; 3382 tcp->tcp_cwnd = mss; 3383 } 3384 3385 /* 3386 * We set the send window to zero here. 3387 * This is needed if there is data to be 3388 * processed already on the queue. 3389 * Later (at swnd_update label), the 3390 * "new_swnd > tcp_swnd" condition is satisfied 3391 * the XMIT_NEEDED flag is set in the current 3392 * (SYN_RCVD) state. This ensures tcp_wput_data() is 3393 * called if there is already data on queue in 3394 * this state. 3395 */ 3396 tcp->tcp_swnd = 0; 3397 3398 if (new_swnd > tcp->tcp_max_swnd) 3399 tcp->tcp_max_swnd = new_swnd; 3400 tcp->tcp_swl1 = seg_seq; 3401 tcp->tcp_swl2 = seg_ack; 3402 tcp->tcp_state = TCPS_ESTABLISHED; 3403 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 3404 } 3405 /* This code follows 4.4BSD-Lite2 mostly. */ 3406 if (bytes_acked < 0) 3407 goto est; 3408 3409 /* 3410 * If TCP is ECN capable and the congestion experience bit is 3411 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 3412 * done once per window (or more loosely, per RTT). 3413 */ 3414 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 3415 tcp->tcp_cwr = B_FALSE; 3416 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { 3417 if (!tcp->tcp_cwr) { 3418 npkt = (MIN(tcp->tcp_cwnd, tcp->tcp_swnd) >> 1) / mss; 3419 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; 3420 tcp->tcp_cwnd = npkt * mss; 3421 /* 3422 * If the cwnd is 0, use the timer to clock out 3423 * new segments. This is required by the ECN spec. 3424 */ 3425 if (npkt == 0) { 3426 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3427 /* 3428 * This makes sure that when the ACK comes 3429 * back, we will increase tcp_cwnd by 1 MSS. 3430 */ 3431 tcp->tcp_cwnd_cnt = 0; 3432 } 3433 tcp->tcp_cwr = B_TRUE; 3434 /* 3435 * This marks the end of the current window of in 3436 * flight data. That is why we don't use 3437 * tcp_suna + tcp_swnd. Only data in flight can 3438 * provide ECN info. 3439 */ 3440 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3441 tcp->tcp_ecn_cwr_sent = B_FALSE; 3442 } 3443 } 3444 3445 mp1 = tcp->tcp_xmit_head; 3446 if (bytes_acked == 0) { 3447 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 3448 int dupack_cnt; 3449 3450 BUMP_MIB(tcp_mib.tcpInDupAck); 3451 /* 3452 * Fast retransmit. When we have seen exactly three 3453 * identical ACKs while we have unacked data 3454 * outstanding we take it as a hint that our peer 3455 * dropped something. 3456 * 3457 * If TCP is retransmitting, don't do fast retransmit. 3458 */ 3459 if (mp1 != NULL && tcp->tcp_suna != tcp->tcp_snxt && 3460 ! tcp->tcp_rexmit) { 3461 /* Do Limited Transmit */ 3462 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 3463 tcp_dupack_fast_retransmit) { 3464 /* 3465 * RFC 3042 3466 * 3467 * What we need to do is temporarily 3468 * increase tcp_cwnd so that new 3469 * data can be sent if it is allowed 3470 * by the receive window (tcp_rwnd). 3471 * tcp_wput_data() will take care of 3472 * the rest. 3473 * 3474 * If the connection is SACK capable, 3475 * only do limited xmit when there 3476 * is SACK info. 3477 * 3478 * Note how tcp_cwnd is incremented. 3479 * The first dup ACK will increase 3480 * it by 1 MSS. The second dup ACK 3481 * will increase it by 2 MSS. This 3482 * means that only 1 new segment will 3483 * be sent for each dup ACK. 3484 */ 3485 if (tcp->tcp_unsent > 0 && 3486 (!tcp->tcp_snd_sack_ok || 3487 (tcp->tcp_snd_sack_ok && 3488 tcp->tcp_notsack_list != NULL))) { 3489 tcp->tcp_cwnd += mss << 3490 (tcp->tcp_dupack_cnt - 1); 3491 flags |= TH_LIMIT_XMIT; 3492 } 3493 } else if (dupack_cnt == 3494 tcp_dupack_fast_retransmit) { 3495 3496 BUMP_MIB(tcp_mib.tcpOutFastRetrans); 3497 /* 3498 * If we have reduced tcp_ssthresh 3499 * because of ECN, do not reduce it again 3500 * unless it is already one window of data 3501 * away. After one window of data, tcp_cwr 3502 * should then be cleared. Note that 3503 * for non ECN capable connection, tcp_cwr 3504 * should always be false. 3505 * 3506 * Adjust cwnd since the duplicate 3507 * ack indicates that a packet was 3508 * dropped (due to congestion.) 3509 */ 3510 if (!tcp->tcp_cwr) { 3511 npkt = (MIN(tcp->tcp_cwnd, 3512 tcp->tcp_swnd) >> 1) / mss; 3513 if (npkt < 2) 3514 npkt = 2; 3515 tcp->tcp_cwnd_ssthresh = npkt * mss; 3516 tcp->tcp_cwnd = (npkt + 3517 tcp->tcp_dupack_cnt) * mss; 3518 } 3519 if (tcp->tcp_ecn_ok) { 3520 tcp->tcp_cwr = B_TRUE; 3521 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 3522 tcp->tcp_ecn_cwr_sent = B_FALSE; 3523 } 3524 3525 /* 3526 * We do Hoe's algorithm. Refer to her 3527 * paper "Improving the Start-up Behavior 3528 * of a Congestion Control Scheme for TCP," 3529 * appeared in SIGCOMM'96. 3530 * 3531 * Save highest seq no we have sent so far. 3532 * Be careful about the invisible FIN byte. 3533 */ 3534 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 3535 (tcp->tcp_unsent == 0)) { 3536 tcp->tcp_rexmit_max = tcp->tcp_fss; 3537 } else { 3538 tcp->tcp_rexmit_max = tcp->tcp_snxt; 3539 } 3540 3541 /* 3542 * Do not allow bursty traffic during. 3543 * fast recovery. Refer to Fall and Floyd's 3544 * paper "Simulation-based Comparisons of 3545 * Tahoe, Reno and SACK TCP" (in CCR ??) 3546 * This is a best current practise. 3547 */ 3548 tcp->tcp_snd_burst = TCP_CWND_SS; 3549 3550 /* 3551 * For SACK: 3552 * Calculate tcp_pipe, which is the 3553 * estimated number of bytes in 3554 * network. 3555 * 3556 * tcp_fack is the highest sack'ed seq num 3557 * TCP has received. 3558 * 3559 * tcp_pipe is explained in the above quoted 3560 * Fall and Floyd's paper. tcp_fack is 3561 * explained in Mathis and Mahdavi's 3562 * "Forward Acknowledgment: Refining TCP 3563 * Congestion Control" in SIGCOMM '96. 3564 */ 3565 if (tcp->tcp_snd_sack_ok) { 3566 assert(tcp->tcp_sack_info != NULL); 3567 if (tcp->tcp_notsack_list != NULL) { 3568 tcp->tcp_pipe = tcp->tcp_snxt - 3569 tcp->tcp_fack; 3570 tcp->tcp_sack_snxt = seg_ack; 3571 flags |= TH_NEED_SACK_REXMIT; 3572 } else { 3573 /* 3574 * Always initialize tcp_pipe 3575 * even though we don't have 3576 * any SACK info. If later 3577 * we get SACK info and 3578 * tcp_pipe is not initialized, 3579 * funny things will happen. 3580 */ 3581 tcp->tcp_pipe = 3582 tcp->tcp_cwnd_ssthresh; 3583 } 3584 } else { 3585 flags |= TH_REXMIT_NEEDED; 3586 } /* tcp_snd_sack_ok */ 3587 3588 } else { 3589 /* 3590 * Here we perform congestion 3591 * avoidance, but NOT slow start. 3592 * This is known as the Fast 3593 * Recovery Algorithm. 3594 */ 3595 if (tcp->tcp_snd_sack_ok && 3596 tcp->tcp_notsack_list != NULL) { 3597 flags |= TH_NEED_SACK_REXMIT; 3598 tcp->tcp_pipe -= mss; 3599 if (tcp->tcp_pipe < 0) 3600 tcp->tcp_pipe = 0; 3601 } else { 3602 /* 3603 * We know that one more packet has 3604 * left the pipe thus we can update 3605 * cwnd. 3606 */ 3607 cwnd = tcp->tcp_cwnd + mss; 3608 if (cwnd > tcp->tcp_cwnd_max) 3609 cwnd = tcp->tcp_cwnd_max; 3610 tcp->tcp_cwnd = cwnd; 3611 flags |= TH_XMIT_NEEDED; 3612 } 3613 } 3614 } 3615 } else if (tcp->tcp_zero_win_probe) { 3616 /* 3617 * If the window has opened, need to arrange 3618 * to send additional data. 3619 */ 3620 if (new_swnd != 0) { 3621 /* tcp_suna != tcp_snxt */ 3622 /* Packet contains a window update */ 3623 BUMP_MIB(tcp_mib.tcpInWinUpdate); 3624 tcp->tcp_zero_win_probe = 0; 3625 tcp->tcp_timer_backoff = 0; 3626 tcp->tcp_ms_we_have_waited = 0; 3627 3628 /* 3629 * Transmit starting with tcp_suna since 3630 * the one byte probe is not ack'ed. 3631 * If TCP has sent more than one identical 3632 * probe, tcp_rexmit will be set. That means 3633 * tcp_ss_rexmit() will send out the one 3634 * byte along with new data. Otherwise, 3635 * fake the retransmission. 3636 */ 3637 flags |= TH_XMIT_NEEDED; 3638 if (!tcp->tcp_rexmit) { 3639 tcp->tcp_rexmit = B_TRUE; 3640 tcp->tcp_dupack_cnt = 0; 3641 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 3642 tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 3643 } 3644 } 3645 } 3646 goto swnd_update; 3647 } 3648 3649 /* 3650 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 3651 * If the ACK value acks something that we have not yet sent, it might 3652 * be an old duplicate segment. Send an ACK to re-synchronize the 3653 * other side. 3654 * Note: reset in response to unacceptable ACK in SYN_RECEIVE 3655 * state is handled above, so we can always just drop the segment and 3656 * send an ACK here. 3657 * 3658 * Should we send ACKs in response to ACK only segments? 3659 */ 3660 if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 3661 BUMP_MIB(tcp_mib.tcpInAckUnsent); 3662 /* drop the received segment */ 3663 freemsg(mp); 3664 3665 /* Send back an ACK. */ 3666 mp = tcp_ack_mp(tcp); 3667 3668 if (mp == NULL) { 3669 return; 3670 } 3671 BUMP_MIB(tcp_mib.tcpOutAck); 3672 (void) ipv4_tcp_output(sock_id, mp); 3673 freeb(mp); 3674 return; 3675 } 3676 3677 /* 3678 * TCP gets a new ACK, update the notsack'ed list to delete those 3679 * blocks that are covered by this ACK. 3680 */ 3681 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 3682 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 3683 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 3684 } 3685 3686 /* 3687 * If we got an ACK after fast retransmit, check to see 3688 * if it is a partial ACK. If it is not and the congestion 3689 * window was inflated to account for the other side's 3690 * cached packets, retract it. If it is, do Hoe's algorithm. 3691 */ 3692 if (tcp->tcp_dupack_cnt >= tcp_dupack_fast_retransmit) { 3693 assert(tcp->tcp_rexmit == B_FALSE); 3694 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 3695 tcp->tcp_dupack_cnt = 0; 3696 /* 3697 * Restore the orig tcp_cwnd_ssthresh after 3698 * fast retransmit phase. 3699 */ 3700 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 3701 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; 3702 } 3703 tcp->tcp_rexmit_max = seg_ack; 3704 tcp->tcp_cwnd_cnt = 0; 3705 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3706 3707 /* 3708 * Remove all notsack info to avoid confusion with 3709 * the next fast retrasnmit/recovery phase. 3710 */ 3711 if (tcp->tcp_snd_sack_ok && 3712 tcp->tcp_notsack_list != NULL) { 3713 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 3714 } 3715 } else { 3716 if (tcp->tcp_snd_sack_ok && 3717 tcp->tcp_notsack_list != NULL) { 3718 flags |= TH_NEED_SACK_REXMIT; 3719 tcp->tcp_pipe -= mss; 3720 if (tcp->tcp_pipe < 0) 3721 tcp->tcp_pipe = 0; 3722 } else { 3723 /* 3724 * Hoe's algorithm: 3725 * 3726 * Retransmit the unack'ed segment and 3727 * restart fast recovery. Note that we 3728 * need to scale back tcp_cwnd to the 3729 * original value when we started fast 3730 * recovery. This is to prevent overly 3731 * aggressive behaviour in sending new 3732 * segments. 3733 */ 3734 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + 3735 tcp_dupack_fast_retransmit * mss; 3736 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 3737 BUMP_MIB(tcp_mib.tcpOutFastRetrans); 3738 flags |= TH_REXMIT_NEEDED; 3739 } 3740 } 3741 } else { 3742 tcp->tcp_dupack_cnt = 0; 3743 if (tcp->tcp_rexmit) { 3744 /* 3745 * TCP is retranmitting. If the ACK ack's all 3746 * outstanding data, update tcp_rexmit_max and 3747 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 3748 * to the correct value. 3749 * 3750 * Note that SEQ_LEQ() is used. This is to avoid 3751 * unnecessary fast retransmit caused by dup ACKs 3752 * received when TCP does slow start retransmission 3753 * after a time out. During this phase, TCP may 3754 * send out segments which are already received. 3755 * This causes dup ACKs to be sent back. 3756 */ 3757 if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 3758 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 3759 tcp->tcp_rexmit_nxt = seg_ack; 3760 } 3761 if (seg_ack != tcp->tcp_rexmit_max) { 3762 flags |= TH_XMIT_NEEDED; 3763 } 3764 } else { 3765 tcp->tcp_rexmit = B_FALSE; 3766 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 3767 tcp->tcp_snd_burst = TCP_CWND_NORMAL; 3768 } 3769 tcp->tcp_ms_we_have_waited = 0; 3770 } 3771 } 3772 3773 BUMP_MIB(tcp_mib.tcpInAckSegs); 3774 UPDATE_MIB(tcp_mib.tcpInAckBytes, bytes_acked); 3775 tcp->tcp_suna = seg_ack; 3776 if (tcp->tcp_zero_win_probe != 0) { 3777 tcp->tcp_zero_win_probe = 0; 3778 tcp->tcp_timer_backoff = 0; 3779 } 3780 3781 /* 3782 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 3783 * Note that it cannot be the SYN being ack'ed. The code flow 3784 * will not reach here. 3785 */ 3786 if (mp1 == NULL) { 3787 goto fin_acked; 3788 } 3789 3790 /* 3791 * Update the congestion window. 3792 * 3793 * If TCP is not ECN capable or TCP is ECN capable but the 3794 * congestion experience bit is not set, increase the tcp_cwnd as 3795 * usual. 3796 */ 3797 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 3798 cwnd = tcp->tcp_cwnd; 3799 add = mss; 3800 3801 if (cwnd >= tcp->tcp_cwnd_ssthresh) { 3802 /* 3803 * This is to prevent an increase of less than 1 MSS of 3804 * tcp_cwnd. With partial increase, tcp_wput_data() 3805 * may send out tinygrams in order to preserve mblk 3806 * boundaries. 3807 * 3808 * By initializing tcp_cwnd_cnt to new tcp_cwnd and 3809 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is 3810 * increased by 1 MSS for every RTTs. 3811 */ 3812 if (tcp->tcp_cwnd_cnt <= 0) { 3813 tcp->tcp_cwnd_cnt = cwnd + add; 3814 } else { 3815 tcp->tcp_cwnd_cnt -= add; 3816 add = 0; 3817 } 3818 } 3819 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); 3820 } 3821 3822 /* Can we update the RTT estimates? */ 3823 if (tcp->tcp_snd_ts_ok) { 3824 /* Ignore zero timestamp echo-reply. */ 3825 if (tcpopt.tcp_opt_ts_ecr != 0) { 3826 tcp_set_rto(tcp, (int32_t)(prom_gettime() - 3827 tcpopt.tcp_opt_ts_ecr)); 3828 } 3829 3830 /* If needed, restart the timer. */ 3831 if (tcp->tcp_set_timer == 1) { 3832 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3833 tcp->tcp_set_timer = 0; 3834 } 3835 /* 3836 * Update tcp_csuna in case the other side stops sending 3837 * us timestamps. 3838 */ 3839 tcp->tcp_csuna = tcp->tcp_snxt; 3840 } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 3841 /* 3842 * An ACK sequence we haven't seen before, so get the RTT 3843 * and update the RTO. 3844 */ 3845 tcp_set_rto(tcp, (int32_t)(prom_gettime() - 3846 (uint32_t)mp1->b_prev)); 3847 3848 /* Remeber the last sequence to be ACKed */ 3849 tcp->tcp_csuna = seg_ack; 3850 if (tcp->tcp_set_timer == 1) { 3851 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 3852 tcp->tcp_set_timer = 0; 3853 } 3854 } else { 3855 BUMP_MIB(tcp_mib.tcpRttNoUpdate); 3856 } 3857 3858 /* Eat acknowledged bytes off the xmit queue. */ 3859 for (;;) { 3860 mblk_t *mp2; 3861 uchar_t *wptr; 3862 3863 wptr = mp1->b_wptr; 3864 assert((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 3865 bytes_acked -= (int)(wptr - mp1->b_rptr); 3866 if (bytes_acked < 0) { 3867 mp1->b_rptr = wptr + bytes_acked; 3868 break; 3869 } 3870 mp1->b_prev = NULL; 3871 mp2 = mp1; 3872 mp1 = mp1->b_cont; 3873 freeb(mp2); 3874 if (bytes_acked == 0) { 3875 if (mp1 == NULL) { 3876 /* Everything is ack'ed, clear the tail. */ 3877 tcp->tcp_xmit_tail = NULL; 3878 goto pre_swnd_update; 3879 } 3880 if (mp2 != tcp->tcp_xmit_tail) 3881 break; 3882 tcp->tcp_xmit_tail = mp1; 3883 assert((uintptr_t)(mp1->b_wptr - 3884 mp1->b_rptr) <= (uintptr_t)INT_MAX); 3885 tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 3886 mp1->b_rptr); 3887 break; 3888 } 3889 if (mp1 == NULL) { 3890 /* 3891 * More was acked but there is nothing more 3892 * outstanding. This means that the FIN was 3893 * just acked or that we're talking to a clown. 3894 */ 3895 fin_acked: 3896 assert(tcp->tcp_fin_sent); 3897 tcp->tcp_xmit_tail = NULL; 3898 if (tcp->tcp_fin_sent) { 3899 tcp->tcp_fin_acked = B_TRUE; 3900 } else { 3901 /* 3902 * We should never got here because 3903 * we have already checked that the 3904 * number of bytes ack'ed should be 3905 * smaller than or equal to what we 3906 * have sent so far (it is the 3907 * acceptability check of the ACK). 3908 * We can only get here if the send 3909 * queue is corrupted. 3910 * 3911 * Terminate the connection and 3912 * panic the system. It is better 3913 * for us to panic instead of 3914 * continuing to avoid other disaster. 3915 */ 3916 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 3917 tcp->tcp_rnxt, TH_RST|TH_ACK, 0, sock_id); 3918 printf("Memory corruption " 3919 "detected for connection %s.\n", 3920 tcp_display(tcp, NULL, 3921 DISP_ADDR_AND_PORT)); 3922 /* We should never get here... */ 3923 prom_panic("tcp_rput_data"); 3924 return; 3925 } 3926 goto pre_swnd_update; 3927 } 3928 assert(mp2 != tcp->tcp_xmit_tail); 3929 } 3930 if (tcp->tcp_unsent) { 3931 flags |= TH_XMIT_NEEDED; 3932 } 3933 pre_swnd_update: 3934 tcp->tcp_xmit_head = mp1; 3935 swnd_update: 3936 /* 3937 * The following check is different from most other implementations. 3938 * For bi-directional transfer, when segments are dropped, the 3939 * "normal" check will not accept a window update in those 3940 * retransmitted segemnts. Failing to do that, TCP may send out 3941 * segments which are outside receiver's window. As TCP accepts 3942 * the ack in those retransmitted segments, if the window update in 3943 * the same segment is not accepted, TCP will incorrectly calculates 3944 * that it can send more segments. This can create a deadlock 3945 * with the receiver if its window becomes zero. 3946 */ 3947 if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 3948 SEQ_LT(tcp->tcp_swl1, seg_seq) || 3949 (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 3950 /* 3951 * The criteria for update is: 3952 * 3953 * 1. the segment acknowledges some data. Or 3954 * 2. the segment is new, i.e. it has a higher seq num. Or 3955 * 3. the segment is not old and the advertised window is 3956 * larger than the previous advertised window. 3957 */ 3958 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 3959 flags |= TH_XMIT_NEEDED; 3960 tcp->tcp_swnd = new_swnd; 3961 if (new_swnd > tcp->tcp_max_swnd) 3962 tcp->tcp_max_swnd = new_swnd; 3963 tcp->tcp_swl1 = seg_seq; 3964 tcp->tcp_swl2 = seg_ack; 3965 } 3966 est: 3967 if (tcp->tcp_state > TCPS_ESTABLISHED) { 3968 switch (tcp->tcp_state) { 3969 case TCPS_FIN_WAIT_1: 3970 if (tcp->tcp_fin_acked) { 3971 tcp->tcp_state = TCPS_FIN_WAIT_2; 3972 /* 3973 * We implement the non-standard BSD/SunOS 3974 * FIN_WAIT_2 flushing algorithm. 3975 * If there is no user attached to this 3976 * TCP endpoint, then this TCP struct 3977 * could hang around forever in FIN_WAIT_2 3978 * state if the peer forgets to send us 3979 * a FIN. To prevent this, we wait only 3980 * 2*MSL (a convenient time value) for 3981 * the FIN to arrive. If it doesn't show up, 3982 * we flush the TCP endpoint. This algorithm, 3983 * though a violation of RFC-793, has worked 3984 * for over 10 years in BSD systems. 3985 * Note: SunOS 4.x waits 675 seconds before 3986 * flushing the FIN_WAIT_2 connection. 3987 */ 3988 TCP_TIMER_RESTART(tcp, 3989 tcp_fin_wait_2_flush_interval); 3990 } 3991 break; 3992 case TCPS_FIN_WAIT_2: 3993 break; /* Shutdown hook? */ 3994 case TCPS_LAST_ACK: 3995 freemsg(mp); 3996 if (tcp->tcp_fin_acked) { 3997 (void) tcp_clean_death(sock_id, tcp, 0); 3998 return; 3999 } 4000 goto xmit_check; 4001 case TCPS_CLOSING: 4002 if (tcp->tcp_fin_acked) { 4003 tcp->tcp_state = TCPS_TIME_WAIT; 4004 tcp_time_wait_append(tcp); 4005 TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 4006 } 4007 /*FALLTHRU*/ 4008 case TCPS_CLOSE_WAIT: 4009 freemsg(mp); 4010 goto xmit_check; 4011 default: 4012 assert(tcp->tcp_state != TCPS_TIME_WAIT); 4013 break; 4014 } 4015 } 4016 if (flags & TH_FIN) { 4017 /* Make sure we ack the fin */ 4018 flags |= TH_ACK_NEEDED; 4019 if (!tcp->tcp_fin_rcvd) { 4020 tcp->tcp_fin_rcvd = B_TRUE; 4021 tcp->tcp_rnxt++; 4022 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 4023 4024 switch (tcp->tcp_state) { 4025 case TCPS_SYN_RCVD: 4026 case TCPS_ESTABLISHED: 4027 tcp->tcp_state = TCPS_CLOSE_WAIT; 4028 /* Keepalive? */ 4029 break; 4030 case TCPS_FIN_WAIT_1: 4031 if (!tcp->tcp_fin_acked) { 4032 tcp->tcp_state = TCPS_CLOSING; 4033 break; 4034 } 4035 /* FALLTHRU */ 4036 case TCPS_FIN_WAIT_2: 4037 tcp->tcp_state = TCPS_TIME_WAIT; 4038 tcp_time_wait_append(tcp); 4039 TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 4040 if (seg_len) { 4041 /* 4042 * implies data piggybacked on FIN. 4043 * break to handle data. 4044 */ 4045 break; 4046 } 4047 freemsg(mp); 4048 goto ack_check; 4049 } 4050 } 4051 } 4052 if (mp == NULL) 4053 goto xmit_check; 4054 if (seg_len == 0) { 4055 freemsg(mp); 4056 goto xmit_check; 4057 } 4058 if (mp->b_rptr == mp->b_wptr) { 4059 /* 4060 * The header has been consumed, so we remove the 4061 * zero-length mblk here. 4062 */ 4063 mp1 = mp; 4064 mp = mp->b_cont; 4065 freeb(mp1); 4066 } 4067 /* 4068 * ACK every other segments, unless the input queue is empty 4069 * as we don't have a timer available. 4070 */ 4071 if (++tcp->tcp_rack_cnt == 2 || sockets[sock_id].inq == NULL) { 4072 flags |= TH_ACK_NEEDED; 4073 tcp->tcp_rack_cnt = 0; 4074 } 4075 tcp->tcp_rnxt += seg_len; 4076 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 4077 4078 /* Update SACK list */ 4079 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4080 tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 4081 &(tcp->tcp_num_sack_blk)); 4082 } 4083 4084 if (tcp->tcp_listener) { 4085 /* 4086 * Side queue inbound data until the accept happens. 4087 * tcp_accept/tcp_rput drains this when the accept happens. 4088 */ 4089 tcp_rcv_enqueue(tcp, mp, seg_len); 4090 } else { 4091 /* Just queue the data until the app calls read. */ 4092 tcp_rcv_enqueue(tcp, mp, seg_len); 4093 /* 4094 * Make sure the timer is running if we have data waiting 4095 * for a push bit. This provides resiliency against 4096 * implementations that do not correctly generate push bits. 4097 */ 4098 if (tcp->tcp_rcv_list != NULL) 4099 flags |= TH_TIMER_NEEDED; 4100 } 4101 4102 xmit_check: 4103 /* Is there anything left to do? */ 4104 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 4105 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_TIMER_NEEDED)) == 0) 4106 return; 4107 4108 /* Any transmit work to do and a non-zero window? */ 4109 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 4110 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 4111 if (flags & TH_REXMIT_NEEDED) { 4112 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 4113 4114 if (snd_size > mss) 4115 snd_size = mss; 4116 if (snd_size > tcp->tcp_swnd) 4117 snd_size = tcp->tcp_swnd; 4118 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 4119 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 4120 B_TRUE); 4121 4122 if (mp1 != NULL) { 4123 tcp->tcp_xmit_head->b_prev = 4124 (mblk_t *)prom_gettime(); 4125 tcp->tcp_csuna = tcp->tcp_snxt; 4126 BUMP_MIB(tcp_mib.tcpRetransSegs); 4127 UPDATE_MIB(tcp_mib.tcpRetransBytes, snd_size); 4128 (void) ipv4_tcp_output(sock_id, mp1); 4129 freeb(mp1); 4130 } 4131 } 4132 if (flags & TH_NEED_SACK_REXMIT) { 4133 if (tcp_sack_rxmit(tcp, sock_id) != 0) { 4134 flags |= TH_XMIT_NEEDED; 4135 } 4136 } 4137 /* 4138 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 4139 * out new segment. Note that tcp_rexmit should not be 4140 * set, otherwise TH_LIMIT_XMIT should not be set. 4141 */ 4142 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 4143 if (!tcp->tcp_rexmit) { 4144 tcp_wput_data(tcp, NULL, sock_id); 4145 } else { 4146 tcp_ss_rexmit(tcp, sock_id); 4147 } 4148 /* 4149 * The TCP could be closed in tcp_state_wait via 4150 * tcp_wput_data (tcp_ss_rexmit could call 4151 * tcp_wput_data as well). 4152 */ 4153 if (sockets[sock_id].pcb == NULL) 4154 return; 4155 } 4156 /* 4157 * Adjust tcp_cwnd back to normal value after sending 4158 * new data segments. 4159 */ 4160 if (flags & TH_LIMIT_XMIT) { 4161 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 4162 } 4163 4164 /* Anything more to do? */ 4165 if ((flags & (TH_ACK_NEEDED|TH_TIMER_NEEDED)) == 0) 4166 return; 4167 } 4168 ack_check: 4169 if (flags & TH_ACK_NEEDED) { 4170 /* 4171 * Time to send an ack for some reason. 4172 */ 4173 if ((mp1 = tcp_ack_mp(tcp)) != NULL) { 4174 TCP_DUMP_PACKET("tcp_rput_data: ack mp", mp1); 4175 (void) ipv4_tcp_output(sock_id, mp1); 4176 BUMP_MIB(tcp_mib.tcpOutAck); 4177 freeb(mp1); 4178 } 4179 } 4180 } 4181 4182 /* 4183 * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start 4184 * retransmission after a timeout. 4185 * 4186 * To limit the number of duplicate segments, we limit the number of segment 4187 * to be sent in one time to tcp_snd_burst, the burst variable. 4188 */ 4189 static void 4190 tcp_ss_rexmit(tcp_t *tcp, int sock_id) 4191 { 4192 uint32_t snxt; 4193 uint32_t smax; 4194 int32_t win; 4195 int32_t mss; 4196 int32_t off; 4197 int32_t burst = tcp->tcp_snd_burst; 4198 mblk_t *snxt_mp; 4199 4200 /* 4201 * Note that tcp_rexmit can be set even though TCP has retransmitted 4202 * all unack'ed segments. 4203 */ 4204 if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 4205 smax = tcp->tcp_rexmit_max; 4206 snxt = tcp->tcp_rexmit_nxt; 4207 if (SEQ_LT(snxt, tcp->tcp_suna)) { 4208 snxt = tcp->tcp_suna; 4209 } 4210 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 4211 win -= snxt - tcp->tcp_suna; 4212 mss = tcp->tcp_mss; 4213 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 4214 4215 while (SEQ_LT(snxt, smax) && (win > 0) && 4216 (burst > 0) && (snxt_mp != NULL)) { 4217 mblk_t *xmit_mp; 4218 mblk_t *old_snxt_mp = snxt_mp; 4219 uint32_t cnt = mss; 4220 4221 if (win < cnt) { 4222 cnt = win; 4223 } 4224 if (SEQ_GT(snxt + cnt, smax)) { 4225 cnt = smax - snxt; 4226 } 4227 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 4228 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 4229 4230 if (xmit_mp == NULL) 4231 return; 4232 4233 (void) ipv4_tcp_output(sock_id, xmit_mp); 4234 freeb(xmit_mp); 4235 4236 snxt += cnt; 4237 win -= cnt; 4238 /* 4239 * Update the send timestamp to avoid false 4240 * retransmission. 4241 */ 4242 old_snxt_mp->b_prev = (mblk_t *)prom_gettime(); 4243 BUMP_MIB(tcp_mib.tcpRetransSegs); 4244 UPDATE_MIB(tcp_mib.tcpRetransBytes, cnt); 4245 4246 tcp->tcp_rexmit_nxt = snxt; 4247 burst--; 4248 } 4249 /* 4250 * If we have transmitted all we have at the time 4251 * we started the retranmission, we can leave 4252 * the rest of the job to tcp_wput_data(). But we 4253 * need to check the send window first. If the 4254 * win is not 0, go on with tcp_wput_data(). 4255 */ 4256 if (SEQ_LT(snxt, smax) || win == 0) { 4257 return; 4258 } 4259 } 4260 /* Only call tcp_wput_data() if there is data to be sent. */ 4261 if (tcp->tcp_unsent) { 4262 tcp_wput_data(tcp, NULL, sock_id); 4263 } 4264 } 4265 4266 /* 4267 * tcp_timer is the timer service routine. It handles all timer events for 4268 * a tcp instance except keepalives. It figures out from the state of the 4269 * tcp instance what kind of action needs to be done at the time it is called. 4270 */ 4271 static void 4272 tcp_timer(tcp_t *tcp, int sock_id) 4273 { 4274 mblk_t *mp; 4275 uint32_t first_threshold; 4276 uint32_t second_threshold; 4277 uint32_t ms; 4278 uint32_t mss; 4279 4280 first_threshold = tcp->tcp_first_timer_threshold; 4281 second_threshold = tcp->tcp_second_timer_threshold; 4282 switch (tcp->tcp_state) { 4283 case TCPS_IDLE: 4284 case TCPS_BOUND: 4285 case TCPS_LISTEN: 4286 return; 4287 case TCPS_SYN_RCVD: 4288 case TCPS_SYN_SENT: 4289 first_threshold = tcp->tcp_first_ctimer_threshold; 4290 second_threshold = tcp->tcp_second_ctimer_threshold; 4291 break; 4292 case TCPS_ESTABLISHED: 4293 case TCPS_FIN_WAIT_1: 4294 case TCPS_CLOSING: 4295 case TCPS_CLOSE_WAIT: 4296 case TCPS_LAST_ACK: 4297 /* If we have data to rexmit */ 4298 if (tcp->tcp_suna != tcp->tcp_snxt) { 4299 int32_t time_to_wait; 4300 4301 BUMP_MIB(tcp_mib.tcpTimRetrans); 4302 if (tcp->tcp_xmit_head == NULL) 4303 break; 4304 time_to_wait = (int32_t)(prom_gettime() - 4305 (uint32_t)tcp->tcp_xmit_head->b_prev); 4306 time_to_wait = tcp->tcp_rto - time_to_wait; 4307 if (time_to_wait > 0) { 4308 /* 4309 * Timer fired too early, so restart it. 4310 */ 4311 TCP_TIMER_RESTART(tcp, time_to_wait); 4312 return; 4313 } 4314 /* 4315 * When we probe zero windows, we force the swnd open. 4316 * If our peer acks with a closed window swnd will be 4317 * set to zero by tcp_rput(). As long as we are 4318 * receiving acks tcp_rput will 4319 * reset 'tcp_ms_we_have_waited' so as not to trip the 4320 * first and second interval actions. NOTE: the timer 4321 * interval is allowed to continue its exponential 4322 * backoff. 4323 */ 4324 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 4325 DEBUG_1("tcp_timer (%d): zero win", sock_id); 4326 break; 4327 } else { 4328 /* 4329 * After retransmission, we need to do 4330 * slow start. Set the ssthresh to one 4331 * half of current effective window and 4332 * cwnd to one MSS. Also reset 4333 * tcp_cwnd_cnt. 4334 * 4335 * Note that if tcp_ssthresh is reduced because 4336 * of ECN, do not reduce it again unless it is 4337 * already one window of data away (tcp_cwr 4338 * should then be cleared) or this is a 4339 * timeout for a retransmitted segment. 4340 */ 4341 uint32_t npkt; 4342 4343 if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 4344 npkt = (MIN((tcp->tcp_timer_backoff ? 4345 tcp->tcp_cwnd_ssthresh : 4346 tcp->tcp_cwnd), 4347 tcp->tcp_swnd) >> 1) / 4348 tcp->tcp_mss; 4349 if (npkt < 2) 4350 npkt = 2; 4351 tcp->tcp_cwnd_ssthresh = npkt * 4352 tcp->tcp_mss; 4353 } 4354 tcp->tcp_cwnd = tcp->tcp_mss; 4355 tcp->tcp_cwnd_cnt = 0; 4356 if (tcp->tcp_ecn_ok) { 4357 tcp->tcp_cwr = B_TRUE; 4358 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 4359 tcp->tcp_ecn_cwr_sent = B_FALSE; 4360 } 4361 } 4362 break; 4363 } 4364 /* 4365 * We have something to send yet we cannot send. The 4366 * reason can be: 4367 * 4368 * 1. Zero send window: we need to do zero window probe. 4369 * 2. Zero cwnd: because of ECN, we need to "clock out 4370 * segments. 4371 * 3. SWS avoidance: receiver may have shrunk window, 4372 * reset our knowledge. 4373 * 4374 * Note that condition 2 can happen with either 1 or 4375 * 3. But 1 and 3 are exclusive. 4376 */ 4377 if (tcp->tcp_unsent != 0) { 4378 if (tcp->tcp_cwnd == 0) { 4379 /* 4380 * Set tcp_cwnd to 1 MSS so that a 4381 * new segment can be sent out. We 4382 * are "clocking out" new data when 4383 * the network is really congested. 4384 */ 4385 assert(tcp->tcp_ecn_ok); 4386 tcp->tcp_cwnd = tcp->tcp_mss; 4387 } 4388 if (tcp->tcp_swnd == 0) { 4389 /* Extend window for zero window probe */ 4390 tcp->tcp_swnd++; 4391 tcp->tcp_zero_win_probe = B_TRUE; 4392 BUMP_MIB(tcp_mib.tcpOutWinProbe); 4393 } else { 4394 /* 4395 * Handle timeout from sender SWS avoidance. 4396 * Reset our knowledge of the max send window 4397 * since the receiver might have reduced its 4398 * receive buffer. Avoid setting tcp_max_swnd 4399 * to one since that will essentially disable 4400 * the SWS checks. 4401 * 4402 * Note that since we don't have a SWS 4403 * state variable, if the timeout is set 4404 * for ECN but not for SWS, this 4405 * code will also be executed. This is 4406 * fine as tcp_max_swnd is updated 4407 * constantly and it will not affect 4408 * anything. 4409 */ 4410 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 4411 } 4412 tcp_wput_data(tcp, NULL, sock_id); 4413 return; 4414 } 4415 /* Is there a FIN that needs to be to re retransmitted? */ 4416 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4417 !tcp->tcp_fin_acked) 4418 break; 4419 /* Nothing to do, return without restarting timer. */ 4420 return; 4421 case TCPS_FIN_WAIT_2: 4422 /* 4423 * User closed the TCP endpoint and peer ACK'ed our FIN. 4424 * We waited some time for for peer's FIN, but it hasn't 4425 * arrived. We flush the connection now to avoid 4426 * case where the peer has rebooted. 4427 */ 4428 /* FALLTHRU */ 4429 case TCPS_TIME_WAIT: 4430 (void) tcp_clean_death(sock_id, tcp, 0); 4431 return; 4432 default: 4433 DEBUG_3("tcp_timer (%d): strange state (%d) %s", sock_id, 4434 tcp->tcp_state, tcp_display(tcp, NULL, 4435 DISP_PORT_ONLY)); 4436 return; 4437 } 4438 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 4439 /* 4440 * For zero window probe, we need to send indefinitely, 4441 * unless we have not heard from the other side for some 4442 * time... 4443 */ 4444 if ((tcp->tcp_zero_win_probe == 0) || 4445 ((prom_gettime() - tcp->tcp_last_recv_time) > 4446 second_threshold)) { 4447 BUMP_MIB(tcp_mib.tcpTimRetransDrop); 4448 /* 4449 * If TCP is in SYN_RCVD state, send back a 4450 * RST|ACK as BSD does. Note that tcp_zero_win_probe 4451 * should be zero in TCPS_SYN_RCVD state. 4452 */ 4453 if (tcp->tcp_state == TCPS_SYN_RCVD) { 4454 tcp_xmit_ctl("tcp_timer: RST sent on timeout " 4455 "in SYN_RCVD", 4456 tcp, NULL, tcp->tcp_snxt, 4457 tcp->tcp_rnxt, TH_RST | TH_ACK, 0, sock_id); 4458 } 4459 (void) tcp_clean_death(sock_id, tcp, 4460 tcp->tcp_client_errno ? 4461 tcp->tcp_client_errno : ETIMEDOUT); 4462 return; 4463 } else { 4464 /* 4465 * Set tcp_ms_we_have_waited to second_threshold 4466 * so that in next timeout, we will do the above 4467 * check (lbolt - tcp_last_recv_time). This is 4468 * also to avoid overflow. 4469 * 4470 * We don't need to decrement tcp_timer_backoff 4471 * to avoid overflow because it will be decremented 4472 * later if new timeout value is greater than 4473 * tcp_rexmit_interval_max. In the case when 4474 * tcp_rexmit_interval_max is greater than 4475 * second_threshold, it means that we will wait 4476 * longer than second_threshold to send the next 4477 * window probe. 4478 */ 4479 tcp->tcp_ms_we_have_waited = second_threshold; 4480 } 4481 } else if (ms > first_threshold && tcp->tcp_rtt_sa != 0) { 4482 /* 4483 * We have been retransmitting for too long... The RTT 4484 * we calculated is probably incorrect. Reinitialize it. 4485 * Need to compensate for 0 tcp_rtt_sa. Reset 4486 * tcp_rtt_update so that we won't accidentally cache a 4487 * bad value. But only do this if this is not a zero 4488 * window probe. 4489 */ 4490 if (tcp->tcp_zero_win_probe == 0) { 4491 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 4492 (tcp->tcp_rtt_sa >> 5); 4493 tcp->tcp_rtt_sa = 0; 4494 tcp->tcp_rtt_update = 0; 4495 } 4496 } 4497 tcp->tcp_timer_backoff++; 4498 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 4499 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 4500 tcp_rexmit_interval_min) { 4501 /* 4502 * This means the original RTO is tcp_rexmit_interval_min. 4503 * So we will use tcp_rexmit_interval_min as the RTO value 4504 * and do the backoff. 4505 */ 4506 ms = tcp_rexmit_interval_min << tcp->tcp_timer_backoff; 4507 } else { 4508 ms <<= tcp->tcp_timer_backoff; 4509 } 4510 if (ms > tcp_rexmit_interval_max) { 4511 ms = tcp_rexmit_interval_max; 4512 /* 4513 * ms is at max, decrement tcp_timer_backoff to avoid 4514 * overflow. 4515 */ 4516 tcp->tcp_timer_backoff--; 4517 } 4518 tcp->tcp_ms_we_have_waited += ms; 4519 if (tcp->tcp_zero_win_probe == 0) { 4520 tcp->tcp_rto = ms; 4521 } 4522 TCP_TIMER_RESTART(tcp, ms); 4523 /* 4524 * This is after a timeout and tcp_rto is backed off. Set 4525 * tcp_set_timer to 1 so that next time RTO is updated, we will 4526 * restart the timer with a correct value. 4527 */ 4528 tcp->tcp_set_timer = 1; 4529 mss = tcp->tcp_snxt - tcp->tcp_suna; 4530 if (mss > tcp->tcp_mss) 4531 mss = tcp->tcp_mss; 4532 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 4533 mss = tcp->tcp_swnd; 4534 4535 if ((mp = tcp->tcp_xmit_head) != NULL) 4536 mp->b_prev = (mblk_t *)prom_gettime(); 4537 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 4538 B_TRUE); 4539 if (mp == NULL) 4540 return; 4541 tcp->tcp_csuna = tcp->tcp_snxt; 4542 BUMP_MIB(tcp_mib.tcpRetransSegs); 4543 UPDATE_MIB(tcp_mib.tcpRetransBytes, mss); 4544 /* Dump the packet when debugging. */ 4545 TCP_DUMP_PACKET("tcp_timer", mp); 4546 4547 (void) ipv4_tcp_output(sock_id, mp); 4548 freeb(mp); 4549 4550 /* 4551 * When slow start after retransmission begins, start with 4552 * this seq no. tcp_rexmit_max marks the end of special slow 4553 * start phase. tcp_snd_burst controls how many segments 4554 * can be sent because of an ack. 4555 */ 4556 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 4557 tcp->tcp_snd_burst = TCP_CWND_SS; 4558 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 4559 (tcp->tcp_unsent == 0)) { 4560 tcp->tcp_rexmit_max = tcp->tcp_fss; 4561 } else { 4562 tcp->tcp_rexmit_max = tcp->tcp_snxt; 4563 } 4564 tcp->tcp_rexmit = B_TRUE; 4565 tcp->tcp_dupack_cnt = 0; 4566 4567 /* 4568 * Remove all rexmit SACK blk to start from fresh. 4569 */ 4570 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 4571 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 4572 tcp->tcp_num_notsack_blk = 0; 4573 tcp->tcp_cnt_notsack_list = 0; 4574 } 4575 } 4576 4577 /* 4578 * The TCP normal data output path. 4579 * NOTE: the logic of the fast path is duplicated from this function. 4580 */ 4581 static void 4582 tcp_wput_data(tcp_t *tcp, mblk_t *mp, int sock_id) 4583 { 4584 int len; 4585 mblk_t *local_time; 4586 mblk_t *mp1; 4587 uchar_t *rptr; 4588 uint32_t snxt; 4589 int tail_unsent; 4590 int tcpstate; 4591 int usable = 0; 4592 mblk_t *xmit_tail; 4593 int32_t num_burst_seg; 4594 int32_t mss; 4595 int32_t num_sack_blk = 0; 4596 int32_t tcp_hdr_len; 4597 ipaddr_t *dst; 4598 ipaddr_t *src; 4599 4600 #ifdef DEBUG 4601 printf("tcp_wput_data(%d) ##############################\n", sock_id); 4602 #endif 4603 tcpstate = tcp->tcp_state; 4604 if (mp == NULL) { 4605 /* Really tacky... but we need this for detached closes. */ 4606 len = tcp->tcp_unsent; 4607 goto data_null; 4608 } 4609 4610 /* 4611 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 4612 * or before a connection attempt has begun. 4613 * 4614 * The following should not happen in inetboot.... 4615 */ 4616 if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 4617 (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 4618 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 4619 printf("tcp_wput_data: data after ordrel, %s\n", 4620 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 4621 } 4622 freemsg(mp); 4623 return; 4624 } 4625 4626 /* Strip empties */ 4627 for (;;) { 4628 assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 4629 (uintptr_t)INT_MAX); 4630 len = (int)(mp->b_wptr - mp->b_rptr); 4631 if (len > 0) 4632 break; 4633 mp1 = mp; 4634 mp = mp->b_cont; 4635 freeb(mp1); 4636 if (mp == NULL) { 4637 return; 4638 } 4639 } 4640 4641 /* If we are the first on the list ... */ 4642 if (tcp->tcp_xmit_head == NULL) { 4643 tcp->tcp_xmit_head = mp; 4644 tcp->tcp_xmit_tail = mp; 4645 tcp->tcp_xmit_tail_unsent = len; 4646 } else { 4647 tcp->tcp_xmit_last->b_cont = mp; 4648 len += tcp->tcp_unsent; 4649 } 4650 4651 /* Tack on however many more positive length mblks we have */ 4652 if ((mp1 = mp->b_cont) != NULL) { 4653 do { 4654 int tlen; 4655 assert((uintptr_t)(mp1->b_wptr - 4656 mp1->b_rptr) <= (uintptr_t)INT_MAX); 4657 tlen = (int)(mp1->b_wptr - mp1->b_rptr); 4658 if (tlen <= 0) { 4659 mp->b_cont = mp1->b_cont; 4660 freeb(mp1); 4661 } else { 4662 len += tlen; 4663 mp = mp1; 4664 } 4665 } while ((mp1 = mp->b_cont) != NULL); 4666 } 4667 tcp->tcp_xmit_last = mp; 4668 tcp->tcp_unsent = len; 4669 4670 data_null: 4671 snxt = tcp->tcp_snxt; 4672 xmit_tail = tcp->tcp_xmit_tail; 4673 tail_unsent = tcp->tcp_xmit_tail_unsent; 4674 4675 /* 4676 * Note that tcp_mss has been adjusted to take into account the 4677 * timestamp option if applicable. Because SACK options do not 4678 * appear in every TCP segments and they are of variable lengths, 4679 * they cannot be included in tcp_mss. Thus we need to calculate 4680 * the actual segment length when we need to send a segment which 4681 * includes SACK options. 4682 */ 4683 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 4684 int32_t opt_len; 4685 4686 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 4687 tcp->tcp_num_sack_blk); 4688 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 4689 2 + TCPOPT_HEADER_LEN; 4690 mss = tcp->tcp_mss - opt_len; 4691 tcp_hdr_len = tcp->tcp_hdr_len + opt_len; 4692 } else { 4693 mss = tcp->tcp_mss; 4694 tcp_hdr_len = tcp->tcp_hdr_len; 4695 } 4696 4697 if ((tcp->tcp_suna == snxt) && 4698 (prom_gettime() - tcp->tcp_last_recv_time) >= tcp->tcp_rto) { 4699 tcp->tcp_cwnd = MIN(tcp_slow_start_after_idle * mss, 4700 MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 4701 } 4702 if (tcpstate == TCPS_SYN_RCVD) { 4703 /* 4704 * The three-way connection establishment handshake is not 4705 * complete yet. We want to queue the data for transmission 4706 * after entering ESTABLISHED state (RFC793). Setting usable to 4707 * zero cause a jump to "done" label effectively leaving data 4708 * on the queue. 4709 */ 4710 4711 usable = 0; 4712 } else { 4713 int usable_r = tcp->tcp_swnd; 4714 4715 /* 4716 * In the special case when cwnd is zero, which can only 4717 * happen if the connection is ECN capable, return now. 4718 * New segments is sent using tcp_timer(). The timer 4719 * is set in tcp_rput_data(). 4720 */ 4721 if (tcp->tcp_cwnd == 0) { 4722 /* 4723 * Note that tcp_cwnd is 0 before 3-way handshake is 4724 * finished. 4725 */ 4726 assert(tcp->tcp_ecn_ok || 4727 tcp->tcp_state < TCPS_ESTABLISHED); 4728 return; 4729 } 4730 4731 /* usable = MIN(swnd, cwnd) - unacked_bytes */ 4732 if (usable_r > tcp->tcp_cwnd) 4733 usable_r = tcp->tcp_cwnd; 4734 4735 /* NOTE: trouble if xmitting while SYN not acked? */ 4736 usable_r -= snxt; 4737 usable_r += tcp->tcp_suna; 4738 4739 /* usable = MIN(usable, unsent) */ 4740 if (usable_r > len) 4741 usable_r = len; 4742 4743 /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 4744 if (usable_r != 0) 4745 usable = usable_r; 4746 } 4747 4748 local_time = (mblk_t *)prom_gettime(); 4749 4750 /* 4751 * "Our" Nagle Algorithm. This is not the same as in the old 4752 * BSD. This is more in line with the true intent of Nagle. 4753 * 4754 * The conditions are: 4755 * 1. The amount of unsent data (or amount of data which can be 4756 * sent, whichever is smaller) is less than Nagle limit. 4757 * 2. The last sent size is also less than Nagle limit. 4758 * 3. There is unack'ed data. 4759 * 4. Urgent pointer is not set. Send urgent data ignoring the 4760 * Nagle algorithm. This reduces the probability that urgent 4761 * bytes get "merged" together. 4762 * 5. The app has not closed the connection. This eliminates the 4763 * wait time of the receiving side waiting for the last piece of 4764 * (small) data. 4765 * 4766 * If all are satisified, exit without sending anything. Note 4767 * that Nagle limit can be smaller than 1 MSS. Nagle limit is 4768 * the smaller of 1 MSS and global tcp_naglim_def (default to be 4769 * 4095). 4770 */ 4771 if (usable < (int)tcp->tcp_naglim && 4772 tcp->tcp_naglim > tcp->tcp_last_sent_len && 4773 snxt != tcp->tcp_suna && 4774 !(tcp->tcp_valid_bits & TCP_URG_VALID)) 4775 goto done; 4776 4777 num_burst_seg = tcp->tcp_snd_burst; 4778 for (;;) { 4779 tcph_t *tcph; 4780 mblk_t *new_mp; 4781 4782 if (num_burst_seg-- == 0) 4783 goto done; 4784 4785 len = mss; 4786 if (len > usable) { 4787 len = usable; 4788 if (len <= 0) { 4789 /* Terminate the loop */ 4790 goto done; 4791 } 4792 /* 4793 * Sender silly-window avoidance. 4794 * Ignore this if we are going to send a 4795 * zero window probe out. 4796 * 4797 * TODO: force data into microscopic window ?? 4798 * ==> (!pushed || (unsent > usable)) 4799 */ 4800 if (len < (tcp->tcp_max_swnd >> 1) && 4801 (tcp->tcp_unsent - (snxt - tcp->tcp_snxt)) > len && 4802 !((tcp->tcp_valid_bits & TCP_URG_VALID) && 4803 len == 1) && (! tcp->tcp_zero_win_probe)) { 4804 /* 4805 * If the retransmit timer is not running 4806 * we start it so that we will retransmit 4807 * in the case when the the receiver has 4808 * decremented the window. 4809 */ 4810 if (snxt == tcp->tcp_snxt && 4811 snxt == tcp->tcp_suna) { 4812 /* 4813 * We are not supposed to send 4814 * anything. So let's wait a little 4815 * bit longer before breaking SWS 4816 * avoidance. 4817 * 4818 * What should the value be? 4819 * Suggestion: MAX(init rexmit time, 4820 * tcp->tcp_rto) 4821 */ 4822 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 4823 } 4824 goto done; 4825 } 4826 } 4827 4828 tcph = tcp->tcp_tcph; 4829 4830 usable -= len; /* Approximate - can be adjusted later */ 4831 if (usable > 0) 4832 tcph->th_flags[0] = TH_ACK; 4833 else 4834 tcph->th_flags[0] = (TH_ACK | TH_PUSH); 4835 4836 U32_TO_ABE32(snxt, tcph->th_seq); 4837 4838 if (tcp->tcp_valid_bits) { 4839 uchar_t *prev_rptr = xmit_tail->b_rptr; 4840 uint32_t prev_snxt = tcp->tcp_snxt; 4841 4842 if (tail_unsent == 0) { 4843 assert(xmit_tail->b_cont != NULL); 4844 xmit_tail = xmit_tail->b_cont; 4845 prev_rptr = xmit_tail->b_rptr; 4846 tail_unsent = (int)(xmit_tail->b_wptr - 4847 xmit_tail->b_rptr); 4848 } else { 4849 xmit_tail->b_rptr = xmit_tail->b_wptr - 4850 tail_unsent; 4851 } 4852 mp = tcp_xmit_mp(tcp, xmit_tail, len, NULL, NULL, 4853 snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 4854 /* Restore tcp_snxt so we get amount sent right. */ 4855 tcp->tcp_snxt = prev_snxt; 4856 if (prev_rptr == xmit_tail->b_rptr) 4857 xmit_tail->b_prev = local_time; 4858 else 4859 xmit_tail->b_rptr = prev_rptr; 4860 4861 if (mp == NULL) 4862 break; 4863 4864 mp1 = mp->b_cont; 4865 4866 snxt += len; 4867 tcp->tcp_last_sent_len = (ushort_t)len; 4868 while (mp1->b_cont) { 4869 xmit_tail = xmit_tail->b_cont; 4870 xmit_tail->b_prev = local_time; 4871 mp1 = mp1->b_cont; 4872 } 4873 tail_unsent = xmit_tail->b_wptr - mp1->b_wptr; 4874 BUMP_MIB(tcp_mib.tcpOutDataSegs); 4875 UPDATE_MIB(tcp_mib.tcpOutDataBytes, len); 4876 /* Dump the packet when debugging. */ 4877 TCP_DUMP_PACKET("tcp_wput_data (valid bits)", mp); 4878 (void) ipv4_tcp_output(sock_id, mp); 4879 freeb(mp); 4880 continue; 4881 } 4882 4883 snxt += len; /* Adjust later if we don't send all of len */ 4884 BUMP_MIB(tcp_mib.tcpOutDataSegs); 4885 UPDATE_MIB(tcp_mib.tcpOutDataBytes, len); 4886 4887 if (tail_unsent) { 4888 /* Are the bytes above us in flight? */ 4889 rptr = xmit_tail->b_wptr - tail_unsent; 4890 if (rptr != xmit_tail->b_rptr) { 4891 tail_unsent -= len; 4892 len += tcp_hdr_len; 4893 tcp->tcp_ipha->ip_len = htons(len); 4894 mp = dupb(xmit_tail); 4895 if (!mp) 4896 break; 4897 mp->b_rptr = rptr; 4898 goto must_alloc; 4899 } 4900 } else { 4901 xmit_tail = xmit_tail->b_cont; 4902 assert((uintptr_t)(xmit_tail->b_wptr - 4903 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 4904 tail_unsent = (int)(xmit_tail->b_wptr - 4905 xmit_tail->b_rptr); 4906 } 4907 4908 tail_unsent -= len; 4909 tcp->tcp_last_sent_len = (ushort_t)len; 4910 4911 len += tcp_hdr_len; 4912 if (tcp->tcp_ipversion == IPV4_VERSION) 4913 tcp->tcp_ipha->ip_len = htons(len); 4914 4915 xmit_tail->b_prev = local_time; 4916 4917 mp = dupb(xmit_tail); 4918 if (mp == NULL) 4919 goto out_of_mem; 4920 4921 len = tcp_hdr_len; 4922 /* 4923 * There are four reasons to allocate a new hdr mblk: 4924 * 1) The bytes above us are in use by another packet 4925 * 2) We don't have good alignment 4926 * 3) The mblk is being shared 4927 * 4) We don't have enough room for a header 4928 */ 4929 rptr = mp->b_rptr - len; 4930 if (!OK_32PTR(rptr) || 4931 rptr < mp->b_datap) { 4932 /* NOTE: we assume allocb returns an OK_32PTR */ 4933 4934 must_alloc:; 4935 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 4936 tcp_wroff_xtra, 0); 4937 if (mp1 == NULL) { 4938 freemsg(mp); 4939 goto out_of_mem; 4940 } 4941 mp1->b_cont = mp; 4942 mp = mp1; 4943 /* Leave room for Link Level header */ 4944 len = tcp_hdr_len; 4945 rptr = &mp->b_rptr[tcp_wroff_xtra]; 4946 mp->b_wptr = &rptr[len]; 4947 } 4948 4949 if (tcp->tcp_snd_ts_ok) { 4950 U32_TO_BE32((uint32_t)local_time, 4951 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 4952 U32_TO_BE32(tcp->tcp_ts_recent, 4953 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 4954 } else { 4955 assert(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 4956 } 4957 4958 mp->b_rptr = rptr; 4959 4960 /* Copy the template header. */ 4961 dst = (ipaddr_t *)rptr; 4962 src = (ipaddr_t *)tcp->tcp_iphc; 4963 dst[0] = src[0]; 4964 dst[1] = src[1]; 4965 dst[2] = src[2]; 4966 dst[3] = src[3]; 4967 dst[4] = src[4]; 4968 dst[5] = src[5]; 4969 dst[6] = src[6]; 4970 dst[7] = src[7]; 4971 dst[8] = src[8]; 4972 dst[9] = src[9]; 4973 len = tcp->tcp_hdr_len; 4974 if (len -= 40) { 4975 len >>= 2; 4976 dst += 10; 4977 src += 10; 4978 do { 4979 *dst++ = *src++; 4980 } while (--len); 4981 } 4982 4983 /* 4984 * Set tcph to point to the header of the outgoing packet, 4985 * not to the template header. 4986 */ 4987 tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 4988 4989 /* 4990 * Set the ECN info in the TCP header if it is not a zero 4991 * window probe. Zero window probe is only sent in 4992 * tcp_wput_data() and tcp_timer(). 4993 */ 4994 if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 4995 SET_ECT(tcp, rptr); 4996 4997 if (tcp->tcp_ecn_echo_on) 4998 tcph->th_flags[0] |= TH_ECE; 4999 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 5000 tcph->th_flags[0] |= TH_CWR; 5001 tcp->tcp_ecn_cwr_sent = B_TRUE; 5002 } 5003 } 5004 5005 /* Fill in SACK options */ 5006 if (num_sack_blk > 0) { 5007 uchar_t *wptr = rptr + tcp->tcp_hdr_len; 5008 sack_blk_t *tmp; 5009 int32_t i; 5010 5011 wptr[0] = TCPOPT_NOP; 5012 wptr[1] = TCPOPT_NOP; 5013 wptr[2] = TCPOPT_SACK; 5014 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5015 sizeof (sack_blk_t); 5016 wptr += TCPOPT_REAL_SACK_LEN; 5017 5018 tmp = tcp->tcp_sack_list; 5019 for (i = 0; i < num_sack_blk; i++) { 5020 U32_TO_BE32(tmp[i].begin, wptr); 5021 wptr += sizeof (tcp_seq); 5022 U32_TO_BE32(tmp[i].end, wptr); 5023 wptr += sizeof (tcp_seq); 5024 } 5025 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 5026 << 4); 5027 } 5028 5029 if (tail_unsent) { 5030 mp1 = mp->b_cont; 5031 if (mp1 == NULL) 5032 mp1 = mp; 5033 /* 5034 * If we're a little short, tack on more mblks 5035 * as long as we don't need to split an mblk. 5036 */ 5037 while (tail_unsent < 0 && 5038 tail_unsent + (int)(xmit_tail->b_cont->b_wptr - 5039 xmit_tail->b_cont->b_rptr) <= 0) { 5040 xmit_tail = xmit_tail->b_cont; 5041 /* Stash for rtt use later */ 5042 xmit_tail->b_prev = local_time; 5043 mp1->b_cont = dupb(xmit_tail); 5044 mp1 = mp1->b_cont; 5045 assert((uintptr_t)(xmit_tail->b_wptr - 5046 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 5047 tail_unsent += (int)(xmit_tail->b_wptr - 5048 xmit_tail->b_rptr); 5049 if (mp1 == NULL) { 5050 freemsg(mp); 5051 goto out_of_mem; 5052 } 5053 } 5054 /* Trim back any surplus on the last mblk */ 5055 if (tail_unsent > 0) 5056 mp1->b_wptr -= tail_unsent; 5057 if (tail_unsent < 0) { 5058 uint32_t ip_len; 5059 5060 /* 5061 * We did not send everything we could in 5062 * order to preserve mblk boundaries. 5063 */ 5064 usable -= tail_unsent; 5065 snxt += tail_unsent; 5066 tcp->tcp_last_sent_len += tail_unsent; 5067 UPDATE_MIB(tcp_mib.tcpOutDataBytes, 5068 tail_unsent); 5069 /* Adjust the IP length field. */ 5070 ip_len = ntohs(((struct ip *)rptr)->ip_len) + 5071 tail_unsent; 5072 ((struct ip *)rptr)->ip_len = htons(ip_len); 5073 tail_unsent = 0; 5074 } 5075 } 5076 5077 if (mp == NULL) 5078 goto out_of_mem; 5079 5080 /* 5081 * Performance hit! We need to pullup the whole message 5082 * in order to do checksum and for the MAC output routine. 5083 */ 5084 if (mp->b_cont != NULL) { 5085 int mp_size; 5086 #ifdef DEBUG 5087 printf("Multiple mblk %d\n", msgdsize(mp)); 5088 #endif 5089 new_mp = allocb(msgdsize(mp) + tcp_wroff_xtra, 0); 5090 new_mp->b_rptr += tcp_wroff_xtra; 5091 new_mp->b_wptr = new_mp->b_rptr; 5092 while (mp != NULL) { 5093 mp_size = mp->b_wptr - mp->b_rptr; 5094 bcopy(mp->b_rptr, new_mp->b_wptr, mp_size); 5095 new_mp->b_wptr += mp_size; 5096 mp = mp->b_cont; 5097 } 5098 freemsg(mp); 5099 mp = new_mp; 5100 } 5101 tcp_set_cksum(mp); 5102 ((struct ip *)mp->b_rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5103 TCP_DUMP_PACKET("tcp_wput_data", mp); 5104 (void) ipv4_tcp_output(sock_id, mp); 5105 freemsg(mp); 5106 } 5107 out_of_mem:; 5108 /* Pretend that all we were trying to send really got sent */ 5109 if (tail_unsent < 0) { 5110 do { 5111 xmit_tail = xmit_tail->b_cont; 5112 xmit_tail->b_prev = local_time; 5113 assert((uintptr_t)(xmit_tail->b_wptr - 5114 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 5115 tail_unsent += (int)(xmit_tail->b_wptr - 5116 xmit_tail->b_rptr); 5117 } while (tail_unsent < 0); 5118 } 5119 done:; 5120 tcp->tcp_xmit_tail = xmit_tail; 5121 tcp->tcp_xmit_tail_unsent = tail_unsent; 5122 len = tcp->tcp_snxt - snxt; 5123 if (len) { 5124 /* 5125 * If new data was sent, need to update the notsack 5126 * list, which is, afterall, data blocks that have 5127 * not been sack'ed by the receiver. New data is 5128 * not sack'ed. 5129 */ 5130 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 5131 /* len is a negative value. */ 5132 tcp->tcp_pipe -= len; 5133 tcp_notsack_update(&(tcp->tcp_notsack_list), 5134 tcp->tcp_snxt, snxt, 5135 &(tcp->tcp_num_notsack_blk), 5136 &(tcp->tcp_cnt_notsack_list)); 5137 } 5138 tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 5139 tcp->tcp_rack = tcp->tcp_rnxt; 5140 tcp->tcp_rack_cnt = 0; 5141 if ((snxt + len) == tcp->tcp_suna) { 5142 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5143 } 5144 /* 5145 * Note that len is the amount we just sent but with a negative 5146 * sign. We update tcp_unsent here since we may come back to 5147 * tcp_wput_data from tcp_state_wait. 5148 */ 5149 len += tcp->tcp_unsent; 5150 tcp->tcp_unsent = len; 5151 5152 /* 5153 * Let's wait till all the segments have been acked, since we 5154 * don't have a timer. 5155 */ 5156 (void) tcp_state_wait(sock_id, tcp, TCPS_ALL_ACKED); 5157 return; 5158 } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 5159 /* 5160 * Didn't send anything. Make sure the timer is running 5161 * so that we will probe a zero window. 5162 */ 5163 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5164 } 5165 5166 /* Note that len is the amount we just sent but with a negative sign */ 5167 len += tcp->tcp_unsent; 5168 tcp->tcp_unsent = len; 5169 5170 } 5171 5172 static void 5173 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 5174 uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcph_t *tcph, 5175 int sock_id) 5176 { 5177 int32_t bytes_acked; 5178 int32_t gap; 5179 int32_t rgap; 5180 tcp_opt_t tcpopt; 5181 uint_t flags; 5182 uint32_t new_swnd = 0; 5183 5184 #ifdef DEBUG 5185 printf("Time wait processing called ###############3\n"); 5186 #endif 5187 5188 /* Just make sure we send the right sock_id to tcp_clean_death */ 5189 if ((sockets[sock_id].pcb == NULL) || (sockets[sock_id].pcb != tcp)) 5190 sock_id = -1; 5191 5192 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 5193 new_swnd = BE16_TO_U16(tcph->th_win) << 5194 ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 5195 if (tcp->tcp_snd_ts_ok) { 5196 if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 5197 freemsg(mp); 5198 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5199 tcp->tcp_rnxt, TH_ACK, 0, -1); 5200 return; 5201 } 5202 } 5203 gap = seg_seq - tcp->tcp_rnxt; 5204 rgap = tcp->tcp_rwnd - (gap + seg_len); 5205 if (gap < 0) { 5206 BUMP_MIB(tcp_mib.tcpInDataDupSegs); 5207 UPDATE_MIB(tcp_mib.tcpInDataDupBytes, 5208 (seg_len > -gap ? -gap : seg_len)); 5209 seg_len += gap; 5210 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 5211 if (flags & TH_RST) { 5212 freemsg(mp); 5213 return; 5214 } 5215 if ((flags & TH_FIN) && seg_len == -1) { 5216 /* 5217 * When TCP receives a duplicate FIN in 5218 * TIME_WAIT state, restart the 2 MSL timer. 5219 * See page 73 in RFC 793. Make sure this TCP 5220 * is already on the TIME_WAIT list. If not, 5221 * just restart the timer. 5222 */ 5223 tcp_time_wait_remove(tcp); 5224 tcp_time_wait_append(tcp); 5225 TCP_TIMER_RESTART(tcp, tcp_time_wait_interval); 5226 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5227 tcp->tcp_rnxt, TH_ACK, 0, -1); 5228 freemsg(mp); 5229 return; 5230 } 5231 flags |= TH_ACK_NEEDED; 5232 seg_len = 0; 5233 goto process_ack; 5234 } 5235 5236 /* Fix seg_seq, and chew the gap off the front. */ 5237 seg_seq = tcp->tcp_rnxt; 5238 } 5239 5240 if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 5241 /* 5242 * Make sure that when we accept the connection, pick 5243 * an ISS greater than (tcp_snxt + ISS_INCR/2) for the 5244 * old connection. 5245 * 5246 * The next ISS generated is equal to tcp_iss_incr_extra 5247 * + ISS_INCR/2 + other components depending on the 5248 * value of tcp_strong_iss. We pre-calculate the new 5249 * ISS here and compare with tcp_snxt to determine if 5250 * we need to make adjustment to tcp_iss_incr_extra. 5251 * 5252 * Note that since we are now in the global queue 5253 * perimeter and need to do a lateral_put() to the 5254 * listener queue, there can be other connection requests/ 5255 * attempts while the lateral_put() is going on. That 5256 * means what we calculate here may not be correct. This 5257 * is extremely difficult to solve unless TCP and IP 5258 * modules are merged and there is no perimeter, but just 5259 * locks. The above calculation is ugly and is a 5260 * waste of CPU cycles... 5261 */ 5262 uint32_t new_iss = tcp_iss_incr_extra; 5263 int32_t adj; 5264 5265 /* Add time component and min random (i.e. 1). */ 5266 new_iss += (prom_gettime() >> ISS_NSEC_SHT) + 1; 5267 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 5268 /* 5269 * New ISS not guaranteed to be ISS_INCR/2 5270 * ahead of the current tcp_snxt, so add the 5271 * difference to tcp_iss_incr_extra. 5272 */ 5273 tcp_iss_incr_extra += adj; 5274 } 5275 tcp_clean_death(sock_id, tcp, 0); 5276 5277 /* 5278 * This is a passive open. Right now we do not 5279 * do anything... 5280 */ 5281 freemsg(mp); 5282 return; 5283 } 5284 5285 /* 5286 * rgap is the amount of stuff received out of window. A negative 5287 * value is the amount out of window. 5288 */ 5289 if (rgap < 0) { 5290 BUMP_MIB(tcp_mib.tcpInDataPastWinSegs); 5291 UPDATE_MIB(tcp_mib.tcpInDataPastWinBytes, -rgap); 5292 /* Fix seg_len and make sure there is something left. */ 5293 seg_len += rgap; 5294 if (seg_len <= 0) { 5295 if (flags & TH_RST) { 5296 freemsg(mp); 5297 return; 5298 } 5299 flags |= TH_ACK_NEEDED; 5300 seg_len = 0; 5301 goto process_ack; 5302 } 5303 } 5304 /* 5305 * Check whether we can update tcp_ts_recent. This test is 5306 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 5307 * Extensions for High Performance: An Update", Internet Draft. 5308 */ 5309 if (tcp->tcp_snd_ts_ok && 5310 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 5311 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 5312 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 5313 tcp->tcp_last_rcv_lbolt = prom_gettime(); 5314 } 5315 5316 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 5317 /* Always ack out of order packets */ 5318 flags |= TH_ACK_NEEDED; 5319 seg_len = 0; 5320 } else if (seg_len > 0) { 5321 BUMP_MIB(tcp_mib.tcpInDataInorderSegs); 5322 UPDATE_MIB(tcp_mib.tcpInDataInorderBytes, seg_len); 5323 } 5324 if (flags & TH_RST) { 5325 freemsg(mp); 5326 (void) tcp_clean_death(sock_id, tcp, 0); 5327 return; 5328 } 5329 if (flags & TH_SYN) { 5330 freemsg(mp); 5331 tcp_xmit_ctl("TH_SYN", tcp, NULL, seg_ack, seg_seq + 1, 5332 TH_RST|TH_ACK, 0, -1); 5333 /* 5334 * Do not delete the TCP structure if it is in 5335 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 5336 */ 5337 return; 5338 } 5339 process_ack: 5340 if (flags & TH_ACK) { 5341 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 5342 if (bytes_acked <= 0) { 5343 if (bytes_acked == 0 && seg_len == 0 && 5344 new_swnd == tcp->tcp_swnd) 5345 BUMP_MIB(tcp_mib.tcpInDupAck); 5346 } else { 5347 /* Acks something not sent */ 5348 flags |= TH_ACK_NEEDED; 5349 } 5350 } 5351 freemsg(mp); 5352 if (flags & TH_ACK_NEEDED) { 5353 /* 5354 * Time to send an ack for some reason. 5355 */ 5356 tcp_xmit_ctl(NULL, tcp, NULL, tcp->tcp_snxt, 5357 tcp->tcp_rnxt, TH_ACK, 0, -1); 5358 } 5359 } 5360 5361 static int 5362 tcp_init_values(tcp_t *tcp, struct inetboot_socket *isp) 5363 { 5364 int err; 5365 5366 tcp->tcp_family = AF_INET; 5367 tcp->tcp_ipversion = IPV4_VERSION; 5368 5369 /* 5370 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 5371 * will be close to tcp_rexmit_interval_initial. By doing this, we 5372 * allow the algorithm to adjust slowly to large fluctuations of RTT 5373 * during first few transmissions of a connection as seen in slow 5374 * links. 5375 */ 5376 tcp->tcp_rtt_sa = tcp_rexmit_interval_initial << 2; 5377 tcp->tcp_rtt_sd = tcp_rexmit_interval_initial >> 1; 5378 tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 5379 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + 5380 tcp_conn_grace_period; 5381 if (tcp->tcp_rto < tcp_rexmit_interval_min) 5382 tcp->tcp_rto = tcp_rexmit_interval_min; 5383 tcp->tcp_timer_backoff = 0; 5384 tcp->tcp_ms_we_have_waited = 0; 5385 tcp->tcp_last_recv_time = prom_gettime(); 5386 tcp->tcp_cwnd_max = tcp_cwnd_max_; 5387 tcp->tcp_snd_burst = TCP_CWND_INFINITE; 5388 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 5389 /* For Ethernet, the mtu returned is actually 1550... */ 5390 if (mac_get_type() == IFT_ETHER) { 5391 tcp->tcp_if_mtu = mac_get_mtu() - 50; 5392 } else { 5393 tcp->tcp_if_mtu = mac_get_mtu(); 5394 } 5395 tcp->tcp_mss = tcp->tcp_if_mtu; 5396 5397 tcp->tcp_first_timer_threshold = tcp_ip_notify_interval; 5398 tcp->tcp_first_ctimer_threshold = tcp_ip_notify_cinterval; 5399 tcp->tcp_second_timer_threshold = tcp_ip_abort_interval; 5400 /* 5401 * Fix it to tcp_ip_abort_linterval later if it turns out to be a 5402 * passive open. 5403 */ 5404 tcp->tcp_second_ctimer_threshold = tcp_ip_abort_cinterval; 5405 5406 tcp->tcp_naglim = tcp_naglim_def; 5407 5408 /* NOTE: ISS is now set in tcp_adapt_ire(). */ 5409 5410 /* Initialize the header template */ 5411 if (tcp->tcp_ipversion == IPV4_VERSION) { 5412 err = tcp_header_init_ipv4(tcp); 5413 } 5414 if (err) 5415 return (err); 5416 5417 /* 5418 * Init the window scale to the max so tcp_rwnd_set() won't pare 5419 * down tcp_rwnd. tcp_adapt_ire() will set the right value later. 5420 */ 5421 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 5422 tcp->tcp_xmit_lowater = tcp_xmit_lowat; 5423 if (isp != NULL) { 5424 tcp->tcp_xmit_hiwater = isp->so_sndbuf; 5425 tcp->tcp_rwnd = isp->so_rcvbuf; 5426 tcp->tcp_rwnd_max = isp->so_rcvbuf; 5427 } 5428 tcp->tcp_state = TCPS_IDLE; 5429 return (0); 5430 } 5431 5432 /* 5433 * Initialize the IPv4 header. Loses any record of any IP options. 5434 */ 5435 static int 5436 tcp_header_init_ipv4(tcp_t *tcp) 5437 { 5438 tcph_t *tcph; 5439 5440 /* 5441 * This is a simple initialization. If there's 5442 * already a template, it should never be too small, 5443 * so reuse it. Otherwise, allocate space for the new one. 5444 */ 5445 if (tcp->tcp_iphc != NULL) { 5446 assert(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 5447 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 5448 } else { 5449 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 5450 tcp->tcp_iphc = bkmem_zalloc(tcp->tcp_iphc_len); 5451 if (tcp->tcp_iphc == NULL) { 5452 tcp->tcp_iphc_len = 0; 5453 return (ENOMEM); 5454 } 5455 } 5456 tcp->tcp_ipha = (struct ip *)tcp->tcp_iphc; 5457 tcp->tcp_ipversion = IPV4_VERSION; 5458 5459 /* 5460 * Note that it does not include TCP options yet. It will 5461 * after the connection is established. 5462 */ 5463 tcp->tcp_hdr_len = sizeof (struct ip) + sizeof (tcph_t); 5464 tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 5465 tcp->tcp_ip_hdr_len = sizeof (struct ip); 5466 tcp->tcp_ipha->ip_v = IP_VERSION; 5467 /* We don't support IP options... */ 5468 tcp->tcp_ipha->ip_hl = IP_SIMPLE_HDR_LENGTH_IN_WORDS; 5469 tcp->tcp_ipha->ip_p = IPPROTO_TCP; 5470 /* We are not supposed to do PMTU discovery... */ 5471 tcp->tcp_ipha->ip_sum = 0; 5472 5473 tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (struct ip)); 5474 tcp->tcp_tcph = tcph; 5475 tcph->th_offset_and_rsrvd[0] = (5 << 4); 5476 return (0); 5477 } 5478 5479 /* 5480 * Send out a control packet on the tcp connection specified. This routine 5481 * is typically called where we need a simple ACK or RST generated. 5482 * 5483 * This function is called with or without a mp. 5484 */ 5485 static void 5486 tcp_xmit_ctl(char *str, tcp_t *tcp, mblk_t *mp, uint32_t seq, 5487 uint32_t ack, int ctl, uint_t ip_hdr_len, int sock_id) 5488 { 5489 uchar_t *rptr; 5490 tcph_t *tcph; 5491 struct ip *iph = NULL; 5492 int tcp_hdr_len; 5493 int tcp_ip_hdr_len; 5494 5495 tcp_hdr_len = tcp->tcp_hdr_len; 5496 tcp_ip_hdr_len = tcp->tcp_ip_hdr_len; 5497 5498 if (mp) { 5499 assert(ip_hdr_len != 0); 5500 rptr = mp->b_rptr; 5501 tcph = (tcph_t *)(rptr + ip_hdr_len); 5502 /* Don't reply to a RST segment. */ 5503 if (tcph->th_flags[0] & TH_RST) { 5504 freeb(mp); 5505 return; 5506 } 5507 freemsg(mp); 5508 rptr = NULL; 5509 } else { 5510 assert(ip_hdr_len == 0); 5511 } 5512 /* If a text string is passed in with the request, print it out. */ 5513 if (str != NULL) { 5514 dprintf("tcp_xmit_ctl(%d): '%s', seq 0x%x, ack 0x%x, " 5515 "ctl 0x%x\n", sock_id, str, seq, ack, ctl); 5516 } 5517 mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 0); 5518 if (mp == NULL) { 5519 dprintf("tcp_xmit_ctl(%d): Cannot allocate memory\n", sock_id); 5520 return; 5521 } 5522 rptr = &mp->b_rptr[tcp_wroff_xtra]; 5523 mp->b_rptr = rptr; 5524 mp->b_wptr = &rptr[tcp_hdr_len]; 5525 bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); 5526 5527 iph = (struct ip *)rptr; 5528 iph->ip_len = htons(tcp_hdr_len); 5529 5530 tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; 5531 tcph->th_flags[0] = (uint8_t)ctl; 5532 if (ctl & TH_RST) { 5533 BUMP_MIB(tcp_mib.tcpOutRsts); 5534 BUMP_MIB(tcp_mib.tcpOutControl); 5535 /* 5536 * Don't send TSopt w/ TH_RST packets per RFC 1323. 5537 */ 5538 if (tcp->tcp_snd_ts_ok && tcp->tcp_state > TCPS_SYN_SENT) { 5539 mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN]; 5540 *(mp->b_wptr) = TCPOPT_EOL; 5541 iph->ip_len = htons(tcp_hdr_len - 5542 TCPOPT_REAL_TS_LEN); 5543 tcph->th_offset_and_rsrvd[0] -= (3 << 4); 5544 } 5545 } 5546 if (ctl & TH_ACK) { 5547 uint32_t now = prom_gettime(); 5548 5549 if (tcp->tcp_snd_ts_ok) { 5550 U32_TO_BE32(now, 5551 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5552 U32_TO_BE32(tcp->tcp_ts_recent, 5553 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5554 } 5555 tcp->tcp_rack = ack; 5556 tcp->tcp_rack_cnt = 0; 5557 BUMP_MIB(tcp_mib.tcpOutAck); 5558 } 5559 BUMP_MIB(tcp_mib.tcpOutSegs); 5560 U32_TO_BE32(seq, tcph->th_seq); 5561 U32_TO_BE32(ack, tcph->th_ack); 5562 5563 tcp_set_cksum(mp); 5564 iph->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5565 TCP_DUMP_PACKET("tcp_xmit_ctl", mp); 5566 (void) ipv4_tcp_output(sock_id, mp); 5567 freeb(mp); 5568 } 5569 5570 /* Generate an ACK-only (no data) segment for a TCP endpoint */ 5571 static mblk_t * 5572 tcp_ack_mp(tcp_t *tcp) 5573 { 5574 if (tcp->tcp_valid_bits) { 5575 /* 5576 * For the complex case where we have to send some 5577 * controls (FIN or SYN), let tcp_xmit_mp do it. 5578 * When sending an ACK-only segment (no data) 5579 * into a zero window, always set the seq number to 5580 * suna, since snxt will be extended past the window. 5581 * If we used snxt, the receiver might consider the ACK 5582 * unacceptable. 5583 */ 5584 return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 5585 (tcp->tcp_zero_win_probe) ? 5586 tcp->tcp_suna : 5587 tcp->tcp_snxt, B_FALSE, NULL, B_FALSE)); 5588 } else { 5589 /* Generate a simple ACK */ 5590 uchar_t *rptr; 5591 tcph_t *tcph; 5592 mblk_t *mp1; 5593 int32_t tcp_hdr_len; 5594 int32_t num_sack_blk = 0; 5595 int32_t sack_opt_len; 5596 5597 /* 5598 * Allocate space for TCP + IP headers 5599 * and link-level header 5600 */ 5601 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 5602 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 5603 tcp->tcp_num_sack_blk); 5604 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 5605 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 5606 tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len; 5607 } else { 5608 tcp_hdr_len = tcp->tcp_hdr_len; 5609 } 5610 mp1 = allocb(tcp_hdr_len + tcp_wroff_xtra, 0); 5611 if (mp1 == NULL) 5612 return (NULL); 5613 5614 /* copy in prototype TCP + IP header */ 5615 rptr = mp1->b_rptr + tcp_wroff_xtra; 5616 mp1->b_rptr = rptr; 5617 mp1->b_wptr = rptr + tcp_hdr_len; 5618 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 5619 5620 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 5621 5622 /* 5623 * Set the TCP sequence number. 5624 * When sending an ACK-only segment (no data) 5625 * into a zero window, always set the seq number to 5626 * suna, since snxt will be extended past the window. 5627 * If we used snxt, the receiver might consider the ACK 5628 * unacceptable. 5629 */ 5630 U32_TO_ABE32((tcp->tcp_zero_win_probe) ? 5631 tcp->tcp_suna : tcp->tcp_snxt, tcph->th_seq); 5632 5633 /* Set up the TCP flag field. */ 5634 tcph->th_flags[0] = (uchar_t)TH_ACK; 5635 if (tcp->tcp_ecn_echo_on) 5636 tcph->th_flags[0] |= TH_ECE; 5637 5638 tcp->tcp_rack = tcp->tcp_rnxt; 5639 tcp->tcp_rack_cnt = 0; 5640 5641 /* fill in timestamp option if in use */ 5642 if (tcp->tcp_snd_ts_ok) { 5643 uint32_t llbolt = (uint32_t)prom_gettime(); 5644 5645 U32_TO_BE32(llbolt, 5646 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5647 U32_TO_BE32(tcp->tcp_ts_recent, 5648 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5649 } 5650 5651 /* Fill in SACK options */ 5652 if (num_sack_blk > 0) { 5653 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 5654 sack_blk_t *tmp; 5655 int32_t i; 5656 5657 wptr[0] = TCPOPT_NOP; 5658 wptr[1] = TCPOPT_NOP; 5659 wptr[2] = TCPOPT_SACK; 5660 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5661 sizeof (sack_blk_t); 5662 wptr += TCPOPT_REAL_SACK_LEN; 5663 5664 tmp = tcp->tcp_sack_list; 5665 for (i = 0; i < num_sack_blk; i++) { 5666 U32_TO_BE32(tmp[i].begin, wptr); 5667 wptr += sizeof (tcp_seq); 5668 U32_TO_BE32(tmp[i].end, wptr); 5669 wptr += sizeof (tcp_seq); 5670 } 5671 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 5672 << 4); 5673 } 5674 5675 ((struct ip *)rptr)->ip_len = htons(tcp_hdr_len); 5676 tcp_set_cksum(mp1); 5677 ((struct ip *)rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 5678 return (mp1); 5679 } 5680 } 5681 5682 /* 5683 * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 5684 * ip and tcp header ready to pass down to IP. If the mp passed in is 5685 * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 5686 * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 5687 * otherwise it will dup partial mblks.) 5688 * Otherwise, an appropriate ACK packet will be generated. This 5689 * routine is not usually called to send new data for the first time. It 5690 * is mostly called out of the timer for retransmits, and to generate ACKs. 5691 * 5692 * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 5693 * be adjusted by *offset. And after dupb(), the offset and the ending mblk 5694 * of the original mblk chain will be returned in *offset and *end_mp. 5695 */ 5696 static mblk_t * 5697 tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 5698 mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 5699 boolean_t rexmit) 5700 { 5701 int data_length; 5702 int32_t off = 0; 5703 uint_t flags; 5704 mblk_t *mp1; 5705 mblk_t *mp2; 5706 mblk_t *new_mp; 5707 uchar_t *rptr; 5708 tcph_t *tcph; 5709 int32_t num_sack_blk = 0; 5710 int32_t sack_opt_len = 0; 5711 5712 /* Allocate for our maximum TCP header + link-level */ 5713 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 5714 tcp_wroff_xtra, 0); 5715 if (mp1 == NULL) 5716 return (NULL); 5717 data_length = 0; 5718 5719 /* 5720 * Note that tcp_mss has been adjusted to take into account the 5721 * timestamp option if applicable. Because SACK options do not 5722 * appear in every TCP segments and they are of variable lengths, 5723 * they cannot be included in tcp_mss. Thus we need to calculate 5724 * the actual segment length when we need to send a segment which 5725 * includes SACK options. 5726 */ 5727 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 5728 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 5729 tcp->tcp_num_sack_blk); 5730 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 5731 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 5732 if (max_to_send + sack_opt_len > tcp->tcp_mss) 5733 max_to_send -= sack_opt_len; 5734 } 5735 5736 if (offset != NULL) { 5737 off = *offset; 5738 /* We use offset as an indicator that end_mp is not NULL. */ 5739 *end_mp = NULL; 5740 } 5741 for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 5742 /* This could be faster with cooperation from downstream */ 5743 if (mp2 != mp1 && !sendall && 5744 data_length + (int)(mp->b_wptr - mp->b_rptr) > 5745 max_to_send) 5746 /* 5747 * Don't send the next mblk since the whole mblk 5748 * does not fit. 5749 */ 5750 break; 5751 mp2->b_cont = dupb(mp); 5752 mp2 = mp2->b_cont; 5753 if (mp2 == NULL) { 5754 freemsg(mp1); 5755 return (NULL); 5756 } 5757 mp2->b_rptr += off; 5758 assert((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 5759 (uintptr_t)INT_MAX); 5760 5761 data_length += (int)(mp2->b_wptr - mp2->b_rptr); 5762 if (data_length > max_to_send) { 5763 mp2->b_wptr -= data_length - max_to_send; 5764 data_length = max_to_send; 5765 off = mp2->b_wptr - mp->b_rptr; 5766 break; 5767 } else { 5768 off = 0; 5769 } 5770 } 5771 if (offset != NULL) { 5772 *offset = off; 5773 *end_mp = mp; 5774 } 5775 if (seg_len != NULL) { 5776 *seg_len = data_length; 5777 } 5778 5779 rptr = mp1->b_rptr + tcp_wroff_xtra; 5780 mp1->b_rptr = rptr; 5781 mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; 5782 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 5783 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 5784 U32_TO_ABE32(seq, tcph->th_seq); 5785 5786 /* 5787 * Use tcp_unsent to determine if the PUSH bit should be used assumes 5788 * that this function was called from tcp_wput_data. Thus, when called 5789 * to retransmit data the setting of the PUSH bit may appear some 5790 * what random in that it might get set when it should not. This 5791 * should not pose any performance issues. 5792 */ 5793 if (data_length != 0 && (tcp->tcp_unsent == 0 || 5794 tcp->tcp_unsent == data_length)) { 5795 flags = TH_ACK | TH_PUSH; 5796 } else { 5797 flags = TH_ACK; 5798 } 5799 5800 if (tcp->tcp_ecn_ok) { 5801 if (tcp->tcp_ecn_echo_on) 5802 flags |= TH_ECE; 5803 5804 /* 5805 * Only set ECT bit and ECN_CWR if a segment contains new data. 5806 * There is no TCP flow control for non-data segments, and 5807 * only data segment is transmitted reliably. 5808 */ 5809 if (data_length > 0 && !rexmit) { 5810 SET_ECT(tcp, rptr); 5811 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 5812 flags |= TH_CWR; 5813 tcp->tcp_ecn_cwr_sent = B_TRUE; 5814 } 5815 } 5816 } 5817 5818 if (tcp->tcp_valid_bits) { 5819 uint32_t u1; 5820 5821 if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 5822 seq == tcp->tcp_iss) { 5823 uchar_t *wptr; 5824 5825 /* 5826 * Tack on the MSS option. It is always needed 5827 * for both active and passive open. 5828 */ 5829 wptr = mp1->b_wptr; 5830 wptr[0] = TCPOPT_MAXSEG; 5831 wptr[1] = TCPOPT_MAXSEG_LEN; 5832 wptr += 2; 5833 /* 5834 * MSS option value should be interface MTU - MIN 5835 * TCP/IP header. 5836 */ 5837 u1 = tcp->tcp_if_mtu - IP_SIMPLE_HDR_LENGTH - 5838 TCP_MIN_HEADER_LENGTH; 5839 U16_TO_BE16(u1, wptr); 5840 mp1->b_wptr = wptr + 2; 5841 /* Update the offset to cover the additional word */ 5842 tcph->th_offset_and_rsrvd[0] += (1 << 4); 5843 5844 /* 5845 * Note that the following way of filling in 5846 * TCP options are not optimal. Some NOPs can 5847 * be saved. But there is no need at this time 5848 * to optimize it. When it is needed, we will 5849 * do it. 5850 */ 5851 switch (tcp->tcp_state) { 5852 case TCPS_SYN_SENT: 5853 flags = TH_SYN; 5854 5855 if (tcp->tcp_snd_ws_ok) { 5856 wptr = mp1->b_wptr; 5857 wptr[0] = TCPOPT_NOP; 5858 wptr[1] = TCPOPT_WSCALE; 5859 wptr[2] = TCPOPT_WS_LEN; 5860 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 5861 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 5862 tcph->th_offset_and_rsrvd[0] += 5863 (1 << 4); 5864 } 5865 5866 if (tcp->tcp_snd_ts_ok) { 5867 uint32_t llbolt; 5868 5869 llbolt = prom_gettime(); 5870 wptr = mp1->b_wptr; 5871 wptr[0] = TCPOPT_NOP; 5872 wptr[1] = TCPOPT_NOP; 5873 wptr[2] = TCPOPT_TSTAMP; 5874 wptr[3] = TCPOPT_TSTAMP_LEN; 5875 wptr += 4; 5876 U32_TO_BE32(llbolt, wptr); 5877 wptr += 4; 5878 assert(tcp->tcp_ts_recent == 0); 5879 U32_TO_BE32(0L, wptr); 5880 mp1->b_wptr += TCPOPT_REAL_TS_LEN; 5881 tcph->th_offset_and_rsrvd[0] += 5882 (3 << 4); 5883 } 5884 5885 if (tcp->tcp_snd_sack_ok) { 5886 wptr = mp1->b_wptr; 5887 wptr[0] = TCPOPT_NOP; 5888 wptr[1] = TCPOPT_NOP; 5889 wptr[2] = TCPOPT_SACK_PERMITTED; 5890 wptr[3] = TCPOPT_SACK_OK_LEN; 5891 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 5892 tcph->th_offset_and_rsrvd[0] += 5893 (1 << 4); 5894 } 5895 5896 /* 5897 * Set up all the bits to tell other side 5898 * we are ECN capable. 5899 */ 5900 if (tcp->tcp_ecn_ok) { 5901 flags |= (TH_ECE | TH_CWR); 5902 } 5903 break; 5904 case TCPS_SYN_RCVD: 5905 flags |= TH_SYN; 5906 5907 if (tcp->tcp_snd_ws_ok) { 5908 wptr = mp1->b_wptr; 5909 wptr[0] = TCPOPT_NOP; 5910 wptr[1] = TCPOPT_WSCALE; 5911 wptr[2] = TCPOPT_WS_LEN; 5912 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 5913 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 5914 tcph->th_offset_and_rsrvd[0] += (1 << 4); 5915 } 5916 5917 if (tcp->tcp_snd_sack_ok) { 5918 wptr = mp1->b_wptr; 5919 wptr[0] = TCPOPT_NOP; 5920 wptr[1] = TCPOPT_NOP; 5921 wptr[2] = TCPOPT_SACK_PERMITTED; 5922 wptr[3] = TCPOPT_SACK_OK_LEN; 5923 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 5924 tcph->th_offset_and_rsrvd[0] += 5925 (1 << 4); 5926 } 5927 5928 /* 5929 * If the other side is ECN capable, reply 5930 * that we are also ECN capable. 5931 */ 5932 if (tcp->tcp_ecn_ok) { 5933 flags |= TH_ECE; 5934 } 5935 break; 5936 default: 5937 break; 5938 } 5939 /* allocb() of adequate mblk assures space */ 5940 assert((uintptr_t)(mp1->b_wptr - 5941 mp1->b_rptr) <= (uintptr_t)INT_MAX); 5942 if (flags & TH_SYN) 5943 BUMP_MIB(tcp_mib.tcpOutControl); 5944 } 5945 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 5946 (seq + data_length) == tcp->tcp_fss) { 5947 if (!tcp->tcp_fin_acked) { 5948 flags |= TH_FIN; 5949 BUMP_MIB(tcp_mib.tcpOutControl); 5950 } 5951 if (!tcp->tcp_fin_sent) { 5952 tcp->tcp_fin_sent = B_TRUE; 5953 switch (tcp->tcp_state) { 5954 case TCPS_SYN_RCVD: 5955 case TCPS_ESTABLISHED: 5956 tcp->tcp_state = TCPS_FIN_WAIT_1; 5957 break; 5958 case TCPS_CLOSE_WAIT: 5959 tcp->tcp_state = TCPS_LAST_ACK; 5960 break; 5961 } 5962 if (tcp->tcp_suna == tcp->tcp_snxt) 5963 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 5964 tcp->tcp_snxt = tcp->tcp_fss + 1; 5965 } 5966 } 5967 } 5968 tcph->th_flags[0] = (uchar_t)flags; 5969 tcp->tcp_rack = tcp->tcp_rnxt; 5970 tcp->tcp_rack_cnt = 0; 5971 5972 if (tcp->tcp_snd_ts_ok) { 5973 if (tcp->tcp_state != TCPS_SYN_SENT) { 5974 uint32_t llbolt = prom_gettime(); 5975 5976 U32_TO_BE32(llbolt, 5977 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 5978 U32_TO_BE32(tcp->tcp_ts_recent, 5979 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 5980 } 5981 } 5982 5983 if (num_sack_blk > 0) { 5984 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 5985 sack_blk_t *tmp; 5986 int32_t i; 5987 5988 wptr[0] = TCPOPT_NOP; 5989 wptr[1] = TCPOPT_NOP; 5990 wptr[2] = TCPOPT_SACK; 5991 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 5992 sizeof (sack_blk_t); 5993 wptr += TCPOPT_REAL_SACK_LEN; 5994 5995 tmp = tcp->tcp_sack_list; 5996 for (i = 0; i < num_sack_blk; i++) { 5997 U32_TO_BE32(tmp[i].begin, wptr); 5998 wptr += sizeof (tcp_seq); 5999 U32_TO_BE32(tmp[i].end, wptr); 6000 wptr += sizeof (tcp_seq); 6001 } 6002 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4); 6003 } 6004 assert((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 6005 data_length += (int)(mp1->b_wptr - rptr); 6006 if (tcp->tcp_ipversion == IPV4_VERSION) 6007 ((struct ip *)rptr)->ip_len = htons(data_length); 6008 6009 /* 6010 * Performance hit! We need to pullup the whole message 6011 * in order to do checksum and for the MAC output routine. 6012 */ 6013 if (mp1->b_cont != NULL) { 6014 int mp_size; 6015 #ifdef DEBUG 6016 printf("Multiple mblk %d\n", msgdsize(mp1)); 6017 #endif 6018 new_mp = allocb(msgdsize(mp1) + tcp_wroff_xtra, 0); 6019 new_mp->b_rptr += tcp_wroff_xtra; 6020 new_mp->b_wptr = new_mp->b_rptr; 6021 while (mp1 != NULL) { 6022 mp_size = mp1->b_wptr - mp1->b_rptr; 6023 bcopy(mp1->b_rptr, new_mp->b_wptr, mp_size); 6024 new_mp->b_wptr += mp_size; 6025 mp1 = mp1->b_cont; 6026 } 6027 freemsg(mp1); 6028 mp1 = new_mp; 6029 } 6030 tcp_set_cksum(mp1); 6031 /* Fill in the TTL field as it is 0 in the header template. */ 6032 ((struct ip *)mp1->b_rptr)->ip_ttl = (uint8_t)tcp_ipv4_ttl; 6033 6034 return (mp1); 6035 } 6036 6037 /* 6038 * Generate a "no listener here" reset in response to the 6039 * connection request contained within 'mp' 6040 */ 6041 static void 6042 tcp_xmit_listeners_reset(int sock_id, mblk_t *mp, uint_t ip_hdr_len) 6043 { 6044 uchar_t *rptr; 6045 uint32_t seg_len; 6046 tcph_t *tcph; 6047 uint32_t seg_seq; 6048 uint32_t seg_ack; 6049 uint_t flags; 6050 6051 rptr = mp->b_rptr; 6052 6053 tcph = (tcph_t *)&rptr[ip_hdr_len]; 6054 seg_seq = BE32_TO_U32(tcph->th_seq); 6055 seg_ack = BE32_TO_U32(tcph->th_ack); 6056 flags = tcph->th_flags[0]; 6057 6058 seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len); 6059 if (flags & TH_RST) { 6060 freeb(mp); 6061 } else if (flags & TH_ACK) { 6062 tcp_xmit_early_reset("no tcp, reset", 6063 sock_id, mp, seg_ack, 0, TH_RST, ip_hdr_len); 6064 } else { 6065 if (flags & TH_SYN) 6066 seg_len++; 6067 tcp_xmit_early_reset("no tcp, reset/ack", sock_id, 6068 mp, 0, seg_seq + seg_len, 6069 TH_RST | TH_ACK, ip_hdr_len); 6070 } 6071 } 6072 6073 /* Non overlapping byte exchanger */ 6074 static void 6075 tcp_xchg(uchar_t *a, uchar_t *b, int len) 6076 { 6077 uchar_t uch; 6078 6079 while (len-- > 0) { 6080 uch = a[len]; 6081 a[len] = b[len]; 6082 b[len] = uch; 6083 } 6084 } 6085 6086 /* 6087 * Generate a reset based on an inbound packet for which there is no active 6088 * tcp state that we can find. 6089 */ 6090 static void 6091 tcp_xmit_early_reset(char *str, int sock_id, mblk_t *mp, uint32_t seq, 6092 uint32_t ack, int ctl, uint_t ip_hdr_len) 6093 { 6094 struct ip *iph = NULL; 6095 ushort_t len; 6096 tcph_t *tcph; 6097 int i; 6098 ipaddr_t addr; 6099 mblk_t *new_mp; 6100 6101 if (str != NULL) { 6102 dprintf("tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 6103 "flags 0x%x\n", str, seq, ack, ctl); 6104 } 6105 6106 /* 6107 * We skip reversing source route here. 6108 * (for now we replace all IP options with EOL) 6109 */ 6110 iph = (struct ip *)mp->b_rptr; 6111 for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 6112 mp->b_rptr[i] = IPOPT_EOL; 6113 /* 6114 * Make sure that src address is not a limited broadcast 6115 * address. Not all broadcast address checking for the 6116 * src address is possible, since we don't know the 6117 * netmask of the src addr. 6118 * No check for destination address is done, since 6119 * IP will not pass up a packet with a broadcast dest address 6120 * to TCP. 6121 */ 6122 if (iph->ip_src.s_addr == INADDR_ANY || 6123 iph->ip_src.s_addr == INADDR_BROADCAST) { 6124 freemsg(mp); 6125 return; 6126 } 6127 6128 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6129 if (tcph->th_flags[0] & TH_RST) { 6130 freemsg(mp); 6131 return; 6132 } 6133 /* 6134 * Now copy the original header to a new buffer. The reason 6135 * for doing this is that we need to put extra room before 6136 * the header for the MAC layer address. The original mblk 6137 * does not have this extra head room. 6138 */ 6139 len = ip_hdr_len + sizeof (tcph_t); 6140 if ((new_mp = allocb(len + tcp_wroff_xtra, 0)) == NULL) { 6141 freemsg(mp); 6142 return; 6143 } 6144 new_mp->b_rptr += tcp_wroff_xtra; 6145 bcopy(mp->b_rptr, new_mp->b_rptr, len); 6146 new_mp->b_wptr = new_mp->b_rptr + len; 6147 freemsg(mp); 6148 mp = new_mp; 6149 iph = (struct ip *)mp->b_rptr; 6150 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 6151 6152 tcph->th_offset_and_rsrvd[0] = (5 << 4); 6153 tcp_xchg(tcph->th_fport, tcph->th_lport, 2); 6154 U32_TO_BE32(ack, tcph->th_ack); 6155 U32_TO_BE32(seq, tcph->th_seq); 6156 U16_TO_BE16(0, tcph->th_win); 6157 bzero(tcph->th_sum, sizeof (int16_t)); 6158 tcph->th_flags[0] = (uint8_t)ctl; 6159 if (ctl & TH_RST) { 6160 BUMP_MIB(tcp_mib.tcpOutRsts); 6161 BUMP_MIB(tcp_mib.tcpOutControl); 6162 } 6163 6164 iph->ip_len = htons(len); 6165 /* Swap addresses */ 6166 addr = iph->ip_src.s_addr; 6167 iph->ip_src = iph->ip_dst; 6168 iph->ip_dst.s_addr = addr; 6169 iph->ip_id = 0; 6170 iph->ip_ttl = 0; 6171 tcp_set_cksum(mp); 6172 iph->ip_ttl = (uint8_t)tcp_ipv4_ttl; 6173 6174 /* Dump the packet when debugging. */ 6175 TCP_DUMP_PACKET("tcp_xmit_early_reset", mp); 6176 (void) ipv4_tcp_output(sock_id, mp); 6177 freemsg(mp); 6178 } 6179 6180 static void 6181 tcp_set_cksum(mblk_t *mp) 6182 { 6183 struct ip *iph; 6184 tcpha_t *tcph; 6185 int len; 6186 6187 iph = (struct ip *)mp->b_rptr; 6188 tcph = (tcpha_t *)(iph + 1); 6189 len = ntohs(iph->ip_len); 6190 /* 6191 * Calculate the TCP checksum. Need to include the psuedo header, 6192 * which is similar to the real IP header starting at the TTL field. 6193 */ 6194 iph->ip_sum = htons(len - IP_SIMPLE_HDR_LENGTH); 6195 tcph->tha_sum = 0; 6196 tcph->tha_sum = tcp_cksum((uint16_t *)&(iph->ip_ttl), 6197 len - IP_SIMPLE_HDR_LENGTH + 12); 6198 iph->ip_sum = 0; 6199 } 6200 6201 static uint16_t 6202 tcp_cksum(uint16_t *buf, uint32_t len) 6203 { 6204 /* 6205 * Compute Internet Checksum for "count" bytes 6206 * beginning at location "addr". 6207 */ 6208 int32_t sum = 0; 6209 6210 while (len > 1) { 6211 /* This is the inner loop */ 6212 sum += *buf++; 6213 len -= 2; 6214 } 6215 6216 /* Add left-over byte, if any */ 6217 if (len > 0) 6218 sum += *(unsigned char *)buf * 256; 6219 6220 /* Fold 32-bit sum to 16 bits */ 6221 while (sum >> 16) 6222 sum = (sum & 0xffff) + (sum >> 16); 6223 6224 return ((uint16_t)~sum); 6225 } 6226 6227 /* 6228 * Type three generator adapted from the random() function in 4.4 BSD: 6229 */ 6230 6231 /* 6232 * Copyright (c) 1983, 1993 6233 * The Regents of the University of California. All rights reserved. 6234 * 6235 * Redistribution and use in source and binary forms, with or without 6236 * modification, are permitted provided that the following conditions 6237 * are met: 6238 * 1. Redistributions of source code must retain the above copyright 6239 * notice, this list of conditions and the following disclaimer. 6240 * 2. Redistributions in binary form must reproduce the above copyright 6241 * notice, this list of conditions and the following disclaimer in the 6242 * documentation and/or other materials provided with the distribution. 6243 * 3. All advertising materials mentioning features or use of this software 6244 * must display the following acknowledgement: 6245 * This product includes software developed by the University of 6246 * California, Berkeley and its contributors. 6247 * 4. Neither the name of the University nor the names of its contributors 6248 * may be used to endorse or promote products derived from this software 6249 * without specific prior written permission. 6250 * 6251 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 6252 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 6253 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 6254 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 6255 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 6256 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 6257 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 6258 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 6259 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 6260 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 6261 * SUCH DAMAGE. 6262 */ 6263 6264 /* Type 3 -- x**31 + x**3 + 1 */ 6265 #define DEG_3 31 6266 #define SEP_3 3 6267 6268 6269 /* Protected by tcp_random_lock */ 6270 static int tcp_randtbl[DEG_3 + 1]; 6271 6272 static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 6273 static int *tcp_random_rptr = &tcp_randtbl[1]; 6274 6275 static int *tcp_random_state = &tcp_randtbl[1]; 6276 static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 6277 6278 static void 6279 tcp_random_init(void) 6280 { 6281 int i; 6282 uint32_t hrt; 6283 uint32_t wallclock; 6284 uint32_t result; 6285 6286 /* 6287 * 6288 * XXX We don't have high resolution time in standalone... The 6289 * following is just some approximation on the comment below. 6290 * 6291 * Use high-res timer and current time for seed. Gethrtime() returns 6292 * a longlong, which may contain resolution down to nanoseconds. 6293 * The current time will either be a 32-bit or a 64-bit quantity. 6294 * XOR the two together in a 64-bit result variable. 6295 * Convert the result to a 32-bit value by multiplying the high-order 6296 * 32-bits by the low-order 32-bits. 6297 * 6298 * XXX We don't have gethrtime() in prom and the wallclock.... 6299 */ 6300 6301 hrt = prom_gettime(); 6302 wallclock = (uint32_t)time(NULL); 6303 result = wallclock ^ hrt; 6304 tcp_random_state[0] = result; 6305 6306 for (i = 1; i < DEG_3; i++) 6307 tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 6308 + 12345; 6309 tcp_random_fptr = &tcp_random_state[SEP_3]; 6310 tcp_random_rptr = &tcp_random_state[0]; 6311 for (i = 0; i < 10 * DEG_3; i++) 6312 (void) tcp_random(); 6313 } 6314 6315 /* 6316 * tcp_random: Return a random number in the range [1 - (128K + 1)]. 6317 * This range is selected to be approximately centered on TCP_ISS / 2, 6318 * and easy to compute. We get this value by generating a 32-bit random 6319 * number, selecting out the high-order 17 bits, and then adding one so 6320 * that we never return zero. 6321 */ 6322 static int 6323 tcp_random(void) 6324 { 6325 int i; 6326 6327 *tcp_random_fptr += *tcp_random_rptr; 6328 6329 /* 6330 * The high-order bits are more random than the low-order bits, 6331 * so we select out the high-order 17 bits and add one so that 6332 * we never return zero. 6333 */ 6334 i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 6335 if (++tcp_random_fptr >= tcp_random_end_ptr) { 6336 tcp_random_fptr = tcp_random_state; 6337 ++tcp_random_rptr; 6338 } else if (++tcp_random_rptr >= tcp_random_end_ptr) 6339 tcp_random_rptr = tcp_random_state; 6340 6341 return (i); 6342 } 6343 6344 /* 6345 * Generate ISS, taking into account NDD changes may happen halfway through. 6346 * (If the iss is not zero, set it.) 6347 */ 6348 static void 6349 tcp_iss_init(tcp_t *tcp) 6350 { 6351 tcp_iss_incr_extra += (ISS_INCR >> 1); 6352 tcp->tcp_iss = tcp_iss_incr_extra; 6353 tcp->tcp_iss += (prom_gettime() >> ISS_NSEC_SHT) + tcp_random(); 6354 tcp->tcp_valid_bits = TCP_ISS_VALID; 6355 tcp->tcp_fss = tcp->tcp_iss - 1; 6356 tcp->tcp_suna = tcp->tcp_iss; 6357 tcp->tcp_snxt = tcp->tcp_iss + 1; 6358 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 6359 tcp->tcp_csuna = tcp->tcp_snxt; 6360 } 6361 6362 /* 6363 * Diagnostic routine used to return a string associated with the tcp state. 6364 * Note that if the caller does not supply a buffer, it will use an internal 6365 * static string. This means that if multiple threads call this function at 6366 * the same time, output can be corrupted... Note also that this function 6367 * does not check the size of the supplied buffer. The caller has to make 6368 * sure that it is big enough. 6369 */ 6370 static char * 6371 tcp_display(tcp_t *tcp, char *sup_buf, char format) 6372 { 6373 char buf1[30]; 6374 static char priv_buf[INET_ADDRSTRLEN * 2 + 80]; 6375 char *buf; 6376 char *cp; 6377 char local_addrbuf[INET_ADDRSTRLEN]; 6378 char remote_addrbuf[INET_ADDRSTRLEN]; 6379 struct in_addr addr; 6380 6381 if (sup_buf != NULL) 6382 buf = sup_buf; 6383 else 6384 buf = priv_buf; 6385 6386 if (tcp == NULL) 6387 return ("NULL_TCP"); 6388 switch (tcp->tcp_state) { 6389 case TCPS_CLOSED: 6390 cp = "TCP_CLOSED"; 6391 break; 6392 case TCPS_IDLE: 6393 cp = "TCP_IDLE"; 6394 break; 6395 case TCPS_BOUND: 6396 cp = "TCP_BOUND"; 6397 break; 6398 case TCPS_LISTEN: 6399 cp = "TCP_LISTEN"; 6400 break; 6401 case TCPS_SYN_SENT: 6402 cp = "TCP_SYN_SENT"; 6403 break; 6404 case TCPS_SYN_RCVD: 6405 cp = "TCP_SYN_RCVD"; 6406 break; 6407 case TCPS_ESTABLISHED: 6408 cp = "TCP_ESTABLISHED"; 6409 break; 6410 case TCPS_CLOSE_WAIT: 6411 cp = "TCP_CLOSE_WAIT"; 6412 break; 6413 case TCPS_FIN_WAIT_1: 6414 cp = "TCP_FIN_WAIT_1"; 6415 break; 6416 case TCPS_CLOSING: 6417 cp = "TCP_CLOSING"; 6418 break; 6419 case TCPS_LAST_ACK: 6420 cp = "TCP_LAST_ACK"; 6421 break; 6422 case TCPS_FIN_WAIT_2: 6423 cp = "TCP_FIN_WAIT_2"; 6424 break; 6425 case TCPS_TIME_WAIT: 6426 cp = "TCP_TIME_WAIT"; 6427 break; 6428 default: 6429 (void) sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 6430 cp = buf1; 6431 break; 6432 } 6433 switch (format) { 6434 case DISP_ADDR_AND_PORT: 6435 /* 6436 * Note that we use the remote address in the tcp_b 6437 * structure. This means that it will print out 6438 * the real destination address, not the next hop's 6439 * address if source routing is used. 6440 */ 6441 addr.s_addr = tcp->tcp_bound_source; 6442 bcopy(inet_ntoa(addr), local_addrbuf, sizeof (local_addrbuf)); 6443 addr.s_addr = tcp->tcp_remote; 6444 bcopy(inet_ntoa(addr), remote_addrbuf, sizeof (remote_addrbuf)); 6445 (void) snprintf(buf, sizeof (priv_buf), "[%s.%u, %s.%u] %s", 6446 local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf, 6447 ntohs(tcp->tcp_fport), cp); 6448 break; 6449 case DISP_PORT_ONLY: 6450 default: 6451 (void) snprintf(buf, sizeof (priv_buf), "[%u, %u] %s", 6452 ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp); 6453 break; 6454 } 6455 6456 return (buf); 6457 } 6458 6459 /* 6460 * Add a new piece to the tcp reassembly queue. If the gap at the beginning 6461 * is filled, return as much as we can. The message passed in may be 6462 * multi-part, chained using b_cont. "start" is the starting sequence 6463 * number for this piece. 6464 */ 6465 static mblk_t * 6466 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 6467 { 6468 uint32_t end; 6469 mblk_t *mp1; 6470 mblk_t *mp2; 6471 mblk_t *next_mp; 6472 uint32_t u1; 6473 6474 /* Walk through all the new pieces. */ 6475 do { 6476 assert((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 6477 (uintptr_t)INT_MAX); 6478 end = start + (int)(mp->b_wptr - mp->b_rptr); 6479 next_mp = mp->b_cont; 6480 if (start == end) { 6481 /* Empty. Blast it. */ 6482 freeb(mp); 6483 continue; 6484 } 6485 mp->b_cont = NULL; 6486 TCP_REASS_SET_SEQ(mp, start); 6487 TCP_REASS_SET_END(mp, end); 6488 mp1 = tcp->tcp_reass_tail; 6489 if (!mp1) { 6490 tcp->tcp_reass_tail = mp; 6491 tcp->tcp_reass_head = mp; 6492 BUMP_MIB(tcp_mib.tcpInDataUnorderSegs); 6493 UPDATE_MIB(tcp_mib.tcpInDataUnorderBytes, end - start); 6494 continue; 6495 } 6496 /* New stuff completely beyond tail? */ 6497 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { 6498 /* Link it on end. */ 6499 mp1->b_cont = mp; 6500 tcp->tcp_reass_tail = mp; 6501 BUMP_MIB(tcp_mib.tcpInDataUnorderSegs); 6502 UPDATE_MIB(tcp_mib.tcpInDataUnorderBytes, end - start); 6503 continue; 6504 } 6505 mp1 = tcp->tcp_reass_head; 6506 u1 = TCP_REASS_SEQ(mp1); 6507 /* New stuff at the front? */ 6508 if (SEQ_LT(start, u1)) { 6509 /* Yes... Check for overlap. */ 6510 mp->b_cont = mp1; 6511 tcp->tcp_reass_head = mp; 6512 tcp_reass_elim_overlap(tcp, mp); 6513 continue; 6514 } 6515 /* 6516 * The new piece fits somewhere between the head and tail. 6517 * We find our slot, where mp1 precedes us and mp2 trails. 6518 */ 6519 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 6520 u1 = TCP_REASS_SEQ(mp2); 6521 if (SEQ_LEQ(start, u1)) 6522 break; 6523 } 6524 /* Link ourselves in */ 6525 mp->b_cont = mp2; 6526 mp1->b_cont = mp; 6527 6528 /* Trim overlap with following mblk(s) first */ 6529 tcp_reass_elim_overlap(tcp, mp); 6530 6531 /* Trim overlap with preceding mblk */ 6532 tcp_reass_elim_overlap(tcp, mp1); 6533 6534 } while (start = end, mp = next_mp); 6535 mp1 = tcp->tcp_reass_head; 6536 /* Anything ready to go? */ 6537 if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 6538 return (NULL); 6539 /* Eat what we can off the queue */ 6540 for (;;) { 6541 mp = mp1->b_cont; 6542 end = TCP_REASS_END(mp1); 6543 TCP_REASS_SET_SEQ(mp1, 0); 6544 TCP_REASS_SET_END(mp1, 0); 6545 if (!mp) { 6546 tcp->tcp_reass_tail = NULL; 6547 break; 6548 } 6549 if (end != TCP_REASS_SEQ(mp)) { 6550 mp1->b_cont = NULL; 6551 break; 6552 } 6553 mp1 = mp; 6554 } 6555 mp1 = tcp->tcp_reass_head; 6556 tcp->tcp_reass_head = mp; 6557 return (mp1); 6558 } 6559 6560 /* Eliminate any overlap that mp may have over later mblks */ 6561 static void 6562 tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 6563 { 6564 uint32_t end; 6565 mblk_t *mp1; 6566 uint32_t u1; 6567 6568 end = TCP_REASS_END(mp); 6569 while ((mp1 = mp->b_cont) != NULL) { 6570 u1 = TCP_REASS_SEQ(mp1); 6571 if (!SEQ_GT(end, u1)) 6572 break; 6573 if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 6574 mp->b_wptr -= end - u1; 6575 TCP_REASS_SET_END(mp, u1); 6576 BUMP_MIB(tcp_mib.tcpInDataPartDupSegs); 6577 UPDATE_MIB(tcp_mib.tcpInDataPartDupBytes, end - u1); 6578 break; 6579 } 6580 mp->b_cont = mp1->b_cont; 6581 freeb(mp1); 6582 BUMP_MIB(tcp_mib.tcpInDataDupSegs); 6583 UPDATE_MIB(tcp_mib.tcpInDataDupBytes, end - u1); 6584 } 6585 if (!mp1) 6586 tcp->tcp_reass_tail = mp; 6587 } 6588 6589 /* 6590 * Remove a connection from the list of detached TIME_WAIT connections. 6591 */ 6592 static void 6593 tcp_time_wait_remove(tcp_t *tcp) 6594 { 6595 if (tcp->tcp_time_wait_expire == 0) { 6596 assert(tcp->tcp_time_wait_next == NULL); 6597 assert(tcp->tcp_time_wait_prev == NULL); 6598 return; 6599 } 6600 assert(tcp->tcp_state == TCPS_TIME_WAIT); 6601 if (tcp == tcp_time_wait_head) { 6602 assert(tcp->tcp_time_wait_prev == NULL); 6603 tcp_time_wait_head = tcp->tcp_time_wait_next; 6604 if (tcp_time_wait_head != NULL) { 6605 tcp_time_wait_head->tcp_time_wait_prev = NULL; 6606 } else { 6607 tcp_time_wait_tail = NULL; 6608 } 6609 } else if (tcp == tcp_time_wait_tail) { 6610 assert(tcp != tcp_time_wait_head); 6611 assert(tcp->tcp_time_wait_next == NULL); 6612 tcp_time_wait_tail = tcp->tcp_time_wait_prev; 6613 assert(tcp_time_wait_tail != NULL); 6614 tcp_time_wait_tail->tcp_time_wait_next = NULL; 6615 } else { 6616 assert(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 6617 assert(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 6618 tcp->tcp_time_wait_prev->tcp_time_wait_next = 6619 tcp->tcp_time_wait_next; 6620 tcp->tcp_time_wait_next->tcp_time_wait_prev = 6621 tcp->tcp_time_wait_prev; 6622 } 6623 tcp->tcp_time_wait_next = NULL; 6624 tcp->tcp_time_wait_prev = NULL; 6625 tcp->tcp_time_wait_expire = 0; 6626 } 6627 6628 /* 6629 * Add a connection to the list of detached TIME_WAIT connections 6630 * and set its time to expire ... 6631 */ 6632 static void 6633 tcp_time_wait_append(tcp_t *tcp) 6634 { 6635 tcp->tcp_time_wait_expire = prom_gettime() + tcp_time_wait_interval; 6636 if (tcp->tcp_time_wait_expire == 0) 6637 tcp->tcp_time_wait_expire = 1; 6638 6639 if (tcp_time_wait_head == NULL) { 6640 assert(tcp_time_wait_tail == NULL); 6641 tcp_time_wait_head = tcp; 6642 } else { 6643 assert(tcp_time_wait_tail != NULL); 6644 assert(tcp_time_wait_tail->tcp_state == TCPS_TIME_WAIT); 6645 tcp_time_wait_tail->tcp_time_wait_next = tcp; 6646 tcp->tcp_time_wait_prev = tcp_time_wait_tail; 6647 } 6648 tcp_time_wait_tail = tcp; 6649 6650 /* for ndd stats about compression */ 6651 tcp_cum_timewait++; 6652 } 6653 6654 /* 6655 * Periodic qtimeout routine run on the default queue. 6656 * Performs 2 functions. 6657 * 1. Does TIME_WAIT compression on all recently added tcps. List 6658 * traversal is done backwards from the tail. 6659 * 2. Blows away all tcps whose TIME_WAIT has expired. List traversal 6660 * is done forwards from the head. 6661 */ 6662 void 6663 tcp_time_wait_collector(void) 6664 { 6665 tcp_t *tcp; 6666 uint32_t now; 6667 6668 /* 6669 * In order to reap time waits reliably, we should use a 6670 * source of time that is not adjustable by the user 6671 */ 6672 now = prom_gettime(); 6673 while ((tcp = tcp_time_wait_head) != NULL) { 6674 /* 6675 * Compare times using modular arithmetic, since 6676 * lbolt can wrapover. 6677 */ 6678 if ((int32_t)(now - tcp->tcp_time_wait_expire) < 0) { 6679 break; 6680 } 6681 /* 6682 * Note that the err must be 0 as there is no socket 6683 * associated with this TCP... 6684 */ 6685 (void) tcp_clean_death(-1, tcp, 0); 6686 } 6687 /* Schedule next run time. */ 6688 tcp_time_wait_runtime = prom_gettime() + 10000; 6689 } 6690 6691 void 6692 tcp_time_wait_report(void) 6693 { 6694 tcp_t *tcp; 6695 6696 printf("Current time %u\n", prom_gettime()); 6697 for (tcp = tcp_time_wait_head; tcp != NULL; 6698 tcp = tcp->tcp_time_wait_next) { 6699 printf("%s expires at %u\n", tcp_display(tcp, NULL, 6700 DISP_ADDR_AND_PORT), tcp->tcp_time_wait_expire); 6701 } 6702 } 6703 6704 /* 6705 * Send up all messages queued on tcp_rcv_list. 6706 * Have to set tcp_co_norm since we use putnext. 6707 */ 6708 static void 6709 tcp_rcv_drain(int sock_id, tcp_t *tcp) 6710 { 6711 mblk_t *mp; 6712 struct inetgram *in_gram; 6713 mblk_t *in_mp; 6714 int len; 6715 6716 /* Don't drain if the app has not finished reading all the data. */ 6717 if (sockets[sock_id].so_rcvbuf <= 0) 6718 return; 6719 6720 /* We might have come here just to updated the rwnd */ 6721 if (tcp->tcp_rcv_list == NULL) 6722 goto win_update; 6723 6724 if ((in_gram = (struct inetgram *)bkmem_zalloc( 6725 sizeof (struct inetgram))) == NULL) { 6726 return; 6727 } 6728 if ((in_mp = allocb(tcp->tcp_rcv_cnt, 0)) == NULL) { 6729 bkmem_free((caddr_t)in_gram, sizeof (struct inetgram)); 6730 return; 6731 } 6732 in_gram->igm_level = APP_LVL; 6733 in_gram->igm_mp = in_mp; 6734 in_gram->igm_id = 0; 6735 6736 while ((mp = tcp->tcp_rcv_list) != NULL) { 6737 tcp->tcp_rcv_list = mp->b_cont; 6738 len = mp->b_wptr - mp->b_rptr; 6739 bcopy(mp->b_rptr, in_mp->b_wptr, len); 6740 in_mp->b_wptr += len; 6741 freeb(mp); 6742 } 6743 6744 tcp->tcp_rcv_last_tail = NULL; 6745 tcp->tcp_rcv_cnt = 0; 6746 add_grams(&sockets[sock_id].inq, in_gram); 6747 6748 /* This means that so_rcvbuf can be less than 0. */ 6749 sockets[sock_id].so_rcvbuf -= in_mp->b_wptr - in_mp->b_rptr; 6750 win_update: 6751 /* 6752 * Increase the receive window to max. But we need to do receiver 6753 * SWS avoidance. This means that we need to check the increase of 6754 * of receive window is at least 1 MSS. 6755 */ 6756 if (sockets[sock_id].so_rcvbuf > 0 && 6757 (tcp->tcp_rwnd_max - tcp->tcp_rwnd >= tcp->tcp_mss)) { 6758 tcp->tcp_rwnd = tcp->tcp_rwnd_max; 6759 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 6760 tcp->tcp_tcph->th_win); 6761 } 6762 } 6763 6764 /* 6765 * Wrapper for recvfrom to call 6766 */ 6767 void 6768 tcp_rcv_drain_sock(int sock_id) 6769 { 6770 tcp_t *tcp; 6771 if ((tcp = sockets[sock_id].pcb) == NULL) 6772 return; 6773 tcp_rcv_drain(sock_id, tcp); 6774 } 6775 6776 /* 6777 * If the inq == NULL and the tcp_rcv_list != NULL, we have data that 6778 * recvfrom could read. Place a magic message in the inq to let recvfrom 6779 * know that it needs to call tcp_rcv_drain_sock to pullup the data. 6780 */ 6781 static void 6782 tcp_drain_needed(int sock_id, tcp_t *tcp) 6783 { 6784 struct inetgram *in_gram; 6785 #ifdef DEBUG 6786 printf("tcp_drain_needed: inq %x, tcp_rcv_list %x\n", 6787 sockets[sock_id].inq, tcp->tcp_rcv_list); 6788 #endif 6789 if ((sockets[sock_id].inq != NULL) || 6790 (tcp->tcp_rcv_list == NULL)) 6791 return; 6792 6793 if ((in_gram = (struct inetgram *)bkmem_zalloc( 6794 sizeof (struct inetgram))) == NULL) 6795 return; 6796 6797 in_gram->igm_level = APP_LVL; 6798 in_gram->igm_mp = NULL; 6799 in_gram->igm_id = TCP_CALLB_MAGIC_ID; 6800 6801 add_grams(&sockets[sock_id].inq, in_gram); 6802 } 6803 6804 /* 6805 * Queue data on tcp_rcv_list which is a b_next chain. 6806 * Each element of the chain is a b_cont chain. 6807 * 6808 * M_DATA messages are added to the current element. 6809 * Other messages are added as new (b_next) elements. 6810 */ 6811 static void 6812 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) 6813 { 6814 assert(seg_len == msgdsize(mp)); 6815 if (tcp->tcp_rcv_list == NULL) { 6816 tcp->tcp_rcv_list = mp; 6817 } else { 6818 tcp->tcp_rcv_last_tail->b_cont = mp; 6819 } 6820 while (mp->b_cont) 6821 mp = mp->b_cont; 6822 tcp->tcp_rcv_last_tail = mp; 6823 tcp->tcp_rcv_cnt += seg_len; 6824 tcp->tcp_rwnd -= seg_len; 6825 #ifdef DEBUG 6826 printf("tcp_rcv_enqueue rwnd %d\n", tcp->tcp_rwnd); 6827 #endif 6828 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 6829 } 6830 6831 /* The minimum of smoothed mean deviation in RTO calculation. */ 6832 #define TCP_SD_MIN 400 6833 6834 /* 6835 * Set RTO for this connection. The formula is from Jacobson and Karels' 6836 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 6837 * are the same as those in Appendix A.2 of that paper. 6838 * 6839 * m = new measurement 6840 * sa = smoothed RTT average (8 * average estimates). 6841 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 6842 */ 6843 static void 6844 tcp_set_rto(tcp_t *tcp, int32_t rtt) 6845 { 6846 int32_t m = rtt; 6847 uint32_t sa = tcp->tcp_rtt_sa; 6848 uint32_t sv = tcp->tcp_rtt_sd; 6849 uint32_t rto; 6850 6851 BUMP_MIB(tcp_mib.tcpRttUpdate); 6852 tcp->tcp_rtt_update++; 6853 6854 /* tcp_rtt_sa is not 0 means this is a new sample. */ 6855 if (sa != 0) { 6856 /* 6857 * Update average estimator: 6858 * new rtt = 7/8 old rtt + 1/8 Error 6859 */ 6860 6861 /* m is now Error in estimate. */ 6862 m -= sa >> 3; 6863 if ((int32_t)(sa += m) <= 0) { 6864 /* 6865 * Don't allow the smoothed average to be negative. 6866 * We use 0 to denote reinitialization of the 6867 * variables. 6868 */ 6869 sa = 1; 6870 } 6871 6872 /* 6873 * Update deviation estimator: 6874 * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) 6875 */ 6876 if (m < 0) 6877 m = -m; 6878 m -= sv >> 2; 6879 sv += m; 6880 } else { 6881 /* 6882 * This follows BSD's implementation. So the reinitialized 6883 * RTO is 3 * m. We cannot go less than 2 because if the 6884 * link is bandwidth dominated, doubling the window size 6885 * during slow start means doubling the RTT. We want to be 6886 * more conservative when we reinitialize our estimates. 3 6887 * is just a convenient number. 6888 */ 6889 sa = m << 3; 6890 sv = m << 1; 6891 } 6892 if (sv < TCP_SD_MIN) { 6893 /* 6894 * We do not know that if sa captures the delay ACK 6895 * effect as in a long train of segments, a receiver 6896 * does not delay its ACKs. So set the minimum of sv 6897 * to be TCP_SD_MIN, which is default to 400 ms, twice 6898 * of BSD DATO. That means the minimum of mean 6899 * deviation is 100 ms. 6900 * 6901 */ 6902 sv = TCP_SD_MIN; 6903 } 6904 tcp->tcp_rtt_sa = sa; 6905 tcp->tcp_rtt_sd = sv; 6906 /* 6907 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) 6908 * 6909 * Add tcp_rexmit_interval extra in case of extreme environment 6910 * where the algorithm fails to work. The default value of 6911 * tcp_rexmit_interval_extra should be 0. 6912 * 6913 * As we use a finer grained clock than BSD and update 6914 * RTO for every ACKs, add in another .25 of RTT to the 6915 * deviation of RTO to accomodate burstiness of 1/4 of 6916 * window size. 6917 */ 6918 rto = (sa >> 3) + sv + tcp_rexmit_interval_extra + (sa >> 5); 6919 6920 if (rto > tcp_rexmit_interval_max) { 6921 tcp->tcp_rto = tcp_rexmit_interval_max; 6922 } else if (rto < tcp_rexmit_interval_min) { 6923 tcp->tcp_rto = tcp_rexmit_interval_min; 6924 } else { 6925 tcp->tcp_rto = rto; 6926 } 6927 6928 /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 6929 tcp->tcp_timer_backoff = 0; 6930 } 6931 6932 /* 6933 * Initiate closedown sequence on an active connection. 6934 * Return value zero for OK return, non-zero for error return. 6935 */ 6936 static int 6937 tcp_xmit_end(tcp_t *tcp, int sock_id) 6938 { 6939 mblk_t *mp; 6940 6941 if (tcp->tcp_state < TCPS_SYN_RCVD || 6942 tcp->tcp_state > TCPS_CLOSE_WAIT) { 6943 /* 6944 * Invalid state, only states TCPS_SYN_RCVD, 6945 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 6946 */ 6947 return (-1); 6948 } 6949 6950 tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 6951 tcp->tcp_valid_bits |= TCP_FSS_VALID; 6952 /* 6953 * If there is nothing more unsent, send the FIN now. 6954 * Otherwise, it will go out with the last segment. 6955 */ 6956 if (tcp->tcp_unsent == 0) { 6957 mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 6958 tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 6959 6960 if (mp != NULL) { 6961 /* Dump the packet when debugging. */ 6962 TCP_DUMP_PACKET("tcp_xmit_end", mp); 6963 (void) ipv4_tcp_output(sock_id, mp); 6964 freeb(mp); 6965 } else { 6966 /* 6967 * Couldn't allocate msg. Pretend we got it out. 6968 * Wait for rexmit timeout. 6969 */ 6970 tcp->tcp_snxt = tcp->tcp_fss + 1; 6971 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 6972 } 6973 6974 /* 6975 * If needed, update tcp_rexmit_snxt as tcp_snxt is 6976 * changed. 6977 */ 6978 if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 6979 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 6980 } 6981 } else { 6982 tcp_wput_data(tcp, NULL, B_FALSE); 6983 } 6984 6985 return (0); 6986 } 6987 6988 int 6989 tcp_opt_set(tcp_t *tcp, int level, int option, const void *optval, 6990 socklen_t optlen) 6991 { 6992 switch (level) { 6993 case SOL_SOCKET: { 6994 switch (option) { 6995 case SO_RCVBUF: 6996 if (optlen == sizeof (int)) { 6997 int val = *(int *)optval; 6998 6999 if (val > tcp_max_buf) { 7000 errno = ENOBUFS; 7001 break; 7002 } 7003 /* Silently ignore zero */ 7004 if (val != 0) { 7005 val = MSS_ROUNDUP(val, tcp->tcp_mss); 7006 (void) tcp_rwnd_set(tcp, val); 7007 } 7008 } else { 7009 errno = EINVAL; 7010 } 7011 break; 7012 case SO_SNDBUF: 7013 if (optlen == sizeof (int)) { 7014 tcp->tcp_xmit_hiwater = *(int *)optval; 7015 if (tcp->tcp_xmit_hiwater > tcp_max_buf) 7016 tcp->tcp_xmit_hiwater = tcp_max_buf; 7017 } else { 7018 errno = EINVAL; 7019 } 7020 break; 7021 case SO_LINGER: 7022 if (optlen == sizeof (struct linger)) { 7023 struct linger *lgr = (struct linger *)optval; 7024 7025 if (lgr->l_onoff) { 7026 tcp->tcp_linger = 1; 7027 tcp->tcp_lingertime = lgr->l_linger; 7028 } else { 7029 tcp->tcp_linger = 0; 7030 tcp->tcp_lingertime = 0; 7031 } 7032 } else { 7033 errno = EINVAL; 7034 } 7035 break; 7036 default: 7037 errno = ENOPROTOOPT; 7038 break; 7039 } 7040 break; 7041 } /* case SOL_SOCKET */ 7042 case IPPROTO_TCP: { 7043 switch (option) { 7044 default: 7045 errno = ENOPROTOOPT; 7046 break; 7047 } 7048 break; 7049 } /* case IPPROTO_TCP */ 7050 case IPPROTO_IP: { 7051 switch (option) { 7052 default: 7053 errno = ENOPROTOOPT; 7054 break; 7055 } 7056 break; 7057 } /* case IPPROTO_IP */ 7058 default: 7059 errno = ENOPROTOOPT; 7060 break; 7061 } /* switch (level) */ 7062 7063 if (errno != 0) 7064 return (-1); 7065 else 7066 return (0); 7067 } 7068