1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsun.h> 31 #include <sys/strsubr.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/timod.h> 37 #include <sys/ddi.h> 38 #include <sys/sunddi.h> 39 #include <sys/suntpi.h> 40 #include <sys/xti_inet.h> 41 #include <sys/cmn_err.h> 42 #include <sys/debug.h> 43 #include <sys/sdt.h> 44 #include <sys/vtrace.h> 45 #include <sys/kmem.h> 46 #include <sys/ethernet.h> 47 #include <sys/cpuvar.h> 48 #include <sys/dlpi.h> 49 #include <sys/pattr.h> 50 #include <sys/policy.h> 51 #include <sys/priv.h> 52 #include <sys/zone.h> 53 #include <sys/sunldi.h> 54 55 #include <sys/errno.h> 56 #include <sys/signal.h> 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <sys/sockio.h> 60 #include <sys/isa_defs.h> 61 #include <sys/md5.h> 62 #include <sys/random.h> 63 #include <sys/uio.h> 64 #include <sys/systm.h> 65 #include <netinet/in.h> 66 #include <netinet/tcp.h> 67 #include <netinet/ip6.h> 68 #include <netinet/icmp6.h> 69 #include <net/if.h> 70 #include <net/route.h> 71 #include <inet/ipsec_impl.h> 72 73 #include <inet/common.h> 74 #include <inet/ip.h> 75 #include <inet/ip_impl.h> 76 #include <inet/ip6.h> 77 #include <inet/ip_ndp.h> 78 #include <inet/proto_set.h> 79 #include <inet/mib2.h> 80 #include <inet/nd.h> 81 #include <inet/optcom.h> 82 #include <inet/snmpcom.h> 83 #include <inet/kstatcom.h> 84 #include <inet/tcp.h> 85 #include <inet/tcp_impl.h> 86 #include <inet/udp_impl.h> 87 #include <net/pfkeyv2.h> 88 #include <inet/ipdrop.h> 89 90 #include <inet/ipclassifier.h> 91 #include <inet/ip_ire.h> 92 #include <inet/ip_ftable.h> 93 #include <inet/ip_if.h> 94 #include <inet/ipp_common.h> 95 #include <inet/ip_rts.h> 96 #include <inet/ip_netinfo.h> 97 #include <sys/squeue_impl.h> 98 #include <sys/squeue.h> 99 #include <inet/kssl/ksslapi.h> 100 #include <sys/tsol/label.h> 101 #include <sys/tsol/tnet.h> 102 #include <rpc/pmap_prot.h> 103 #include <sys/callo.h> 104 105 #include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ 106 107 /* 108 * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) 109 * 110 * (Read the detailed design doc in PSARC case directory) 111 * 112 * The entire tcp state is contained in tcp_t and conn_t structure 113 * which are allocated in tandem using ipcl_conn_create() and passing 114 * IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect 115 * the references on the tcp_t. The tcp_t structure is never compressed 116 * and packets always land on the correct TCP perimeter from the time 117 * eager is created till the time tcp_t dies (as such the old mentat 118 * TCP global queue is not used for detached state and no IPSEC checking 119 * is required). The global queue is still allocated to send out resets 120 * for connection which have no listeners and IP directly calls 121 * tcp_xmit_listeners_reset() which does any policy check. 122 * 123 * Protection and Synchronisation mechanism: 124 * 125 * The tcp data structure does not use any kind of lock for protecting 126 * its state but instead uses 'squeues' for mutual exclusion from various 127 * read and write side threads. To access a tcp member, the thread should 128 * always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS, 129 * or SQ_NODRAIN). Since the squeues allow a direct function call, caller 130 * can pass any tcp function having prototype of edesc_t as argument 131 * (different from traditional STREAMs model where packets come in only 132 * designated entry points). The list of functions that can be directly 133 * called via squeue are listed before the usual function prototype. 134 * 135 * Referencing: 136 * 137 * TCP is MT-Hot and we use a reference based scheme to make sure that the 138 * tcp structure doesn't disappear when its needed. When the application 139 * creates an outgoing connection or accepts an incoming connection, we 140 * start out with 2 references on 'conn_ref'. One for TCP and one for IP. 141 * The IP reference is just a symbolic reference since ip_tcpclose() 142 * looks at tcp structure after tcp_close_output() returns which could 143 * have dropped the last TCP reference. So as long as the connection is 144 * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the 145 * conn_t. The classifier puts its own reference when the connection is 146 * inserted in listen or connected hash. Anytime a thread needs to enter 147 * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr 148 * on write side or by doing a classify on read side and then puts a 149 * reference on the conn before doing squeue_enter/tryenter/fill. For 150 * read side, the classifier itself puts the reference under fanout lock 151 * to make sure that tcp can't disappear before it gets processed. The 152 * squeue will drop this reference automatically so the called function 153 * doesn't have to do a DEC_REF. 154 * 155 * Opening a new connection: 156 * 157 * The outgoing connection open is pretty simple. tcp_open() does the 158 * work in creating the conn/tcp structure and initializing it. The 159 * squeue assignment is done based on the CPU the application 160 * is running on. So for outbound connections, processing is always done 161 * on application CPU which might be different from the incoming CPU 162 * being interrupted by the NIC. An optimal way would be to figure out 163 * the NIC <-> CPU binding at listen time, and assign the outgoing 164 * connection to the squeue attached to the CPU that will be interrupted 165 * for incoming packets (we know the NIC based on the bind IP address). 166 * This might seem like a problem if more data is going out but the 167 * fact is that in most cases the transmit is ACK driven transmit where 168 * the outgoing data normally sits on TCP's xmit queue waiting to be 169 * transmitted. 170 * 171 * Accepting a connection: 172 * 173 * This is a more interesting case because of various races involved in 174 * establishing a eager in its own perimeter. Read the meta comment on 175 * top of tcp_input_listener(). But briefly, the squeue is picked by 176 * ip_fanout based on the ring or the sender (if loopback). 177 * 178 * Closing a connection: 179 * 180 * The close is fairly straight forward. tcp_close() calls tcp_close_output() 181 * via squeue to do the close and mark the tcp as detached if the connection 182 * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its 183 * reference but tcp_close() drop IP's reference always. So if tcp was 184 * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP 185 * and 1 because it is in classifier's connected hash. This is the condition 186 * we use to determine that its OK to clean up the tcp outside of squeue 187 * when time wait expires (check the ref under fanout and conn_lock and 188 * if it is 2, remove it from fanout hash and kill it). 189 * 190 * Although close just drops the necessary references and marks the 191 * tcp_detached state, tcp_close needs to know the tcp_detached has been 192 * set (under squeue) before letting the STREAM go away (because a 193 * inbound packet might attempt to go up the STREAM while the close 194 * has happened and tcp_detached is not set). So a special lock and 195 * flag is used along with a condition variable (tcp_closelock, tcp_closed, 196 * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked 197 * tcp_detached. 198 * 199 * Special provisions and fast paths: 200 * 201 * We make special provisions for sockfs by marking tcp_issocket 202 * whenever we have only sockfs on top of TCP. This allows us to skip 203 * putting the tcp in acceptor hash since a sockfs listener can never 204 * become acceptor and also avoid allocating a tcp_t for acceptor STREAM 205 * since eager has already been allocated and the accept now happens 206 * on acceptor STREAM. There is a big blob of comment on top of 207 * tcp_input_listener explaining the new accept. When socket is POP'd, 208 * sockfs sends us an ioctl to mark the fact and we go back to old 209 * behaviour. Once tcp_issocket is unset, its never set for the 210 * life of that connection. 211 * 212 * IPsec notes : 213 * 214 * Since a packet is always executed on the correct TCP perimeter 215 * all IPsec processing is defered to IP including checking new 216 * connections and setting IPSEC policies for new connection. The 217 * only exception is tcp_xmit_listeners_reset() which is called 218 * directly from IP and needs to policy check to see if TH_RST 219 * can be sent out. 220 */ 221 222 /* 223 * Values for squeue switch: 224 * 1: SQ_NODRAIN 225 * 2: SQ_PROCESS 226 * 3: SQ_FILL 227 */ 228 int tcp_squeue_wput = 2; /* /etc/systems */ 229 int tcp_squeue_flag; 230 231 /* 232 * This controls how tiny a write must be before we try to copy it 233 * into the mblk on the tail of the transmit queue. Not much 234 * speedup is observed for values larger than sixteen. Zero will 235 * disable the optimisation. 236 */ 237 int tcp_tx_pull_len = 16; 238 239 /* 240 * TCP Statistics. 241 * 242 * How TCP statistics work. 243 * 244 * There are two types of statistics invoked by two macros. 245 * 246 * TCP_STAT(name) does non-atomic increment of a named stat counter. It is 247 * supposed to be used in non MT-hot paths of the code. 248 * 249 * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is 250 * supposed to be used for DEBUG purposes and may be used on a hot path. 251 * 252 * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat 253 * (use "kstat tcp" to get them). 254 * 255 * There is also additional debugging facility that marks tcp_clean_death() 256 * instances and saves them in tcp_t structure. It is triggered by 257 * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for 258 * tcp_clean_death() calls that counts the number of times each tag was hit. It 259 * is triggered by TCP_CLD_COUNTERS define. 260 * 261 * How to add new counters. 262 * 263 * 1) Add a field in the tcp_stat structure describing your counter. 264 * 2) Add a line in the template in tcp_kstat2_init() with the name 265 * of the counter. 266 * 267 * IMPORTANT!! - make sure that both are in sync !! 268 * 3) Use either TCP_STAT or TCP_DBGSTAT with the name. 269 * 270 * Please avoid using private counters which are not kstat-exported. 271 * 272 * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances 273 * in tcp_t structure. 274 * 275 * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags. 276 */ 277 278 #ifndef TCP_DEBUG_COUNTER 279 #ifdef DEBUG 280 #define TCP_DEBUG_COUNTER 1 281 #else 282 #define TCP_DEBUG_COUNTER 0 283 #endif 284 #endif 285 286 #define TCP_CLD_COUNTERS 0 287 288 #define TCP_TAG_CLEAN_DEATH 1 289 #define TCP_MAX_CLEAN_DEATH_TAG 32 290 291 #ifdef lint 292 static int _lint_dummy_; 293 #endif 294 295 #if TCP_CLD_COUNTERS 296 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; 297 #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++ 298 #elif defined(lint) 299 #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0); 300 #else 301 #define TCP_CLD_STAT(x) 302 #endif 303 304 #if TCP_DEBUG_COUNTER 305 #define TCP_DBGSTAT(tcps, x) \ 306 atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1) 307 #define TCP_G_DBGSTAT(x) \ 308 atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1) 309 #elif defined(lint) 310 #define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0); 311 #define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); 312 #else 313 #define TCP_DBGSTAT(tcps, x) 314 #define TCP_G_DBGSTAT(x) 315 #endif 316 317 #define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++) 318 319 tcp_g_stat_t tcp_g_statistics; 320 kstat_t *tcp_g_kstat; 321 322 /* Macros for timestamp comparisons */ 323 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 324 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 325 326 /* 327 * Parameters for TCP Initial Send Sequence number (ISS) generation. When 328 * tcp_strong_iss is set to 1, which is the default, the ISS is calculated 329 * by adding three components: a time component which grows by 1 every 4096 330 * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); 331 * a per-connection component which grows by 125000 for every new connection; 332 * and an "extra" component that grows by a random amount centered 333 * approximately on 64000. This causes the ISS generator to cycle every 334 * 4.89 hours if no TCP connections are made, and faster if connections are 335 * made. 336 * 337 * When tcp_strong_iss is set to 0, ISS is calculated by adding two 338 * components: a time component which grows by 250000 every second; and 339 * a per-connection component which grows by 125000 for every new connections. 340 * 341 * A third method, when tcp_strong_iss is set to 2, for generating ISS is 342 * prescribed by Steve Bellovin. This involves adding time, the 125000 per 343 * connection, and a one-way hash (MD5) of the connection ID <sport, dport, 344 * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered 345 * password. 346 */ 347 #define ISS_INCR 250000 348 #define ISS_NSEC_SHT 12 349 350 static sin_t sin_null; /* Zero address for quick clears */ 351 static sin6_t sin6_null; /* Zero address for quick clears */ 352 353 /* 354 * This implementation follows the 4.3BSD interpretation of the urgent 355 * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause 356 * incompatible changes in protocols like telnet and rlogin. 357 */ 358 #define TCP_OLD_URP_INTERPRETATION 1 359 360 /* 361 * Since tcp_listener is not cleared atomically with tcp_detached 362 * being cleared we need this extra bit to tell a detached connection 363 * apart from one that is in the process of being accepted. 364 */ 365 #define TCP_IS_DETACHED_NONEAGER(tcp) \ 366 (TCP_IS_DETACHED(tcp) && \ 367 (!(tcp)->tcp_hard_binding)) 368 369 /* 370 * TCP reassembly macros. We hide starting and ending sequence numbers in 371 * b_next and b_prev of messages on the reassembly queue. The messages are 372 * chained using b_cont. These macros are used in tcp_reass() so we don't 373 * have to see the ugly casts and assignments. 374 */ 375 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) 376 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ 377 (mblk_t *)(uintptr_t)(u)) 378 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) 379 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ 380 (mblk_t *)(uintptr_t)(u)) 381 382 /* 383 * Implementation of TCP Timers. 384 * ============================= 385 * 386 * INTERFACE: 387 * 388 * There are two basic functions dealing with tcp timers: 389 * 390 * timeout_id_t tcp_timeout(connp, func, time) 391 * clock_t tcp_timeout_cancel(connp, timeout_id) 392 * TCP_TIMER_RESTART(tcp, intvl) 393 * 394 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' 395 * after 'time' ticks passed. The function called by timeout() must adhere to 396 * the same restrictions as a driver soft interrupt handler - it must not sleep 397 * or call other functions that might sleep. The value returned is the opaque 398 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to 399 * cancel the request. The call to tcp_timeout() may fail in which case it 400 * returns zero. This is different from the timeout(9F) function which never 401 * fails. 402 * 403 * The call-back function 'func' always receives 'connp' as its single 404 * argument. It is always executed in the squeue corresponding to the tcp 405 * structure. The tcp structure is guaranteed to be present at the time the 406 * call-back is called. 407 * 408 * NOTE: The call-back function 'func' is never called if tcp is in 409 * the TCPS_CLOSED state. 410 * 411 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() 412 * request. locks acquired by the call-back routine should not be held across 413 * the call to tcp_timeout_cancel() or a deadlock may result. 414 * 415 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. 416 * Otherwise, it returns an integer value greater than or equal to 0. In 417 * particular, if the call-back function is already placed on the squeue, it can 418 * not be canceled. 419 * 420 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called 421 * within squeue context corresponding to the tcp instance. Since the 422 * call-back is also called via the same squeue, there are no race 423 * conditions described in untimeout(9F) manual page since all calls are 424 * strictly serialized. 425 * 426 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout 427 * stored in tcp_timer_tid and starts a new one using 428 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back 429 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid 430 * field. 431 * 432 * NOTE: since the timeout cancellation is not guaranteed, the cancelled 433 * call-back may still be called, so it is possible tcp_timer() will be 434 * called several times. This should not be a problem since tcp_timer() 435 * should always check the tcp instance state. 436 * 437 * 438 * IMPLEMENTATION: 439 * 440 * TCP timers are implemented using three-stage process. The call to 441 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function 442 * when the timer expires. The tcp_timer_callback() arranges the call of the 443 * tcp_timer_handler() function via squeue corresponding to the tcp 444 * instance. The tcp_timer_handler() calls actual requested timeout call-back 445 * and passes tcp instance as an argument to it. Information is passed between 446 * stages using the tcp_timer_t structure which contains the connp pointer, the 447 * tcp call-back to call and the timeout id returned by the timeout(9F). 448 * 449 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - 450 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo 451 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() 452 * returns the pointer to this mblk. 453 * 454 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It 455 * looks like a normal mblk without actual dblk attached to it. 456 * 457 * To optimize performance each tcp instance holds a small cache of timer 458 * mblocks. In the current implementation it caches up to two timer mblocks per 459 * tcp instance. The cache is preserved over tcp frees and is only freed when 460 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp 461 * timer processing happens on a corresponding squeue, the cache manipulation 462 * does not require any locks. Experiments show that majority of timer mblocks 463 * allocations are satisfied from the tcp cache and do not involve kmem calls. 464 * 465 * The tcp_timeout() places a refhold on the connp instance which guarantees 466 * that it will be present at the time the call-back function fires. The 467 * tcp_timer_handler() drops the reference after calling the call-back, so the 468 * call-back function does not need to manipulate the references explicitly. 469 */ 470 471 typedef struct tcp_timer_s { 472 conn_t *connp; 473 void (*tcpt_proc)(void *); 474 callout_id_t tcpt_tid; 475 } tcp_timer_t; 476 477 static kmem_cache_t *tcp_timercache; 478 kmem_cache_t *tcp_sack_info_cache; 479 480 /* 481 * For scalability, we must not run a timer for every TCP connection 482 * in TIME_WAIT state. To see why, consider (for time wait interval of 483 * 4 minutes): 484 * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's 485 * 486 * This list is ordered by time, so you need only delete from the head 487 * until you get to entries which aren't old enough to delete yet. 488 * The list consists of only the detached TIME_WAIT connections. 489 * 490 * Note that the timer (tcp_time_wait_expire) is started when the tcp_t 491 * becomes detached TIME_WAIT (either by changing the state and already 492 * being detached or the other way around). This means that the TIME_WAIT 493 * state can be extended (up to doubled) if the connection doesn't become 494 * detached for a long time. 495 * 496 * The list manipulations (including tcp_time_wait_next/prev) 497 * are protected by the tcp_time_wait_lock. The content of the 498 * detached TIME_WAIT connections is protected by the normal perimeters. 499 * 500 * This list is per squeue and squeues are shared across the tcp_stack_t's. 501 * Things on tcp_time_wait_head remain associated with the tcp_stack_t 502 * and conn_netstack. 503 * The tcp_t's that are added to tcp_free_list are disassociated and 504 * have NULL tcp_tcps and conn_netstack pointers. 505 */ 506 typedef struct tcp_squeue_priv_s { 507 kmutex_t tcp_time_wait_lock; 508 callout_id_t tcp_time_wait_tid; 509 tcp_t *tcp_time_wait_head; 510 tcp_t *tcp_time_wait_tail; 511 tcp_t *tcp_free_list; 512 uint_t tcp_free_list_cnt; 513 } tcp_squeue_priv_t; 514 515 /* 516 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. 517 * Running it every 5 seconds seems to give the best results. 518 */ 519 #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000) 520 521 /* 522 * To prevent memory hog, limit the number of entries in tcp_free_list 523 * to 1% of available memory / number of cpus 524 */ 525 uint_t tcp_free_list_max_cnt = 0; 526 527 #define TCP_XMIT_LOWATER 4096 528 #define TCP_XMIT_HIWATER 49152 529 #define TCP_RECV_LOWATER 2048 530 #define TCP_RECV_HIWATER 128000 531 532 /* 533 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days 534 */ 535 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) 536 537 #define TIDUSZ 4096 /* transport interface data unit size */ 538 539 /* 540 * Bind hash list size and has function. It has to be a power of 2 for 541 * hashing. 542 */ 543 #define TCP_BIND_FANOUT_SIZE 512 544 #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1)) 545 546 /* 547 * Size of acceptor hash list. It has to be a power of 2 for hashing. 548 */ 549 #define TCP_ACCEPTOR_FANOUT_SIZE 256 550 551 #ifdef _ILP32 552 #define TCP_ACCEPTOR_HASH(accid) \ 553 (((uint_t)(accid) >> 8) & (TCP_ACCEPTOR_FANOUT_SIZE - 1)) 554 #else 555 #define TCP_ACCEPTOR_HASH(accid) \ 556 ((uint_t)(accid) & (TCP_ACCEPTOR_FANOUT_SIZE - 1)) 557 #endif /* _ILP32 */ 558 559 #define IP_ADDR_CACHE_SIZE 2048 560 #define IP_ADDR_CACHE_HASH(faddr) \ 561 (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1)) 562 563 /* 564 * If there is a limit set on the number of connections allowed per each 565 * listener, the following struct is used to store that counter. This needs 566 * to be separated from the listener since the listener can go away before 567 * all the connections are gone. When the struct is allocated, tlc_cnt is set 568 * to 1. When the listener goes away, tlc_cnt is decremented by one. And 569 * the last connection (or the listener) which decrements tlc_cnt to zero 570 * frees the struct. 571 * 572 * tlc_max is the threshold value tcps_conn_listen_port. It is set when the 573 * tcp_listen_cnt_t is allocated. 574 * 575 * tlc_report_time stores the time when cmn_err() is called to report that the 576 * max has been exceeeded. Report is done at most once every 577 * TCP_TLC_REPORT_INTERVAL mins for a listener. 578 * 579 * tlc_drop stores the number of connection attempt dropped because the 580 * limit has reached. 581 */ 582 typedef struct tcp_listen_cnt_s { 583 uint32_t tlc_max; 584 uint32_t tlc_cnt; 585 int64_t tlc_report_time; 586 uint32_t tlc_drop; 587 } tcp_listen_cnt_t; 588 589 #define TCP_TLC_REPORT_INTERVAL (1 * MINUTES) 590 591 #define TCP_DECR_LISTEN_CNT(tcp) \ 592 { \ 593 ASSERT((tcp)->tcp_listen_cnt->tlc_cnt > 0); \ 594 if (atomic_add_32_nv(&(tcp)->tcp_listen_cnt->tlc_cnt, -1) == 0) \ 595 kmem_free((tcp)->tcp_listen_cnt, sizeof (tcp_listen_cnt_t)); \ 596 (tcp)->tcp_listen_cnt = NULL; \ 597 } 598 599 /* Minimum number of connections per listener. */ 600 uint32_t tcp_min_conn_listener = 2; 601 602 /* 603 * Linked list struct to store listener connection limit configuration per 604 * IP stack. 605 */ 606 typedef struct tcp_listener_s { 607 in_port_t tl_port; 608 uint32_t tl_ratio; 609 list_node_t tl_link; 610 } tcp_listener_t; 611 612 /* 613 * The shift factor applied to tcp_mss to decide if the peer sends us a 614 * valid initial receive window. By default, if the peer receive window 615 * is smaller than 1 MSS (shift factor is 0), it is considered as invalid. 616 */ 617 uint32_t tcp_init_wnd_shft = 0; 618 619 /* 620 * When the system is under memory pressure, stack variable tcps_reclaim is 621 * true, we shorten the connection timeout abort interval to tcp_early_abort 622 * seconds. 623 */ 624 uint32_t tcp_early_abort = 30; 625 626 /* 627 * TCP options struct returned from tcp_parse_options. 628 */ 629 typedef struct tcp_opt_s { 630 uint32_t tcp_opt_mss; 631 uint32_t tcp_opt_wscale; 632 uint32_t tcp_opt_ts_val; 633 uint32_t tcp_opt_ts_ecr; 634 tcp_t *tcp; 635 } tcp_opt_t; 636 637 /* 638 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 639 */ 640 641 #ifdef _BIG_ENDIAN 642 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 643 (TCPOPT_TSTAMP << 8) | 10) 644 #else 645 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 646 (TCPOPT_NOP << 8) | TCPOPT_NOP) 647 #endif 648 649 /* 650 * Flags returned from tcp_parse_options. 651 */ 652 #define TCP_OPT_MSS_PRESENT 1 653 #define TCP_OPT_WSCALE_PRESENT 2 654 #define TCP_OPT_TSTAMP_PRESENT 4 655 #define TCP_OPT_SACK_OK_PRESENT 8 656 #define TCP_OPT_SACK_PRESENT 16 657 658 /* TCP option length */ 659 #define TCPOPT_NOP_LEN 1 660 #define TCPOPT_MAXSEG_LEN 4 661 #define TCPOPT_WS_LEN 3 662 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 663 #define TCPOPT_TSTAMP_LEN 10 664 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 665 #define TCPOPT_SACK_OK_LEN 2 666 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 667 #define TCPOPT_REAL_SACK_LEN 4 668 #define TCPOPT_MAX_SACK_LEN 36 669 #define TCPOPT_HEADER_LEN 2 670 671 /* TCP cwnd burst factor. */ 672 #define TCP_CWND_INFINITE 65535 673 #define TCP_CWND_SS 3 674 #define TCP_CWND_NORMAL 5 675 676 /* Maximum TCP initial cwin (start/restart). */ 677 #define TCP_MAX_INIT_CWND 8 678 679 /* 680 * Initialize cwnd according to RFC 3390. def_max_init_cwnd is 681 * either tcp_slow_start_initial or tcp_slow_start_after idle 682 * depending on the caller. If the upper layer has not used the 683 * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd 684 * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd. 685 * If the upper layer has changed set the tcp_init_cwnd, just use 686 * it to calculate the tcp_cwnd. 687 */ 688 #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \ 689 { \ 690 if ((tcp)->tcp_init_cwnd == 0) { \ 691 (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \ 692 MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \ 693 } else { \ 694 (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \ 695 } \ 696 tcp->tcp_cwnd_cnt = 0; \ 697 } 698 699 /* TCP Timer control structure */ 700 typedef struct tcpt_s { 701 pfv_t tcpt_pfv; /* The routine we are to call */ 702 tcp_t *tcpt_tcp; /* The parameter we are to pass in */ 703 } tcpt_t; 704 705 /* 706 * Functions called directly via squeue having a prototype of edesc_t. 707 */ 708 void tcp_input_listener(void *arg, mblk_t *mp, void *arg2, 709 ip_recv_attr_t *ira); 710 static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, 711 ip_recv_attr_t *dummy); 712 void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, 713 ip_recv_attr_t *dummy); 714 static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, 715 ip_recv_attr_t *dummy); 716 static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, 717 ip_recv_attr_t *dummy); 718 void tcp_input_data(void *arg, mblk_t *mp, void *arg2, 719 ip_recv_attr_t *ira); 720 static void tcp_close_output(void *arg, mblk_t *mp, void *arg2, 721 ip_recv_attr_t *dummy); 722 void tcp_output(void *arg, mblk_t *mp, void *arg2, 723 ip_recv_attr_t *dummy); 724 void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, 725 ip_recv_attr_t *dummy); 726 static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, 727 ip_recv_attr_t *dummy); 728 static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, 729 ip_recv_attr_t *dummy); 730 static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, 731 ip_recv_attr_t *dummy); 732 static void tcp_send_synack(void *arg, mblk_t *mp, void *arg2, 733 ip_recv_attr_t *dummy); 734 735 736 /* Prototype for TCP functions */ 737 static void tcp_random_init(void); 738 int tcp_random(void); 739 static void tcp_tli_accept(tcp_t *tcp, mblk_t *mp); 740 static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, 741 tcp_t *eager); 742 static int tcp_set_destination(tcp_t *tcp); 743 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 744 int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, 745 boolean_t user_specified); 746 static void tcp_closei_local(tcp_t *tcp); 747 static void tcp_close_detached(tcp_t *tcp); 748 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, 749 mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira); 750 static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp); 751 static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, 752 in_port_t dstport, uint_t srcid); 753 static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, 754 in_port_t dstport, uint32_t flowinfo, 755 uint_t srcid, uint32_t scope_id); 756 static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); 757 static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); 758 static char *tcp_display(tcp_t *tcp, char *, char); 759 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); 760 static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only); 761 static void tcp_eager_unlink(tcp_t *tcp); 762 static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr, 763 int unixerr); 764 static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 765 int tlierr, int unixerr); 766 static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, 767 cred_t *cr); 768 static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, 769 char *value, caddr_t cp, cred_t *cr); 770 static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, 771 char *value, caddr_t cp, cred_t *cr); 772 static int tcp_tpistate(tcp_t *tcp); 773 static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp, 774 int caller_holds_lock); 775 static void tcp_bind_hash_remove(tcp_t *tcp); 776 static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *); 777 void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp); 778 static void tcp_acceptor_hash_remove(tcp_t *tcp); 779 static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); 780 static void tcp_info_req(tcp_t *tcp, mblk_t *mp); 781 static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); 782 static void tcp_init_values(tcp_t *tcp); 783 static void tcp_ip_notify(tcp_t *tcp); 784 static void tcp_iss_init(tcp_t *tcp); 785 static void tcp_keepalive_killer(void *arg); 786 static int tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt); 787 static void tcp_mss_set(tcp_t *tcp, uint32_t size); 788 static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, 789 int *do_disconnectp, int *t_errorp, int *sys_errorp); 790 static boolean_t tcp_allow_connopt_set(int level, int name); 791 int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); 792 static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); 793 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, 794 tcp_stack_t *); 795 static int tcp_param_set(queue_t *q, mblk_t *mp, char *value, 796 caddr_t cp, cred_t *cr); 797 static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, 798 caddr_t cp, cred_t *cr); 799 static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *); 800 static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, 801 caddr_t cp, cred_t *cr); 802 static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt); 803 static void tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt); 804 static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start); 805 static void tcp_reass_timer(void *arg); 806 static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp); 807 static void tcp_reinit(tcp_t *tcp); 808 static void tcp_reinit_values(tcp_t *tcp); 809 810 static uint_t tcp_rwnd_reopen(tcp_t *tcp); 811 static uint_t tcp_rcv_drain(tcp_t *tcp); 812 static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); 813 static boolean_t tcp_send_rst_chk(tcp_stack_t *); 814 static void tcp_ss_rexmit(tcp_t *tcp); 815 static mblk_t *tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, 816 ip_recv_attr_t *); 817 static void tcp_process_options(tcp_t *, tcpha_t *); 818 static void tcp_rsrv(queue_t *q); 819 static int tcp_snmp_state(tcp_t *tcp); 820 static void tcp_timer(void *arg); 821 static void tcp_timer_callback(void *); 822 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp, 823 boolean_t random); 824 static in_port_t tcp_get_next_priv_port(const tcp_t *); 825 static void tcp_wput_sock(queue_t *q, mblk_t *mp); 826 static void tcp_wput_fallback(queue_t *q, mblk_t *mp); 827 void tcp_tpi_accept(queue_t *q, mblk_t *mp); 828 static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); 829 static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); 830 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); 831 static int tcp_send(tcp_t *tcp, const int mss, 832 const int total_hdr_len, const int tcp_hdr_len, 833 const int num_sack_blk, int *usable, uint_t *snxt, 834 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time); 835 static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, 836 int num_sack_blk); 837 static void tcp_wsrv(queue_t *q); 838 static int tcp_xmit_end(tcp_t *tcp); 839 static void tcp_ack_timer(void *arg); 840 static mblk_t *tcp_ack_mp(tcp_t *tcp); 841 static void tcp_xmit_early_reset(char *str, mblk_t *mp, 842 uint32_t seq, uint32_t ack, int ctl, ip_recv_attr_t *, 843 ip_stack_t *, conn_t *); 844 static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, 845 uint32_t ack, int ctl); 846 static void tcp_set_rto(tcp_t *, time_t); 847 static void tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); 848 static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *); 849 static boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *, 850 ip_recv_attr_t *); 851 static int tcp_build_hdrs(tcp_t *); 852 static void tcp_time_wait_append(tcp_t *tcp); 853 static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 854 uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha, 855 ip_recv_attr_t *ira); 856 boolean_t tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp); 857 static boolean_t tcp_zcopy_check(tcp_t *); 858 static void tcp_zcopy_notify(tcp_t *); 859 static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t); 860 static void tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa); 861 static void tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only); 862 static void tcp_update_zcopy(tcp_t *tcp); 863 static void tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t, 864 ixa_notify_arg_t); 865 static void tcp_rexmit_after_error(tcp_t *tcp); 866 static void tcp_send_data(tcp_t *, mblk_t *); 867 extern mblk_t *tcp_timermp_alloc(int); 868 extern void tcp_timermp_free(tcp_t *); 869 static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); 870 static void tcp_stop_lingering(tcp_t *tcp); 871 static void tcp_close_linger_timeout(void *arg); 872 static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns); 873 static void tcp_stack_fini(netstackid_t stackid, void *arg); 874 static void *tcp_g_kstat_init(tcp_g_stat_t *); 875 static void tcp_g_kstat_fini(kstat_t *); 876 static void *tcp_kstat_init(netstackid_t, tcp_stack_t *); 877 static void tcp_kstat_fini(netstackid_t, kstat_t *); 878 static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *); 879 static void tcp_kstat2_fini(netstackid_t, kstat_t *); 880 static int tcp_kstat_update(kstat_t *kp, int rw); 881 static mblk_t *tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 882 ip_recv_attr_t *ira); 883 static mblk_t *tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, 884 ip_recv_attr_t *ira); 885 static int tcp_squeue_switch(int); 886 887 static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t); 888 static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *); 889 static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *); 890 static int tcp_tpi_close(queue_t *, int); 891 static int tcp_tpi_close_accept(queue_t *); 892 893 static void tcp_squeue_add(squeue_t *); 894 static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); 895 896 extern void tcp_kssl_input(tcp_t *, mblk_t *, cred_t *); 897 898 void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy); 899 void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, 900 ip_recv_attr_t *dummy); 901 902 static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, 903 sock_upper_handle_t, cred_t *); 904 static int tcp_listen(sock_lower_handle_t, int, cred_t *); 905 static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *, 906 boolean_t); 907 static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t, 908 cred_t *, pid_t); 909 static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *, 910 boolean_t); 911 static int tcp_do_unbind(conn_t *); 912 static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *, 913 boolean_t); 914 915 static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *); 916 917 static uint32_t tcp_find_listener_conf(tcp_stack_t *, in_port_t); 918 static int tcp_listener_conf_get(queue_t *, mblk_t *, caddr_t, cred_t *); 919 static int tcp_listener_conf_add(queue_t *, mblk_t *, char *, caddr_t, 920 cred_t *); 921 static int tcp_listener_conf_del(queue_t *, mblk_t *, char *, caddr_t, 922 cred_t *); 923 static void tcp_listener_conf_cleanup(tcp_stack_t *); 924 925 /* 926 * Routines related to the TCP_IOC_ABORT_CONN ioctl command. 927 * 928 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting 929 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure 930 * (defined in tcp.h) needs to be filled in and passed into the kernel 931 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t 932 * structure contains the four-tuple of a TCP connection and a range of TCP 933 * states (specified by ac_start and ac_end). The use of wildcard addresses 934 * and ports is allowed. Connections with a matching four tuple and a state 935 * within the specified range will be aborted. The valid states for the 936 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, 937 * inclusive. 938 * 939 * An application which has its connection aborted by this ioctl will receive 940 * an error that is dependent on the connection state at the time of the abort. 941 * If the connection state is < TCPS_TIME_WAIT, an application should behave as 942 * though a RST packet has been received. If the connection state is equal to 943 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel 944 * and all resources associated with the connection will be freed. 945 */ 946 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); 947 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); 948 static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, 949 ip_recv_attr_t *dummy); 950 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); 951 static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); 952 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, 953 boolean_t, tcp_stack_t *); 954 955 static struct module_info tcp_rinfo = { 956 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER 957 }; 958 959 static struct module_info tcp_winfo = { 960 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16 961 }; 962 963 /* 964 * Entry points for TCP as a device. The normal case which supports 965 * the TCP functionality. 966 * We have separate open functions for the /dev/tcp and /dev/tcp6 devices. 967 */ 968 struct qinit tcp_rinitv4 = { 969 NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo 970 }; 971 972 struct qinit tcp_rinitv6 = { 973 NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo 974 }; 975 976 struct qinit tcp_winit = { 977 (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 978 }; 979 980 /* Initial entry point for TCP in socket mode. */ 981 struct qinit tcp_sock_winit = { 982 (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 983 }; 984 985 /* TCP entry point during fallback */ 986 struct qinit tcp_fallback_sock_winit = { 987 (pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo 988 }; 989 990 /* 991 * Entry points for TCP as a acceptor STREAM opened by sockfs when doing 992 * an accept. Avoid allocating data structures since eager has already 993 * been created. 994 */ 995 struct qinit tcp_acceptor_rinit = { 996 NULL, (pfi_t)tcp_rsrv, NULL, tcp_tpi_close_accept, NULL, &tcp_winfo 997 }; 998 999 struct qinit tcp_acceptor_winit = { 1000 (pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo 1001 }; 1002 1003 /* For AF_INET aka /dev/tcp */ 1004 struct streamtab tcpinfov4 = { 1005 &tcp_rinitv4, &tcp_winit 1006 }; 1007 1008 /* For AF_INET6 aka /dev/tcp6 */ 1009 struct streamtab tcpinfov6 = { 1010 &tcp_rinitv6, &tcp_winit 1011 }; 1012 1013 sock_downcalls_t sock_tcp_downcalls; 1014 1015 /* Setable only in /etc/system. Move to ndd? */ 1016 boolean_t tcp_icmp_source_quench = B_FALSE; 1017 1018 /* 1019 * Following assumes TPI alignment requirements stay along 32 bit 1020 * boundaries 1021 */ 1022 #define ROUNDUP32(x) \ 1023 (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1)) 1024 1025 /* Template for response to info request. */ 1026 static struct T_info_ack tcp_g_t_info_ack = { 1027 T_INFO_ACK, /* PRIM_type */ 1028 0, /* TSDU_size */ 1029 T_INFINITE, /* ETSDU_size */ 1030 T_INVALID, /* CDATA_size */ 1031 T_INVALID, /* DDATA_size */ 1032 sizeof (sin_t), /* ADDR_size */ 1033 0, /* OPT_size - not initialized here */ 1034 TIDUSZ, /* TIDU_size */ 1035 T_COTS_ORD, /* SERV_type */ 1036 TCPS_IDLE, /* CURRENT_state */ 1037 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 1038 }; 1039 1040 static struct T_info_ack tcp_g_t_info_ack_v6 = { 1041 T_INFO_ACK, /* PRIM_type */ 1042 0, /* TSDU_size */ 1043 T_INFINITE, /* ETSDU_size */ 1044 T_INVALID, /* CDATA_size */ 1045 T_INVALID, /* DDATA_size */ 1046 sizeof (sin6_t), /* ADDR_size */ 1047 0, /* OPT_size - not initialized here */ 1048 TIDUSZ, /* TIDU_size */ 1049 T_COTS_ORD, /* SERV_type */ 1050 TCPS_IDLE, /* CURRENT_state */ 1051 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 1052 }; 1053 1054 #define MS 1L 1055 #define SECONDS (1000 * MS) 1056 #define MINUTES (60 * SECONDS) 1057 #define HOURS (60 * MINUTES) 1058 #define DAYS (24 * HOURS) 1059 1060 #define PARAM_MAX (~(uint32_t)0) 1061 1062 /* Max size IP datagram is 64k - 1 */ 1063 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t))) 1064 #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t))) 1065 /* Max of the above */ 1066 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 1067 1068 /* Largest TCP port number */ 1069 #define TCP_MAX_PORT (64 * 1024 - 1) 1070 1071 /* 1072 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link 1073 * layer header. It has to be a multiple of 4. 1074 */ 1075 static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; 1076 #define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val 1077 1078 #define MB (1024 * 1024) 1079 1080 /* 1081 * All of these are alterable, within the min/max values given, at run time. 1082 * Note that the default value of "tcp_time_wait_interval" is four minutes, 1083 * per the TCP spec. 1084 */ 1085 /* BEGIN CSTYLED */ 1086 static tcpparam_t lcl_tcp_param_arr[] = { 1087 /*min max value name */ 1088 { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"}, 1089 { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" }, 1090 { 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" }, 1091 { 1, 1024, 1, "tcp_conn_req_min" }, 1092 { 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" }, 1093 { 128, (1<<30), 1*MB, "tcp_cwnd_max" }, 1094 { 0, 10, 0, "tcp_debug" }, 1095 { 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"}, 1096 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"}, 1097 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"}, 1098 { 500*MS, PARAM_MAX, 5*MINUTES, "tcp_ip_abort_interval"}, 1099 { 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"}, 1100 { 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"}, 1101 { 1, 255, 64, "tcp_ipv4_ttl"}, 1102 { 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"}, 1103 { 0, 100, 10, "tcp_maxpsz_multiplier" }, 1104 { 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"}, 1105 { 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"}, 1106 { 1, TCP_MSS_MAX, 108, "tcp_mss_min"}, 1107 { 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"}, 1108 { 1*MS, 20*SECONDS, 1*SECONDS, "tcp_rexmit_interval_initial"}, 1109 { 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"}, 1110 { 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"}, 1111 { 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" }, 1112 { 0, 16, 0, "tcp_snd_lowat_fraction" }, 1113 { 1, 10000, 3, "tcp_dupack_fast_retransmit" }, 1114 { 0, 1, 0, "tcp_ignore_path_mtu" }, 1115 { 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"}, 1116 { 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"}, 1117 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"}, 1118 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"}, 1119 { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"}, 1120 { 1, 65536, 4, "tcp_recv_hiwat_minmss"}, 1121 { 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"}, 1122 { 8192, (1<<30), 1*MB, "tcp_max_buf"}, 1123 /* 1124 * Question: What default value should I set for tcp_strong_iss? 1125 */ 1126 { 0, 2, 1, "tcp_strong_iss"}, 1127 { 0, 65536, 20, "tcp_rtt_updates"}, 1128 { 0, 1, 1, "tcp_wscale_always"}, 1129 { 0, 1, 0, "tcp_tstamp_always"}, 1130 { 0, 1, 1, "tcp_tstamp_if_wscale"}, 1131 { 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"}, 1132 { 0, 16, 2, "tcp_deferred_acks_max"}, 1133 { 1, 16384, 4, "tcp_slow_start_after_idle"}, 1134 { 1, 4, 4, "tcp_slow_start_initial"}, 1135 { 0, 2, 2, "tcp_sack_permitted"}, 1136 { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"}, 1137 { 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"}, 1138 { 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"}, 1139 { 0, 1, 0, "tcp_rev_src_routes"}, 1140 { 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"}, 1141 { 0, 16, 8, "tcp_local_dacks_max"}, 1142 { 0, 2, 1, "tcp_ecn_permitted"}, 1143 { 0, 1, 1, "tcp_rst_sent_rate_enabled"}, 1144 { 0, PARAM_MAX, 40, "tcp_rst_sent_rate"}, 1145 { 0, 100*MS, 50*MS, "tcp_push_timer_interval"}, 1146 { 0, 1, 0, "tcp_use_smss_as_mss_opt"}, 1147 { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"}, 1148 { 0, 1, 0, "tcp_dev_flow_ctl"}, 1149 { 0, PARAM_MAX, 100*SECONDS, "tcp_reass_timeout"} 1150 }; 1151 /* END CSTYLED */ 1152 1153 /* Round up the value to the nearest mss. */ 1154 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 1155 1156 /* 1157 * Set ECN capable transport (ECT) code point in IP header. 1158 * 1159 * Note that there are 2 ECT code points '01' and '10', which are called 1160 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 1161 * point ECT(0) for TCP as described in RFC 2481. 1162 */ 1163 #define SET_ECT(tcp, iph) \ 1164 if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \ 1165 /* We need to clear the code point first. */ \ 1166 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ 1167 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ 1168 } else { \ 1169 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \ 1170 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ 1171 } 1172 1173 /* 1174 * The format argument to pass to tcp_display(). 1175 * DISP_PORT_ONLY means that the returned string has only port info. 1176 * DISP_ADDR_AND_PORT means that the returned string also contains the 1177 * remote and local IP address. 1178 */ 1179 #define DISP_PORT_ONLY 1 1180 #define DISP_ADDR_AND_PORT 2 1181 1182 #define IS_VMLOANED_MBLK(mp) \ 1183 (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) 1184 1185 uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ 1186 1187 /* 1188 * Forces all connections to obey the value of the tcps_maxpsz_multiplier 1189 * tunable settable via NDD. Otherwise, the per-connection behavior is 1190 * determined dynamically during tcp_set_destination(), which is the default. 1191 */ 1192 boolean_t tcp_static_maxpsz = B_FALSE; 1193 1194 /* Setable in /etc/system */ 1195 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ 1196 uint32_t tcp_random_anon_port = 1; 1197 1198 /* 1199 * To reach to an eager in Q0 which can be dropped due to an incoming 1200 * new SYN request when Q0 is full, a new doubly linked list is 1201 * introduced. This list allows to select an eager from Q0 in O(1) time. 1202 * This is needed to avoid spending too much time walking through the 1203 * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of 1204 * this new list has to be a member of Q0. 1205 * This list is headed by listener's tcp_t. When the list is empty, 1206 * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0, 1207 * of listener's tcp_t point to listener's tcp_t itself. 1208 * 1209 * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager 1210 * in the list. MAKE_UNDROPPABLE() takes the eager out of the list. 1211 * These macros do not affect the eager's membership to Q0. 1212 */ 1213 1214 1215 #define MAKE_DROPPABLE(listener, eager) \ 1216 if ((eager)->tcp_eager_next_drop_q0 == NULL) { \ 1217 (listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\ 1218 = (eager); \ 1219 (eager)->tcp_eager_prev_drop_q0 = (listener); \ 1220 (eager)->tcp_eager_next_drop_q0 = \ 1221 (listener)->tcp_eager_next_drop_q0; \ 1222 (listener)->tcp_eager_next_drop_q0 = (eager); \ 1223 } 1224 1225 #define MAKE_UNDROPPABLE(eager) \ 1226 if ((eager)->tcp_eager_next_drop_q0 != NULL) { \ 1227 (eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \ 1228 = (eager)->tcp_eager_prev_drop_q0; \ 1229 (eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \ 1230 = (eager)->tcp_eager_next_drop_q0; \ 1231 (eager)->tcp_eager_prev_drop_q0 = NULL; \ 1232 (eager)->tcp_eager_next_drop_q0 = NULL; \ 1233 } 1234 1235 /* 1236 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more 1237 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent 1238 * data, TCP will not respond with an ACK. RFC 793 requires that 1239 * TCP responds with an ACK for such a bogus ACK. By not following 1240 * the RFC, we prevent TCP from getting into an ACK storm if somehow 1241 * an attacker successfully spoofs an acceptable segment to our 1242 * peer; or when our peer is "confused." 1243 */ 1244 uint32_t tcp_drop_ack_unsent_cnt = 10; 1245 1246 /* 1247 * Hook functions to enable cluster networking 1248 * On non-clustered systems these vectors must always be NULL. 1249 */ 1250 1251 void (*cl_inet_listen)(netstackid_t stack_id, uint8_t protocol, 1252 sa_family_t addr_family, uint8_t *laddrp, 1253 in_port_t lport, void *args) = NULL; 1254 void (*cl_inet_unlisten)(netstackid_t stack_id, uint8_t protocol, 1255 sa_family_t addr_family, uint8_t *laddrp, 1256 in_port_t lport, void *args) = NULL; 1257 1258 int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol, 1259 boolean_t is_outgoing, 1260 sa_family_t addr_family, 1261 uint8_t *laddrp, in_port_t lport, 1262 uint8_t *faddrp, in_port_t fport, 1263 void *args) = NULL; 1264 void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol, 1265 sa_family_t addr_family, uint8_t *laddrp, 1266 in_port_t lport, uint8_t *faddrp, 1267 in_port_t fport, void *args) = NULL; 1268 1269 1270 /* 1271 * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err) 1272 */ 1273 #define CL_INET_CONNECT(connp, is_outgoing, err) { \ 1274 (err) = 0; \ 1275 if (cl_inet_connect2 != NULL) { \ 1276 /* \ 1277 * Running in cluster mode - register active connection \ 1278 * information \ 1279 */ \ 1280 if ((connp)->conn_ipversion == IPV4_VERSION) { \ 1281 if ((connp)->conn_laddr_v4 != 0) { \ 1282 (err) = (*cl_inet_connect2)( \ 1283 (connp)->conn_netstack->netstack_stackid,\ 1284 IPPROTO_TCP, is_outgoing, AF_INET, \ 1285 (uint8_t *)(&((connp)->conn_laddr_v4)),\ 1286 (in_port_t)(connp)->conn_lport, \ 1287 (uint8_t *)(&((connp)->conn_faddr_v4)),\ 1288 (in_port_t)(connp)->conn_fport, NULL); \ 1289 } \ 1290 } else { \ 1291 if (!IN6_IS_ADDR_UNSPECIFIED( \ 1292 &(connp)->conn_laddr_v6)) { \ 1293 (err) = (*cl_inet_connect2)( \ 1294 (connp)->conn_netstack->netstack_stackid,\ 1295 IPPROTO_TCP, is_outgoing, AF_INET6, \ 1296 (uint8_t *)(&((connp)->conn_laddr_v6)),\ 1297 (in_port_t)(connp)->conn_lport, \ 1298 (uint8_t *)(&((connp)->conn_faddr_v6)), \ 1299 (in_port_t)(connp)->conn_fport, NULL); \ 1300 } \ 1301 } \ 1302 } \ 1303 } 1304 1305 #define CL_INET_DISCONNECT(connp) { \ 1306 if (cl_inet_disconnect != NULL) { \ 1307 /* \ 1308 * Running in cluster mode - deregister active \ 1309 * connection information \ 1310 */ \ 1311 if ((connp)->conn_ipversion == IPV4_VERSION) { \ 1312 if ((connp)->conn_laddr_v4 != 0) { \ 1313 (*cl_inet_disconnect)( \ 1314 (connp)->conn_netstack->netstack_stackid,\ 1315 IPPROTO_TCP, AF_INET, \ 1316 (uint8_t *)(&((connp)->conn_laddr_v4)),\ 1317 (in_port_t)(connp)->conn_lport, \ 1318 (uint8_t *)(&((connp)->conn_faddr_v4)),\ 1319 (in_port_t)(connp)->conn_fport, NULL); \ 1320 } \ 1321 } else { \ 1322 if (!IN6_IS_ADDR_UNSPECIFIED( \ 1323 &(connp)->conn_laddr_v6)) { \ 1324 (*cl_inet_disconnect)( \ 1325 (connp)->conn_netstack->netstack_stackid,\ 1326 IPPROTO_TCP, AF_INET6, \ 1327 (uint8_t *)(&((connp)->conn_laddr_v6)),\ 1328 (in_port_t)(connp)->conn_lport, \ 1329 (uint8_t *)(&((connp)->conn_faddr_v6)), \ 1330 (in_port_t)(connp)->conn_fport, NULL); \ 1331 } \ 1332 } \ 1333 } \ 1334 } 1335 1336 /* 1337 * Steps to do when a tcp_t moves to TIME-WAIT state. 1338 * 1339 * This connection is done, we don't need to account for it. Decrement 1340 * the listener connection counter if needed. 1341 * 1342 * Unconditionally clear the exclusive binding bit so this TIME-WAIT 1343 * connection won't interfere with new ones. 1344 * 1345 * Start the TIME-WAIT timer. If upper layer has not closed the connection, 1346 * the timer is handled within the context of this tcp_t. When the timer 1347 * fires, tcp_clean_death() is called. If upper layer closes the connection 1348 * during this period, tcp_time_wait_append() will be called to add this 1349 * tcp_t to the global TIME-WAIT list. Note that this means that the 1350 * actual wait time in TIME-WAIT state will be longer than the 1351 * tcps_time_wait_interval since the period before upper layer closes the 1352 * connection is not accounted for when tcp_time_wait_append() is called. 1353 * 1354 * If uppser layer has closed the connection, call tcp_time_wait_append() 1355 * directly. 1356 */ 1357 #define SET_TIME_WAIT(tcps, tcp, connp) \ 1358 { \ 1359 (tcp)->tcp_state = TCPS_TIME_WAIT; \ 1360 if ((tcp)->tcp_listen_cnt != NULL) \ 1361 TCP_DECR_LISTEN_CNT(tcp); \ 1362 (connp)->conn_exclbind = 0; \ 1363 if (!TCP_IS_DETACHED(tcp)) { \ 1364 TCP_TIMER_RESTART(tcp, (tcps)->tcps_time_wait_interval); \ 1365 } else { \ 1366 tcp_time_wait_append(tcp); \ 1367 TCP_DBGSTAT(tcps, tcp_rput_time_wait); \ 1368 } \ 1369 } 1370 1371 /* 1372 * Cluster networking hook for traversing current connection list. 1373 * This routine is used to extract the current list of live connections 1374 * which must continue to to be dispatched to this node. 1375 */ 1376 int cl_tcp_walk_list(netstackid_t stack_id, 1377 int (*callback)(cl_tcp_info_t *, void *), void *arg); 1378 1379 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), 1380 void *arg, tcp_stack_t *tcps); 1381 1382 static void 1383 tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh) 1384 { 1385 uint32_t default_threshold = SOCKET_RECVHIWATER >> 3; 1386 1387 if (IPCL_IS_NONSTR(tcp->tcp_connp)) { 1388 conn_t *connp = tcp->tcp_connp; 1389 struct sock_proto_props sopp; 1390 1391 /* 1392 * only increase rcvthresh upto default_threshold 1393 */ 1394 if (new_rcvthresh > default_threshold) 1395 new_rcvthresh = default_threshold; 1396 1397 sopp.sopp_flags = SOCKOPT_RCVTHRESH; 1398 sopp.sopp_rcvthresh = new_rcvthresh; 1399 1400 (*connp->conn_upcalls->su_set_proto_props) 1401 (connp->conn_upper_handle, &sopp); 1402 } 1403 } 1404 /* 1405 * Figure out the value of window scale opton. Note that the rwnd is 1406 * ASSUMED to be rounded up to the nearest MSS before the calculation. 1407 * We cannot find the scale value and then do a round up of tcp_rwnd 1408 * because the scale value may not be correct after that. 1409 * 1410 * Set the compiler flag to make this function inline. 1411 */ 1412 static void 1413 tcp_set_ws_value(tcp_t *tcp) 1414 { 1415 int i; 1416 uint32_t rwnd = tcp->tcp_rwnd; 1417 1418 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; 1419 i++, rwnd >>= 1) 1420 ; 1421 tcp->tcp_rcv_ws = i; 1422 } 1423 1424 /* 1425 * Remove a connection from the list of detached TIME_WAIT connections. 1426 * It returns B_FALSE if it can't remove the connection from the list 1427 * as the connection has already been removed from the list due to an 1428 * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. 1429 */ 1430 static boolean_t 1431 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) 1432 { 1433 boolean_t locked = B_FALSE; 1434 1435 if (tcp_time_wait == NULL) { 1436 tcp_time_wait = *((tcp_squeue_priv_t **) 1437 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); 1438 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1439 locked = B_TRUE; 1440 } else { 1441 ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); 1442 } 1443 1444 if (tcp->tcp_time_wait_expire == 0) { 1445 ASSERT(tcp->tcp_time_wait_next == NULL); 1446 ASSERT(tcp->tcp_time_wait_prev == NULL); 1447 if (locked) 1448 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1449 return (B_FALSE); 1450 } 1451 ASSERT(TCP_IS_DETACHED(tcp)); 1452 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1453 1454 if (tcp == tcp_time_wait->tcp_time_wait_head) { 1455 ASSERT(tcp->tcp_time_wait_prev == NULL); 1456 tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; 1457 if (tcp_time_wait->tcp_time_wait_head != NULL) { 1458 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = 1459 NULL; 1460 } else { 1461 tcp_time_wait->tcp_time_wait_tail = NULL; 1462 } 1463 } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { 1464 ASSERT(tcp != tcp_time_wait->tcp_time_wait_head); 1465 ASSERT(tcp->tcp_time_wait_next == NULL); 1466 tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; 1467 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1468 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; 1469 } else { 1470 ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 1471 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 1472 tcp->tcp_time_wait_prev->tcp_time_wait_next = 1473 tcp->tcp_time_wait_next; 1474 tcp->tcp_time_wait_next->tcp_time_wait_prev = 1475 tcp->tcp_time_wait_prev; 1476 } 1477 tcp->tcp_time_wait_next = NULL; 1478 tcp->tcp_time_wait_prev = NULL; 1479 tcp->tcp_time_wait_expire = 0; 1480 1481 if (locked) 1482 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1483 return (B_TRUE); 1484 } 1485 1486 /* 1487 * Add a connection to the list of detached TIME_WAIT connections 1488 * and set its time to expire. 1489 */ 1490 static void 1491 tcp_time_wait_append(tcp_t *tcp) 1492 { 1493 tcp_stack_t *tcps = tcp->tcp_tcps; 1494 tcp_squeue_priv_t *tcp_time_wait = 1495 *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp, 1496 SQPRIVATE_TCP)); 1497 1498 tcp_timers_stop(tcp); 1499 1500 /* Freed above */ 1501 ASSERT(tcp->tcp_timer_tid == 0); 1502 ASSERT(tcp->tcp_ack_tid == 0); 1503 1504 /* must have happened at the time of detaching the tcp */ 1505 ASSERT(tcp->tcp_ptpahn == NULL); 1506 ASSERT(tcp->tcp_flow_stopped == 0); 1507 ASSERT(tcp->tcp_time_wait_next == NULL); 1508 ASSERT(tcp->tcp_time_wait_prev == NULL); 1509 ASSERT(tcp->tcp_time_wait_expire == NULL); 1510 ASSERT(tcp->tcp_listener == NULL); 1511 1512 tcp->tcp_time_wait_expire = ddi_get_lbolt(); 1513 /* 1514 * The value computed below in tcp->tcp_time_wait_expire may 1515 * appear negative or wrap around. That is ok since our 1516 * interest is only in the difference between the current lbolt 1517 * value and tcp->tcp_time_wait_expire. But the value should not 1518 * be zero, since it means the tcp is not in the TIME_WAIT list. 1519 * The corresponding comparison in tcp_time_wait_collector() uses 1520 * modular arithmetic. 1521 */ 1522 tcp->tcp_time_wait_expire += 1523 drv_usectohz(tcps->tcps_time_wait_interval * 1000); 1524 if (tcp->tcp_time_wait_expire == 0) 1525 tcp->tcp_time_wait_expire = 1; 1526 1527 ASSERT(TCP_IS_DETACHED(tcp)); 1528 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1529 ASSERT(tcp->tcp_time_wait_next == NULL); 1530 ASSERT(tcp->tcp_time_wait_prev == NULL); 1531 TCP_DBGSTAT(tcps, tcp_time_wait); 1532 1533 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1534 if (tcp_time_wait->tcp_time_wait_head == NULL) { 1535 ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); 1536 tcp_time_wait->tcp_time_wait_head = tcp; 1537 } else { 1538 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1539 ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == 1540 TCPS_TIME_WAIT); 1541 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; 1542 tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; 1543 } 1544 tcp_time_wait->tcp_time_wait_tail = tcp; 1545 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1546 } 1547 1548 /* ARGSUSED */ 1549 void 1550 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 1551 { 1552 conn_t *connp = (conn_t *)arg; 1553 tcp_t *tcp = connp->conn_tcp; 1554 tcp_stack_t *tcps = tcp->tcp_tcps; 1555 1556 ASSERT(tcp != NULL); 1557 if (tcp->tcp_state == TCPS_CLOSED) { 1558 return; 1559 } 1560 1561 ASSERT((connp->conn_family == AF_INET && 1562 connp->conn_ipversion == IPV4_VERSION) || 1563 (connp->conn_family == AF_INET6 && 1564 (connp->conn_ipversion == IPV4_VERSION || 1565 connp->conn_ipversion == IPV6_VERSION))); 1566 ASSERT(!tcp->tcp_listener); 1567 1568 TCP_STAT(tcps, tcp_time_wait_reap); 1569 ASSERT(TCP_IS_DETACHED(tcp)); 1570 1571 /* 1572 * Because they have no upstream client to rebind or tcp_close() 1573 * them later, we axe the connection here and now. 1574 */ 1575 tcp_close_detached(tcp); 1576 } 1577 1578 /* 1579 * Remove cached/latched IPsec references. 1580 */ 1581 void 1582 tcp_ipsec_cleanup(tcp_t *tcp) 1583 { 1584 conn_t *connp = tcp->tcp_connp; 1585 1586 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1587 1588 if (connp->conn_latch != NULL) { 1589 IPLATCH_REFRELE(connp->conn_latch); 1590 connp->conn_latch = NULL; 1591 } 1592 if (connp->conn_latch_in_policy != NULL) { 1593 IPPOL_REFRELE(connp->conn_latch_in_policy); 1594 connp->conn_latch_in_policy = NULL; 1595 } 1596 if (connp->conn_latch_in_action != NULL) { 1597 IPACT_REFRELE(connp->conn_latch_in_action); 1598 connp->conn_latch_in_action = NULL; 1599 } 1600 if (connp->conn_policy != NULL) { 1601 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 1602 connp->conn_policy = NULL; 1603 } 1604 } 1605 1606 /* 1607 * Cleaup before placing on free list. 1608 * Disassociate from the netstack/tcp_stack_t since the freelist 1609 * is per squeue and not per netstack. 1610 */ 1611 void 1612 tcp_cleanup(tcp_t *tcp) 1613 { 1614 mblk_t *mp; 1615 tcp_sack_info_t *tcp_sack_info; 1616 conn_t *connp = tcp->tcp_connp; 1617 tcp_stack_t *tcps = tcp->tcp_tcps; 1618 netstack_t *ns = tcps->tcps_netstack; 1619 mblk_t *tcp_rsrv_mp; 1620 1621 tcp_bind_hash_remove(tcp); 1622 1623 /* Cleanup that which needs the netstack first */ 1624 tcp_ipsec_cleanup(tcp); 1625 ixa_cleanup(connp->conn_ixa); 1626 1627 if (connp->conn_ht_iphc != NULL) { 1628 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 1629 connp->conn_ht_iphc = NULL; 1630 connp->conn_ht_iphc_allocated = 0; 1631 connp->conn_ht_iphc_len = 0; 1632 connp->conn_ht_ulp = NULL; 1633 connp->conn_ht_ulp_len = 0; 1634 tcp->tcp_ipha = NULL; 1635 tcp->tcp_ip6h = NULL; 1636 tcp->tcp_tcpha = NULL; 1637 } 1638 1639 /* We clear any IP_OPTIONS and extension headers */ 1640 ip_pkt_free(&connp->conn_xmit_ipp); 1641 1642 tcp_free(tcp); 1643 1644 /* Release any SSL context */ 1645 if (tcp->tcp_kssl_ent != NULL) { 1646 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); 1647 tcp->tcp_kssl_ent = NULL; 1648 } 1649 1650 if (tcp->tcp_kssl_ctx != NULL) { 1651 kssl_release_ctx(tcp->tcp_kssl_ctx); 1652 tcp->tcp_kssl_ctx = NULL; 1653 } 1654 tcp->tcp_kssl_pending = B_FALSE; 1655 1656 /* 1657 * Since we will bzero the entire structure, we need to 1658 * remove it and reinsert it in global hash list. We 1659 * know the walkers can't get to this conn because we 1660 * had set CONDEMNED flag earlier and checked reference 1661 * under conn_lock so walker won't pick it and when we 1662 * go the ipcl_globalhash_remove() below, no walker 1663 * can get to it. 1664 */ 1665 ipcl_globalhash_remove(connp); 1666 1667 /* Save some state */ 1668 mp = tcp->tcp_timercache; 1669 1670 tcp_sack_info = tcp->tcp_sack_info; 1671 tcp_rsrv_mp = tcp->tcp_rsrv_mp; 1672 1673 if (connp->conn_cred != NULL) { 1674 crfree(connp->conn_cred); 1675 connp->conn_cred = NULL; 1676 } 1677 ipcl_conn_cleanup(connp); 1678 connp->conn_flags = IPCL_TCPCONN; 1679 1680 /* 1681 * Now it is safe to decrement the reference counts. 1682 * This might be the last reference on the netstack 1683 * in which case it will cause the freeing of the IP Instance. 1684 */ 1685 connp->conn_netstack = NULL; 1686 connp->conn_ixa->ixa_ipst = NULL; 1687 netstack_rele(ns); 1688 ASSERT(tcps != NULL); 1689 tcp->tcp_tcps = NULL; 1690 1691 bzero(tcp, sizeof (tcp_t)); 1692 1693 /* restore the state */ 1694 tcp->tcp_timercache = mp; 1695 1696 tcp->tcp_sack_info = tcp_sack_info; 1697 tcp->tcp_rsrv_mp = tcp_rsrv_mp; 1698 1699 tcp->tcp_connp = connp; 1700 1701 ASSERT(connp->conn_tcp == tcp); 1702 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1703 connp->conn_state_flags = CONN_INCIPIENT; 1704 ASSERT(connp->conn_proto == IPPROTO_TCP); 1705 ASSERT(connp->conn_ref == 1); 1706 } 1707 1708 /* 1709 * Blows away all tcps whose TIME_WAIT has expired. List traversal 1710 * is done forwards from the head. 1711 * This walks all stack instances since 1712 * tcp_time_wait remains global across all stacks. 1713 */ 1714 /* ARGSUSED */ 1715 void 1716 tcp_time_wait_collector(void *arg) 1717 { 1718 tcp_t *tcp; 1719 clock_t now; 1720 mblk_t *mp; 1721 conn_t *connp; 1722 kmutex_t *lock; 1723 boolean_t removed; 1724 1725 squeue_t *sqp = (squeue_t *)arg; 1726 tcp_squeue_priv_t *tcp_time_wait = 1727 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 1728 1729 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1730 tcp_time_wait->tcp_time_wait_tid = 0; 1731 1732 if (tcp_time_wait->tcp_free_list != NULL && 1733 tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { 1734 TCP_G_STAT(tcp_freelist_cleanup); 1735 while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { 1736 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 1737 tcp->tcp_time_wait_next = NULL; 1738 tcp_time_wait->tcp_free_list_cnt--; 1739 ASSERT(tcp->tcp_tcps == NULL); 1740 CONN_DEC_REF(tcp->tcp_connp); 1741 } 1742 ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); 1743 } 1744 1745 /* 1746 * In order to reap time waits reliably, we should use a 1747 * source of time that is not adjustable by the user -- hence 1748 * the call to ddi_get_lbolt(). 1749 */ 1750 now = ddi_get_lbolt(); 1751 while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { 1752 /* 1753 * Compare times using modular arithmetic, since 1754 * lbolt can wrapover. 1755 */ 1756 if ((now - tcp->tcp_time_wait_expire) < 0) { 1757 break; 1758 } 1759 1760 removed = tcp_time_wait_remove(tcp, tcp_time_wait); 1761 ASSERT(removed); 1762 1763 connp = tcp->tcp_connp; 1764 ASSERT(connp->conn_fanout != NULL); 1765 lock = &connp->conn_fanout->connf_lock; 1766 /* 1767 * This is essentially a TW reclaim fast path optimization for 1768 * performance where the timewait collector checks under the 1769 * fanout lock (so that no one else can get access to the 1770 * conn_t) that the refcnt is 2 i.e. one for TCP and one for 1771 * the classifier hash list. If ref count is indeed 2, we can 1772 * just remove the conn under the fanout lock and avoid 1773 * cleaning up the conn under the squeue, provided that 1774 * clustering callbacks are not enabled. If clustering is 1775 * enabled, we need to make the clustering callback before 1776 * setting the CONDEMNED flag and after dropping all locks and 1777 * so we forego this optimization and fall back to the slow 1778 * path. Also please see the comments in tcp_closei_local 1779 * regarding the refcnt logic. 1780 * 1781 * Since we are holding the tcp_time_wait_lock, its better 1782 * not to block on the fanout_lock because other connections 1783 * can't add themselves to time_wait list. So we do a 1784 * tryenter instead of mutex_enter. 1785 */ 1786 if (mutex_tryenter(lock)) { 1787 mutex_enter(&connp->conn_lock); 1788 if ((connp->conn_ref == 2) && 1789 (cl_inet_disconnect == NULL)) { 1790 ipcl_hash_remove_locked(connp, 1791 connp->conn_fanout); 1792 /* 1793 * Set the CONDEMNED flag now itself so that 1794 * the refcnt cannot increase due to any 1795 * walker. 1796 */ 1797 connp->conn_state_flags |= CONN_CONDEMNED; 1798 mutex_exit(lock); 1799 mutex_exit(&connp->conn_lock); 1800 if (tcp_time_wait->tcp_free_list_cnt < 1801 tcp_free_list_max_cnt) { 1802 /* Add to head of tcp_free_list */ 1803 mutex_exit( 1804 &tcp_time_wait->tcp_time_wait_lock); 1805 tcp_cleanup(tcp); 1806 ASSERT(connp->conn_latch == NULL); 1807 ASSERT(connp->conn_policy == NULL); 1808 ASSERT(tcp->tcp_tcps == NULL); 1809 ASSERT(connp->conn_netstack == NULL); 1810 1811 mutex_enter( 1812 &tcp_time_wait->tcp_time_wait_lock); 1813 tcp->tcp_time_wait_next = 1814 tcp_time_wait->tcp_free_list; 1815 tcp_time_wait->tcp_free_list = tcp; 1816 tcp_time_wait->tcp_free_list_cnt++; 1817 continue; 1818 } else { 1819 /* Do not add to tcp_free_list */ 1820 mutex_exit( 1821 &tcp_time_wait->tcp_time_wait_lock); 1822 tcp_bind_hash_remove(tcp); 1823 ixa_cleanup(tcp->tcp_connp->conn_ixa); 1824 tcp_ipsec_cleanup(tcp); 1825 CONN_DEC_REF(tcp->tcp_connp); 1826 } 1827 } else { 1828 CONN_INC_REF_LOCKED(connp); 1829 mutex_exit(lock); 1830 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1831 mutex_exit(&connp->conn_lock); 1832 /* 1833 * We can reuse the closemp here since conn has 1834 * detached (otherwise we wouldn't even be in 1835 * time_wait list). tcp_closemp_used can safely 1836 * be changed without taking a lock as no other 1837 * thread can concurrently access it at this 1838 * point in the connection lifecycle. 1839 */ 1840 1841 if (tcp->tcp_closemp.b_prev == NULL) 1842 tcp->tcp_closemp_used = B_TRUE; 1843 else 1844 cmn_err(CE_PANIC, 1845 "tcp_timewait_collector: " 1846 "concurrent use of tcp_closemp: " 1847 "connp %p tcp %p\n", (void *)connp, 1848 (void *)tcp); 1849 1850 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1851 mp = &tcp->tcp_closemp; 1852 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 1853 tcp_timewait_output, connp, NULL, 1854 SQ_FILL, SQTAG_TCP_TIMEWAIT); 1855 } 1856 } else { 1857 mutex_enter(&connp->conn_lock); 1858 CONN_INC_REF_LOCKED(connp); 1859 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1860 mutex_exit(&connp->conn_lock); 1861 /* 1862 * We can reuse the closemp here since conn has 1863 * detached (otherwise we wouldn't even be in 1864 * time_wait list). tcp_closemp_used can safely 1865 * be changed without taking a lock as no other 1866 * thread can concurrently access it at this 1867 * point in the connection lifecycle. 1868 */ 1869 1870 if (tcp->tcp_closemp.b_prev == NULL) 1871 tcp->tcp_closemp_used = B_TRUE; 1872 else 1873 cmn_err(CE_PANIC, "tcp_timewait_collector: " 1874 "concurrent use of tcp_closemp: " 1875 "connp %p tcp %p\n", (void *)connp, 1876 (void *)tcp); 1877 1878 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1879 mp = &tcp->tcp_closemp; 1880 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 1881 tcp_timewait_output, connp, NULL, 1882 SQ_FILL, SQTAG_TCP_TIMEWAIT); 1883 } 1884 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1885 } 1886 1887 if (tcp_time_wait->tcp_free_list != NULL) 1888 tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; 1889 1890 tcp_time_wait->tcp_time_wait_tid = 1891 timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp, 1892 TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION, 1893 CALLOUT_FLAG_ROUNDUP); 1894 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1895 } 1896 1897 /* 1898 * Reply to a clients T_CONN_RES TPI message. This function 1899 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES 1900 * on the acceptor STREAM and processed in tcp_accept_common(). 1901 * Read the block comment on top of tcp_input_listener(). 1902 */ 1903 static void 1904 tcp_tli_accept(tcp_t *listener, mblk_t *mp) 1905 { 1906 tcp_t *acceptor; 1907 tcp_t *eager; 1908 tcp_t *tcp; 1909 struct T_conn_res *tcr; 1910 t_uscalar_t acceptor_id; 1911 t_scalar_t seqnum; 1912 mblk_t *discon_mp = NULL; 1913 mblk_t *ok_mp; 1914 mblk_t *mp1; 1915 tcp_stack_t *tcps = listener->tcp_tcps; 1916 conn_t *econnp; 1917 1918 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 1919 tcp_err_ack(listener, mp, TPROTO, 0); 1920 return; 1921 } 1922 tcr = (struct T_conn_res *)mp->b_rptr; 1923 1924 /* 1925 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the 1926 * read side queue of the streams device underneath us i.e. the 1927 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we 1928 * look it up in the queue_hash. Under LP64 it sends down the 1929 * minor_t of the accepting endpoint. 1930 * 1931 * Once the acceptor/eager are modified (in tcp_accept_swap) the 1932 * fanout hash lock is held. 1933 * This prevents any thread from entering the acceptor queue from 1934 * below (since it has not been hard bound yet i.e. any inbound 1935 * packets will arrive on the listener conn_t and 1936 * go through the classifier). 1937 * The CONN_INC_REF will prevent the acceptor from closing. 1938 * 1939 * XXX It is still possible for a tli application to send down data 1940 * on the accepting stream while another thread calls t_accept. 1941 * This should not be a problem for well-behaved applications since 1942 * the T_OK_ACK is sent after the queue swapping is completed. 1943 * 1944 * If the accepting fd is the same as the listening fd, avoid 1945 * queue hash lookup since that will return an eager listener in a 1946 * already established state. 1947 */ 1948 acceptor_id = tcr->ACCEPTOR_id; 1949 mutex_enter(&listener->tcp_eager_lock); 1950 if (listener->tcp_acceptor_id == acceptor_id) { 1951 eager = listener->tcp_eager_next_q; 1952 /* only count how many T_CONN_INDs so don't count q0 */ 1953 if ((listener->tcp_conn_req_cnt_q != 1) || 1954 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { 1955 mutex_exit(&listener->tcp_eager_lock); 1956 tcp_err_ack(listener, mp, TBADF, 0); 1957 return; 1958 } 1959 if (listener->tcp_conn_req_cnt_q0 != 0) { 1960 /* Throw away all the eagers on q0. */ 1961 tcp_eager_cleanup(listener, 1); 1962 } 1963 if (listener->tcp_syn_defense) { 1964 listener->tcp_syn_defense = B_FALSE; 1965 if (listener->tcp_ip_addr_cache != NULL) { 1966 kmem_free(listener->tcp_ip_addr_cache, 1967 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1968 listener->tcp_ip_addr_cache = NULL; 1969 } 1970 } 1971 /* 1972 * Transfer tcp_conn_req_max to the eager so that when 1973 * a disconnect occurs we can revert the endpoint to the 1974 * listen state. 1975 */ 1976 eager->tcp_conn_req_max = listener->tcp_conn_req_max; 1977 ASSERT(listener->tcp_conn_req_cnt_q0 == 0); 1978 /* 1979 * Get a reference on the acceptor just like the 1980 * tcp_acceptor_hash_lookup below. 1981 */ 1982 acceptor = listener; 1983 CONN_INC_REF(acceptor->tcp_connp); 1984 } else { 1985 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); 1986 if (acceptor == NULL) { 1987 if (listener->tcp_connp->conn_debug) { 1988 (void) strlog(TCP_MOD_ID, 0, 1, 1989 SL_ERROR|SL_TRACE, 1990 "tcp_accept: did not find acceptor 0x%x\n", 1991 acceptor_id); 1992 } 1993 mutex_exit(&listener->tcp_eager_lock); 1994 tcp_err_ack(listener, mp, TPROVMISMATCH, 0); 1995 return; 1996 } 1997 /* 1998 * Verify acceptor state. The acceptable states for an acceptor 1999 * include TCPS_IDLE and TCPS_BOUND. 2000 */ 2001 switch (acceptor->tcp_state) { 2002 case TCPS_IDLE: 2003 /* FALLTHRU */ 2004 case TCPS_BOUND: 2005 break; 2006 default: 2007 CONN_DEC_REF(acceptor->tcp_connp); 2008 mutex_exit(&listener->tcp_eager_lock); 2009 tcp_err_ack(listener, mp, TOUTSTATE, 0); 2010 return; 2011 } 2012 } 2013 2014 /* The listener must be in TCPS_LISTEN */ 2015 if (listener->tcp_state != TCPS_LISTEN) { 2016 CONN_DEC_REF(acceptor->tcp_connp); 2017 mutex_exit(&listener->tcp_eager_lock); 2018 tcp_err_ack(listener, mp, TOUTSTATE, 0); 2019 return; 2020 } 2021 2022 /* 2023 * Rendezvous with an eager connection request packet hanging off 2024 * 'tcp' that has the 'seqnum' tag. We tagged the detached open 2025 * tcp structure when the connection packet arrived in 2026 * tcp_input_listener(). 2027 */ 2028 seqnum = tcr->SEQ_number; 2029 eager = listener; 2030 do { 2031 eager = eager->tcp_eager_next_q; 2032 if (eager == NULL) { 2033 CONN_DEC_REF(acceptor->tcp_connp); 2034 mutex_exit(&listener->tcp_eager_lock); 2035 tcp_err_ack(listener, mp, TBADSEQ, 0); 2036 return; 2037 } 2038 } while (eager->tcp_conn_req_seqnum != seqnum); 2039 mutex_exit(&listener->tcp_eager_lock); 2040 2041 /* 2042 * At this point, both acceptor and listener have 2 ref 2043 * that they begin with. Acceptor has one additional ref 2044 * we placed in lookup while listener has 3 additional 2045 * ref for being behind the squeue (tcp_accept() is 2046 * done on listener's squeue); being in classifier hash; 2047 * and eager's ref on listener. 2048 */ 2049 ASSERT(listener->tcp_connp->conn_ref >= 5); 2050 ASSERT(acceptor->tcp_connp->conn_ref >= 3); 2051 2052 /* 2053 * The eager at this point is set in its own squeue and 2054 * could easily have been killed (tcp_accept_finish will 2055 * deal with that) because of a TH_RST so we can only 2056 * ASSERT for a single ref. 2057 */ 2058 ASSERT(eager->tcp_connp->conn_ref >= 1); 2059 2060 /* 2061 * Pre allocate the discon_ind mblk also. tcp_accept_finish will 2062 * use it if something failed. 2063 */ 2064 discon_mp = allocb(MAX(sizeof (struct T_discon_ind), 2065 sizeof (struct stroptions)), BPRI_HI); 2066 if (discon_mp == NULL) { 2067 CONN_DEC_REF(acceptor->tcp_connp); 2068 CONN_DEC_REF(eager->tcp_connp); 2069 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 2070 return; 2071 } 2072 2073 econnp = eager->tcp_connp; 2074 2075 /* Hold a copy of mp, in case reallocb fails */ 2076 if ((mp1 = copymsg(mp)) == NULL) { 2077 CONN_DEC_REF(acceptor->tcp_connp); 2078 CONN_DEC_REF(eager->tcp_connp); 2079 freemsg(discon_mp); 2080 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 2081 return; 2082 } 2083 2084 tcr = (struct T_conn_res *)mp1->b_rptr; 2085 2086 /* 2087 * This is an expanded version of mi_tpi_ok_ack_alloc() 2088 * which allocates a larger mblk and appends the new 2089 * local address to the ok_ack. The address is copied by 2090 * soaccept() for getsockname(). 2091 */ 2092 { 2093 int extra; 2094 2095 extra = (econnp->conn_family == AF_INET) ? 2096 sizeof (sin_t) : sizeof (sin6_t); 2097 2098 /* 2099 * Try to re-use mp, if possible. Otherwise, allocate 2100 * an mblk and return it as ok_mp. In any case, mp 2101 * is no longer usable upon return. 2102 */ 2103 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { 2104 CONN_DEC_REF(acceptor->tcp_connp); 2105 CONN_DEC_REF(eager->tcp_connp); 2106 freemsg(discon_mp); 2107 /* Original mp has been freed by now, so use mp1 */ 2108 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); 2109 return; 2110 } 2111 2112 mp = NULL; /* We should never use mp after this point */ 2113 2114 switch (extra) { 2115 case sizeof (sin_t): { 2116 sin_t *sin = (sin_t *)ok_mp->b_wptr; 2117 2118 ok_mp->b_wptr += extra; 2119 sin->sin_family = AF_INET; 2120 sin->sin_port = econnp->conn_lport; 2121 sin->sin_addr.s_addr = econnp->conn_laddr_v4; 2122 break; 2123 } 2124 case sizeof (sin6_t): { 2125 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; 2126 2127 ok_mp->b_wptr += extra; 2128 sin6->sin6_family = AF_INET6; 2129 sin6->sin6_port = econnp->conn_lport; 2130 sin6->sin6_addr = econnp->conn_laddr_v6; 2131 sin6->sin6_flowinfo = econnp->conn_flowinfo; 2132 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 2133 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 2134 sin6->sin6_scope_id = 2135 econnp->conn_ixa->ixa_scopeid; 2136 } else { 2137 sin6->sin6_scope_id = 0; 2138 } 2139 sin6->__sin6_src_id = 0; 2140 break; 2141 } 2142 default: 2143 break; 2144 } 2145 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); 2146 } 2147 2148 /* 2149 * If there are no options we know that the T_CONN_RES will 2150 * succeed. However, we can't send the T_OK_ACK upstream until 2151 * the tcp_accept_swap is done since it would be dangerous to 2152 * let the application start using the new fd prior to the swap. 2153 */ 2154 tcp_accept_swap(listener, acceptor, eager); 2155 2156 /* 2157 * tcp_accept_swap unlinks eager from listener but does not drop 2158 * the eager's reference on the listener. 2159 */ 2160 ASSERT(eager->tcp_listener == NULL); 2161 ASSERT(listener->tcp_connp->conn_ref >= 5); 2162 2163 /* 2164 * The eager is now associated with its own queue. Insert in 2165 * the hash so that the connection can be reused for a future 2166 * T_CONN_RES. 2167 */ 2168 tcp_acceptor_hash_insert(acceptor_id, eager); 2169 2170 /* 2171 * We now do the processing of options with T_CONN_RES. 2172 * We delay till now since we wanted to have queue to pass to 2173 * option processing routines that points back to the right 2174 * instance structure which does not happen until after 2175 * tcp_accept_swap(). 2176 * 2177 * Note: 2178 * The sanity of the logic here assumes that whatever options 2179 * are appropriate to inherit from listner=>eager are done 2180 * before this point, and whatever were to be overridden (or not) 2181 * in transfer logic from eager=>acceptor in tcp_accept_swap(). 2182 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it 2183 * before its ACCEPTOR_id comes down in T_CONN_RES ] 2184 * This may not be true at this point in time but can be fixed 2185 * independently. This option processing code starts with 2186 * the instantiated acceptor instance and the final queue at 2187 * this point. 2188 */ 2189 2190 if (tcr->OPT_length != 0) { 2191 /* Options to process */ 2192 int t_error = 0; 2193 int sys_error = 0; 2194 int do_disconnect = 0; 2195 2196 if (tcp_conprim_opt_process(eager, mp1, 2197 &do_disconnect, &t_error, &sys_error) < 0) { 2198 eager->tcp_accept_error = 1; 2199 if (do_disconnect) { 2200 /* 2201 * An option failed which does not allow 2202 * connection to be accepted. 2203 * 2204 * We allow T_CONN_RES to succeed and 2205 * put a T_DISCON_IND on the eager queue. 2206 */ 2207 ASSERT(t_error == 0 && sys_error == 0); 2208 eager->tcp_send_discon_ind = 1; 2209 } else { 2210 ASSERT(t_error != 0); 2211 freemsg(ok_mp); 2212 /* 2213 * Original mp was either freed or set 2214 * to ok_mp above, so use mp1 instead. 2215 */ 2216 tcp_err_ack(listener, mp1, t_error, sys_error); 2217 goto finish; 2218 } 2219 } 2220 /* 2221 * Most likely success in setting options (except if 2222 * eager->tcp_send_discon_ind set). 2223 * mp1 option buffer represented by OPT_length/offset 2224 * potentially modified and contains results of setting 2225 * options at this point 2226 */ 2227 } 2228 2229 /* We no longer need mp1, since all options processing has passed */ 2230 freemsg(mp1); 2231 2232 putnext(listener->tcp_connp->conn_rq, ok_mp); 2233 2234 mutex_enter(&listener->tcp_eager_lock); 2235 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 2236 tcp_t *tail; 2237 mblk_t *conn_ind; 2238 2239 /* 2240 * This path should not be executed if listener and 2241 * acceptor streams are the same. 2242 */ 2243 ASSERT(listener != acceptor); 2244 2245 tcp = listener->tcp_eager_prev_q0; 2246 /* 2247 * listener->tcp_eager_prev_q0 points to the TAIL of the 2248 * deferred T_conn_ind queue. We need to get to the head of 2249 * the queue in order to send up T_conn_ind the same order as 2250 * how the 3WHS is completed. 2251 */ 2252 while (tcp != listener) { 2253 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) 2254 break; 2255 else 2256 tcp = tcp->tcp_eager_prev_q0; 2257 } 2258 ASSERT(tcp != listener); 2259 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; 2260 ASSERT(conn_ind != NULL); 2261 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 2262 2263 /* Move from q0 to q */ 2264 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 2265 listener->tcp_conn_req_cnt_q0--; 2266 listener->tcp_conn_req_cnt_q++; 2267 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 2268 tcp->tcp_eager_prev_q0; 2269 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 2270 tcp->tcp_eager_next_q0; 2271 tcp->tcp_eager_prev_q0 = NULL; 2272 tcp->tcp_eager_next_q0 = NULL; 2273 tcp->tcp_conn_def_q0 = B_FALSE; 2274 2275 /* Make sure the tcp isn't in the list of droppables */ 2276 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && 2277 tcp->tcp_eager_prev_drop_q0 == NULL); 2278 2279 /* 2280 * Insert at end of the queue because sockfs sends 2281 * down T_CONN_RES in chronological order. Leaving 2282 * the older conn indications at front of the queue 2283 * helps reducing search time. 2284 */ 2285 tail = listener->tcp_eager_last_q; 2286 if (tail != NULL) 2287 tail->tcp_eager_next_q = tcp; 2288 else 2289 listener->tcp_eager_next_q = tcp; 2290 listener->tcp_eager_last_q = tcp; 2291 tcp->tcp_eager_next_q = NULL; 2292 mutex_exit(&listener->tcp_eager_lock); 2293 putnext(tcp->tcp_connp->conn_rq, conn_ind); 2294 } else { 2295 mutex_exit(&listener->tcp_eager_lock); 2296 } 2297 2298 /* 2299 * Done with the acceptor - free it 2300 * 2301 * Note: from this point on, no access to listener should be made 2302 * as listener can be equal to acceptor. 2303 */ 2304 finish: 2305 ASSERT(acceptor->tcp_detached); 2306 acceptor->tcp_connp->conn_rq = NULL; 2307 ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); 2308 acceptor->tcp_connp->conn_wq = NULL; 2309 (void) tcp_clean_death(acceptor, 0, 2); 2310 CONN_DEC_REF(acceptor->tcp_connp); 2311 2312 /* 2313 * We pass discon_mp to tcp_accept_finish to get on the right squeue. 2314 * 2315 * It will update the setting for sockfs/stream head and also take 2316 * care of any data that arrived before accept() wad called. 2317 * In case we already received a FIN then tcp_accept_finish will send up 2318 * the ordrel. It will also send up a window update if the window 2319 * has opened up. 2320 */ 2321 2322 /* 2323 * XXX: we currently have a problem if XTI application closes the 2324 * acceptor stream in between. This problem exists in on10-gate also 2325 * and is well know but nothing can be done short of major rewrite 2326 * to fix it. Now it is possible to take care of it by assigning TLI/XTI 2327 * eager same squeue as listener (we can distinguish non socket 2328 * listeners at the time of handling a SYN in tcp_input_listener) 2329 * and do most of the work that tcp_accept_finish does here itself 2330 * and then get behind the acceptor squeue to access the acceptor 2331 * queue. 2332 */ 2333 /* 2334 * We already have a ref on tcp so no need to do one before squeue_enter 2335 */ 2336 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp, 2337 tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL, 2338 SQTAG_TCP_ACCEPT_FINISH); 2339 } 2340 2341 /* 2342 * Swap information between the eager and acceptor for a TLI/XTI client. 2343 * The sockfs accept is done on the acceptor stream and control goes 2344 * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not 2345 * called. In either case, both the eager and listener are in their own 2346 * perimeter (squeue) and the code has to deal with potential race. 2347 * 2348 * See the block comment on top of tcp_accept() and tcp_tli_accept(). 2349 */ 2350 static void 2351 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) 2352 { 2353 conn_t *econnp, *aconnp; 2354 2355 ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq); 2356 ASSERT(eager->tcp_detached && !acceptor->tcp_detached); 2357 ASSERT(!TCP_IS_SOCKET(acceptor)); 2358 ASSERT(!TCP_IS_SOCKET(eager)); 2359 ASSERT(!TCP_IS_SOCKET(listener)); 2360 2361 /* 2362 * Trusted Extensions may need to use a security label that is 2363 * different from the acceptor's label on MLP and MAC-Exempt 2364 * sockets. If this is the case, the required security label 2365 * already exists in econnp->conn_ixa->ixa_tsl. Since we make the 2366 * acceptor stream refer to econnp we atomatically get that label. 2367 */ 2368 2369 acceptor->tcp_detached = B_TRUE; 2370 /* 2371 * To permit stream re-use by TLI/XTI, the eager needs a copy of 2372 * the acceptor id. 2373 */ 2374 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; 2375 2376 /* remove eager from listen list... */ 2377 mutex_enter(&listener->tcp_eager_lock); 2378 tcp_eager_unlink(eager); 2379 ASSERT(eager->tcp_eager_next_q == NULL && 2380 eager->tcp_eager_last_q == NULL); 2381 ASSERT(eager->tcp_eager_next_q0 == NULL && 2382 eager->tcp_eager_prev_q0 == NULL); 2383 mutex_exit(&listener->tcp_eager_lock); 2384 2385 econnp = eager->tcp_connp; 2386 aconnp = acceptor->tcp_connp; 2387 econnp->conn_rq = aconnp->conn_rq; 2388 econnp->conn_wq = aconnp->conn_wq; 2389 econnp->conn_rq->q_ptr = econnp; 2390 econnp->conn_wq->q_ptr = econnp; 2391 2392 /* 2393 * In the TLI/XTI loopback case, we are inside the listener's squeue, 2394 * which might be a different squeue from our peer TCP instance. 2395 * For TCP Fusion, the peer expects that whenever tcp_detached is 2396 * clear, our TCP queues point to the acceptor's queues. Thus, use 2397 * membar_producer() to ensure that the assignments of conn_rq/conn_wq 2398 * above reach global visibility prior to the clearing of tcp_detached. 2399 */ 2400 membar_producer(); 2401 eager->tcp_detached = B_FALSE; 2402 2403 ASSERT(eager->tcp_ack_tid == 0); 2404 2405 econnp->conn_dev = aconnp->conn_dev; 2406 econnp->conn_minor_arena = aconnp->conn_minor_arena; 2407 2408 ASSERT(econnp->conn_minor_arena != NULL); 2409 if (econnp->conn_cred != NULL) 2410 crfree(econnp->conn_cred); 2411 econnp->conn_cred = aconnp->conn_cred; 2412 aconnp->conn_cred = NULL; 2413 econnp->conn_cpid = aconnp->conn_cpid; 2414 ASSERT(econnp->conn_netstack == aconnp->conn_netstack); 2415 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); 2416 2417 econnp->conn_zoneid = aconnp->conn_zoneid; 2418 econnp->conn_allzones = aconnp->conn_allzones; 2419 econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid; 2420 2421 econnp->conn_mac_mode = aconnp->conn_mac_mode; 2422 econnp->conn_zone_is_global = aconnp->conn_zone_is_global; 2423 aconnp->conn_mac_mode = CONN_MAC_DEFAULT; 2424 2425 /* Do the IPC initialization */ 2426 CONN_INC_REF(econnp); 2427 2428 /* Done with old IPC. Drop its ref on its connp */ 2429 CONN_DEC_REF(aconnp); 2430 } 2431 2432 2433 /* 2434 * Adapt to the information, such as rtt and rtt_sd, provided from the 2435 * DCE and IRE maintained by IP. 2436 * 2437 * Checks for multicast and broadcast destination address. 2438 * Returns zero if ok; an errno on failure. 2439 * 2440 * Note that the MSS calculation here is based on the info given in 2441 * the DCE and IRE. We do not do any calculation based on TCP options. They 2442 * will be handled in tcp_input_data() when TCP knows which options to use. 2443 * 2444 * Note on how TCP gets its parameters for a connection. 2445 * 2446 * When a tcp_t structure is allocated, it gets all the default parameters. 2447 * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd, 2448 * spipe, rpipe, ... from the route metrics. Route metric overrides the 2449 * default. 2450 * 2451 * An incoming SYN with a multicast or broadcast destination address is dropped 2452 * in ip_fanout_v4/v6. 2453 * 2454 * An incoming SYN with a multicast or broadcast source address is always 2455 * dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in 2456 * conn_connect. 2457 * The same logic in tcp_set_destination also serves to 2458 * reject an attempt to connect to a broadcast or multicast (destination) 2459 * address. 2460 */ 2461 static int 2462 tcp_set_destination(tcp_t *tcp) 2463 { 2464 uint32_t mss_max; 2465 uint32_t mss; 2466 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 2467 conn_t *connp = tcp->tcp_connp; 2468 tcp_stack_t *tcps = tcp->tcp_tcps; 2469 iulp_t uinfo; 2470 int error; 2471 uint32_t flags; 2472 2473 flags = IPDF_LSO | IPDF_ZCOPY; 2474 /* 2475 * Make sure we have a dce for the destination to avoid dce_ident 2476 * contention for connected sockets. 2477 */ 2478 flags |= IPDF_UNIQUE_DCE; 2479 2480 if (!tcps->tcps_ignore_path_mtu) 2481 connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; 2482 2483 /* Use conn_lock to satify ASSERT; tcp is already serialized */ 2484 mutex_enter(&connp->conn_lock); 2485 error = conn_connect(connp, &uinfo, flags); 2486 mutex_exit(&connp->conn_lock); 2487 if (error != 0) 2488 return (error); 2489 2490 error = tcp_build_hdrs(tcp); 2491 if (error != 0) 2492 return (error); 2493 2494 tcp->tcp_localnet = uinfo.iulp_localnet; 2495 2496 if (uinfo.iulp_rtt != 0) { 2497 clock_t rto; 2498 2499 tcp->tcp_rtt_sa = uinfo.iulp_rtt; 2500 tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd; 2501 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 2502 tcps->tcps_rexmit_interval_extra + 2503 (tcp->tcp_rtt_sa >> 5); 2504 2505 if (rto > tcps->tcps_rexmit_interval_max) { 2506 tcp->tcp_rto = tcps->tcps_rexmit_interval_max; 2507 } else if (rto < tcps->tcps_rexmit_interval_min) { 2508 tcp->tcp_rto = tcps->tcps_rexmit_interval_min; 2509 } else { 2510 tcp->tcp_rto = rto; 2511 } 2512 } 2513 if (uinfo.iulp_ssthresh != 0) 2514 tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh; 2515 else 2516 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 2517 if (uinfo.iulp_spipe > 0) { 2518 connp->conn_sndbuf = MIN(uinfo.iulp_spipe, 2519 tcps->tcps_max_buf); 2520 if (tcps->tcps_snd_lowat_fraction != 0) { 2521 connp->conn_sndlowat = connp->conn_sndbuf / 2522 tcps->tcps_snd_lowat_fraction; 2523 } 2524 (void) tcp_maxpsz_set(tcp, B_TRUE); 2525 } 2526 /* 2527 * Note that up till now, acceptor always inherits receive 2528 * window from the listener. But if there is a metrics 2529 * associated with a host, we should use that instead of 2530 * inheriting it from listener. Thus we need to pass this 2531 * info back to the caller. 2532 */ 2533 if (uinfo.iulp_rpipe > 0) { 2534 tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe, 2535 tcps->tcps_max_buf); 2536 } 2537 2538 if (uinfo.iulp_rtomax > 0) { 2539 tcp->tcp_second_timer_threshold = 2540 uinfo.iulp_rtomax; 2541 } 2542 2543 /* 2544 * Use the metric option settings, iulp_tstamp_ok and 2545 * iulp_wscale_ok, only for active open. What this means 2546 * is that if the other side uses timestamp or window 2547 * scale option, TCP will also use those options. That 2548 * is for passive open. If the application sets a 2549 * large window, window scale is enabled regardless of 2550 * the value in iulp_wscale_ok. This is the behavior 2551 * since 2.6. So we keep it. 2552 * The only case left in passive open processing is the 2553 * check for SACK. 2554 * For ECN, it should probably be like SACK. But the 2555 * current value is binary, so we treat it like the other 2556 * cases. The metric only controls active open.For passive 2557 * open, the ndd param, tcp_ecn_permitted, controls the 2558 * behavior. 2559 */ 2560 if (!tcp_detached) { 2561 /* 2562 * The if check means that the following can only 2563 * be turned on by the metrics only IRE, but not off. 2564 */ 2565 if (uinfo.iulp_tstamp_ok) 2566 tcp->tcp_snd_ts_ok = B_TRUE; 2567 if (uinfo.iulp_wscale_ok) 2568 tcp->tcp_snd_ws_ok = B_TRUE; 2569 if (uinfo.iulp_sack == 2) 2570 tcp->tcp_snd_sack_ok = B_TRUE; 2571 if (uinfo.iulp_ecn_ok) 2572 tcp->tcp_ecn_ok = B_TRUE; 2573 } else { 2574 /* 2575 * Passive open. 2576 * 2577 * As above, the if check means that SACK can only be 2578 * turned on by the metric only IRE. 2579 */ 2580 if (uinfo.iulp_sack > 0) { 2581 tcp->tcp_snd_sack_ok = B_TRUE; 2582 } 2583 } 2584 2585 /* 2586 * XXX Note that currently, iulp_mtu can be as small as 68 2587 * because of PMTUd. So tcp_mss may go to negative if combined 2588 * length of all those options exceeds 28 bytes. But because 2589 * of the tcp_mss_min check below, we may not have a problem if 2590 * tcp_mss_min is of a reasonable value. The default is 1 so 2591 * the negative problem still exists. And the check defeats PMTUd. 2592 * In fact, if PMTUd finds that the MSS should be smaller than 2593 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min 2594 * value. 2595 * 2596 * We do not deal with that now. All those problems related to 2597 * PMTUd will be fixed later. 2598 */ 2599 ASSERT(uinfo.iulp_mtu != 0); 2600 mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu; 2601 2602 /* Sanity check for MSS value. */ 2603 if (connp->conn_ipversion == IPV4_VERSION) 2604 mss_max = tcps->tcps_mss_max_ipv4; 2605 else 2606 mss_max = tcps->tcps_mss_max_ipv6; 2607 2608 if (tcp->tcp_ipsec_overhead == 0) 2609 tcp->tcp_ipsec_overhead = conn_ipsec_length(connp); 2610 2611 mss -= tcp->tcp_ipsec_overhead; 2612 2613 if (mss < tcps->tcps_mss_min) 2614 mss = tcps->tcps_mss_min; 2615 if (mss > mss_max) 2616 mss = mss_max; 2617 2618 /* Note that this is the maximum MSS, excluding all options. */ 2619 tcp->tcp_mss = mss; 2620 2621 /* 2622 * Update the tcp connection with LSO capability. 2623 */ 2624 tcp_update_lso(tcp, connp->conn_ixa); 2625 2626 /* 2627 * Initialize the ISS here now that we have the full connection ID. 2628 * The RFC 1948 method of initial sequence number generation requires 2629 * knowledge of the full connection ID before setting the ISS. 2630 */ 2631 tcp_iss_init(tcp); 2632 2633 tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local); 2634 2635 /* 2636 * Make sure that conn is not marked incipient 2637 * for incoming connections. A blind 2638 * removal of incipient flag is cheaper than 2639 * check and removal. 2640 */ 2641 mutex_enter(&connp->conn_lock); 2642 connp->conn_state_flags &= ~CONN_INCIPIENT; 2643 mutex_exit(&connp->conn_lock); 2644 return (0); 2645 } 2646 2647 static void 2648 tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) 2649 { 2650 int error; 2651 conn_t *connp = tcp->tcp_connp; 2652 struct sockaddr *sa; 2653 mblk_t *mp1; 2654 struct T_bind_req *tbr; 2655 int backlog; 2656 socklen_t len; 2657 sin_t *sin; 2658 sin6_t *sin6; 2659 cred_t *cr; 2660 2661 /* 2662 * All Solaris components should pass a db_credp 2663 * for this TPI message, hence we ASSERT. 2664 * But in case there is some other M_PROTO that looks 2665 * like a TPI message sent by some other kernel 2666 * component, we check and return an error. 2667 */ 2668 cr = msg_getcred(mp, NULL); 2669 ASSERT(cr != NULL); 2670 if (cr == NULL) { 2671 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 2672 return; 2673 } 2674 2675 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 2676 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 2677 if (connp->conn_debug) { 2678 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 2679 "tcp_tpi_bind: bad req, len %u", 2680 (uint_t)(mp->b_wptr - mp->b_rptr)); 2681 } 2682 tcp_err_ack(tcp, mp, TPROTO, 0); 2683 return; 2684 } 2685 /* Make sure the largest address fits */ 2686 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); 2687 if (mp1 == NULL) { 2688 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 2689 return; 2690 } 2691 mp = mp1; 2692 tbr = (struct T_bind_req *)mp->b_rptr; 2693 2694 backlog = tbr->CONIND_number; 2695 len = tbr->ADDR_length; 2696 2697 switch (len) { 2698 case 0: /* request for a generic port */ 2699 tbr->ADDR_offset = sizeof (struct T_bind_req); 2700 if (connp->conn_family == AF_INET) { 2701 tbr->ADDR_length = sizeof (sin_t); 2702 sin = (sin_t *)&tbr[1]; 2703 *sin = sin_null; 2704 sin->sin_family = AF_INET; 2705 sa = (struct sockaddr *)sin; 2706 len = sizeof (sin_t); 2707 mp->b_wptr = (uchar_t *)&sin[1]; 2708 } else { 2709 ASSERT(connp->conn_family == AF_INET6); 2710 tbr->ADDR_length = sizeof (sin6_t); 2711 sin6 = (sin6_t *)&tbr[1]; 2712 *sin6 = sin6_null; 2713 sin6->sin6_family = AF_INET6; 2714 sa = (struct sockaddr *)sin6; 2715 len = sizeof (sin6_t); 2716 mp->b_wptr = (uchar_t *)&sin6[1]; 2717 } 2718 break; 2719 2720 case sizeof (sin_t): /* Complete IPv4 address */ 2721 sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, 2722 sizeof (sin_t)); 2723 break; 2724 2725 case sizeof (sin6_t): /* Complete IPv6 address */ 2726 sa = (struct sockaddr *)mi_offset_param(mp, 2727 tbr->ADDR_offset, sizeof (sin6_t)); 2728 break; 2729 2730 default: 2731 if (connp->conn_debug) { 2732 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 2733 "tcp_tpi_bind: bad address length, %d", 2734 tbr->ADDR_length); 2735 } 2736 tcp_err_ack(tcp, mp, TBADADDR, 0); 2737 return; 2738 } 2739 2740 if (backlog > 0) { 2741 error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp), 2742 tbr->PRIM_type != O_T_BIND_REQ); 2743 } else { 2744 error = tcp_do_bind(connp, sa, len, DB_CRED(mp), 2745 tbr->PRIM_type != O_T_BIND_REQ); 2746 } 2747 done: 2748 if (error > 0) { 2749 tcp_err_ack(tcp, mp, TSYSERR, error); 2750 } else if (error < 0) { 2751 tcp_err_ack(tcp, mp, -error, 0); 2752 } else { 2753 /* 2754 * Update port information as sockfs/tpi needs it for checking 2755 */ 2756 if (connp->conn_family == AF_INET) { 2757 sin = (sin_t *)sa; 2758 sin->sin_port = connp->conn_lport; 2759 } else { 2760 sin6 = (sin6_t *)sa; 2761 sin6->sin6_port = connp->conn_lport; 2762 } 2763 mp->b_datap->db_type = M_PCPROTO; 2764 tbr->PRIM_type = T_BIND_ACK; 2765 putnext(connp->conn_rq, mp); 2766 } 2767 } 2768 2769 /* 2770 * If the "bind_to_req_port_only" parameter is set, if the requested port 2771 * number is available, return it, If not return 0 2772 * 2773 * If "bind_to_req_port_only" parameter is not set and 2774 * If the requested port number is available, return it. If not, return 2775 * the first anonymous port we happen across. If no anonymous ports are 2776 * available, return 0. addr is the requested local address, if any. 2777 * 2778 * In either case, when succeeding update the tcp_t to record the port number 2779 * and insert it in the bind hash table. 2780 * 2781 * Note that TCP over IPv4 and IPv6 sockets can use the same port number 2782 * without setting SO_REUSEADDR. This is needed so that they 2783 * can be viewed as two independent transport protocols. 2784 */ 2785 static in_port_t 2786 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 2787 int reuseaddr, boolean_t quick_connect, 2788 boolean_t bind_to_req_port_only, boolean_t user_specified) 2789 { 2790 /* number of times we have run around the loop */ 2791 int count = 0; 2792 /* maximum number of times to run around the loop */ 2793 int loopmax; 2794 conn_t *connp = tcp->tcp_connp; 2795 tcp_stack_t *tcps = tcp->tcp_tcps; 2796 2797 /* 2798 * Lookup for free addresses is done in a loop and "loopmax" 2799 * influences how long we spin in the loop 2800 */ 2801 if (bind_to_req_port_only) { 2802 /* 2803 * If the requested port is busy, don't bother to look 2804 * for a new one. Setting loop maximum count to 1 has 2805 * that effect. 2806 */ 2807 loopmax = 1; 2808 } else { 2809 /* 2810 * If the requested port is busy, look for a free one 2811 * in the anonymous port range. 2812 * Set loopmax appropriately so that one does not look 2813 * forever in the case all of the anonymous ports are in use. 2814 */ 2815 if (connp->conn_anon_priv_bind) { 2816 /* 2817 * loopmax = 2818 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 2819 */ 2820 loopmax = IPPORT_RESERVED - 2821 tcps->tcps_min_anonpriv_port; 2822 } else { 2823 loopmax = (tcps->tcps_largest_anon_port - 2824 tcps->tcps_smallest_anon_port + 1); 2825 } 2826 } 2827 do { 2828 uint16_t lport; 2829 tf_t *tbf; 2830 tcp_t *ltcp; 2831 conn_t *lconnp; 2832 2833 lport = htons(port); 2834 2835 /* 2836 * Ensure that the tcp_t is not currently in the bind hash. 2837 * Hold the lock on the hash bucket to ensure that 2838 * the duplicate check plus the insertion is an atomic 2839 * operation. 2840 * 2841 * This function does an inline lookup on the bind hash list 2842 * Make sure that we access only members of tcp_t 2843 * and that we don't look at tcp_tcp, since we are not 2844 * doing a CONN_INC_REF. 2845 */ 2846 tcp_bind_hash_remove(tcp); 2847 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; 2848 mutex_enter(&tbf->tf_lock); 2849 for (ltcp = tbf->tf_tcp; ltcp != NULL; 2850 ltcp = ltcp->tcp_bind_hash) { 2851 if (lport == ltcp->tcp_connp->conn_lport) 2852 break; 2853 } 2854 2855 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { 2856 boolean_t not_socket; 2857 boolean_t exclbind; 2858 2859 lconnp = ltcp->tcp_connp; 2860 2861 /* 2862 * On a labeled system, we must treat bindings to ports 2863 * on shared IP addresses by sockets with MAC exemption 2864 * privilege as being in all zones, as there's 2865 * otherwise no way to identify the right receiver. 2866 */ 2867 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp)) 2868 continue; 2869 2870 /* 2871 * If TCP_EXCLBIND is set for either the bound or 2872 * binding endpoint, the semantics of bind 2873 * is changed according to the following. 2874 * 2875 * spec = specified address (v4 or v6) 2876 * unspec = unspecified address (v4 or v6) 2877 * A = specified addresses are different for endpoints 2878 * 2879 * bound bind to allowed 2880 * ------------------------------------- 2881 * unspec unspec no 2882 * unspec spec no 2883 * spec unspec no 2884 * spec spec yes if A 2885 * 2886 * For labeled systems, SO_MAC_EXEMPT behaves the same 2887 * as TCP_EXCLBIND, except that zoneid is ignored. 2888 * 2889 * Note: 2890 * 2891 * 1. Because of TLI semantics, an endpoint can go 2892 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or 2893 * TCPS_BOUND, depending on whether it is originally 2894 * a listener or not. That is why we need to check 2895 * for states greater than or equal to TCPS_BOUND 2896 * here. 2897 * 2898 * 2. Ideally, we should only check for state equals 2899 * to TCPS_LISTEN. And the following check should be 2900 * added. 2901 * 2902 * if (ltcp->tcp_state == TCPS_LISTEN || 2903 * !reuseaddr || !lconnp->conn_reuseaddr) { 2904 * ... 2905 * } 2906 * 2907 * The semantics will be changed to this. If the 2908 * endpoint on the list is in state not equal to 2909 * TCPS_LISTEN and both endpoints have SO_REUSEADDR 2910 * set, let the bind succeed. 2911 * 2912 * Because of (1), we cannot do that for TLI 2913 * endpoints. But we can do that for socket endpoints. 2914 * If in future, we can change this going back 2915 * semantics, we can use the above check for TLI also. 2916 */ 2917 not_socket = !(TCP_IS_SOCKET(ltcp) && 2918 TCP_IS_SOCKET(tcp)); 2919 exclbind = lconnp->conn_exclbind || 2920 connp->conn_exclbind; 2921 2922 if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) || 2923 (connp->conn_mac_mode != CONN_MAC_DEFAULT) || 2924 (exclbind && (not_socket || 2925 ltcp->tcp_state <= TCPS_ESTABLISHED))) { 2926 if (V6_OR_V4_INADDR_ANY( 2927 lconnp->conn_bound_addr_v6) || 2928 V6_OR_V4_INADDR_ANY(*laddr) || 2929 IN6_ARE_ADDR_EQUAL(laddr, 2930 &lconnp->conn_bound_addr_v6)) { 2931 break; 2932 } 2933 continue; 2934 } 2935 2936 /* 2937 * Check ipversion to allow IPv4 and IPv6 sockets to 2938 * have disjoint port number spaces, if *_EXCLBIND 2939 * is not set and only if the application binds to a 2940 * specific port. We use the same autoassigned port 2941 * number space for IPv4 and IPv6 sockets. 2942 */ 2943 if (connp->conn_ipversion != lconnp->conn_ipversion && 2944 bind_to_req_port_only) 2945 continue; 2946 2947 /* 2948 * Ideally, we should make sure that the source 2949 * address, remote address, and remote port in the 2950 * four tuple for this tcp-connection is unique. 2951 * However, trying to find out the local source 2952 * address would require too much code duplication 2953 * with IP, since IP needs needs to have that code 2954 * to support userland TCP implementations. 2955 */ 2956 if (quick_connect && 2957 (ltcp->tcp_state > TCPS_LISTEN) && 2958 ((connp->conn_fport != lconnp->conn_fport) || 2959 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, 2960 &lconnp->conn_faddr_v6))) 2961 continue; 2962 2963 if (!reuseaddr) { 2964 /* 2965 * No socket option SO_REUSEADDR. 2966 * If existing port is bound to 2967 * a non-wildcard IP address 2968 * and the requesting stream is 2969 * bound to a distinct 2970 * different IP addresses 2971 * (non-wildcard, also), keep 2972 * going. 2973 */ 2974 if (!V6_OR_V4_INADDR_ANY(*laddr) && 2975 !V6_OR_V4_INADDR_ANY( 2976 lconnp->conn_bound_addr_v6) && 2977 !IN6_ARE_ADDR_EQUAL(laddr, 2978 &lconnp->conn_bound_addr_v6)) 2979 continue; 2980 if (ltcp->tcp_state >= TCPS_BOUND) { 2981 /* 2982 * This port is being used and 2983 * its state is >= TCPS_BOUND, 2984 * so we can't bind to it. 2985 */ 2986 break; 2987 } 2988 } else { 2989 /* 2990 * socket option SO_REUSEADDR is set on the 2991 * binding tcp_t. 2992 * 2993 * If two streams are bound to 2994 * same IP address or both addr 2995 * and bound source are wildcards 2996 * (INADDR_ANY), we want to stop 2997 * searching. 2998 * We have found a match of IP source 2999 * address and source port, which is 3000 * refused regardless of the 3001 * SO_REUSEADDR setting, so we break. 3002 */ 3003 if (IN6_ARE_ADDR_EQUAL(laddr, 3004 &lconnp->conn_bound_addr_v6) && 3005 (ltcp->tcp_state == TCPS_LISTEN || 3006 ltcp->tcp_state == TCPS_BOUND)) 3007 break; 3008 } 3009 } 3010 if (ltcp != NULL) { 3011 /* The port number is busy */ 3012 mutex_exit(&tbf->tf_lock); 3013 } else { 3014 /* 3015 * This port is ours. Insert in fanout and mark as 3016 * bound to prevent others from getting the port 3017 * number. 3018 */ 3019 tcp->tcp_state = TCPS_BOUND; 3020 connp->conn_lport = htons(port); 3021 3022 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( 3023 connp->conn_lport)] == tbf); 3024 tcp_bind_hash_insert(tbf, tcp, 1); 3025 3026 mutex_exit(&tbf->tf_lock); 3027 3028 /* 3029 * We don't want tcp_next_port_to_try to "inherit" 3030 * a port number supplied by the user in a bind. 3031 */ 3032 if (user_specified) 3033 return (port); 3034 3035 /* 3036 * This is the only place where tcp_next_port_to_try 3037 * is updated. After the update, it may or may not 3038 * be in the valid range. 3039 */ 3040 if (!connp->conn_anon_priv_bind) 3041 tcps->tcps_next_port_to_try = port + 1; 3042 return (port); 3043 } 3044 3045 if (connp->conn_anon_priv_bind) { 3046 port = tcp_get_next_priv_port(tcp); 3047 } else { 3048 if (count == 0 && user_specified) { 3049 /* 3050 * We may have to return an anonymous port. So 3051 * get one to start with. 3052 */ 3053 port = 3054 tcp_update_next_port( 3055 tcps->tcps_next_port_to_try, 3056 tcp, B_TRUE); 3057 user_specified = B_FALSE; 3058 } else { 3059 port = tcp_update_next_port(port + 1, tcp, 3060 B_FALSE); 3061 } 3062 } 3063 if (port == 0) 3064 break; 3065 3066 /* 3067 * Don't let this loop run forever in the case where 3068 * all of the anonymous ports are in use. 3069 */ 3070 } while (++count < loopmax); 3071 return (0); 3072 } 3073 3074 /* 3075 * tcp_clean_death / tcp_close_detached must not be called more than once 3076 * on a tcp. Thus every function that potentially calls tcp_clean_death 3077 * must check for the tcp state before calling tcp_clean_death. 3078 * Eg. tcp_input_data, tcp_eager_kill, tcp_clean_death_wrapper, 3079 * tcp_timer_handler, all check for the tcp state. 3080 */ 3081 /* ARGSUSED */ 3082 void 3083 tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, 3084 ip_recv_attr_t *dummy) 3085 { 3086 tcp_t *tcp = ((conn_t *)arg)->conn_tcp; 3087 3088 freemsg(mp); 3089 if (tcp->tcp_state > TCPS_BOUND) 3090 (void) tcp_clean_death(((conn_t *)arg)->conn_tcp, 3091 ETIMEDOUT, 5); 3092 } 3093 3094 /* 3095 * We are dying for some reason. Try to do it gracefully. (May be called 3096 * as writer.) 3097 * 3098 * Return -1 if the structure was not cleaned up (if the cleanup had to be 3099 * done by a service procedure). 3100 * TBD - Should the return value distinguish between the tcp_t being 3101 * freed and it being reinitialized? 3102 */ 3103 static int 3104 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) 3105 { 3106 mblk_t *mp; 3107 queue_t *q; 3108 conn_t *connp = tcp->tcp_connp; 3109 tcp_stack_t *tcps = tcp->tcp_tcps; 3110 3111 TCP_CLD_STAT(tag); 3112 3113 #if TCP_TAG_CLEAN_DEATH 3114 tcp->tcp_cleandeathtag = tag; 3115 #endif 3116 3117 if (tcp->tcp_fused) 3118 tcp_unfuse(tcp); 3119 3120 if (tcp->tcp_linger_tid != 0 && 3121 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 3122 tcp_stop_lingering(tcp); 3123 } 3124 3125 ASSERT(tcp != NULL); 3126 ASSERT((connp->conn_family == AF_INET && 3127 connp->conn_ipversion == IPV4_VERSION) || 3128 (connp->conn_family == AF_INET6 && 3129 (connp->conn_ipversion == IPV4_VERSION || 3130 connp->conn_ipversion == IPV6_VERSION))); 3131 3132 if (TCP_IS_DETACHED(tcp)) { 3133 if (tcp->tcp_hard_binding) { 3134 /* 3135 * Its an eager that we are dealing with. We close the 3136 * eager but in case a conn_ind has already gone to the 3137 * listener, let tcp_accept_finish() send a discon_ind 3138 * to the listener and drop the last reference. If the 3139 * listener doesn't even know about the eager i.e. the 3140 * conn_ind hasn't gone up, blow away the eager and drop 3141 * the last reference as well. If the conn_ind has gone 3142 * up, state should be BOUND. tcp_accept_finish 3143 * will figure out that the connection has received a 3144 * RST and will send a DISCON_IND to the application. 3145 */ 3146 tcp_closei_local(tcp); 3147 if (!tcp->tcp_tconnind_started) { 3148 CONN_DEC_REF(connp); 3149 } else { 3150 tcp->tcp_state = TCPS_BOUND; 3151 } 3152 } else { 3153 tcp_close_detached(tcp); 3154 } 3155 return (0); 3156 } 3157 3158 TCP_STAT(tcps, tcp_clean_death_nondetached); 3159 3160 /* 3161 * The connection is dead. Decrement listener connection counter if 3162 * necessary. 3163 */ 3164 if (tcp->tcp_listen_cnt != NULL) 3165 TCP_DECR_LISTEN_CNT(tcp); 3166 3167 q = connp->conn_rq; 3168 3169 /* Trash all inbound data */ 3170 if (!IPCL_IS_NONSTR(connp)) { 3171 ASSERT(q != NULL); 3172 flushq(q, FLUSHALL); 3173 } 3174 3175 /* 3176 * If we are at least part way open and there is error 3177 * (err==0 implies no error) 3178 * notify our client by a T_DISCON_IND. 3179 */ 3180 if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) { 3181 if (tcp->tcp_state >= TCPS_ESTABLISHED && 3182 !TCP_IS_SOCKET(tcp)) { 3183 /* 3184 * Send M_FLUSH according to TPI. Because sockets will 3185 * (and must) ignore FLUSHR we do that only for TPI 3186 * endpoints and sockets in STREAMS mode. 3187 */ 3188 (void) putnextctl1(q, M_FLUSH, FLUSHR); 3189 } 3190 if (connp->conn_debug) { 3191 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 3192 "tcp_clean_death: discon err %d", err); 3193 } 3194 if (IPCL_IS_NONSTR(connp)) { 3195 /* Direct socket, use upcall */ 3196 (*connp->conn_upcalls->su_disconnected)( 3197 connp->conn_upper_handle, tcp->tcp_connid, err); 3198 } else { 3199 mp = mi_tpi_discon_ind(NULL, err, 0); 3200 if (mp != NULL) { 3201 putnext(q, mp); 3202 } else { 3203 if (connp->conn_debug) { 3204 (void) strlog(TCP_MOD_ID, 0, 1, 3205 SL_ERROR|SL_TRACE, 3206 "tcp_clean_death, sending M_ERROR"); 3207 } 3208 (void) putnextctl1(q, M_ERROR, EPROTO); 3209 } 3210 } 3211 if (tcp->tcp_state <= TCPS_SYN_RCVD) { 3212 /* SYN_SENT or SYN_RCVD */ 3213 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); 3214 } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) { 3215 /* ESTABLISHED or CLOSE_WAIT */ 3216 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); 3217 } 3218 } 3219 3220 tcp_reinit(tcp); 3221 if (IPCL_IS_NONSTR(connp)) 3222 (void) tcp_do_unbind(connp); 3223 3224 return (-1); 3225 } 3226 3227 /* 3228 * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout 3229 * to expire, stop the wait and finish the close. 3230 */ 3231 static void 3232 tcp_stop_lingering(tcp_t *tcp) 3233 { 3234 clock_t delta = 0; 3235 tcp_stack_t *tcps = tcp->tcp_tcps; 3236 conn_t *connp = tcp->tcp_connp; 3237 3238 tcp->tcp_linger_tid = 0; 3239 if (tcp->tcp_state > TCPS_LISTEN) { 3240 tcp_acceptor_hash_remove(tcp); 3241 mutex_enter(&tcp->tcp_non_sq_lock); 3242 if (tcp->tcp_flow_stopped) { 3243 tcp_clrqfull(tcp); 3244 } 3245 mutex_exit(&tcp->tcp_non_sq_lock); 3246 3247 if (tcp->tcp_timer_tid != 0) { 3248 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 3249 tcp->tcp_timer_tid = 0; 3250 } 3251 /* 3252 * Need to cancel those timers which will not be used when 3253 * TCP is detached. This has to be done before the conn_wq 3254 * is cleared. 3255 */ 3256 tcp_timers_stop(tcp); 3257 3258 tcp->tcp_detached = B_TRUE; 3259 connp->conn_rq = NULL; 3260 connp->conn_wq = NULL; 3261 3262 if (tcp->tcp_state == TCPS_TIME_WAIT) { 3263 tcp_time_wait_append(tcp); 3264 TCP_DBGSTAT(tcps, tcp_detach_time_wait); 3265 goto finish; 3266 } 3267 3268 /* 3269 * If delta is zero the timer event wasn't executed and was 3270 * successfully canceled. In this case we need to restart it 3271 * with the minimal delta possible. 3272 */ 3273 if (delta >= 0) { 3274 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 3275 delta ? delta : 1); 3276 } 3277 } else { 3278 tcp_closei_local(tcp); 3279 CONN_DEC_REF(connp); 3280 } 3281 finish: 3282 /* Signal closing thread that it can complete close */ 3283 mutex_enter(&tcp->tcp_closelock); 3284 tcp->tcp_detached = B_TRUE; 3285 connp->conn_rq = NULL; 3286 connp->conn_wq = NULL; 3287 3288 tcp->tcp_closed = 1; 3289 cv_signal(&tcp->tcp_closecv); 3290 mutex_exit(&tcp->tcp_closelock); 3291 } 3292 3293 /* 3294 * Handle lingering timeouts. This function is called when the SO_LINGER timeout 3295 * expires. 3296 */ 3297 static void 3298 tcp_close_linger_timeout(void *arg) 3299 { 3300 conn_t *connp = (conn_t *)arg; 3301 tcp_t *tcp = connp->conn_tcp; 3302 3303 tcp->tcp_client_errno = ETIMEDOUT; 3304 tcp_stop_lingering(tcp); 3305 } 3306 3307 static void 3308 tcp_close_common(conn_t *connp, int flags) 3309 { 3310 tcp_t *tcp = connp->conn_tcp; 3311 mblk_t *mp = &tcp->tcp_closemp; 3312 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 3313 mblk_t *bp; 3314 3315 ASSERT(connp->conn_ref >= 2); 3316 3317 /* 3318 * Mark the conn as closing. ipsq_pending_mp_add will not 3319 * add any mp to the pending mp list, after this conn has 3320 * started closing. 3321 */ 3322 mutex_enter(&connp->conn_lock); 3323 connp->conn_state_flags |= CONN_CLOSING; 3324 if (connp->conn_oper_pending_ill != NULL) 3325 conn_ioctl_cleanup_reqd = B_TRUE; 3326 CONN_INC_REF_LOCKED(connp); 3327 mutex_exit(&connp->conn_lock); 3328 tcp->tcp_closeflags = (uint8_t)flags; 3329 ASSERT(connp->conn_ref >= 3); 3330 3331 /* 3332 * tcp_closemp_used is used below without any protection of a lock 3333 * as we don't expect any one else to use it concurrently at this 3334 * point otherwise it would be a major defect. 3335 */ 3336 3337 if (mp->b_prev == NULL) 3338 tcp->tcp_closemp_used = B_TRUE; 3339 else 3340 cmn_err(CE_PANIC, "tcp_close: concurrent use of tcp_closemp: " 3341 "connp %p tcp %p\n", (void *)connp, (void *)tcp); 3342 3343 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 3344 3345 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp, 3346 NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); 3347 3348 mutex_enter(&tcp->tcp_closelock); 3349 while (!tcp->tcp_closed) { 3350 if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) { 3351 /* 3352 * The cv_wait_sig() was interrupted. We now do the 3353 * following: 3354 * 3355 * 1) If the endpoint was lingering, we allow this 3356 * to be interrupted by cancelling the linger timeout 3357 * and closing normally. 3358 * 3359 * 2) Revert to calling cv_wait() 3360 * 3361 * We revert to using cv_wait() to avoid an 3362 * infinite loop which can occur if the calling 3363 * thread is higher priority than the squeue worker 3364 * thread and is bound to the same cpu. 3365 */ 3366 if (connp->conn_linger && connp->conn_lingertime > 0) { 3367 mutex_exit(&tcp->tcp_closelock); 3368 /* Entering squeue, bump ref count. */ 3369 CONN_INC_REF(connp); 3370 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 3371 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, 3372 tcp_linger_interrupted, connp, NULL, 3373 tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); 3374 mutex_enter(&tcp->tcp_closelock); 3375 } 3376 break; 3377 } 3378 } 3379 while (!tcp->tcp_closed) 3380 cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock); 3381 mutex_exit(&tcp->tcp_closelock); 3382 3383 /* 3384 * In the case of listener streams that have eagers in the q or q0 3385 * we wait for the eagers to drop their reference to us. conn_rq and 3386 * conn_wq of the eagers point to our queues. By waiting for the 3387 * refcnt to drop to 1, we are sure that the eagers have cleaned 3388 * up their queue pointers and also dropped their references to us. 3389 */ 3390 if (tcp->tcp_wait_for_eagers) { 3391 mutex_enter(&connp->conn_lock); 3392 while (connp->conn_ref != 1) { 3393 cv_wait(&connp->conn_cv, &connp->conn_lock); 3394 } 3395 mutex_exit(&connp->conn_lock); 3396 } 3397 /* 3398 * ioctl cleanup. The mp is queued in the ipx_pending_mp. 3399 */ 3400 if (conn_ioctl_cleanup_reqd) 3401 conn_ioctl_cleanup(connp); 3402 3403 connp->conn_cpid = NOPID; 3404 } 3405 3406 static int 3407 tcp_tpi_close(queue_t *q, int flags) 3408 { 3409 conn_t *connp; 3410 3411 ASSERT(WR(q)->q_next == NULL); 3412 3413 if (flags & SO_FALLBACK) { 3414 /* 3415 * stream is being closed while in fallback 3416 * simply free the resources that were allocated 3417 */ 3418 inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); 3419 qprocsoff(q); 3420 goto done; 3421 } 3422 3423 connp = Q_TO_CONN(q); 3424 /* 3425 * We are being closed as /dev/tcp or /dev/tcp6. 3426 */ 3427 tcp_close_common(connp, flags); 3428 3429 qprocsoff(q); 3430 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 3431 3432 /* 3433 * Drop IP's reference on the conn. This is the last reference 3434 * on the connp if the state was less than established. If the 3435 * connection has gone into timewait state, then we will have 3436 * one ref for the TCP and one more ref (total of two) for the 3437 * classifier connected hash list (a timewait connections stays 3438 * in connected hash till closed). 3439 * 3440 * We can't assert the references because there might be other 3441 * transient reference places because of some walkers or queued 3442 * packets in squeue for the timewait state. 3443 */ 3444 CONN_DEC_REF(connp); 3445 done: 3446 q->q_ptr = WR(q)->q_ptr = NULL; 3447 return (0); 3448 } 3449 3450 static int 3451 tcp_tpi_close_accept(queue_t *q) 3452 { 3453 vmem_t *minor_arena; 3454 dev_t conn_dev; 3455 3456 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); 3457 3458 /* 3459 * We had opened an acceptor STREAM for sockfs which is 3460 * now being closed due to some error. 3461 */ 3462 qprocsoff(q); 3463 3464 minor_arena = (vmem_t *)WR(q)->q_ptr; 3465 conn_dev = (dev_t)RD(q)->q_ptr; 3466 ASSERT(minor_arena != NULL); 3467 ASSERT(conn_dev != 0); 3468 inet_minor_free(minor_arena, conn_dev); 3469 q->q_ptr = WR(q)->q_ptr = NULL; 3470 return (0); 3471 } 3472 3473 /* 3474 * Called by tcp_close() routine via squeue when lingering is 3475 * interrupted by a signal. 3476 */ 3477 3478 /* ARGSUSED */ 3479 static void 3480 tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 3481 { 3482 conn_t *connp = (conn_t *)arg; 3483 tcp_t *tcp = connp->conn_tcp; 3484 3485 freeb(mp); 3486 if (tcp->tcp_linger_tid != 0 && 3487 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 3488 tcp_stop_lingering(tcp); 3489 tcp->tcp_client_errno = EINTR; 3490 } 3491 } 3492 3493 /* 3494 * Called by streams close routine via squeues when our client blows off her 3495 * descriptor, we take this to mean: "close the stream state NOW, close the tcp 3496 * connection politely" When SO_LINGER is set (with a non-zero linger time and 3497 * it is not a nonblocking socket) then this routine sleeps until the FIN is 3498 * acked. 3499 * 3500 * NOTE: tcp_close potentially returns error when lingering. 3501 * However, the stream head currently does not pass these errors 3502 * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK 3503 * errors to the application (from tsleep()) and not errors 3504 * like ECONNRESET caused by receiving a reset packet. 3505 */ 3506 3507 /* ARGSUSED */ 3508 static void 3509 tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 3510 { 3511 char *msg; 3512 conn_t *connp = (conn_t *)arg; 3513 tcp_t *tcp = connp->conn_tcp; 3514 clock_t delta = 0; 3515 tcp_stack_t *tcps = tcp->tcp_tcps; 3516 3517 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 3518 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 3519 3520 mutex_enter(&tcp->tcp_eager_lock); 3521 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 3522 /* Cleanup for listener */ 3523 tcp_eager_cleanup(tcp, 0); 3524 tcp->tcp_wait_for_eagers = 1; 3525 } 3526 mutex_exit(&tcp->tcp_eager_lock); 3527 3528 tcp->tcp_lso = B_FALSE; 3529 3530 msg = NULL; 3531 switch (tcp->tcp_state) { 3532 case TCPS_CLOSED: 3533 case TCPS_IDLE: 3534 case TCPS_BOUND: 3535 case TCPS_LISTEN: 3536 break; 3537 case TCPS_SYN_SENT: 3538 msg = "tcp_close, during connect"; 3539 break; 3540 case TCPS_SYN_RCVD: 3541 /* 3542 * Close during the connect 3-way handshake 3543 * but here there may or may not be pending data 3544 * already on queue. Process almost same as in 3545 * the ESTABLISHED state. 3546 */ 3547 /* FALLTHRU */ 3548 default: 3549 if (tcp->tcp_fused) 3550 tcp_unfuse(tcp); 3551 3552 /* 3553 * If SO_LINGER has set a zero linger time, abort the 3554 * connection with a reset. 3555 */ 3556 if (connp->conn_linger && connp->conn_lingertime == 0) { 3557 msg = "tcp_close, zero lingertime"; 3558 break; 3559 } 3560 3561 /* 3562 * Abort connection if there is unread data queued. 3563 */ 3564 if (tcp->tcp_rcv_list || tcp->tcp_reass_head) { 3565 msg = "tcp_close, unread data"; 3566 break; 3567 } 3568 /* 3569 * We have done a qwait() above which could have possibly 3570 * drained more messages in turn causing transition to a 3571 * different state. Check whether we have to do the rest 3572 * of the processing or not. 3573 */ 3574 if (tcp->tcp_state <= TCPS_LISTEN) 3575 break; 3576 3577 /* 3578 * Transmit the FIN before detaching the tcp_t. 3579 * After tcp_detach returns this queue/perimeter 3580 * no longer owns the tcp_t thus others can modify it. 3581 */ 3582 (void) tcp_xmit_end(tcp); 3583 3584 /* 3585 * If lingering on close then wait until the fin is acked, 3586 * the SO_LINGER time passes, or a reset is sent/received. 3587 */ 3588 if (connp->conn_linger && connp->conn_lingertime > 0 && 3589 !(tcp->tcp_fin_acked) && 3590 tcp->tcp_state >= TCPS_ESTABLISHED) { 3591 if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) { 3592 tcp->tcp_client_errno = EWOULDBLOCK; 3593 } else if (tcp->tcp_client_errno == 0) { 3594 3595 ASSERT(tcp->tcp_linger_tid == 0); 3596 3597 tcp->tcp_linger_tid = TCP_TIMER(tcp, 3598 tcp_close_linger_timeout, 3599 connp->conn_lingertime * hz); 3600 3601 /* tcp_close_linger_timeout will finish close */ 3602 if (tcp->tcp_linger_tid == 0) 3603 tcp->tcp_client_errno = ENOSR; 3604 else 3605 return; 3606 } 3607 3608 /* 3609 * Check if we need to detach or just close 3610 * the instance. 3611 */ 3612 if (tcp->tcp_state <= TCPS_LISTEN) 3613 break; 3614 } 3615 3616 /* 3617 * Make sure that no other thread will access the conn_rq of 3618 * this instance (through lookups etc.) as conn_rq will go 3619 * away shortly. 3620 */ 3621 tcp_acceptor_hash_remove(tcp); 3622 3623 mutex_enter(&tcp->tcp_non_sq_lock); 3624 if (tcp->tcp_flow_stopped) { 3625 tcp_clrqfull(tcp); 3626 } 3627 mutex_exit(&tcp->tcp_non_sq_lock); 3628 3629 if (tcp->tcp_timer_tid != 0) { 3630 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 3631 tcp->tcp_timer_tid = 0; 3632 } 3633 /* 3634 * Need to cancel those timers which will not be used when 3635 * TCP is detached. This has to be done before the conn_wq 3636 * is set to NULL. 3637 */ 3638 tcp_timers_stop(tcp); 3639 3640 tcp->tcp_detached = B_TRUE; 3641 if (tcp->tcp_state == TCPS_TIME_WAIT) { 3642 tcp_time_wait_append(tcp); 3643 TCP_DBGSTAT(tcps, tcp_detach_time_wait); 3644 ASSERT(connp->conn_ref >= 3); 3645 goto finish; 3646 } 3647 3648 /* 3649 * If delta is zero the timer event wasn't executed and was 3650 * successfully canceled. In this case we need to restart it 3651 * with the minimal delta possible. 3652 */ 3653 if (delta >= 0) 3654 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 3655 delta ? delta : 1); 3656 3657 ASSERT(connp->conn_ref >= 3); 3658 goto finish; 3659 } 3660 3661 /* Detach did not complete. Still need to remove q from stream. */ 3662 if (msg) { 3663 if (tcp->tcp_state == TCPS_ESTABLISHED || 3664 tcp->tcp_state == TCPS_CLOSE_WAIT) 3665 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); 3666 if (tcp->tcp_state == TCPS_SYN_SENT || 3667 tcp->tcp_state == TCPS_SYN_RCVD) 3668 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); 3669 tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST); 3670 } 3671 3672 tcp_closei_local(tcp); 3673 CONN_DEC_REF(connp); 3674 ASSERT(connp->conn_ref >= 2); 3675 3676 finish: 3677 mutex_enter(&tcp->tcp_closelock); 3678 /* 3679 * Don't change the queues in the case of a listener that has 3680 * eagers in its q or q0. It could surprise the eagers. 3681 * Instead wait for the eagers outside the squeue. 3682 */ 3683 if (!tcp->tcp_wait_for_eagers) { 3684 tcp->tcp_detached = B_TRUE; 3685 connp->conn_rq = NULL; 3686 connp->conn_wq = NULL; 3687 } 3688 3689 /* Signal tcp_close() to finish closing. */ 3690 tcp->tcp_closed = 1; 3691 cv_signal(&tcp->tcp_closecv); 3692 mutex_exit(&tcp->tcp_closelock); 3693 } 3694 3695 /* 3696 * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp. 3697 * Some stream heads get upset if they see these later on as anything but NULL. 3698 */ 3699 static void 3700 tcp_close_mpp(mblk_t **mpp) 3701 { 3702 mblk_t *mp; 3703 3704 if ((mp = *mpp) != NULL) { 3705 do { 3706 mp->b_next = NULL; 3707 mp->b_prev = NULL; 3708 } while ((mp = mp->b_cont) != NULL); 3709 3710 mp = *mpp; 3711 *mpp = NULL; 3712 freemsg(mp); 3713 } 3714 } 3715 3716 /* Do detached close. */ 3717 static void 3718 tcp_close_detached(tcp_t *tcp) 3719 { 3720 if (tcp->tcp_fused) 3721 tcp_unfuse(tcp); 3722 3723 /* 3724 * Clustering code serializes TCP disconnect callbacks and 3725 * cluster tcp list walks by blocking a TCP disconnect callback 3726 * if a cluster tcp list walk is in progress. This ensures 3727 * accurate accounting of TCPs in the cluster code even though 3728 * the TCP list walk itself is not atomic. 3729 */ 3730 tcp_closei_local(tcp); 3731 CONN_DEC_REF(tcp->tcp_connp); 3732 } 3733 3734 /* 3735 * Stop all TCP timers, and free the timer mblks if requested. 3736 */ 3737 void 3738 tcp_timers_stop(tcp_t *tcp) 3739 { 3740 if (tcp->tcp_timer_tid != 0) { 3741 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 3742 tcp->tcp_timer_tid = 0; 3743 } 3744 if (tcp->tcp_ka_tid != 0) { 3745 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); 3746 tcp->tcp_ka_tid = 0; 3747 } 3748 if (tcp->tcp_ack_tid != 0) { 3749 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 3750 tcp->tcp_ack_tid = 0; 3751 } 3752 if (tcp->tcp_push_tid != 0) { 3753 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 3754 tcp->tcp_push_tid = 0; 3755 } 3756 if (tcp->tcp_reass_tid != 0) { 3757 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid); 3758 tcp->tcp_reass_tid = 0; 3759 } 3760 } 3761 3762 /* 3763 * The tcp_t is going away. Remove it from all lists and set it 3764 * to TCPS_CLOSED. The freeing up of memory is deferred until 3765 * tcp_inactive. This is needed since a thread in tcp_rput might have 3766 * done a CONN_INC_REF on this structure before it was removed from the 3767 * hashes. 3768 */ 3769 static void 3770 tcp_closei_local(tcp_t *tcp) 3771 { 3772 conn_t *connp = tcp->tcp_connp; 3773 tcp_stack_t *tcps = tcp->tcp_tcps; 3774 3775 if (!TCP_IS_SOCKET(tcp)) 3776 tcp_acceptor_hash_remove(tcp); 3777 3778 UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs); 3779 tcp->tcp_ibsegs = 0; 3780 UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs); 3781 tcp->tcp_obsegs = 0; 3782 3783 /* 3784 * If we are an eager connection hanging off a listener that 3785 * hasn't formally accepted the connection yet, get off his 3786 * list and blow off any data that we have accumulated. 3787 */ 3788 if (tcp->tcp_listener != NULL) { 3789 tcp_t *listener = tcp->tcp_listener; 3790 mutex_enter(&listener->tcp_eager_lock); 3791 /* 3792 * tcp_tconnind_started == B_TRUE means that the 3793 * conn_ind has already gone to listener. At 3794 * this point, eager will be closed but we 3795 * leave it in listeners eager list so that 3796 * if listener decides to close without doing 3797 * accept, we can clean this up. In tcp_tli_accept 3798 * we take care of the case of accept on closed 3799 * eager. 3800 */ 3801 if (!tcp->tcp_tconnind_started) { 3802 tcp_eager_unlink(tcp); 3803 mutex_exit(&listener->tcp_eager_lock); 3804 /* 3805 * We don't want to have any pointers to the 3806 * listener queue, after we have released our 3807 * reference on the listener 3808 */ 3809 ASSERT(tcp->tcp_detached); 3810 connp->conn_rq = NULL; 3811 connp->conn_wq = NULL; 3812 CONN_DEC_REF(listener->tcp_connp); 3813 } else { 3814 mutex_exit(&listener->tcp_eager_lock); 3815 } 3816 } 3817 3818 /* Stop all the timers */ 3819 tcp_timers_stop(tcp); 3820 3821 if (tcp->tcp_state == TCPS_LISTEN) { 3822 if (tcp->tcp_ip_addr_cache) { 3823 kmem_free((void *)tcp->tcp_ip_addr_cache, 3824 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 3825 tcp->tcp_ip_addr_cache = NULL; 3826 } 3827 } 3828 3829 /* Decrement listerner connection counter if necessary. */ 3830 if (tcp->tcp_listen_cnt != NULL) 3831 TCP_DECR_LISTEN_CNT(tcp); 3832 3833 mutex_enter(&tcp->tcp_non_sq_lock); 3834 if (tcp->tcp_flow_stopped) 3835 tcp_clrqfull(tcp); 3836 mutex_exit(&tcp->tcp_non_sq_lock); 3837 3838 tcp_bind_hash_remove(tcp); 3839 /* 3840 * If the tcp_time_wait_collector (which runs outside the squeue) 3841 * is trying to remove this tcp from the time wait list, we will 3842 * block in tcp_time_wait_remove while trying to acquire the 3843 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also 3844 * requires the ipcl_hash_remove to be ordered after the 3845 * tcp_time_wait_remove for the refcnt checks to work correctly. 3846 */ 3847 if (tcp->tcp_state == TCPS_TIME_WAIT) 3848 (void) tcp_time_wait_remove(tcp, NULL); 3849 CL_INET_DISCONNECT(connp); 3850 ipcl_hash_remove(connp); 3851 ixa_cleanup(connp->conn_ixa); 3852 3853 /* 3854 * Mark the conn as CONDEMNED 3855 */ 3856 mutex_enter(&connp->conn_lock); 3857 connp->conn_state_flags |= CONN_CONDEMNED; 3858 mutex_exit(&connp->conn_lock); 3859 3860 /* Need to cleanup any pending ioctls */ 3861 ASSERT(tcp->tcp_time_wait_next == NULL); 3862 ASSERT(tcp->tcp_time_wait_prev == NULL); 3863 ASSERT(tcp->tcp_time_wait_expire == 0); 3864 tcp->tcp_state = TCPS_CLOSED; 3865 3866 /* Release any SSL context */ 3867 if (tcp->tcp_kssl_ent != NULL) { 3868 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); 3869 tcp->tcp_kssl_ent = NULL; 3870 } 3871 if (tcp->tcp_kssl_ctx != NULL) { 3872 kssl_release_ctx(tcp->tcp_kssl_ctx); 3873 tcp->tcp_kssl_ctx = NULL; 3874 } 3875 tcp->tcp_kssl_pending = B_FALSE; 3876 3877 tcp_ipsec_cleanup(tcp); 3878 } 3879 3880 /* 3881 * tcp is dying (called from ipcl_conn_destroy and error cases). 3882 * Free the tcp_t in either case. 3883 */ 3884 void 3885 tcp_free(tcp_t *tcp) 3886 { 3887 mblk_t *mp; 3888 conn_t *connp = tcp->tcp_connp; 3889 3890 ASSERT(tcp != NULL); 3891 ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL); 3892 3893 connp->conn_rq = NULL; 3894 connp->conn_wq = NULL; 3895 3896 tcp_close_mpp(&tcp->tcp_xmit_head); 3897 tcp_close_mpp(&tcp->tcp_reass_head); 3898 if (tcp->tcp_rcv_list != NULL) { 3899 /* Free b_next chain */ 3900 tcp_close_mpp(&tcp->tcp_rcv_list); 3901 } 3902 if ((mp = tcp->tcp_urp_mp) != NULL) { 3903 freemsg(mp); 3904 } 3905 if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 3906 freemsg(mp); 3907 } 3908 3909 if (tcp->tcp_fused_sigurg_mp != NULL) { 3910 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 3911 freeb(tcp->tcp_fused_sigurg_mp); 3912 tcp->tcp_fused_sigurg_mp = NULL; 3913 } 3914 3915 if (tcp->tcp_ordrel_mp != NULL) { 3916 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 3917 freeb(tcp->tcp_ordrel_mp); 3918 tcp->tcp_ordrel_mp = NULL; 3919 } 3920 3921 if (tcp->tcp_sack_info != NULL) { 3922 if (tcp->tcp_notsack_list != NULL) { 3923 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, 3924 tcp); 3925 } 3926 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 3927 } 3928 3929 if (tcp->tcp_hopopts != NULL) { 3930 mi_free(tcp->tcp_hopopts); 3931 tcp->tcp_hopopts = NULL; 3932 tcp->tcp_hopoptslen = 0; 3933 } 3934 ASSERT(tcp->tcp_hopoptslen == 0); 3935 if (tcp->tcp_dstopts != NULL) { 3936 mi_free(tcp->tcp_dstopts); 3937 tcp->tcp_dstopts = NULL; 3938 tcp->tcp_dstoptslen = 0; 3939 } 3940 ASSERT(tcp->tcp_dstoptslen == 0); 3941 if (tcp->tcp_rthdrdstopts != NULL) { 3942 mi_free(tcp->tcp_rthdrdstopts); 3943 tcp->tcp_rthdrdstopts = NULL; 3944 tcp->tcp_rthdrdstoptslen = 0; 3945 } 3946 ASSERT(tcp->tcp_rthdrdstoptslen == 0); 3947 if (tcp->tcp_rthdr != NULL) { 3948 mi_free(tcp->tcp_rthdr); 3949 tcp->tcp_rthdr = NULL; 3950 tcp->tcp_rthdrlen = 0; 3951 } 3952 ASSERT(tcp->tcp_rthdrlen == 0); 3953 3954 /* 3955 * Following is really a blowing away a union. 3956 * It happens to have exactly two members of identical size 3957 * the following code is enough. 3958 */ 3959 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 3960 } 3961 3962 3963 /* 3964 * Put a connection confirmation message upstream built from the 3965 * address/flowid information with the conn and iph. Report our success or 3966 * failure. 3967 */ 3968 static boolean_t 3969 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp, 3970 mblk_t **defermp, ip_recv_attr_t *ira) 3971 { 3972 sin_t sin; 3973 sin6_t sin6; 3974 mblk_t *mp; 3975 char *optp = NULL; 3976 int optlen = 0; 3977 conn_t *connp = tcp->tcp_connp; 3978 3979 if (defermp != NULL) 3980 *defermp = NULL; 3981 3982 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { 3983 /* 3984 * Return in T_CONN_CON results of option negotiation through 3985 * the T_CONN_REQ. Note: If there is an real end-to-end option 3986 * negotiation, then what is received from remote end needs 3987 * to be taken into account but there is no such thing (yet?) 3988 * in our TCP/IP. 3989 * Note: We do not use mi_offset_param() here as 3990 * tcp_opts_conn_req contents do not directly come from 3991 * an application and are either generated in kernel or 3992 * from user input that was already verified. 3993 */ 3994 mp = tcp->tcp_conn.tcp_opts_conn_req; 3995 optp = (char *)(mp->b_rptr + 3996 ((struct T_conn_req *)mp->b_rptr)->OPT_offset); 3997 optlen = (int) 3998 ((struct T_conn_req *)mp->b_rptr)->OPT_length; 3999 } 4000 4001 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { 4002 4003 /* packet is IPv4 */ 4004 if (connp->conn_family == AF_INET) { 4005 sin = sin_null; 4006 sin.sin_addr.s_addr = connp->conn_faddr_v4; 4007 sin.sin_port = connp->conn_fport; 4008 sin.sin_family = AF_INET; 4009 mp = mi_tpi_conn_con(NULL, (char *)&sin, 4010 (int)sizeof (sin_t), optp, optlen); 4011 } else { 4012 sin6 = sin6_null; 4013 sin6.sin6_addr = connp->conn_faddr_v6; 4014 sin6.sin6_port = connp->conn_fport; 4015 sin6.sin6_family = AF_INET6; 4016 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 4017 (int)sizeof (sin6_t), optp, optlen); 4018 4019 } 4020 } else { 4021 ip6_t *ip6h = (ip6_t *)iphdr; 4022 4023 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); 4024 ASSERT(connp->conn_family == AF_INET6); 4025 sin6 = sin6_null; 4026 sin6.sin6_addr = connp->conn_faddr_v6; 4027 sin6.sin6_port = connp->conn_fport; 4028 sin6.sin6_family = AF_INET6; 4029 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 4030 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 4031 (int)sizeof (sin6_t), optp, optlen); 4032 } 4033 4034 if (!mp) 4035 return (B_FALSE); 4036 4037 mblk_copycred(mp, idmp); 4038 4039 if (defermp == NULL) { 4040 conn_t *connp = tcp->tcp_connp; 4041 if (IPCL_IS_NONSTR(connp)) { 4042 (*connp->conn_upcalls->su_connected) 4043 (connp->conn_upper_handle, tcp->tcp_connid, 4044 ira->ira_cred, ira->ira_cpid); 4045 freemsg(mp); 4046 } else { 4047 if (ira->ira_cred != NULL) { 4048 /* So that getpeerucred works for TPI sockfs */ 4049 mblk_setcred(mp, ira->ira_cred, ira->ira_cpid); 4050 } 4051 putnext(connp->conn_rq, mp); 4052 } 4053 } else { 4054 *defermp = mp; 4055 } 4056 4057 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 4058 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 4059 return (B_TRUE); 4060 } 4061 4062 /* 4063 * Defense for the SYN attack - 4064 * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest 4065 * one from the list of droppable eagers. This list is a subset of q0. 4066 * see comments before the definition of MAKE_DROPPABLE(). 4067 * 2. Don't drop a SYN request before its first timeout. This gives every 4068 * request at least til the first timeout to complete its 3-way handshake. 4069 * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many 4070 * requests currently on the queue that has timed out. This will be used 4071 * as an indicator of whether an attack is under way, so that appropriate 4072 * actions can be taken. (It's incremented in tcp_timer() and decremented 4073 * either when eager goes into ESTABLISHED, or gets freed up.) 4074 * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on 4075 * # of timeout drops back to <= q0len/32 => SYN alert off 4076 */ 4077 static boolean_t 4078 tcp_drop_q0(tcp_t *tcp) 4079 { 4080 tcp_t *eager; 4081 mblk_t *mp; 4082 tcp_stack_t *tcps = tcp->tcp_tcps; 4083 4084 ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock)); 4085 ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 4086 4087 /* Pick oldest eager from the list of droppable eagers */ 4088 eager = tcp->tcp_eager_prev_drop_q0; 4089 4090 /* If list is empty. return B_FALSE */ 4091 if (eager == tcp) { 4092 return (B_FALSE); 4093 } 4094 4095 /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */ 4096 if ((mp = allocb(0, BPRI_HI)) == NULL) 4097 return (B_FALSE); 4098 4099 /* 4100 * Take this eager out from the list of droppable eagers since we are 4101 * going to drop it. 4102 */ 4103 MAKE_UNDROPPABLE(eager); 4104 4105 if (tcp->tcp_connp->conn_debug) { 4106 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 4107 "tcp_drop_q0: listen half-open queue (max=%d) overflow" 4108 " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0, 4109 tcp->tcp_conn_req_cnt_q0, 4110 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 4111 } 4112 4113 BUMP_MIB(&tcps->tcps_mib, tcpHalfOpenDrop); 4114 4115 /* Put a reference on the conn as we are enqueueing it in the sqeue */ 4116 CONN_INC_REF(eager->tcp_connp); 4117 4118 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, 4119 tcp_clean_death_wrapper, eager->tcp_connp, NULL, 4120 SQ_FILL, SQTAG_TCP_DROP_Q0); 4121 4122 return (B_TRUE); 4123 } 4124 4125 /* 4126 * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6 4127 */ 4128 static mblk_t * 4129 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 4130 ip_recv_attr_t *ira) 4131 { 4132 tcp_t *ltcp = lconnp->conn_tcp; 4133 tcp_t *tcp = connp->conn_tcp; 4134 mblk_t *tpi_mp; 4135 ipha_t *ipha; 4136 ip6_t *ip6h; 4137 sin6_t sin6; 4138 uint_t ifindex = ira->ira_ruifindex; 4139 tcp_stack_t *tcps = tcp->tcp_tcps; 4140 4141 if (ira->ira_flags & IRAF_IS_IPV4) { 4142 ipha = (ipha_t *)mp->b_rptr; 4143 4144 connp->conn_ipversion = IPV4_VERSION; 4145 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); 4146 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); 4147 connp->conn_saddr_v6 = connp->conn_laddr_v6; 4148 4149 sin6 = sin6_null; 4150 sin6.sin6_addr = connp->conn_faddr_v6; 4151 sin6.sin6_port = connp->conn_fport; 4152 sin6.sin6_family = AF_INET6; 4153 sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, 4154 IPCL_ZONEID(lconnp), tcps->tcps_netstack); 4155 4156 if (connp->conn_recv_ancillary.crb_recvdstaddr) { 4157 sin6_t sin6d; 4158 4159 sin6d = sin6_null; 4160 sin6d.sin6_addr = connp->conn_laddr_v6; 4161 sin6d.sin6_port = connp->conn_lport; 4162 sin6d.sin6_family = AF_INET; 4163 tpi_mp = mi_tpi_extconn_ind(NULL, 4164 (char *)&sin6d, sizeof (sin6_t), 4165 (char *)&tcp, 4166 (t_scalar_t)sizeof (intptr_t), 4167 (char *)&sin6d, sizeof (sin6_t), 4168 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4169 } else { 4170 tpi_mp = mi_tpi_conn_ind(NULL, 4171 (char *)&sin6, sizeof (sin6_t), 4172 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4173 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4174 } 4175 } else { 4176 ip6h = (ip6_t *)mp->b_rptr; 4177 4178 connp->conn_ipversion = IPV6_VERSION; 4179 connp->conn_laddr_v6 = ip6h->ip6_dst; 4180 connp->conn_faddr_v6 = ip6h->ip6_src; 4181 connp->conn_saddr_v6 = connp->conn_laddr_v6; 4182 4183 sin6 = sin6_null; 4184 sin6.sin6_addr = connp->conn_faddr_v6; 4185 sin6.sin6_port = connp->conn_fport; 4186 sin6.sin6_family = AF_INET6; 4187 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 4188 sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, 4189 IPCL_ZONEID(lconnp), tcps->tcps_netstack); 4190 4191 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { 4192 /* Pass up the scope_id of remote addr */ 4193 sin6.sin6_scope_id = ifindex; 4194 } else { 4195 sin6.sin6_scope_id = 0; 4196 } 4197 if (connp->conn_recv_ancillary.crb_recvdstaddr) { 4198 sin6_t sin6d; 4199 4200 sin6d = sin6_null; 4201 sin6.sin6_addr = connp->conn_laddr_v6; 4202 sin6d.sin6_port = connp->conn_lport; 4203 sin6d.sin6_family = AF_INET6; 4204 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6)) 4205 sin6d.sin6_scope_id = ifindex; 4206 4207 tpi_mp = mi_tpi_extconn_ind(NULL, 4208 (char *)&sin6d, sizeof (sin6_t), 4209 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4210 (char *)&sin6d, sizeof (sin6_t), 4211 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4212 } else { 4213 tpi_mp = mi_tpi_conn_ind(NULL, 4214 (char *)&sin6, sizeof (sin6_t), 4215 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4216 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4217 } 4218 } 4219 4220 tcp->tcp_mss = tcps->tcps_mss_def_ipv6; 4221 return (tpi_mp); 4222 } 4223 4224 /* Handle a SYN on an AF_INET socket */ 4225 mblk_t * 4226 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, 4227 ip_recv_attr_t *ira) 4228 { 4229 tcp_t *ltcp = lconnp->conn_tcp; 4230 tcp_t *tcp = connp->conn_tcp; 4231 sin_t sin; 4232 mblk_t *tpi_mp = NULL; 4233 tcp_stack_t *tcps = tcp->tcp_tcps; 4234 ipha_t *ipha; 4235 4236 ASSERT(ira->ira_flags & IRAF_IS_IPV4); 4237 ipha = (ipha_t *)mp->b_rptr; 4238 4239 connp->conn_ipversion = IPV4_VERSION; 4240 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); 4241 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); 4242 connp->conn_saddr_v6 = connp->conn_laddr_v6; 4243 4244 sin = sin_null; 4245 sin.sin_addr.s_addr = connp->conn_faddr_v4; 4246 sin.sin_port = connp->conn_fport; 4247 sin.sin_family = AF_INET; 4248 if (lconnp->conn_recv_ancillary.crb_recvdstaddr) { 4249 sin_t sind; 4250 4251 sind = sin_null; 4252 sind.sin_addr.s_addr = connp->conn_laddr_v4; 4253 sind.sin_port = connp->conn_lport; 4254 sind.sin_family = AF_INET; 4255 tpi_mp = mi_tpi_extconn_ind(NULL, 4256 (char *)&sind, sizeof (sin_t), (char *)&tcp, 4257 (t_scalar_t)sizeof (intptr_t), (char *)&sind, 4258 sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4259 } else { 4260 tpi_mp = mi_tpi_conn_ind(NULL, 4261 (char *)&sin, sizeof (sin_t), 4262 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4263 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4264 } 4265 4266 tcp->tcp_mss = tcps->tcps_mss_def_ipv4; 4267 return (tpi_mp); 4268 } 4269 4270 /* 4271 * tcp_get_conn/tcp_free_conn 4272 * 4273 * tcp_get_conn is used to get a clean tcp connection structure. 4274 * It tries to reuse the connections put on the freelist by the 4275 * time_wait_collector failing which it goes to kmem_cache. This 4276 * way has two benefits compared to just allocating from and 4277 * freeing to kmem_cache. 4278 * 1) The time_wait_collector can free (which includes the cleanup) 4279 * outside the squeue. So when the interrupt comes, we have a clean 4280 * connection sitting in the freelist. Obviously, this buys us 4281 * performance. 4282 * 4283 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener 4284 * has multiple disadvantages - tying up the squeue during alloc. 4285 * But allocating the conn/tcp in IP land is also not the best since 4286 * we can't check the 'q' and 'q0' which are protected by squeue and 4287 * blindly allocate memory which might have to be freed here if we are 4288 * not allowed to accept the connection. By using the freelist and 4289 * putting the conn/tcp back in freelist, we don't pay a penalty for 4290 * allocating memory without checking 'q/q0' and freeing it if we can't 4291 * accept the connection. 4292 * 4293 * Care should be taken to put the conn back in the same squeue's freelist 4294 * from which it was allocated. Best results are obtained if conn is 4295 * allocated from listener's squeue and freed to the same. Time wait 4296 * collector will free up the freelist is the connection ends up sitting 4297 * there for too long. 4298 */ 4299 void * 4300 tcp_get_conn(void *arg, tcp_stack_t *tcps) 4301 { 4302 tcp_t *tcp = NULL; 4303 conn_t *connp = NULL; 4304 squeue_t *sqp = (squeue_t *)arg; 4305 tcp_squeue_priv_t *tcp_time_wait; 4306 netstack_t *ns; 4307 mblk_t *tcp_rsrv_mp = NULL; 4308 4309 tcp_time_wait = 4310 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 4311 4312 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 4313 tcp = tcp_time_wait->tcp_free_list; 4314 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0)); 4315 if (tcp != NULL) { 4316 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 4317 tcp_time_wait->tcp_free_list_cnt--; 4318 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 4319 tcp->tcp_time_wait_next = NULL; 4320 connp = tcp->tcp_connp; 4321 connp->conn_flags |= IPCL_REUSED; 4322 4323 ASSERT(tcp->tcp_tcps == NULL); 4324 ASSERT(connp->conn_netstack == NULL); 4325 ASSERT(tcp->tcp_rsrv_mp != NULL); 4326 ns = tcps->tcps_netstack; 4327 netstack_hold(ns); 4328 connp->conn_netstack = ns; 4329 connp->conn_ixa->ixa_ipst = ns->netstack_ip; 4330 tcp->tcp_tcps = tcps; 4331 ipcl_globalhash_insert(connp); 4332 4333 connp->conn_ixa->ixa_notify_cookie = tcp; 4334 ASSERT(connp->conn_ixa->ixa_notify == tcp_notify); 4335 connp->conn_recv = tcp_input_data; 4336 ASSERT(connp->conn_recvicmp == tcp_icmp_input); 4337 ASSERT(connp->conn_verifyicmp == tcp_verifyicmp); 4338 return ((void *)connp); 4339 } 4340 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 4341 /* 4342 * Pre-allocate the tcp_rsrv_mp. This mblk will not be freed until 4343 * this conn_t/tcp_t is freed at ipcl_conn_destroy(). 4344 */ 4345 tcp_rsrv_mp = allocb(0, BPRI_HI); 4346 if (tcp_rsrv_mp == NULL) 4347 return (NULL); 4348 4349 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP, 4350 tcps->tcps_netstack)) == NULL) { 4351 freeb(tcp_rsrv_mp); 4352 return (NULL); 4353 } 4354 4355 tcp = connp->conn_tcp; 4356 tcp->tcp_rsrv_mp = tcp_rsrv_mp; 4357 mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL); 4358 4359 tcp->tcp_tcps = tcps; 4360 4361 connp->conn_recv = tcp_input_data; 4362 connp->conn_recvicmp = tcp_icmp_input; 4363 connp->conn_verifyicmp = tcp_verifyicmp; 4364 4365 /* 4366 * Register tcp_notify to listen to capability changes detected by IP. 4367 * This upcall is made in the context of the call to conn_ip_output 4368 * thus it is inside the squeue. 4369 */ 4370 connp->conn_ixa->ixa_notify = tcp_notify; 4371 connp->conn_ixa->ixa_notify_cookie = tcp; 4372 4373 return ((void *)connp); 4374 } 4375 4376 /* BEGIN CSTYLED */ 4377 /* 4378 * 4379 * The sockfs ACCEPT path: 4380 * ======================= 4381 * 4382 * The eager is now established in its own perimeter as soon as SYN is 4383 * received in tcp_input_listener(). When sockfs receives conn_ind, it 4384 * completes the accept processing on the acceptor STREAM. The sending 4385 * of conn_ind part is common for both sockfs listener and a TLI/XTI 4386 * listener but a TLI/XTI listener completes the accept processing 4387 * on the listener perimeter. 4388 * 4389 * Common control flow for 3 way handshake: 4390 * ---------------------------------------- 4391 * 4392 * incoming SYN (listener perimeter) -> tcp_input_listener() 4393 * 4394 * incoming SYN-ACK-ACK (eager perim) -> tcp_input_data() 4395 * send T_CONN_IND (listener perim) -> tcp_send_conn_ind() 4396 * 4397 * Sockfs ACCEPT Path: 4398 * ------------------- 4399 * 4400 * open acceptor stream (tcp_open allocates tcp_tli_accept() 4401 * as STREAM entry point) 4402 * 4403 * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept() 4404 * 4405 * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager 4406 * association (we are not behind eager's squeue but sockfs is protecting us 4407 * and no one knows about this stream yet. The STREAMS entry point q->q_info 4408 * is changed to point at tcp_wput(). 4409 * 4410 * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to 4411 * listener (done on listener's perimeter). 4412 * 4413 * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish 4414 * accept. 4415 * 4416 * TLI/XTI client ACCEPT path: 4417 * --------------------------- 4418 * 4419 * soaccept() sends T_CONN_RES on the listener STREAM. 4420 * 4421 * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send 4422 * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()). 4423 * 4424 * Locks: 4425 * ====== 4426 * 4427 * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and 4428 * and listeners->tcp_eager_next_q. 4429 * 4430 * Referencing: 4431 * ============ 4432 * 4433 * 1) We start out in tcp_input_listener by eager placing a ref on 4434 * listener and listener adding eager to listeners->tcp_eager_next_q0. 4435 * 4436 * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before 4437 * doing so we place a ref on the eager. This ref is finally dropped at the 4438 * end of tcp_accept_finish() while unwinding from the squeue, i.e. the 4439 * reference is dropped by the squeue framework. 4440 * 4441 * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish 4442 * 4443 * The reference must be released by the same entity that added the reference 4444 * In the above scheme, the eager is the entity that adds and releases the 4445 * references. Note that tcp_accept_finish executes in the squeue of the eager 4446 * (albeit after it is attached to the acceptor stream). Though 1. executes 4447 * in the listener's squeue, the eager is nascent at this point and the 4448 * reference can be considered to have been added on behalf of the eager. 4449 * 4450 * Eager getting a Reset or listener closing: 4451 * ========================================== 4452 * 4453 * Once the listener and eager are linked, the listener never does the unlink. 4454 * If the listener needs to close, tcp_eager_cleanup() is called which queues 4455 * a message on all eager perimeter. The eager then does the unlink, clears 4456 * any pointers to the listener's queue and drops the reference to the 4457 * listener. The listener waits in tcp_close outside the squeue until its 4458 * refcount has dropped to 1. This ensures that the listener has waited for 4459 * all eagers to clear their association with the listener. 4460 * 4461 * Similarly, if eager decides to go away, it can unlink itself and close. 4462 * When the T_CONN_RES comes down, we check if eager has closed. Note that 4463 * the reference to eager is still valid because of the extra ref we put 4464 * in tcp_send_conn_ind. 4465 * 4466 * Listener can always locate the eager under the protection 4467 * of the listener->tcp_eager_lock, and then do a refhold 4468 * on the eager during the accept processing. 4469 * 4470 * The acceptor stream accesses the eager in the accept processing 4471 * based on the ref placed on eager before sending T_conn_ind. 4472 * The only entity that can negate this refhold is a listener close 4473 * which is mutually exclusive with an active acceptor stream. 4474 * 4475 * Eager's reference on the listener 4476 * =================================== 4477 * 4478 * If the accept happens (even on a closed eager) the eager drops its 4479 * reference on the listener at the start of tcp_accept_finish. If the 4480 * eager is killed due to an incoming RST before the T_conn_ind is sent up, 4481 * the reference is dropped in tcp_closei_local. If the listener closes, 4482 * the reference is dropped in tcp_eager_kill. In all cases the reference 4483 * is dropped while executing in the eager's context (squeue). 4484 */ 4485 /* END CSTYLED */ 4486 4487 /* Process the SYN packet, mp, directed at the listener 'tcp' */ 4488 4489 /* 4490 * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN. 4491 * tcp_input_data will not see any packets for listeners since the listener 4492 * has conn_recv set to tcp_input_listener. 4493 */ 4494 /* ARGSUSED */ 4495 void 4496 tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 4497 { 4498 tcpha_t *tcpha; 4499 uint32_t seg_seq; 4500 tcp_t *eager; 4501 int err; 4502 conn_t *econnp = NULL; 4503 squeue_t *new_sqp; 4504 mblk_t *mp1; 4505 uint_t ip_hdr_len; 4506 conn_t *lconnp = (conn_t *)arg; 4507 tcp_t *listener = lconnp->conn_tcp; 4508 tcp_stack_t *tcps = listener->tcp_tcps; 4509 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 4510 uint_t flags; 4511 mblk_t *tpi_mp; 4512 uint_t ifindex = ira->ira_ruifindex; 4513 boolean_t tlc_set = B_FALSE; 4514 4515 ip_hdr_len = ira->ira_ip_hdr_length; 4516 tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; 4517 flags = (unsigned int)tcpha->tha_flags & 0xFF; 4518 4519 if (!(flags & TH_SYN)) { 4520 if ((flags & TH_RST) || (flags & TH_URG)) { 4521 freemsg(mp); 4522 return; 4523 } 4524 if (flags & TH_ACK) { 4525 /* Note this executes in listener's squeue */ 4526 tcp_xmit_listeners_reset(mp, ira, ipst, lconnp); 4527 return; 4528 } 4529 4530 freemsg(mp); 4531 return; 4532 } 4533 4534 if (listener->tcp_state != TCPS_LISTEN) 4535 goto error2; 4536 4537 ASSERT(IPCL_IS_BOUND(lconnp)); 4538 4539 mutex_enter(&listener->tcp_eager_lock); 4540 4541 /* 4542 * The system is under memory pressure, so we need to do our part 4543 * to relieve the pressure. So we only accept new request if there 4544 * is nothing waiting to be accepted or waiting to complete the 3-way 4545 * handshake. This means that busy listener will not get too many 4546 * new requests which they cannot handle in time while non-busy 4547 * listener is still functioning properly. 4548 */ 4549 if (tcps->tcps_reclaim && (listener->tcp_conn_req_cnt_q > 0 || 4550 listener->tcp_conn_req_cnt_q0 > 0)) { 4551 mutex_exit(&listener->tcp_eager_lock); 4552 TCP_STAT(tcps, tcp_listen_mem_drop); 4553 goto error2; 4554 } 4555 4556 if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) { 4557 mutex_exit(&listener->tcp_eager_lock); 4558 TCP_STAT(tcps, tcp_listendrop); 4559 BUMP_MIB(&tcps->tcps_mib, tcpListenDrop); 4560 if (lconnp->conn_debug) { 4561 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 4562 "tcp_input_listener: listen backlog (max=%d) " 4563 "overflow (%d pending) on %s", 4564 listener->tcp_conn_req_max, 4565 listener->tcp_conn_req_cnt_q, 4566 tcp_display(listener, NULL, DISP_PORT_ONLY)); 4567 } 4568 goto error2; 4569 } 4570 4571 if (listener->tcp_conn_req_cnt_q0 >= 4572 listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) { 4573 /* 4574 * Q0 is full. Drop a pending half-open req from the queue 4575 * to make room for the new SYN req. Also mark the time we 4576 * drop a SYN. 4577 * 4578 * A more aggressive defense against SYN attack will 4579 * be to set the "tcp_syn_defense" flag now. 4580 */ 4581 TCP_STAT(tcps, tcp_listendropq0); 4582 listener->tcp_last_rcv_lbolt = ddi_get_lbolt64(); 4583 if (!tcp_drop_q0(listener)) { 4584 mutex_exit(&listener->tcp_eager_lock); 4585 BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0); 4586 if (lconnp->conn_debug) { 4587 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 4588 "tcp_input_listener: listen half-open " 4589 "queue (max=%d) full (%d pending) on %s", 4590 tcps->tcps_conn_req_max_q0, 4591 listener->tcp_conn_req_cnt_q0, 4592 tcp_display(listener, NULL, 4593 DISP_PORT_ONLY)); 4594 } 4595 goto error2; 4596 } 4597 } 4598 4599 /* 4600 * Enforce the limit set on the number of connections per listener. 4601 * Note that tlc_cnt starts with 1. So need to add 1 to tlc_max 4602 * for comparison. 4603 */ 4604 if (listener->tcp_listen_cnt != NULL) { 4605 tcp_listen_cnt_t *tlc = listener->tcp_listen_cnt; 4606 int64_t now; 4607 4608 if (atomic_add_32_nv(&tlc->tlc_cnt, 1) > tlc->tlc_max + 1) { 4609 mutex_exit(&listener->tcp_eager_lock); 4610 now = ddi_get_lbolt64(); 4611 atomic_add_32(&tlc->tlc_cnt, -1); 4612 TCP_STAT(tcps, tcp_listen_cnt_drop); 4613 tlc->tlc_drop++; 4614 if (now - tlc->tlc_report_time > 4615 MSEC_TO_TICK(TCP_TLC_REPORT_INTERVAL)) { 4616 zcmn_err(lconnp->conn_zoneid, CE_WARN, 4617 "Listener (port %d) connection max (%u) " 4618 "reached: %u attempts dropped total\n", 4619 ntohs(listener->tcp_connp->conn_lport), 4620 tlc->tlc_max, tlc->tlc_drop); 4621 tlc->tlc_report_time = now; 4622 } 4623 goto error2; 4624 } 4625 tlc_set = B_TRUE; 4626 } 4627 4628 mutex_exit(&listener->tcp_eager_lock); 4629 4630 /* 4631 * IP sets ira_sqp to either the senders conn_sqp (for loopback) 4632 * or based on the ring (for packets from GLD). Otherwise it is 4633 * set based on lbolt i.e., a somewhat random number. 4634 */ 4635 ASSERT(ira->ira_sqp != NULL); 4636 new_sqp = ira->ira_sqp; 4637 4638 econnp = (conn_t *)tcp_get_conn(arg2, tcps); 4639 if (econnp == NULL) 4640 goto error2; 4641 4642 ASSERT(econnp->conn_netstack == lconnp->conn_netstack); 4643 econnp->conn_sqp = new_sqp; 4644 econnp->conn_initial_sqp = new_sqp; 4645 econnp->conn_ixa->ixa_sqp = new_sqp; 4646 4647 econnp->conn_fport = tcpha->tha_lport; 4648 econnp->conn_lport = tcpha->tha_fport; 4649 4650 err = conn_inherit_parent(lconnp, econnp); 4651 if (err != 0) 4652 goto error3; 4653 4654 /* We already know the laddr of the new connection is ours */ 4655 econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation; 4656 4657 ASSERT(OK_32PTR(mp->b_rptr)); 4658 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION || 4659 IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION); 4660 4661 if (lconnp->conn_family == AF_INET) { 4662 ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); 4663 tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira); 4664 } else { 4665 tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira); 4666 } 4667 4668 if (tpi_mp == NULL) 4669 goto error3; 4670 4671 eager = econnp->conn_tcp; 4672 eager->tcp_detached = B_TRUE; 4673 SOCK_CONNID_INIT(eager->tcp_connid); 4674 4675 tcp_init_values(eager); 4676 4677 ASSERT((econnp->conn_ixa->ixa_flags & 4678 (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | 4679 IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) == 4680 (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | 4681 IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)); 4682 4683 if (!tcps->tcps_dev_flow_ctl) 4684 econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; 4685 4686 /* Prepare for diffing against previous packets */ 4687 eager->tcp_recvifindex = 0; 4688 eager->tcp_recvhops = 0xffffffffU; 4689 4690 if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) { 4691 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) || 4692 IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) { 4693 econnp->conn_incoming_ifindex = ifindex; 4694 econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 4695 econnp->conn_ixa->ixa_scopeid = ifindex; 4696 } 4697 } 4698 4699 if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) == 4700 (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) && 4701 tcps->tcps_rev_src_routes) { 4702 ipha_t *ipha = (ipha_t *)mp->b_rptr; 4703 ip_pkt_t *ipp = &econnp->conn_xmit_ipp; 4704 4705 /* Source routing option copyover (reverse it) */ 4706 err = ip_find_hdr_v4(ipha, ipp, B_TRUE); 4707 if (err != 0) { 4708 freemsg(tpi_mp); 4709 goto error3; 4710 } 4711 ip_pkt_source_route_reverse_v4(ipp); 4712 } 4713 4714 ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL); 4715 ASSERT(!eager->tcp_tconnind_started); 4716 /* 4717 * If the SYN came with a credential, it's a loopback packet or a 4718 * labeled packet; attach the credential to the TPI message. 4719 */ 4720 if (ira->ira_cred != NULL) 4721 mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid); 4722 4723 eager->tcp_conn.tcp_eager_conn_ind = tpi_mp; 4724 4725 /* Inherit the listener's SSL protection state */ 4726 if ((eager->tcp_kssl_ent = listener->tcp_kssl_ent) != NULL) { 4727 kssl_hold_ent(eager->tcp_kssl_ent); 4728 eager->tcp_kssl_pending = B_TRUE; 4729 } 4730 4731 /* Inherit the listener's non-STREAMS flag */ 4732 if (IPCL_IS_NONSTR(lconnp)) { 4733 econnp->conn_flags |= IPCL_NONSTR; 4734 } 4735 4736 ASSERT(eager->tcp_ordrel_mp == NULL); 4737 4738 if (!IPCL_IS_NONSTR(econnp)) { 4739 /* 4740 * Pre-allocate the T_ordrel_ind mblk for TPI socket so that 4741 * at close time, we will always have that to send up. 4742 * Otherwise, we need to do special handling in case the 4743 * allocation fails at that time. 4744 */ 4745 if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) 4746 goto error3; 4747 } 4748 /* 4749 * Now that the IP addresses and ports are setup in econnp we 4750 * can do the IPsec policy work. 4751 */ 4752 if (ira->ira_flags & IRAF_IPSEC_SECURE) { 4753 if (lconnp->conn_policy != NULL) { 4754 /* 4755 * Inherit the policy from the listener; use 4756 * actions from ira 4757 */ 4758 if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) { 4759 CONN_DEC_REF(econnp); 4760 freemsg(mp); 4761 goto error3; 4762 } 4763 } 4764 } 4765 4766 /* Inherit various TCP parameters from the listener */ 4767 eager->tcp_naglim = listener->tcp_naglim; 4768 eager->tcp_first_timer_threshold = listener->tcp_first_timer_threshold; 4769 eager->tcp_second_timer_threshold = 4770 listener->tcp_second_timer_threshold; 4771 eager->tcp_first_ctimer_threshold = 4772 listener->tcp_first_ctimer_threshold; 4773 eager->tcp_second_ctimer_threshold = 4774 listener->tcp_second_ctimer_threshold; 4775 4776 /* 4777 * tcp_set_destination() may set tcp_rwnd according to the route 4778 * metrics. If it does not, the eager's receive window will be set 4779 * to the listener's receive window later in this function. 4780 */ 4781 eager->tcp_rwnd = 0; 4782 4783 /* 4784 * Inherit listener's tcp_init_cwnd. Need to do this before 4785 * calling tcp_process_options() which set the initial cwnd. 4786 */ 4787 eager->tcp_init_cwnd = listener->tcp_init_cwnd; 4788 4789 if (is_system_labeled()) { 4790 ip_xmit_attr_t *ixa = econnp->conn_ixa; 4791 4792 ASSERT(ira->ira_tsl != NULL); 4793 /* Discard any old label */ 4794 if (ixa->ixa_free_flags & IXA_FREE_TSL) { 4795 ASSERT(ixa->ixa_tsl != NULL); 4796 label_rele(ixa->ixa_tsl); 4797 ixa->ixa_free_flags &= ~IXA_FREE_TSL; 4798 ixa->ixa_tsl = NULL; 4799 } 4800 if ((lconnp->conn_mlp_type != mlptSingle || 4801 lconnp->conn_mac_mode != CONN_MAC_DEFAULT) && 4802 ira->ira_tsl != NULL) { 4803 /* 4804 * If this is an MLP connection or a MAC-Exempt 4805 * connection with an unlabeled node, packets are to be 4806 * exchanged using the security label of the received 4807 * SYN packet instead of the server application's label. 4808 * tsol_check_dest called from ip_set_destination 4809 * might later update TSF_UNLABELED by replacing 4810 * ixa_tsl with a new label. 4811 */ 4812 label_hold(ira->ira_tsl); 4813 ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl); 4814 DTRACE_PROBE2(mlp_syn_accept, conn_t *, 4815 econnp, ts_label_t *, ixa->ixa_tsl) 4816 } else { 4817 ixa->ixa_tsl = crgetlabel(econnp->conn_cred); 4818 DTRACE_PROBE2(syn_accept, conn_t *, 4819 econnp, ts_label_t *, ixa->ixa_tsl) 4820 } 4821 /* 4822 * conn_connect() called from tcp_set_destination will verify 4823 * the destination is allowed to receive packets at the 4824 * security label of the SYN-ACK we are generating. As part of 4825 * that, tsol_check_dest() may create a new effective label for 4826 * this connection. 4827 * Finally conn_connect() will call conn_update_label. 4828 * All that remains for TCP to do is to call 4829 * conn_build_hdr_template which is done as part of 4830 * tcp_set_destination. 4831 */ 4832 } 4833 4834 /* 4835 * Since we will clear tcp_listener before we clear tcp_detached 4836 * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress 4837 * so we can tell a TCP_DETACHED_NONEAGER apart. 4838 */ 4839 eager->tcp_hard_binding = B_TRUE; 4840 4841 tcp_bind_hash_insert(&tcps->tcps_bind_fanout[ 4842 TCP_BIND_HASH(econnp->conn_lport)], eager, 0); 4843 4844 CL_INET_CONNECT(econnp, B_FALSE, err); 4845 if (err != 0) { 4846 tcp_bind_hash_remove(eager); 4847 goto error3; 4848 } 4849 4850 /* 4851 * No need to check for multicast destination since ip will only pass 4852 * up multicasts to those that have expressed interest 4853 * TODO: what about rejecting broadcasts? 4854 * Also check that source is not a multicast or broadcast address. 4855 */ 4856 eager->tcp_state = TCPS_SYN_RCVD; 4857 SOCK_CONNID_BUMP(eager->tcp_connid); 4858 4859 /* 4860 * Adapt our mss, ttl, ... based on the remote address. 4861 */ 4862 4863 if (tcp_set_destination(eager) != 0) { 4864 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); 4865 /* Undo the bind_hash_insert */ 4866 tcp_bind_hash_remove(eager); 4867 goto error3; 4868 } 4869 4870 /* Process all TCP options. */ 4871 tcp_process_options(eager, tcpha); 4872 4873 /* Is the other end ECN capable? */ 4874 if (tcps->tcps_ecn_permitted >= 1 && 4875 (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 4876 eager->tcp_ecn_ok = B_TRUE; 4877 } 4878 4879 /* 4880 * The listener's conn_rcvbuf should be the default window size or a 4881 * window size changed via SO_RCVBUF option. First round up the 4882 * eager's tcp_rwnd to the nearest MSS. Then find out the window 4883 * scale option value if needed. Call tcp_rwnd_set() to finish the 4884 * setting. 4885 * 4886 * Note if there is a rpipe metric associated with the remote host, 4887 * we should not inherit receive window size from listener. 4888 */ 4889 eager->tcp_rwnd = MSS_ROUNDUP( 4890 (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf : 4891 eager->tcp_rwnd), eager->tcp_mss); 4892 if (eager->tcp_snd_ws_ok) 4893 tcp_set_ws_value(eager); 4894 /* 4895 * Note that this is the only place tcp_rwnd_set() is called for 4896 * accepting a connection. We need to call it here instead of 4897 * after the 3-way handshake because we need to tell the other 4898 * side our rwnd in the SYN-ACK segment. 4899 */ 4900 (void) tcp_rwnd_set(eager, eager->tcp_rwnd); 4901 4902 ASSERT(eager->tcp_connp->conn_rcvbuf != 0 && 4903 eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd); 4904 4905 ASSERT(econnp->conn_rcvbuf != 0 && 4906 econnp->conn_rcvbuf == eager->tcp_rwnd); 4907 4908 /* Put a ref on the listener for the eager. */ 4909 CONN_INC_REF(lconnp); 4910 mutex_enter(&listener->tcp_eager_lock); 4911 listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 4912 eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0; 4913 listener->tcp_eager_next_q0 = eager; 4914 eager->tcp_eager_prev_q0 = listener; 4915 4916 /* Set tcp_listener before adding it to tcp_conn_fanout */ 4917 eager->tcp_listener = listener; 4918 eager->tcp_saved_listener = listener; 4919 4920 /* 4921 * Set tcp_listen_cnt so that when the connection is done, the counter 4922 * is decremented. 4923 */ 4924 eager->tcp_listen_cnt = listener->tcp_listen_cnt; 4925 4926 /* 4927 * Tag this detached tcp vector for later retrieval 4928 * by our listener client in tcp_accept(). 4929 */ 4930 eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum; 4931 listener->tcp_conn_req_cnt_q0++; 4932 if (++listener->tcp_conn_req_seqnum == -1) { 4933 /* 4934 * -1 is "special" and defined in TPI as something 4935 * that should never be used in T_CONN_IND 4936 */ 4937 ++listener->tcp_conn_req_seqnum; 4938 } 4939 mutex_exit(&listener->tcp_eager_lock); 4940 4941 if (listener->tcp_syn_defense) { 4942 /* Don't drop the SYN that comes from a good IP source */ 4943 ipaddr_t *addr_cache; 4944 4945 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 4946 if (addr_cache != NULL && econnp->conn_faddr_v4 == 4947 addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) { 4948 eager->tcp_dontdrop = B_TRUE; 4949 } 4950 } 4951 4952 /* 4953 * We need to insert the eager in its own perimeter but as soon 4954 * as we do that, we expose the eager to the classifier and 4955 * should not touch any field outside the eager's perimeter. 4956 * So do all the work necessary before inserting the eager 4957 * in its own perimeter. Be optimistic that conn_connect() 4958 * will succeed but undo everything if it fails. 4959 */ 4960 seg_seq = ntohl(tcpha->tha_seq); 4961 eager->tcp_irs = seg_seq; 4962 eager->tcp_rack = seg_seq; 4963 eager->tcp_rnxt = seg_seq + 1; 4964 eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt); 4965 BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens); 4966 eager->tcp_state = TCPS_SYN_RCVD; 4967 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, 4968 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE); 4969 if (mp1 == NULL) { 4970 /* 4971 * Increment the ref count as we are going to 4972 * enqueueing an mp in squeue 4973 */ 4974 CONN_INC_REF(econnp); 4975 goto error; 4976 } 4977 4978 /* 4979 * We need to start the rto timer. In normal case, we start 4980 * the timer after sending the packet on the wire (or at 4981 * least believing that packet was sent by waiting for 4982 * conn_ip_output() to return). Since this is the first packet 4983 * being sent on the wire for the eager, our initial tcp_rto 4984 * is at least tcp_rexmit_interval_min which is a fairly 4985 * large value to allow the algorithm to adjust slowly to large 4986 * fluctuations of RTT during first few transmissions. 4987 * 4988 * Starting the timer first and then sending the packet in this 4989 * case shouldn't make much difference since tcp_rexmit_interval_min 4990 * is of the order of several 100ms and starting the timer 4991 * first and then sending the packet will result in difference 4992 * of few micro seconds. 4993 * 4994 * Without this optimization, we are forced to hold the fanout 4995 * lock across the ipcl_bind_insert() and sending the packet 4996 * so that we don't race against an incoming packet (maybe RST) 4997 * for this eager. 4998 * 4999 * It is necessary to acquire an extra reference on the eager 5000 * at this point and hold it until after tcp_send_data() to 5001 * ensure against an eager close race. 5002 */ 5003 5004 CONN_INC_REF(econnp); 5005 5006 TCP_TIMER_RESTART(eager, eager->tcp_rto); 5007 5008 /* 5009 * Insert the eager in its own perimeter now. We are ready to deal 5010 * with any packets on eager. 5011 */ 5012 if (ipcl_conn_insert(econnp) != 0) 5013 goto error; 5014 5015 ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp); 5016 freemsg(mp); 5017 /* 5018 * Send the SYN-ACK. Use the right squeue so that conn_ixa is 5019 * only used by one thread at a time. 5020 */ 5021 if (econnp->conn_sqp == lconnp->conn_sqp) { 5022 (void) conn_ip_output(mp1, econnp->conn_ixa); 5023 CONN_DEC_REF(econnp); 5024 } else { 5025 SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_send_synack, 5026 econnp, NULL, SQ_PROCESS, SQTAG_TCP_SEND_SYNACK); 5027 } 5028 return; 5029 error: 5030 freemsg(mp1); 5031 eager->tcp_closemp_used = B_TRUE; 5032 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 5033 mp1 = &eager->tcp_closemp; 5034 SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill, 5035 econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2); 5036 5037 /* 5038 * If a connection already exists, send the mp to that connections so 5039 * that it can be appropriately dealt with. 5040 */ 5041 ipst = tcps->tcps_netstack->netstack_ip; 5042 5043 if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) { 5044 if (!IPCL_IS_CONNECTED(econnp)) { 5045 /* 5046 * Something bad happened. ipcl_conn_insert() 5047 * failed because a connection already existed 5048 * in connected hash but we can't find it 5049 * anymore (someone blew it away). Just 5050 * free this message and hopefully remote 5051 * will retransmit at which time the SYN can be 5052 * treated as a new connection or dealth with 5053 * a TH_RST if a connection already exists. 5054 */ 5055 CONN_DEC_REF(econnp); 5056 freemsg(mp); 5057 } else { 5058 SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data, 5059 econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1); 5060 } 5061 } else { 5062 /* Nobody wants this packet */ 5063 freemsg(mp); 5064 } 5065 return; 5066 error3: 5067 CONN_DEC_REF(econnp); 5068 error2: 5069 freemsg(mp); 5070 if (tlc_set) 5071 atomic_add_32(&listener->tcp_listen_cnt->tlc_cnt, -1); 5072 } 5073 5074 /* ARGSUSED2 */ 5075 void 5076 tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 5077 { 5078 conn_t *econnp = (conn_t *)arg; 5079 tcp_t *tcp = econnp->conn_tcp; 5080 5081 /* Guard against a RST having blown it away while on the squeue */ 5082 if (tcp->tcp_state == TCPS_CLOSED) { 5083 freemsg(mp); 5084 return; 5085 } 5086 5087 (void) conn_ip_output(mp, econnp->conn_ixa); 5088 } 5089 5090 /* 5091 * In an ideal case of vertical partition in NUMA architecture, its 5092 * beneficial to have the listener and all the incoming connections 5093 * tied to the same squeue. The other constraint is that incoming 5094 * connections should be tied to the squeue attached to interrupted 5095 * CPU for obvious locality reason so this leaves the listener to 5096 * be tied to the same squeue. Our only problem is that when listener 5097 * is binding, the CPU that will get interrupted by the NIC whose 5098 * IP address the listener is binding to is not even known. So 5099 * the code below allows us to change that binding at the time the 5100 * CPU is interrupted by virtue of incoming connection's squeue. 5101 * 5102 * This is usefull only in case of a listener bound to a specific IP 5103 * address. For other kind of listeners, they get bound the 5104 * very first time and there is no attempt to rebind them. 5105 */ 5106 void 5107 tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2, 5108 ip_recv_attr_t *ira) 5109 { 5110 conn_t *connp = (conn_t *)arg; 5111 squeue_t *sqp = (squeue_t *)arg2; 5112 squeue_t *new_sqp; 5113 uint32_t conn_flags; 5114 5115 /* 5116 * IP sets ira_sqp to either the senders conn_sqp (for loopback) 5117 * or based on the ring (for packets from GLD). Otherwise it is 5118 * set based on lbolt i.e., a somewhat random number. 5119 */ 5120 ASSERT(ira->ira_sqp != NULL); 5121 new_sqp = ira->ira_sqp; 5122 5123 if (connp->conn_fanout == NULL) 5124 goto done; 5125 5126 if (!(connp->conn_flags & IPCL_FULLY_BOUND)) { 5127 mutex_enter(&connp->conn_fanout->connf_lock); 5128 mutex_enter(&connp->conn_lock); 5129 /* 5130 * No one from read or write side can access us now 5131 * except for already queued packets on this squeue. 5132 * But since we haven't changed the squeue yet, they 5133 * can't execute. If they are processed after we have 5134 * changed the squeue, they are sent back to the 5135 * correct squeue down below. 5136 * But a listner close can race with processing of 5137 * incoming SYN. If incoming SYN processing changes 5138 * the squeue then the listener close which is waiting 5139 * to enter the squeue would operate on the wrong 5140 * squeue. Hence we don't change the squeue here unless 5141 * the refcount is exactly the minimum refcount. The 5142 * minimum refcount of 4 is counted as - 1 each for 5143 * TCP and IP, 1 for being in the classifier hash, and 5144 * 1 for the mblk being processed. 5145 */ 5146 5147 if (connp->conn_ref != 4 || 5148 connp->conn_tcp->tcp_state != TCPS_LISTEN) { 5149 mutex_exit(&connp->conn_lock); 5150 mutex_exit(&connp->conn_fanout->connf_lock); 5151 goto done; 5152 } 5153 if (connp->conn_sqp != new_sqp) { 5154 while (connp->conn_sqp != new_sqp) 5155 (void) casptr(&connp->conn_sqp, sqp, new_sqp); 5156 /* No special MT issues for outbound ixa_sqp hint */ 5157 connp->conn_ixa->ixa_sqp = new_sqp; 5158 } 5159 5160 do { 5161 conn_flags = connp->conn_flags; 5162 conn_flags |= IPCL_FULLY_BOUND; 5163 (void) cas32(&connp->conn_flags, connp->conn_flags, 5164 conn_flags); 5165 } while (!(connp->conn_flags & IPCL_FULLY_BOUND)); 5166 5167 mutex_exit(&connp->conn_fanout->connf_lock); 5168 mutex_exit(&connp->conn_lock); 5169 5170 /* 5171 * Assume we have picked a good squeue for the listener. Make 5172 * subsequent SYNs not try to change the squeue. 5173 */ 5174 connp->conn_recv = tcp_input_listener; 5175 } 5176 5177 done: 5178 if (connp->conn_sqp != sqp) { 5179 CONN_INC_REF(connp); 5180 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, 5181 ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND); 5182 } else { 5183 tcp_input_listener(connp, mp, sqp, ira); 5184 } 5185 } 5186 5187 /* 5188 * Successful connect request processing begins when our client passes 5189 * a T_CONN_REQ message into tcp_wput(), which performs function calls into 5190 * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream). 5191 * 5192 * After various error checks are completed, tcp_tpi_connect() lays 5193 * the target address and port into the composite header template. 5194 * Then we ask IP for information, including a source address if we didn't 5195 * already have one. Finally we prepare to send the SYN packet, and then 5196 * send up the T_OK_ACK reply message. 5197 */ 5198 static void 5199 tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) 5200 { 5201 sin_t *sin; 5202 struct T_conn_req *tcr; 5203 struct sockaddr *sa; 5204 socklen_t len; 5205 int error; 5206 cred_t *cr; 5207 pid_t cpid; 5208 conn_t *connp = tcp->tcp_connp; 5209 queue_t *q = connp->conn_wq; 5210 5211 /* 5212 * All Solaris components should pass a db_credp 5213 * for this TPI message, hence we ASSERT. 5214 * But in case there is some other M_PROTO that looks 5215 * like a TPI message sent by some other kernel 5216 * component, we check and return an error. 5217 */ 5218 cr = msg_getcred(mp, &cpid); 5219 ASSERT(cr != NULL); 5220 if (cr == NULL) { 5221 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 5222 return; 5223 } 5224 5225 tcr = (struct T_conn_req *)mp->b_rptr; 5226 5227 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 5228 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 5229 tcp_err_ack(tcp, mp, TPROTO, 0); 5230 return; 5231 } 5232 5233 /* 5234 * Pre-allocate the T_ordrel_ind mblk so that at close time, we 5235 * will always have that to send up. Otherwise, we need to do 5236 * special handling in case the allocation fails at that time. 5237 * If the end point is TPI, the tcp_t can be reused and the 5238 * tcp_ordrel_mp may be allocated already. 5239 */ 5240 if (tcp->tcp_ordrel_mp == NULL) { 5241 if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) { 5242 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 5243 return; 5244 } 5245 } 5246 5247 /* 5248 * Determine packet type based on type of address passed in 5249 * the request should contain an IPv4 or IPv6 address. 5250 * Make sure that address family matches the type of 5251 * family of the address passed down. 5252 */ 5253 switch (tcr->DEST_length) { 5254 default: 5255 tcp_err_ack(tcp, mp, TBADADDR, 0); 5256 return; 5257 5258 case (sizeof (sin_t) - sizeof (sin->sin_zero)): { 5259 /* 5260 * XXX: The check for valid DEST_length was not there 5261 * in earlier releases and some buggy 5262 * TLI apps (e.g Sybase) got away with not feeding 5263 * in sin_zero part of address. 5264 * We allow that bug to keep those buggy apps humming. 5265 * Test suites require the check on DEST_length. 5266 * We construct a new mblk with valid DEST_length 5267 * free the original so the rest of the code does 5268 * not have to keep track of this special shorter 5269 * length address case. 5270 */ 5271 mblk_t *nmp; 5272 struct T_conn_req *ntcr; 5273 sin_t *nsin; 5274 5275 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + 5276 tcr->OPT_length, BPRI_HI); 5277 if (nmp == NULL) { 5278 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 5279 return; 5280 } 5281 ntcr = (struct T_conn_req *)nmp->b_rptr; 5282 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ 5283 ntcr->PRIM_type = T_CONN_REQ; 5284 ntcr->DEST_length = sizeof (sin_t); 5285 ntcr->DEST_offset = sizeof (struct T_conn_req); 5286 5287 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); 5288 *nsin = sin_null; 5289 /* Get pointer to shorter address to copy from original mp */ 5290 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 5291 tcr->DEST_length); /* extract DEST_length worth of sin_t */ 5292 if (sin == NULL || !OK_32PTR((char *)sin)) { 5293 freemsg(nmp); 5294 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 5295 return; 5296 } 5297 nsin->sin_family = sin->sin_family; 5298 nsin->sin_port = sin->sin_port; 5299 nsin->sin_addr = sin->sin_addr; 5300 /* Note:nsin->sin_zero zero-fill with sin_null assign above */ 5301 nmp->b_wptr = (uchar_t *)&nsin[1]; 5302 if (tcr->OPT_length != 0) { 5303 ntcr->OPT_length = tcr->OPT_length; 5304 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; 5305 bcopy((uchar_t *)tcr + tcr->OPT_offset, 5306 (uchar_t *)ntcr + ntcr->OPT_offset, 5307 tcr->OPT_length); 5308 nmp->b_wptr += tcr->OPT_length; 5309 } 5310 freemsg(mp); /* original mp freed */ 5311 mp = nmp; /* re-initialize original variables */ 5312 tcr = ntcr; 5313 } 5314 /* FALLTHRU */ 5315 5316 case sizeof (sin_t): 5317 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 5318 sizeof (sin_t)); 5319 len = sizeof (sin_t); 5320 break; 5321 5322 case sizeof (sin6_t): 5323 sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, 5324 sizeof (sin6_t)); 5325 len = sizeof (sin6_t); 5326 break; 5327 } 5328 5329 error = proto_verify_ip_addr(connp->conn_family, sa, len); 5330 if (error != 0) { 5331 tcp_err_ack(tcp, mp, TSYSERR, error); 5332 return; 5333 } 5334 5335 /* 5336 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we 5337 * should key on their sequence number and cut them loose. 5338 */ 5339 5340 /* 5341 * If options passed in, feed it for verification and handling 5342 */ 5343 if (tcr->OPT_length != 0) { 5344 mblk_t *ok_mp; 5345 mblk_t *discon_mp; 5346 mblk_t *conn_opts_mp; 5347 int t_error, sys_error, do_disconnect; 5348 5349 conn_opts_mp = NULL; 5350 5351 if (tcp_conprim_opt_process(tcp, mp, 5352 &do_disconnect, &t_error, &sys_error) < 0) { 5353 if (do_disconnect) { 5354 ASSERT(t_error == 0 && sys_error == 0); 5355 discon_mp = mi_tpi_discon_ind(NULL, 5356 ECONNREFUSED, 0); 5357 if (!discon_mp) { 5358 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 5359 TSYSERR, ENOMEM); 5360 return; 5361 } 5362 ok_mp = mi_tpi_ok_ack_alloc(mp); 5363 if (!ok_mp) { 5364 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 5365 TSYSERR, ENOMEM); 5366 return; 5367 } 5368 qreply(q, ok_mp); 5369 qreply(q, discon_mp); /* no flush! */ 5370 } else { 5371 ASSERT(t_error != 0); 5372 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, 5373 sys_error); 5374 } 5375 return; 5376 } 5377 /* 5378 * Success in setting options, the mp option buffer represented 5379 * by OPT_length/offset has been potentially modified and 5380 * contains results of option processing. We copy it in 5381 * another mp to save it for potentially influencing returning 5382 * it in T_CONN_CONN. 5383 */ 5384 if (tcr->OPT_length != 0) { /* there are resulting options */ 5385 conn_opts_mp = copyb(mp); 5386 if (!conn_opts_mp) { 5387 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 5388 TSYSERR, ENOMEM); 5389 return; 5390 } 5391 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); 5392 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; 5393 /* 5394 * Note: 5395 * These resulting option negotiation can include any 5396 * end-to-end negotiation options but there no such 5397 * thing (yet?) in our TCP/IP. 5398 */ 5399 } 5400 } 5401 5402 /* call the non-TPI version */ 5403 error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid); 5404 if (error < 0) { 5405 mp = mi_tpi_err_ack_alloc(mp, -error, 0); 5406 } else if (error > 0) { 5407 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); 5408 } else { 5409 mp = mi_tpi_ok_ack_alloc(mp); 5410 } 5411 5412 /* 5413 * Note: Code below is the "failure" case 5414 */ 5415 /* return error ack and blow away saved option results if any */ 5416 connect_failed: 5417 if (mp != NULL) 5418 putnext(connp->conn_rq, mp); 5419 else { 5420 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 5421 TSYSERR, ENOMEM); 5422 } 5423 } 5424 5425 /* 5426 * Handle connect to IPv4 destinations, including connections for AF_INET6 5427 * sockets connecting to IPv4 mapped IPv6 destinations. 5428 * Returns zero if OK, a positive errno, or a negative TLI error. 5429 */ 5430 static int 5431 tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport, 5432 uint_t srcid) 5433 { 5434 ipaddr_t dstaddr = *dstaddrp; 5435 uint16_t lport; 5436 conn_t *connp = tcp->tcp_connp; 5437 tcp_stack_t *tcps = tcp->tcp_tcps; 5438 int error; 5439 5440 ASSERT(connp->conn_ipversion == IPV4_VERSION); 5441 5442 /* Check for attempt to connect to INADDR_ANY */ 5443 if (dstaddr == INADDR_ANY) { 5444 /* 5445 * SunOS 4.x and 4.3 BSD allow an application 5446 * to connect a TCP socket to INADDR_ANY. 5447 * When they do this, the kernel picks the 5448 * address of one interface and uses it 5449 * instead. The kernel usually ends up 5450 * picking the address of the loopback 5451 * interface. This is an undocumented feature. 5452 * However, we provide the same thing here 5453 * in order to have source and binary 5454 * compatibility with SunOS 4.x. 5455 * Update the T_CONN_REQ (sin/sin6) since it is used to 5456 * generate the T_CONN_CON. 5457 */ 5458 dstaddr = htonl(INADDR_LOOPBACK); 5459 *dstaddrp = dstaddr; 5460 } 5461 5462 /* Handle __sin6_src_id if socket not bound to an IP address */ 5463 if (srcid != 0 && connp->conn_laddr_v4 == INADDR_ANY) { 5464 ip_srcid_find_id(srcid, &connp->conn_laddr_v6, 5465 IPCL_ZONEID(connp), tcps->tcps_netstack); 5466 connp->conn_saddr_v6 = connp->conn_laddr_v6; 5467 } 5468 5469 IN6_IPADDR_TO_V4MAPPED(dstaddr, &connp->conn_faddr_v6); 5470 connp->conn_fport = dstport; 5471 5472 /* 5473 * At this point the remote destination address and remote port fields 5474 * in the tcp-four-tuple have been filled in the tcp structure. Now we 5475 * have to see which state tcp was in so we can take appropriate action. 5476 */ 5477 if (tcp->tcp_state == TCPS_IDLE) { 5478 /* 5479 * We support a quick connect capability here, allowing 5480 * clients to transition directly from IDLE to SYN_SENT 5481 * tcp_bindi will pick an unused port, insert the connection 5482 * in the bind hash and transition to BOUND state. 5483 */ 5484 lport = tcp_update_next_port(tcps->tcps_next_port_to_try, 5485 tcp, B_TRUE); 5486 lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE, 5487 B_FALSE, B_FALSE); 5488 if (lport == 0) 5489 return (-TNOADDR); 5490 } 5491 5492 /* 5493 * Lookup the route to determine a source address and the uinfo. 5494 * Setup TCP parameters based on the metrics/DCE. 5495 */ 5496 error = tcp_set_destination(tcp); 5497 if (error != 0) 5498 return (error); 5499 5500 /* 5501 * Don't let an endpoint connect to itself. 5502 */ 5503 if (connp->conn_faddr_v4 == connp->conn_laddr_v4 && 5504 connp->conn_fport == connp->conn_lport) 5505 return (-TBADADDR); 5506 5507 tcp->tcp_state = TCPS_SYN_SENT; 5508 5509 return (ipcl_conn_insert_v4(connp)); 5510 } 5511 5512 /* 5513 * Handle connect to IPv6 destinations. 5514 * Returns zero if OK, a positive errno, or a negative TLI error. 5515 */ 5516 static int 5517 tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport, 5518 uint32_t flowinfo, uint_t srcid, uint32_t scope_id) 5519 { 5520 uint16_t lport; 5521 conn_t *connp = tcp->tcp_connp; 5522 tcp_stack_t *tcps = tcp->tcp_tcps; 5523 int error; 5524 5525 ASSERT(connp->conn_family == AF_INET6); 5526 5527 /* 5528 * If we're here, it means that the destination address is a native 5529 * IPv6 address. Return an error if conn_ipversion is not IPv6. A 5530 * reason why it might not be IPv6 is if the socket was bound to an 5531 * IPv4-mapped IPv6 address. 5532 */ 5533 if (connp->conn_ipversion != IPV6_VERSION) 5534 return (-TBADADDR); 5535 5536 /* 5537 * Interpret a zero destination to mean loopback. 5538 * Update the T_CONN_REQ (sin/sin6) since it is used to 5539 * generate the T_CONN_CON. 5540 */ 5541 if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) 5542 *dstaddrp = ipv6_loopback; 5543 5544 /* Handle __sin6_src_id if socket not bound to an IP address */ 5545 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { 5546 ip_srcid_find_id(srcid, &connp->conn_laddr_v6, 5547 IPCL_ZONEID(connp), tcps->tcps_netstack); 5548 connp->conn_saddr_v6 = connp->conn_laddr_v6; 5549 } 5550 5551 /* 5552 * Take care of the scope_id now. 5553 */ 5554 if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(dstaddrp)) { 5555 connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; 5556 connp->conn_ixa->ixa_scopeid = scope_id; 5557 } else { 5558 connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 5559 } 5560 5561 connp->conn_flowinfo = flowinfo; 5562 connp->conn_faddr_v6 = *dstaddrp; 5563 connp->conn_fport = dstport; 5564 5565 /* 5566 * At this point the remote destination address and remote port fields 5567 * in the tcp-four-tuple have been filled in the tcp structure. Now we 5568 * have to see which state tcp was in so we can take appropriate action. 5569 */ 5570 if (tcp->tcp_state == TCPS_IDLE) { 5571 /* 5572 * We support a quick connect capability here, allowing 5573 * clients to transition directly from IDLE to SYN_SENT 5574 * tcp_bindi will pick an unused port, insert the connection 5575 * in the bind hash and transition to BOUND state. 5576 */ 5577 lport = tcp_update_next_port(tcps->tcps_next_port_to_try, 5578 tcp, B_TRUE); 5579 lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE, 5580 B_FALSE, B_FALSE); 5581 if (lport == 0) 5582 return (-TNOADDR); 5583 } 5584 5585 /* 5586 * Lookup the route to determine a source address and the uinfo. 5587 * Setup TCP parameters based on the metrics/DCE. 5588 */ 5589 error = tcp_set_destination(tcp); 5590 if (error != 0) 5591 return (error); 5592 5593 /* 5594 * Don't let an endpoint connect to itself. 5595 */ 5596 if (IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, &connp->conn_laddr_v6) && 5597 connp->conn_fport == connp->conn_lport) 5598 return (-TBADADDR); 5599 5600 tcp->tcp_state = TCPS_SYN_SENT; 5601 5602 return (ipcl_conn_insert_v6(connp)); 5603 } 5604 5605 /* 5606 * Disconnect 5607 * Note that unlike other functions this returns a positive tli error 5608 * when it fails; it never returns an errno. 5609 */ 5610 static int 5611 tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) 5612 { 5613 conn_t *lconnp; 5614 tcp_stack_t *tcps = tcp->tcp_tcps; 5615 conn_t *connp = tcp->tcp_connp; 5616 5617 /* 5618 * Right now, upper modules pass down a T_DISCON_REQ to TCP, 5619 * when the stream is in BOUND state. Do not send a reset, 5620 * since the destination IP address is not valid, and it can 5621 * be the initialized value of all zeros (broadcast address). 5622 */ 5623 if (tcp->tcp_state <= TCPS_BOUND) { 5624 if (connp->conn_debug) { 5625 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 5626 "tcp_disconnect: bad state, %d", tcp->tcp_state); 5627 } 5628 return (TOUTSTATE); 5629 } 5630 5631 5632 if (seqnum == -1 || tcp->tcp_conn_req_max == 0) { 5633 5634 /* 5635 * According to TPI, for non-listeners, ignore seqnum 5636 * and disconnect. 5637 * Following interpretation of -1 seqnum is historical 5638 * and implied TPI ? (TPI only states that for T_CONN_IND, 5639 * a valid seqnum should not be -1). 5640 * 5641 * -1 means disconnect everything 5642 * regardless even on a listener. 5643 */ 5644 5645 int old_state = tcp->tcp_state; 5646 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 5647 5648 /* 5649 * The connection can't be on the tcp_time_wait_head list 5650 * since it is not detached. 5651 */ 5652 ASSERT(tcp->tcp_time_wait_next == NULL); 5653 ASSERT(tcp->tcp_time_wait_prev == NULL); 5654 ASSERT(tcp->tcp_time_wait_expire == 0); 5655 /* 5656 * If it used to be a listener, check to make sure no one else 5657 * has taken the port before switching back to LISTEN state. 5658 */ 5659 if (connp->conn_ipversion == IPV4_VERSION) { 5660 lconnp = ipcl_lookup_listener_v4(connp->conn_lport, 5661 connp->conn_laddr_v4, IPCL_ZONEID(connp), ipst); 5662 } else { 5663 uint_t ifindex = 0; 5664 5665 if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET) 5666 ifindex = connp->conn_ixa->ixa_scopeid; 5667 5668 /* Allow conn_bound_if listeners? */ 5669 lconnp = ipcl_lookup_listener_v6(connp->conn_lport, 5670 &connp->conn_laddr_v6, ifindex, IPCL_ZONEID(connp), 5671 ipst); 5672 } 5673 if (tcp->tcp_conn_req_max && lconnp == NULL) { 5674 tcp->tcp_state = TCPS_LISTEN; 5675 } else if (old_state > TCPS_BOUND) { 5676 tcp->tcp_conn_req_max = 0; 5677 tcp->tcp_state = TCPS_BOUND; 5678 5679 /* 5680 * If this end point is not going to become a listener, 5681 * decrement the listener connection count if 5682 * necessary. Note that we do not do this if it is 5683 * going to be a listner (the above if case) since 5684 * then it may remove the counter struct. 5685 */ 5686 if (tcp->tcp_listen_cnt != NULL) 5687 TCP_DECR_LISTEN_CNT(tcp); 5688 } 5689 if (lconnp != NULL) 5690 CONN_DEC_REF(lconnp); 5691 if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) { 5692 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); 5693 } else if (old_state == TCPS_ESTABLISHED || 5694 old_state == TCPS_CLOSE_WAIT) { 5695 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); 5696 } 5697 5698 if (tcp->tcp_fused) 5699 tcp_unfuse(tcp); 5700 5701 mutex_enter(&tcp->tcp_eager_lock); 5702 if ((tcp->tcp_conn_req_cnt_q0 != 0) || 5703 (tcp->tcp_conn_req_cnt_q != 0)) { 5704 tcp_eager_cleanup(tcp, 0); 5705 } 5706 mutex_exit(&tcp->tcp_eager_lock); 5707 5708 tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt, 5709 tcp->tcp_rnxt, TH_RST | TH_ACK); 5710 5711 tcp_reinit(tcp); 5712 5713 return (0); 5714 } else if (!tcp_eager_blowoff(tcp, seqnum)) { 5715 return (TBADSEQ); 5716 } 5717 return (0); 5718 } 5719 5720 /* 5721 * Our client hereby directs us to reject the connection request 5722 * that tcp_input_listener() marked with 'seqnum'. Rejection consists 5723 * of sending the appropriate RST, not an ICMP error. 5724 */ 5725 static void 5726 tcp_disconnect(tcp_t *tcp, mblk_t *mp) 5727 { 5728 t_scalar_t seqnum; 5729 int error; 5730 conn_t *connp = tcp->tcp_connp; 5731 5732 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 5733 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) { 5734 tcp_err_ack(tcp, mp, TPROTO, 0); 5735 return; 5736 } 5737 seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number; 5738 error = tcp_disconnect_common(tcp, seqnum); 5739 if (error != 0) 5740 tcp_err_ack(tcp, mp, error, 0); 5741 else { 5742 if (tcp->tcp_state >= TCPS_ESTABLISHED) { 5743 /* Send M_FLUSH according to TPI */ 5744 (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); 5745 } 5746 mp = mi_tpi_ok_ack_alloc(mp); 5747 if (mp != NULL) 5748 putnext(connp->conn_rq, mp); 5749 } 5750 } 5751 5752 /* 5753 * Diagnostic routine used to return a string associated with the tcp state. 5754 * Note that if the caller does not supply a buffer, it will use an internal 5755 * static string. This means that if multiple threads call this function at 5756 * the same time, output can be corrupted... Note also that this function 5757 * does not check the size of the supplied buffer. The caller has to make 5758 * sure that it is big enough. 5759 */ 5760 static char * 5761 tcp_display(tcp_t *tcp, char *sup_buf, char format) 5762 { 5763 char buf1[30]; 5764 static char priv_buf[INET6_ADDRSTRLEN * 2 + 80]; 5765 char *buf; 5766 char *cp; 5767 in6_addr_t local, remote; 5768 char local_addrbuf[INET6_ADDRSTRLEN]; 5769 char remote_addrbuf[INET6_ADDRSTRLEN]; 5770 conn_t *connp; 5771 5772 if (sup_buf != NULL) 5773 buf = sup_buf; 5774 else 5775 buf = priv_buf; 5776 5777 if (tcp == NULL) 5778 return ("NULL_TCP"); 5779 5780 connp = tcp->tcp_connp; 5781 switch (tcp->tcp_state) { 5782 case TCPS_CLOSED: 5783 cp = "TCP_CLOSED"; 5784 break; 5785 case TCPS_IDLE: 5786 cp = "TCP_IDLE"; 5787 break; 5788 case TCPS_BOUND: 5789 cp = "TCP_BOUND"; 5790 break; 5791 case TCPS_LISTEN: 5792 cp = "TCP_LISTEN"; 5793 break; 5794 case TCPS_SYN_SENT: 5795 cp = "TCP_SYN_SENT"; 5796 break; 5797 case TCPS_SYN_RCVD: 5798 cp = "TCP_SYN_RCVD"; 5799 break; 5800 case TCPS_ESTABLISHED: 5801 cp = "TCP_ESTABLISHED"; 5802 break; 5803 case TCPS_CLOSE_WAIT: 5804 cp = "TCP_CLOSE_WAIT"; 5805 break; 5806 case TCPS_FIN_WAIT_1: 5807 cp = "TCP_FIN_WAIT_1"; 5808 break; 5809 case TCPS_CLOSING: 5810 cp = "TCP_CLOSING"; 5811 break; 5812 case TCPS_LAST_ACK: 5813 cp = "TCP_LAST_ACK"; 5814 break; 5815 case TCPS_FIN_WAIT_2: 5816 cp = "TCP_FIN_WAIT_2"; 5817 break; 5818 case TCPS_TIME_WAIT: 5819 cp = "TCP_TIME_WAIT"; 5820 break; 5821 default: 5822 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 5823 cp = buf1; 5824 break; 5825 } 5826 switch (format) { 5827 case DISP_ADDR_AND_PORT: 5828 if (connp->conn_ipversion == IPV4_VERSION) { 5829 /* 5830 * Note that we use the remote address in the tcp_b 5831 * structure. This means that it will print out 5832 * the real destination address, not the next hop's 5833 * address if source routing is used. 5834 */ 5835 IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local); 5836 IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote); 5837 5838 } else { 5839 local = connp->conn_laddr_v6; 5840 remote = connp->conn_faddr_v6; 5841 } 5842 (void) inet_ntop(AF_INET6, &local, local_addrbuf, 5843 sizeof (local_addrbuf)); 5844 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf, 5845 sizeof (remote_addrbuf)); 5846 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s", 5847 local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf, 5848 ntohs(connp->conn_fport), cp); 5849 break; 5850 case DISP_PORT_ONLY: 5851 default: 5852 (void) mi_sprintf(buf, "[%u, %u] %s", 5853 ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp); 5854 break; 5855 } 5856 5857 return (buf); 5858 } 5859 5860 /* 5861 * Called via squeue to get on to eager's perimeter. It sends a 5862 * TH_RST if eager is in the fanout table. The listener wants the 5863 * eager to disappear either by means of tcp_eager_blowoff() or 5864 * tcp_eager_cleanup() being called. tcp_eager_kill() can also be 5865 * called (via squeue) if the eager cannot be inserted in the 5866 * fanout table in tcp_input_listener(). 5867 */ 5868 /* ARGSUSED */ 5869 void 5870 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 5871 { 5872 conn_t *econnp = (conn_t *)arg; 5873 tcp_t *eager = econnp->conn_tcp; 5874 tcp_t *listener = eager->tcp_listener; 5875 5876 /* 5877 * We could be called because listener is closing. Since 5878 * the eager was using listener's queue's, we avoid 5879 * using the listeners queues from now on. 5880 */ 5881 ASSERT(eager->tcp_detached); 5882 econnp->conn_rq = NULL; 5883 econnp->conn_wq = NULL; 5884 5885 /* 5886 * An eager's conn_fanout will be NULL if it's a duplicate 5887 * for an existing 4-tuples in the conn fanout table. 5888 * We don't want to send an RST out in such case. 5889 */ 5890 if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) { 5891 tcp_xmit_ctl("tcp_eager_kill, can't wait", 5892 eager, eager->tcp_snxt, 0, TH_RST); 5893 } 5894 5895 /* We are here because listener wants this eager gone */ 5896 if (listener != NULL) { 5897 mutex_enter(&listener->tcp_eager_lock); 5898 tcp_eager_unlink(eager); 5899 if (eager->tcp_tconnind_started) { 5900 /* 5901 * The eager has sent a conn_ind up to the 5902 * listener but listener decides to close 5903 * instead. We need to drop the extra ref 5904 * placed on eager in tcp_input_data() before 5905 * sending the conn_ind to listener. 5906 */ 5907 CONN_DEC_REF(econnp); 5908 } 5909 mutex_exit(&listener->tcp_eager_lock); 5910 CONN_DEC_REF(listener->tcp_connp); 5911 } 5912 5913 if (eager->tcp_state != TCPS_CLOSED) 5914 tcp_close_detached(eager); 5915 } 5916 5917 /* 5918 * Reset any eager connection hanging off this listener marked 5919 * with 'seqnum' and then reclaim it's resources. 5920 */ 5921 static boolean_t 5922 tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) 5923 { 5924 tcp_t *eager; 5925 mblk_t *mp; 5926 tcp_stack_t *tcps = listener->tcp_tcps; 5927 5928 TCP_STAT(tcps, tcp_eager_blowoff_calls); 5929 eager = listener; 5930 mutex_enter(&listener->tcp_eager_lock); 5931 do { 5932 eager = eager->tcp_eager_next_q; 5933 if (eager == NULL) { 5934 mutex_exit(&listener->tcp_eager_lock); 5935 return (B_FALSE); 5936 } 5937 } while (eager->tcp_conn_req_seqnum != seqnum); 5938 5939 if (eager->tcp_closemp_used) { 5940 mutex_exit(&listener->tcp_eager_lock); 5941 return (B_TRUE); 5942 } 5943 eager->tcp_closemp_used = B_TRUE; 5944 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 5945 CONN_INC_REF(eager->tcp_connp); 5946 mutex_exit(&listener->tcp_eager_lock); 5947 mp = &eager->tcp_closemp; 5948 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, 5949 eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF); 5950 return (B_TRUE); 5951 } 5952 5953 /* 5954 * Reset any eager connection hanging off this listener 5955 * and then reclaim it's resources. 5956 */ 5957 static void 5958 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) 5959 { 5960 tcp_t *eager; 5961 mblk_t *mp; 5962 tcp_stack_t *tcps = listener->tcp_tcps; 5963 5964 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 5965 5966 if (!q0_only) { 5967 /* First cleanup q */ 5968 TCP_STAT(tcps, tcp_eager_blowoff_q); 5969 eager = listener->tcp_eager_next_q; 5970 while (eager != NULL) { 5971 if (!eager->tcp_closemp_used) { 5972 eager->tcp_closemp_used = B_TRUE; 5973 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 5974 CONN_INC_REF(eager->tcp_connp); 5975 mp = &eager->tcp_closemp; 5976 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, 5977 tcp_eager_kill, eager->tcp_connp, NULL, 5978 SQ_FILL, SQTAG_TCP_EAGER_CLEANUP); 5979 } 5980 eager = eager->tcp_eager_next_q; 5981 } 5982 } 5983 /* Then cleanup q0 */ 5984 TCP_STAT(tcps, tcp_eager_blowoff_q0); 5985 eager = listener->tcp_eager_next_q0; 5986 while (eager != listener) { 5987 if (!eager->tcp_closemp_used) { 5988 eager->tcp_closemp_used = B_TRUE; 5989 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 5990 CONN_INC_REF(eager->tcp_connp); 5991 mp = &eager->tcp_closemp; 5992 SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, 5993 tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL, 5994 SQTAG_TCP_EAGER_CLEANUP_Q0); 5995 } 5996 eager = eager->tcp_eager_next_q0; 5997 } 5998 } 5999 6000 /* 6001 * If we are an eager connection hanging off a listener that hasn't 6002 * formally accepted the connection yet, get off his list and blow off 6003 * any data that we have accumulated. 6004 */ 6005 static void 6006 tcp_eager_unlink(tcp_t *tcp) 6007 { 6008 tcp_t *listener = tcp->tcp_listener; 6009 6010 ASSERT(listener != NULL); 6011 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 6012 if (tcp->tcp_eager_next_q0 != NULL) { 6013 ASSERT(tcp->tcp_eager_prev_q0 != NULL); 6014 6015 /* Remove the eager tcp from q0 */ 6016 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 6017 tcp->tcp_eager_prev_q0; 6018 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 6019 tcp->tcp_eager_next_q0; 6020 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 6021 listener->tcp_conn_req_cnt_q0--; 6022 6023 tcp->tcp_eager_next_q0 = NULL; 6024 tcp->tcp_eager_prev_q0 = NULL; 6025 6026 /* 6027 * Take the eager out, if it is in the list of droppable 6028 * eagers. 6029 */ 6030 MAKE_UNDROPPABLE(tcp); 6031 6032 if (tcp->tcp_syn_rcvd_timeout != 0) { 6033 /* we have timed out before */ 6034 ASSERT(listener->tcp_syn_rcvd_timeout > 0); 6035 listener->tcp_syn_rcvd_timeout--; 6036 } 6037 } else { 6038 tcp_t **tcpp = &listener->tcp_eager_next_q; 6039 tcp_t *prev = NULL; 6040 6041 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 6042 if (tcpp[0] == tcp) { 6043 if (listener->tcp_eager_last_q == tcp) { 6044 /* 6045 * If we are unlinking the last 6046 * element on the list, adjust 6047 * tail pointer. Set tail pointer 6048 * to nil when list is empty. 6049 */ 6050 ASSERT(tcp->tcp_eager_next_q == NULL); 6051 if (listener->tcp_eager_last_q == 6052 listener->tcp_eager_next_q) { 6053 listener->tcp_eager_last_q = 6054 NULL; 6055 } else { 6056 /* 6057 * We won't get here if there 6058 * is only one eager in the 6059 * list. 6060 */ 6061 ASSERT(prev != NULL); 6062 listener->tcp_eager_last_q = 6063 prev; 6064 } 6065 } 6066 tcpp[0] = tcp->tcp_eager_next_q; 6067 tcp->tcp_eager_next_q = NULL; 6068 tcp->tcp_eager_last_q = NULL; 6069 ASSERT(listener->tcp_conn_req_cnt_q > 0); 6070 listener->tcp_conn_req_cnt_q--; 6071 break; 6072 } 6073 prev = tcpp[0]; 6074 } 6075 } 6076 tcp->tcp_listener = NULL; 6077 } 6078 6079 /* Shorthand to generate and send TPI error acks to our client */ 6080 static void 6081 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) 6082 { 6083 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 6084 putnext(tcp->tcp_connp->conn_rq, mp); 6085 } 6086 6087 /* Shorthand to generate and send TPI error acks to our client */ 6088 static void 6089 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 6090 int t_error, int sys_error) 6091 { 6092 struct T_error_ack *teackp; 6093 6094 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 6095 M_PCPROTO, T_ERROR_ACK)) != NULL) { 6096 teackp = (struct T_error_ack *)mp->b_rptr; 6097 teackp->ERROR_prim = primitive; 6098 teackp->TLI_error = t_error; 6099 teackp->UNIX_error = sys_error; 6100 putnext(tcp->tcp_connp->conn_rq, mp); 6101 } 6102 } 6103 6104 /* 6105 * Note: No locks are held when inspecting tcp_g_*epriv_ports 6106 * but instead the code relies on: 6107 * - the fact that the address of the array and its size never changes 6108 * - the atomic assignment of the elements of the array 6109 */ 6110 /* ARGSUSED */ 6111 static int 6112 tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 6113 { 6114 int i; 6115 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 6116 6117 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 6118 if (tcps->tcps_g_epriv_ports[i] != 0) 6119 (void) mi_mpprintf(mp, "%d ", 6120 tcps->tcps_g_epriv_ports[i]); 6121 } 6122 return (0); 6123 } 6124 6125 /* 6126 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple 6127 * threads from changing it at the same time. 6128 */ 6129 /* ARGSUSED */ 6130 static int 6131 tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 6132 cred_t *cr) 6133 { 6134 long new_value; 6135 int i; 6136 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 6137 6138 /* 6139 * Fail the request if the new value does not lie within the 6140 * port number limits. 6141 */ 6142 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 6143 new_value <= 0 || new_value >= 65536) { 6144 return (EINVAL); 6145 } 6146 6147 mutex_enter(&tcps->tcps_epriv_port_lock); 6148 /* Check if the value is already in the list */ 6149 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 6150 if (new_value == tcps->tcps_g_epriv_ports[i]) { 6151 mutex_exit(&tcps->tcps_epriv_port_lock); 6152 return (EEXIST); 6153 } 6154 } 6155 /* Find an empty slot */ 6156 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 6157 if (tcps->tcps_g_epriv_ports[i] == 0) 6158 break; 6159 } 6160 if (i == tcps->tcps_g_num_epriv_ports) { 6161 mutex_exit(&tcps->tcps_epriv_port_lock); 6162 return (EOVERFLOW); 6163 } 6164 /* Set the new value */ 6165 tcps->tcps_g_epriv_ports[i] = (uint16_t)new_value; 6166 mutex_exit(&tcps->tcps_epriv_port_lock); 6167 return (0); 6168 } 6169 6170 /* 6171 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple 6172 * threads from changing it at the same time. 6173 */ 6174 /* ARGSUSED */ 6175 static int 6176 tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 6177 cred_t *cr) 6178 { 6179 long new_value; 6180 int i; 6181 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 6182 6183 /* 6184 * Fail the request if the new value does not lie within the 6185 * port number limits. 6186 */ 6187 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value <= 0 || 6188 new_value >= 65536) { 6189 return (EINVAL); 6190 } 6191 6192 mutex_enter(&tcps->tcps_epriv_port_lock); 6193 /* Check that the value is already in the list */ 6194 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 6195 if (tcps->tcps_g_epriv_ports[i] == new_value) 6196 break; 6197 } 6198 if (i == tcps->tcps_g_num_epriv_ports) { 6199 mutex_exit(&tcps->tcps_epriv_port_lock); 6200 return (ESRCH); 6201 } 6202 /* Clear the value */ 6203 tcps->tcps_g_epriv_ports[i] = 0; 6204 mutex_exit(&tcps->tcps_epriv_port_lock); 6205 return (0); 6206 } 6207 6208 /* Return the TPI/TLI equivalent of our current tcp_state */ 6209 static int 6210 tcp_tpistate(tcp_t *tcp) 6211 { 6212 switch (tcp->tcp_state) { 6213 case TCPS_IDLE: 6214 return (TS_UNBND); 6215 case TCPS_LISTEN: 6216 /* 6217 * Return whether there are outstanding T_CONN_IND waiting 6218 * for the matching T_CONN_RES. Therefore don't count q0. 6219 */ 6220 if (tcp->tcp_conn_req_cnt_q > 0) 6221 return (TS_WRES_CIND); 6222 else 6223 return (TS_IDLE); 6224 case TCPS_BOUND: 6225 return (TS_IDLE); 6226 case TCPS_SYN_SENT: 6227 return (TS_WCON_CREQ); 6228 case TCPS_SYN_RCVD: 6229 /* 6230 * Note: assumption: this has to the active open SYN_RCVD. 6231 * The passive instance is detached in SYN_RCVD stage of 6232 * incoming connection processing so we cannot get request 6233 * for T_info_ack on it. 6234 */ 6235 return (TS_WACK_CRES); 6236 case TCPS_ESTABLISHED: 6237 return (TS_DATA_XFER); 6238 case TCPS_CLOSE_WAIT: 6239 return (TS_WREQ_ORDREL); 6240 case TCPS_FIN_WAIT_1: 6241 return (TS_WIND_ORDREL); 6242 case TCPS_FIN_WAIT_2: 6243 return (TS_WIND_ORDREL); 6244 6245 case TCPS_CLOSING: 6246 case TCPS_LAST_ACK: 6247 case TCPS_TIME_WAIT: 6248 case TCPS_CLOSED: 6249 /* 6250 * Following TS_WACK_DREQ7 is a rendition of "not 6251 * yet TS_IDLE" TPI state. There is no best match to any 6252 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we 6253 * choose a value chosen that will map to TLI/XTI level 6254 * state of TSTATECHNG (state is process of changing) which 6255 * captures what this dummy state represents. 6256 */ 6257 return (TS_WACK_DREQ7); 6258 default: 6259 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", 6260 tcp->tcp_state, tcp_display(tcp, NULL, 6261 DISP_PORT_ONLY)); 6262 return (TS_UNBND); 6263 } 6264 } 6265 6266 static void 6267 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) 6268 { 6269 tcp_stack_t *tcps = tcp->tcp_tcps; 6270 conn_t *connp = tcp->tcp_connp; 6271 6272 if (connp->conn_family == AF_INET6) 6273 *tia = tcp_g_t_info_ack_v6; 6274 else 6275 *tia = tcp_g_t_info_ack; 6276 tia->CURRENT_state = tcp_tpistate(tcp); 6277 tia->OPT_size = tcp_max_optsize; 6278 if (tcp->tcp_mss == 0) { 6279 /* Not yet set - tcp_open does not set mss */ 6280 if (connp->conn_ipversion == IPV4_VERSION) 6281 tia->TIDU_size = tcps->tcps_mss_def_ipv4; 6282 else 6283 tia->TIDU_size = tcps->tcps_mss_def_ipv6; 6284 } else { 6285 tia->TIDU_size = tcp->tcp_mss; 6286 } 6287 /* TODO: Default ETSDU is 1. Is that correct for tcp? */ 6288 } 6289 6290 static void 6291 tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap, 6292 t_uscalar_t cap_bits1) 6293 { 6294 tcap->CAP_bits1 = 0; 6295 6296 if (cap_bits1 & TC1_INFO) { 6297 tcp_copy_info(&tcap->INFO_ack, tcp); 6298 tcap->CAP_bits1 |= TC1_INFO; 6299 } 6300 6301 if (cap_bits1 & TC1_ACCEPTOR_ID) { 6302 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; 6303 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; 6304 } 6305 6306 } 6307 6308 /* 6309 * This routine responds to T_CAPABILITY_REQ messages. It is called by 6310 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from 6311 * tcp_g_t_info_ack. The current state of the stream is copied from 6312 * tcp_state. 6313 */ 6314 static void 6315 tcp_capability_req(tcp_t *tcp, mblk_t *mp) 6316 { 6317 t_uscalar_t cap_bits1; 6318 struct T_capability_ack *tcap; 6319 6320 if (MBLKL(mp) < sizeof (struct T_capability_req)) { 6321 freemsg(mp); 6322 return; 6323 } 6324 6325 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 6326 6327 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 6328 mp->b_datap->db_type, T_CAPABILITY_ACK); 6329 if (mp == NULL) 6330 return; 6331 6332 tcap = (struct T_capability_ack *)mp->b_rptr; 6333 tcp_do_capability_ack(tcp, tcap, cap_bits1); 6334 6335 putnext(tcp->tcp_connp->conn_rq, mp); 6336 } 6337 6338 /* 6339 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. 6340 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. 6341 * The current state of the stream is copied from tcp_state. 6342 */ 6343 static void 6344 tcp_info_req(tcp_t *tcp, mblk_t *mp) 6345 { 6346 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 6347 T_INFO_ACK); 6348 if (!mp) { 6349 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 6350 return; 6351 } 6352 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); 6353 putnext(tcp->tcp_connp->conn_rq, mp); 6354 } 6355 6356 /* Respond to the TPI addr request */ 6357 static void 6358 tcp_addr_req(tcp_t *tcp, mblk_t *mp) 6359 { 6360 struct sockaddr *sa; 6361 mblk_t *ackmp; 6362 struct T_addr_ack *taa; 6363 conn_t *connp = tcp->tcp_connp; 6364 uint_t addrlen; 6365 6366 /* Make it large enough for worst case */ 6367 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 6368 2 * sizeof (sin6_t), 1); 6369 if (ackmp == NULL) { 6370 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 6371 return; 6372 } 6373 6374 taa = (struct T_addr_ack *)ackmp->b_rptr; 6375 6376 bzero(taa, sizeof (struct T_addr_ack)); 6377 ackmp->b_wptr = (uchar_t *)&taa[1]; 6378 6379 taa->PRIM_type = T_ADDR_ACK; 6380 ackmp->b_datap->db_type = M_PCPROTO; 6381 6382 if (connp->conn_family == AF_INET) 6383 addrlen = sizeof (sin_t); 6384 else 6385 addrlen = sizeof (sin6_t); 6386 6387 /* 6388 * Note: Following code assumes 32 bit alignment of basic 6389 * data structures like sin_t and struct T_addr_ack. 6390 */ 6391 if (tcp->tcp_state >= TCPS_BOUND) { 6392 /* 6393 * Fill in local address first 6394 */ 6395 taa->LOCADDR_offset = sizeof (*taa); 6396 taa->LOCADDR_length = addrlen; 6397 sa = (struct sockaddr *)&taa[1]; 6398 (void) conn_getsockname(connp, sa, &addrlen); 6399 ackmp->b_wptr += addrlen; 6400 } 6401 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 6402 /* 6403 * Fill in Remote address 6404 */ 6405 taa->REMADDR_length = addrlen; 6406 /* assumed 32-bit alignment */ 6407 taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; 6408 sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); 6409 (void) conn_getpeername(connp, sa, &addrlen); 6410 ackmp->b_wptr += addrlen; 6411 } 6412 ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); 6413 putnext(tcp->tcp_connp->conn_rq, ackmp); 6414 } 6415 6416 /* 6417 * Handle reinitialization of a tcp structure. 6418 * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE. 6419 */ 6420 static void 6421 tcp_reinit(tcp_t *tcp) 6422 { 6423 mblk_t *mp; 6424 tcp_stack_t *tcps = tcp->tcp_tcps; 6425 conn_t *connp = tcp->tcp_connp; 6426 6427 TCP_STAT(tcps, tcp_reinit_calls); 6428 6429 /* tcp_reinit should never be called for detached tcp_t's */ 6430 ASSERT(tcp->tcp_listener == NULL); 6431 ASSERT((connp->conn_family == AF_INET && 6432 connp->conn_ipversion == IPV4_VERSION) || 6433 (connp->conn_family == AF_INET6 && 6434 (connp->conn_ipversion == IPV4_VERSION || 6435 connp->conn_ipversion == IPV6_VERSION))); 6436 6437 /* Cancel outstanding timers */ 6438 tcp_timers_stop(tcp); 6439 6440 /* 6441 * Reset everything in the state vector, after updating global 6442 * MIB data from instance counters. 6443 */ 6444 UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs); 6445 tcp->tcp_ibsegs = 0; 6446 UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs); 6447 tcp->tcp_obsegs = 0; 6448 6449 tcp_close_mpp(&tcp->tcp_xmit_head); 6450 if (tcp->tcp_snd_zcopy_aware) 6451 tcp_zcopy_notify(tcp); 6452 tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL; 6453 tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0; 6454 mutex_enter(&tcp->tcp_non_sq_lock); 6455 if (tcp->tcp_flow_stopped && 6456 TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { 6457 tcp_clrqfull(tcp); 6458 } 6459 mutex_exit(&tcp->tcp_non_sq_lock); 6460 tcp_close_mpp(&tcp->tcp_reass_head); 6461 tcp->tcp_reass_tail = NULL; 6462 if (tcp->tcp_rcv_list != NULL) { 6463 /* Free b_next chain */ 6464 tcp_close_mpp(&tcp->tcp_rcv_list); 6465 tcp->tcp_rcv_last_head = NULL; 6466 tcp->tcp_rcv_last_tail = NULL; 6467 tcp->tcp_rcv_cnt = 0; 6468 } 6469 tcp->tcp_rcv_last_tail = NULL; 6470 6471 if ((mp = tcp->tcp_urp_mp) != NULL) { 6472 freemsg(mp); 6473 tcp->tcp_urp_mp = NULL; 6474 } 6475 if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 6476 freemsg(mp); 6477 tcp->tcp_urp_mark_mp = NULL; 6478 } 6479 if (tcp->tcp_fused_sigurg_mp != NULL) { 6480 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 6481 freeb(tcp->tcp_fused_sigurg_mp); 6482 tcp->tcp_fused_sigurg_mp = NULL; 6483 } 6484 if (tcp->tcp_ordrel_mp != NULL) { 6485 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 6486 freeb(tcp->tcp_ordrel_mp); 6487 tcp->tcp_ordrel_mp = NULL; 6488 } 6489 6490 /* 6491 * Following is a union with two members which are 6492 * identical types and size so the following cleanup 6493 * is enough. 6494 */ 6495 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 6496 6497 CL_INET_DISCONNECT(connp); 6498 6499 /* 6500 * The connection can't be on the tcp_time_wait_head list 6501 * since it is not detached. 6502 */ 6503 ASSERT(tcp->tcp_time_wait_next == NULL); 6504 ASSERT(tcp->tcp_time_wait_prev == NULL); 6505 ASSERT(tcp->tcp_time_wait_expire == 0); 6506 6507 if (tcp->tcp_kssl_pending) { 6508 tcp->tcp_kssl_pending = B_FALSE; 6509 6510 /* Don't reset if the initialized by bind. */ 6511 if (tcp->tcp_kssl_ent != NULL) { 6512 kssl_release_ent(tcp->tcp_kssl_ent, NULL, 6513 KSSL_NO_PROXY); 6514 } 6515 } 6516 if (tcp->tcp_kssl_ctx != NULL) { 6517 kssl_release_ctx(tcp->tcp_kssl_ctx); 6518 tcp->tcp_kssl_ctx = NULL; 6519 } 6520 6521 /* 6522 * Reset/preserve other values 6523 */ 6524 tcp_reinit_values(tcp); 6525 ipcl_hash_remove(connp); 6526 ixa_cleanup(connp->conn_ixa); 6527 tcp_ipsec_cleanup(tcp); 6528 6529 connp->conn_laddr_v6 = connp->conn_bound_addr_v6; 6530 connp->conn_saddr_v6 = connp->conn_bound_addr_v6; 6531 6532 if (tcp->tcp_conn_req_max != 0) { 6533 /* 6534 * This is the case when a TLI program uses the same 6535 * transport end point to accept a connection. This 6536 * makes the TCP both a listener and acceptor. When 6537 * this connection is closed, we need to set the state 6538 * back to TCPS_LISTEN. Make sure that the eager list 6539 * is reinitialized. 6540 * 6541 * Note that this stream is still bound to the four 6542 * tuples of the previous connection in IP. If a new 6543 * SYN with different foreign address comes in, IP will 6544 * not find it and will send it to the global queue. In 6545 * the global queue, TCP will do a tcp_lookup_listener() 6546 * to find this stream. This works because this stream 6547 * is only removed from connected hash. 6548 * 6549 */ 6550 tcp->tcp_state = TCPS_LISTEN; 6551 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 6552 tcp->tcp_eager_next_drop_q0 = tcp; 6553 tcp->tcp_eager_prev_drop_q0 = tcp; 6554 /* 6555 * Initially set conn_recv to tcp_input_listener_unbound to try 6556 * to pick a good squeue for the listener when the first SYN 6557 * arrives. tcp_input_listener_unbound sets it to 6558 * tcp_input_listener on that first SYN. 6559 */ 6560 connp->conn_recv = tcp_input_listener_unbound; 6561 6562 connp->conn_proto = IPPROTO_TCP; 6563 connp->conn_faddr_v6 = ipv6_all_zeros; 6564 connp->conn_fport = 0; 6565 6566 (void) ipcl_bind_insert(connp); 6567 } else { 6568 tcp->tcp_state = TCPS_BOUND; 6569 } 6570 6571 /* 6572 * Initialize to default values 6573 */ 6574 tcp_init_values(tcp); 6575 6576 ASSERT(tcp->tcp_ptpbhn != NULL); 6577 tcp->tcp_rwnd = connp->conn_rcvbuf; 6578 tcp->tcp_mss = connp->conn_ipversion != IPV4_VERSION ? 6579 tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4; 6580 } 6581 6582 /* 6583 * Force values to zero that need be zero. 6584 * Do not touch values asociated with the BOUND or LISTEN state 6585 * since the connection will end up in that state after the reinit. 6586 * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t 6587 * structure! 6588 */ 6589 static void 6590 tcp_reinit_values(tcp) 6591 tcp_t *tcp; 6592 { 6593 tcp_stack_t *tcps = tcp->tcp_tcps; 6594 conn_t *connp = tcp->tcp_connp; 6595 6596 #ifndef lint 6597 #define DONTCARE(x) 6598 #define PRESERVE(x) 6599 #else 6600 #define DONTCARE(x) ((x) = (x)) 6601 #define PRESERVE(x) ((x) = (x)) 6602 #endif /* lint */ 6603 6604 PRESERVE(tcp->tcp_bind_hash_port); 6605 PRESERVE(tcp->tcp_bind_hash); 6606 PRESERVE(tcp->tcp_ptpbhn); 6607 PRESERVE(tcp->tcp_acceptor_hash); 6608 PRESERVE(tcp->tcp_ptpahn); 6609 6610 /* Should be ASSERT NULL on these with new code! */ 6611 ASSERT(tcp->tcp_time_wait_next == NULL); 6612 ASSERT(tcp->tcp_time_wait_prev == NULL); 6613 ASSERT(tcp->tcp_time_wait_expire == 0); 6614 PRESERVE(tcp->tcp_state); 6615 PRESERVE(connp->conn_rq); 6616 PRESERVE(connp->conn_wq); 6617 6618 ASSERT(tcp->tcp_xmit_head == NULL); 6619 ASSERT(tcp->tcp_xmit_last == NULL); 6620 ASSERT(tcp->tcp_unsent == 0); 6621 ASSERT(tcp->tcp_xmit_tail == NULL); 6622 ASSERT(tcp->tcp_xmit_tail_unsent == 0); 6623 6624 tcp->tcp_snxt = 0; /* Displayed in mib */ 6625 tcp->tcp_suna = 0; /* Displayed in mib */ 6626 tcp->tcp_swnd = 0; 6627 DONTCARE(tcp->tcp_cwnd); /* Init in tcp_process_options */ 6628 6629 ASSERT(tcp->tcp_ibsegs == 0); 6630 ASSERT(tcp->tcp_obsegs == 0); 6631 6632 if (connp->conn_ht_iphc != NULL) { 6633 kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); 6634 connp->conn_ht_iphc = NULL; 6635 connp->conn_ht_iphc_allocated = 0; 6636 connp->conn_ht_iphc_len = 0; 6637 connp->conn_ht_ulp = NULL; 6638 connp->conn_ht_ulp_len = 0; 6639 tcp->tcp_ipha = NULL; 6640 tcp->tcp_ip6h = NULL; 6641 tcp->tcp_tcpha = NULL; 6642 } 6643 6644 /* We clear any IP_OPTIONS and extension headers */ 6645 ip_pkt_free(&connp->conn_xmit_ipp); 6646 6647 DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */ 6648 DONTCARE(tcp->tcp_ipha); 6649 DONTCARE(tcp->tcp_ip6h); 6650 DONTCARE(tcp->tcp_tcpha); 6651 tcp->tcp_valid_bits = 0; 6652 6653 DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */ 6654 DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */ 6655 tcp->tcp_last_rcv_lbolt = 0; 6656 6657 tcp->tcp_init_cwnd = 0; 6658 6659 tcp->tcp_urp_last_valid = 0; 6660 tcp->tcp_hard_binding = 0; 6661 6662 tcp->tcp_fin_acked = 0; 6663 tcp->tcp_fin_rcvd = 0; 6664 tcp->tcp_fin_sent = 0; 6665 tcp->tcp_ordrel_done = 0; 6666 6667 tcp->tcp_detached = 0; 6668 6669 tcp->tcp_snd_ws_ok = B_FALSE; 6670 tcp->tcp_snd_ts_ok = B_FALSE; 6671 tcp->tcp_zero_win_probe = 0; 6672 6673 tcp->tcp_loopback = 0; 6674 tcp->tcp_localnet = 0; 6675 tcp->tcp_syn_defense = 0; 6676 tcp->tcp_set_timer = 0; 6677 6678 tcp->tcp_active_open = 0; 6679 tcp->tcp_rexmit = B_FALSE; 6680 tcp->tcp_xmit_zc_clean = B_FALSE; 6681 6682 tcp->tcp_snd_sack_ok = B_FALSE; 6683 tcp->tcp_hwcksum = B_FALSE; 6684 6685 DONTCARE(tcp->tcp_maxpsz_multiplier); /* Init in tcp_init_values */ 6686 6687 tcp->tcp_conn_def_q0 = 0; 6688 tcp->tcp_ip_forward_progress = B_FALSE; 6689 tcp->tcp_ecn_ok = B_FALSE; 6690 6691 tcp->tcp_cwr = B_FALSE; 6692 tcp->tcp_ecn_echo_on = B_FALSE; 6693 tcp->tcp_is_wnd_shrnk = B_FALSE; 6694 6695 if (tcp->tcp_sack_info != NULL) { 6696 if (tcp->tcp_notsack_list != NULL) { 6697 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, 6698 tcp); 6699 } 6700 kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info); 6701 tcp->tcp_sack_info = NULL; 6702 } 6703 6704 tcp->tcp_rcv_ws = 0; 6705 tcp->tcp_snd_ws = 0; 6706 tcp->tcp_ts_recent = 0; 6707 tcp->tcp_rnxt = 0; /* Displayed in mib */ 6708 DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */ 6709 tcp->tcp_initial_pmtu = 0; 6710 6711 ASSERT(tcp->tcp_reass_head == NULL); 6712 ASSERT(tcp->tcp_reass_tail == NULL); 6713 6714 tcp->tcp_cwnd_cnt = 0; 6715 6716 ASSERT(tcp->tcp_rcv_list == NULL); 6717 ASSERT(tcp->tcp_rcv_last_head == NULL); 6718 ASSERT(tcp->tcp_rcv_last_tail == NULL); 6719 ASSERT(tcp->tcp_rcv_cnt == 0); 6720 6721 DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */ 6722 DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */ 6723 tcp->tcp_csuna = 0; 6724 6725 tcp->tcp_rto = 0; /* Displayed in MIB */ 6726 DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */ 6727 DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */ 6728 tcp->tcp_rtt_update = 0; 6729 6730 DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 6731 DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 6732 6733 tcp->tcp_rack = 0; /* Displayed in mib */ 6734 tcp->tcp_rack_cnt = 0; 6735 tcp->tcp_rack_cur_max = 0; 6736 tcp->tcp_rack_abs_max = 0; 6737 6738 tcp->tcp_max_swnd = 0; 6739 6740 ASSERT(tcp->tcp_listener == NULL); 6741 6742 DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */ 6743 DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */ 6744 DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */ 6745 DONTCARE(tcp->tcp_urg); /* tcp_valid_bits cleared */ 6746 6747 ASSERT(tcp->tcp_conn_req_cnt_q == 0); 6748 ASSERT(tcp->tcp_conn_req_cnt_q0 == 0); 6749 PRESERVE(tcp->tcp_conn_req_max); 6750 PRESERVE(tcp->tcp_conn_req_seqnum); 6751 6752 DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */ 6753 DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */ 6754 DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */ 6755 DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */ 6756 6757 DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */ 6758 ASSERT(tcp->tcp_urp_mp == NULL); 6759 ASSERT(tcp->tcp_urp_mark_mp == NULL); 6760 ASSERT(tcp->tcp_fused_sigurg_mp == NULL); 6761 6762 ASSERT(tcp->tcp_eager_next_q == NULL); 6763 ASSERT(tcp->tcp_eager_last_q == NULL); 6764 ASSERT((tcp->tcp_eager_next_q0 == NULL && 6765 tcp->tcp_eager_prev_q0 == NULL) || 6766 tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0); 6767 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 6768 6769 ASSERT((tcp->tcp_eager_next_drop_q0 == NULL && 6770 tcp->tcp_eager_prev_drop_q0 == NULL) || 6771 tcp->tcp_eager_next_drop_q0 == tcp->tcp_eager_prev_drop_q0); 6772 6773 tcp->tcp_client_errno = 0; 6774 6775 DONTCARE(connp->conn_sum); /* Init in tcp_init_values */ 6776 6777 connp->conn_faddr_v6 = ipv6_all_zeros; /* Displayed in MIB */ 6778 6779 PRESERVE(connp->conn_bound_addr_v6); 6780 tcp->tcp_last_sent_len = 0; 6781 tcp->tcp_dupack_cnt = 0; 6782 6783 connp->conn_fport = 0; /* Displayed in MIB */ 6784 PRESERVE(connp->conn_lport); 6785 6786 PRESERVE(tcp->tcp_acceptor_lockp); 6787 6788 ASSERT(tcp->tcp_ordrel_mp == NULL); 6789 PRESERVE(tcp->tcp_acceptor_id); 6790 DONTCARE(tcp->tcp_ipsec_overhead); 6791 6792 PRESERVE(connp->conn_family); 6793 /* Remove any remnants of mapped address binding */ 6794 if (connp->conn_family == AF_INET6) { 6795 connp->conn_ipversion = IPV6_VERSION; 6796 tcp->tcp_mss = tcps->tcps_mss_def_ipv6; 6797 } else { 6798 connp->conn_ipversion = IPV4_VERSION; 6799 tcp->tcp_mss = tcps->tcps_mss_def_ipv4; 6800 } 6801 6802 connp->conn_bound_if = 0; 6803 connp->conn_recv_ancillary.crb_all = 0; 6804 tcp->tcp_recvifindex = 0; 6805 tcp->tcp_recvhops = 0; 6806 tcp->tcp_closed = 0; 6807 tcp->tcp_cleandeathtag = 0; 6808 if (tcp->tcp_hopopts != NULL) { 6809 mi_free(tcp->tcp_hopopts); 6810 tcp->tcp_hopopts = NULL; 6811 tcp->tcp_hopoptslen = 0; 6812 } 6813 ASSERT(tcp->tcp_hopoptslen == 0); 6814 if (tcp->tcp_dstopts != NULL) { 6815 mi_free(tcp->tcp_dstopts); 6816 tcp->tcp_dstopts = NULL; 6817 tcp->tcp_dstoptslen = 0; 6818 } 6819 ASSERT(tcp->tcp_dstoptslen == 0); 6820 if (tcp->tcp_rthdrdstopts != NULL) { 6821 mi_free(tcp->tcp_rthdrdstopts); 6822 tcp->tcp_rthdrdstopts = NULL; 6823 tcp->tcp_rthdrdstoptslen = 0; 6824 } 6825 ASSERT(tcp->tcp_rthdrdstoptslen == 0); 6826 if (tcp->tcp_rthdr != NULL) { 6827 mi_free(tcp->tcp_rthdr); 6828 tcp->tcp_rthdr = NULL; 6829 tcp->tcp_rthdrlen = 0; 6830 } 6831 ASSERT(tcp->tcp_rthdrlen == 0); 6832 6833 /* Reset fusion-related fields */ 6834 tcp->tcp_fused = B_FALSE; 6835 tcp->tcp_unfusable = B_FALSE; 6836 tcp->tcp_fused_sigurg = B_FALSE; 6837 tcp->tcp_loopback_peer = NULL; 6838 6839 tcp->tcp_lso = B_FALSE; 6840 6841 tcp->tcp_in_ack_unsent = 0; 6842 tcp->tcp_cork = B_FALSE; 6843 tcp->tcp_tconnind_started = B_FALSE; 6844 6845 PRESERVE(tcp->tcp_squeue_bytes); 6846 6847 ASSERT(tcp->tcp_kssl_ctx == NULL); 6848 ASSERT(!tcp->tcp_kssl_pending); 6849 PRESERVE(tcp->tcp_kssl_ent); 6850 6851 tcp->tcp_closemp_used = B_FALSE; 6852 6853 PRESERVE(tcp->tcp_rsrv_mp); 6854 PRESERVE(tcp->tcp_rsrv_mp_lock); 6855 6856 #ifdef DEBUG 6857 DONTCARE(tcp->tcmp_stk[0]); 6858 #endif 6859 6860 PRESERVE(tcp->tcp_connid); 6861 6862 ASSERT(tcp->tcp_listen_cnt == NULL); 6863 ASSERT(tcp->tcp_reass_tid == 0); 6864 6865 #undef DONTCARE 6866 #undef PRESERVE 6867 } 6868 6869 static void 6870 tcp_init_values(tcp_t *tcp) 6871 { 6872 tcp_stack_t *tcps = tcp->tcp_tcps; 6873 conn_t *connp = tcp->tcp_connp; 6874 6875 ASSERT((connp->conn_family == AF_INET && 6876 connp->conn_ipversion == IPV4_VERSION) || 6877 (connp->conn_family == AF_INET6 && 6878 (connp->conn_ipversion == IPV4_VERSION || 6879 connp->conn_ipversion == IPV6_VERSION))); 6880 6881 /* 6882 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 6883 * will be close to tcp_rexmit_interval_initial. By doing this, we 6884 * allow the algorithm to adjust slowly to large fluctuations of RTT 6885 * during first few transmissions of a connection as seen in slow 6886 * links. 6887 */ 6888 tcp->tcp_rtt_sa = tcps->tcps_rexmit_interval_initial << 2; 6889 tcp->tcp_rtt_sd = tcps->tcps_rexmit_interval_initial >> 1; 6890 tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 6891 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + 6892 tcps->tcps_conn_grace_period; 6893 if (tcp->tcp_rto < tcps->tcps_rexmit_interval_min) 6894 tcp->tcp_rto = tcps->tcps_rexmit_interval_min; 6895 tcp->tcp_timer_backoff = 0; 6896 tcp->tcp_ms_we_have_waited = 0; 6897 tcp->tcp_last_recv_time = ddi_get_lbolt(); 6898 tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_; 6899 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 6900 tcp->tcp_snd_burst = TCP_CWND_INFINITE; 6901 6902 tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier; 6903 6904 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval; 6905 tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval; 6906 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval; 6907 /* 6908 * Fix it to tcp_ip_abort_linterval later if it turns out to be a 6909 * passive open. 6910 */ 6911 tcp->tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval; 6912 6913 tcp->tcp_naglim = tcps->tcps_naglim_def; 6914 6915 /* NOTE: ISS is now set in tcp_set_destination(). */ 6916 6917 /* Reset fusion-related fields */ 6918 tcp->tcp_fused = B_FALSE; 6919 tcp->tcp_unfusable = B_FALSE; 6920 tcp->tcp_fused_sigurg = B_FALSE; 6921 tcp->tcp_loopback_peer = NULL; 6922 6923 /* We rebuild the header template on the next connect/conn_request */ 6924 6925 connp->conn_mlp_type = mlptSingle; 6926 6927 /* 6928 * Init the window scale to the max so tcp_rwnd_set() won't pare 6929 * down tcp_rwnd. tcp_set_destination() will set the right value later. 6930 */ 6931 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 6932 tcp->tcp_rwnd = connp->conn_rcvbuf; 6933 6934 tcp->tcp_cork = B_FALSE; 6935 /* 6936 * Init the tcp_debug option if it wasn't already set. This value 6937 * determines whether TCP 6938 * calls strlog() to print out debug messages. Doing this 6939 * initialization here means that this value is not inherited thru 6940 * tcp_reinit(). 6941 */ 6942 if (!connp->conn_debug) 6943 connp->conn_debug = tcps->tcps_dbg; 6944 6945 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval; 6946 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval; 6947 } 6948 6949 /* At minimum we need 8 bytes in the TCP header for the lookup */ 6950 #define ICMP_MIN_TCP_HDR 8 6951 6952 /* 6953 * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages 6954 * passed up by IP. The message is always received on the correct tcp_t. 6955 * Assumes that IP has pulled up everything up to and including the ICMP header. 6956 */ 6957 /* ARGSUSED2 */ 6958 static void 6959 tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 6960 { 6961 conn_t *connp = (conn_t *)arg1; 6962 icmph_t *icmph; 6963 ipha_t *ipha; 6964 int iph_hdr_length; 6965 tcpha_t *tcpha; 6966 uint32_t seg_seq; 6967 tcp_t *tcp = connp->conn_tcp; 6968 6969 /* Assume IP provides aligned packets */ 6970 ASSERT(OK_32PTR(mp->b_rptr)); 6971 ASSERT((MBLKL(mp) >= sizeof (ipha_t))); 6972 6973 /* 6974 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent 6975 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6. 6976 */ 6977 if (!(ira->ira_flags & IRAF_IS_IPV4)) { 6978 tcp_icmp_error_ipv6(tcp, mp, ira); 6979 return; 6980 } 6981 6982 /* Skip past the outer IP and ICMP headers */ 6983 iph_hdr_length = ira->ira_ip_hdr_length; 6984 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 6985 /* 6986 * If we don't have the correct outer IP header length 6987 * or if we don't have a complete inner IP header 6988 * drop it. 6989 */ 6990 if (iph_hdr_length < sizeof (ipha_t) || 6991 (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) { 6992 noticmpv4: 6993 freemsg(mp); 6994 return; 6995 } 6996 ipha = (ipha_t *)&icmph[1]; 6997 6998 /* Skip past the inner IP and find the ULP header */ 6999 iph_hdr_length = IPH_HDR_LENGTH(ipha); 7000 tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length); 7001 /* 7002 * If we don't have the correct inner IP header length or if the ULP 7003 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR 7004 * bytes of TCP header, drop it. 7005 */ 7006 if (iph_hdr_length < sizeof (ipha_t) || 7007 ipha->ipha_protocol != IPPROTO_TCP || 7008 (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) { 7009 goto noticmpv4; 7010 } 7011 7012 seg_seq = ntohl(tcpha->tha_seq); 7013 switch (icmph->icmph_type) { 7014 case ICMP_DEST_UNREACHABLE: 7015 switch (icmph->icmph_code) { 7016 case ICMP_FRAGMENTATION_NEEDED: 7017 /* 7018 * Update Path MTU, then try to send something out. 7019 */ 7020 tcp_update_pmtu(tcp, B_TRUE); 7021 tcp_rexmit_after_error(tcp); 7022 break; 7023 case ICMP_PORT_UNREACHABLE: 7024 case ICMP_PROTOCOL_UNREACHABLE: 7025 switch (tcp->tcp_state) { 7026 case TCPS_SYN_SENT: 7027 case TCPS_SYN_RCVD: 7028 /* 7029 * ICMP can snipe away incipient 7030 * TCP connections as long as 7031 * seq number is same as initial 7032 * send seq number. 7033 */ 7034 if (seg_seq == tcp->tcp_iss) { 7035 (void) tcp_clean_death(tcp, 7036 ECONNREFUSED, 6); 7037 } 7038 break; 7039 } 7040 break; 7041 case ICMP_HOST_UNREACHABLE: 7042 case ICMP_NET_UNREACHABLE: 7043 /* Record the error in case we finally time out. */ 7044 if (icmph->icmph_code == ICMP_HOST_UNREACHABLE) 7045 tcp->tcp_client_errno = EHOSTUNREACH; 7046 else 7047 tcp->tcp_client_errno = ENETUNREACH; 7048 if (tcp->tcp_state == TCPS_SYN_RCVD) { 7049 if (tcp->tcp_listener != NULL && 7050 tcp->tcp_listener->tcp_syn_defense) { 7051 /* 7052 * Ditch the half-open connection if we 7053 * suspect a SYN attack is under way. 7054 */ 7055 (void) tcp_clean_death(tcp, 7056 tcp->tcp_client_errno, 7); 7057 } 7058 } 7059 break; 7060 default: 7061 break; 7062 } 7063 break; 7064 case ICMP_SOURCE_QUENCH: { 7065 /* 7066 * use a global boolean to control 7067 * whether TCP should respond to ICMP_SOURCE_QUENCH. 7068 * The default is false. 7069 */ 7070 if (tcp_icmp_source_quench) { 7071 /* 7072 * Reduce the sending rate as if we got a 7073 * retransmit timeout 7074 */ 7075 uint32_t npkt; 7076 7077 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / 7078 tcp->tcp_mss; 7079 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss; 7080 tcp->tcp_cwnd = tcp->tcp_mss; 7081 tcp->tcp_cwnd_cnt = 0; 7082 } 7083 break; 7084 } 7085 } 7086 freemsg(mp); 7087 } 7088 7089 /* 7090 * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might 7091 * change. But it can refer to fields like tcp_suna and tcp_snxt. 7092 * 7093 * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP 7094 * error messages received by IP. The message is always received on the correct 7095 * tcp_t. 7096 */ 7097 /* ARGSUSED */ 7098 static boolean_t 7099 tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6, 7100 ip_recv_attr_t *ira) 7101 { 7102 tcpha_t *tcpha = (tcpha_t *)arg2; 7103 uint32_t seq = ntohl(tcpha->tha_seq); 7104 tcp_t *tcp = connp->conn_tcp; 7105 7106 /* 7107 * TCP sequence number contained in payload of the ICMP error message 7108 * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise, 7109 * the message is either a stale ICMP error, or an attack from the 7110 * network. Fail the verification. 7111 */ 7112 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt)) 7113 return (B_FALSE); 7114 7115 /* For "too big" we also check the ignore flag */ 7116 if (ira->ira_flags & IRAF_IS_IPV4) { 7117 ASSERT(icmph != NULL); 7118 if (icmph->icmph_type == ICMP_DEST_UNREACHABLE && 7119 icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && 7120 tcp->tcp_tcps->tcps_ignore_path_mtu) 7121 return (B_FALSE); 7122 } else { 7123 ASSERT(icmp6 != NULL); 7124 if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG && 7125 tcp->tcp_tcps->tcps_ignore_path_mtu) 7126 return (B_FALSE); 7127 } 7128 return (B_TRUE); 7129 } 7130 7131 /* 7132 * Update the TCP connection according to change of PMTU. 7133 * 7134 * Path MTU might have changed by either increase or decrease, so need to 7135 * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny 7136 * or negative MSS, since tcp_mss_set() will do it. 7137 */ 7138 static void 7139 tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) 7140 { 7141 uint32_t pmtu; 7142 int32_t mss; 7143 conn_t *connp = tcp->tcp_connp; 7144 ip_xmit_attr_t *ixa = connp->conn_ixa; 7145 iaflags_t ixaflags; 7146 7147 if (tcp->tcp_tcps->tcps_ignore_path_mtu) 7148 return; 7149 7150 if (tcp->tcp_state < TCPS_ESTABLISHED) 7151 return; 7152 7153 /* 7154 * Always call ip_get_pmtu() to make sure that IP has updated 7155 * ixa_flags properly. 7156 */ 7157 pmtu = ip_get_pmtu(ixa); 7158 ixaflags = ixa->ixa_flags; 7159 7160 /* 7161 * Calculate the MSS by decreasing the PMTU by conn_ht_iphc_len and 7162 * IPsec overhead if applied. Make sure to use the most recent 7163 * IPsec information. 7164 */ 7165 mss = pmtu - connp->conn_ht_iphc_len - conn_ipsec_length(connp); 7166 7167 /* 7168 * Nothing to change, so just return. 7169 */ 7170 if (mss == tcp->tcp_mss) 7171 return; 7172 7173 /* 7174 * Currently, for ICMP errors, only PMTU decrease is handled. 7175 */ 7176 if (mss > tcp->tcp_mss && decrease_only) 7177 return; 7178 7179 DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss); 7180 7181 /* 7182 * Update ixa_fragsize and ixa_pmtu. 7183 */ 7184 ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu; 7185 7186 /* 7187 * Adjust MSS and all relevant variables. 7188 */ 7189 tcp_mss_set(tcp, mss); 7190 7191 /* 7192 * If the PMTU is below the min size maintained by IP, then ip_get_pmtu 7193 * has set IXAF_PMTU_TOO_SMALL and cleared IXAF_PMTU_IPV4_DF. Since TCP 7194 * has a (potentially different) min size we do the same. Make sure to 7195 * clear IXAF_DONTFRAG, which is used by IP to decide whether to 7196 * fragment the packet. 7197 * 7198 * LSO over IPv6 can not be fragmented. So need to disable LSO 7199 * when IPv6 fragmentation is needed. 7200 */ 7201 if (mss < tcp->tcp_tcps->tcps_mss_min) 7202 ixaflags |= IXAF_PMTU_TOO_SMALL; 7203 7204 if (ixaflags & IXAF_PMTU_TOO_SMALL) 7205 ixaflags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); 7206 7207 if ((connp->conn_ipversion == IPV4_VERSION) && 7208 !(ixaflags & IXAF_PMTU_IPV4_DF)) { 7209 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; 7210 } 7211 ixa->ixa_flags = ixaflags; 7212 } 7213 7214 /* 7215 * Do slow start retransmission after ICMP errors of PMTU changes. 7216 */ 7217 static void 7218 tcp_rexmit_after_error(tcp_t *tcp) 7219 { 7220 /* 7221 * All sent data has been acknowledged or no data left to send, just 7222 * to return. 7223 */ 7224 if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) || 7225 (tcp->tcp_xmit_head == NULL)) 7226 return; 7227 7228 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0)) 7229 tcp->tcp_rexmit_max = tcp->tcp_fss; 7230 else 7231 tcp->tcp_rexmit_max = tcp->tcp_snxt; 7232 7233 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 7234 tcp->tcp_rexmit = B_TRUE; 7235 tcp->tcp_dupack_cnt = 0; 7236 tcp->tcp_snd_burst = TCP_CWND_SS; 7237 tcp_ss_rexmit(tcp); 7238 } 7239 7240 /* 7241 * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6 7242 * error messages passed up by IP. 7243 * Assumes that IP has pulled up all the extension headers as well 7244 * as the ICMPv6 header. 7245 */ 7246 static void 7247 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira) 7248 { 7249 icmp6_t *icmp6; 7250 ip6_t *ip6h; 7251 uint16_t iph_hdr_length = ira->ira_ip_hdr_length; 7252 tcpha_t *tcpha; 7253 uint8_t *nexthdrp; 7254 uint32_t seg_seq; 7255 7256 /* 7257 * Verify that we have a complete IP header. 7258 */ 7259 ASSERT((MBLKL(mp) >= sizeof (ip6_t))); 7260 7261 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 7262 ip6h = (ip6_t *)&icmp6[1]; 7263 /* 7264 * Verify if we have a complete ICMP and inner IP header. 7265 */ 7266 if ((uchar_t *)&ip6h[1] > mp->b_wptr) { 7267 noticmpv6: 7268 freemsg(mp); 7269 return; 7270 } 7271 7272 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) 7273 goto noticmpv6; 7274 tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length); 7275 /* 7276 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't 7277 * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the 7278 * packet. 7279 */ 7280 if ((*nexthdrp != IPPROTO_TCP) || 7281 ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) { 7282 goto noticmpv6; 7283 } 7284 7285 seg_seq = ntohl(tcpha->tha_seq); 7286 switch (icmp6->icmp6_type) { 7287 case ICMP6_PACKET_TOO_BIG: 7288 /* 7289 * Update Path MTU, then try to send something out. 7290 */ 7291 tcp_update_pmtu(tcp, B_TRUE); 7292 tcp_rexmit_after_error(tcp); 7293 break; 7294 case ICMP6_DST_UNREACH: 7295 switch (icmp6->icmp6_code) { 7296 case ICMP6_DST_UNREACH_NOPORT: 7297 if (((tcp->tcp_state == TCPS_SYN_SENT) || 7298 (tcp->tcp_state == TCPS_SYN_RCVD)) && 7299 (seg_seq == tcp->tcp_iss)) { 7300 (void) tcp_clean_death(tcp, 7301 ECONNREFUSED, 8); 7302 } 7303 break; 7304 case ICMP6_DST_UNREACH_ADMIN: 7305 case ICMP6_DST_UNREACH_NOROUTE: 7306 case ICMP6_DST_UNREACH_BEYONDSCOPE: 7307 case ICMP6_DST_UNREACH_ADDR: 7308 /* Record the error in case we finally time out. */ 7309 tcp->tcp_client_errno = EHOSTUNREACH; 7310 if (((tcp->tcp_state == TCPS_SYN_SENT) || 7311 (tcp->tcp_state == TCPS_SYN_RCVD)) && 7312 (seg_seq == tcp->tcp_iss)) { 7313 if (tcp->tcp_listener != NULL && 7314 tcp->tcp_listener->tcp_syn_defense) { 7315 /* 7316 * Ditch the half-open connection if we 7317 * suspect a SYN attack is under way. 7318 */ 7319 (void) tcp_clean_death(tcp, 7320 tcp->tcp_client_errno, 9); 7321 } 7322 } 7323 7324 7325 break; 7326 default: 7327 break; 7328 } 7329 break; 7330 case ICMP6_PARAM_PROB: 7331 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 7332 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 7333 (uchar_t *)ip6h + icmp6->icmp6_pptr == 7334 (uchar_t *)nexthdrp) { 7335 if (tcp->tcp_state == TCPS_SYN_SENT || 7336 tcp->tcp_state == TCPS_SYN_RCVD) { 7337 (void) tcp_clean_death(tcp, 7338 ECONNREFUSED, 10); 7339 } 7340 break; 7341 } 7342 break; 7343 7344 case ICMP6_TIME_EXCEEDED: 7345 default: 7346 break; 7347 } 7348 freemsg(mp); 7349 } 7350 7351 /* 7352 * Notify IP that we are having trouble with this connection. IP should 7353 * make note so it can potentially use a different IRE. 7354 */ 7355 static void 7356 tcp_ip_notify(tcp_t *tcp) 7357 { 7358 conn_t *connp = tcp->tcp_connp; 7359 ire_t *ire; 7360 7361 /* 7362 * Note: in the case of source routing we want to blow away the 7363 * route to the first source route hop. 7364 */ 7365 ire = connp->conn_ixa->ixa_ire; 7366 if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 7367 if (ire->ire_ipversion == IPV4_VERSION) { 7368 /* 7369 * As per RFC 1122, we send an RTM_LOSING to inform 7370 * routing protocols. 7371 */ 7372 ip_rts_change(RTM_LOSING, ire->ire_addr, 7373 ire->ire_gateway_addr, ire->ire_mask, 7374 connp->conn_laddr_v4, 0, 0, 0, 7375 (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), 7376 ire->ire_ipst); 7377 } 7378 (void) ire_no_good(ire); 7379 } 7380 } 7381 7382 #pragma inline(tcp_send_data) 7383 7384 /* 7385 * Timer callback routine for keepalive probe. We do a fake resend of 7386 * last ACKed byte. Then set a timer using RTO. When the timer expires, 7387 * check to see if we have heard anything from the other end for the last 7388 * RTO period. If we have, set the timer to expire for another 7389 * tcp_keepalive_intrvl and check again. If we have not, set a timer using 7390 * RTO << 1 and check again when it expires. Keep exponentially increasing 7391 * the timeout if we have not heard from the other side. If for more than 7392 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything, 7393 * kill the connection unless the keepalive abort threshold is 0. In 7394 * that case, we will probe "forever." 7395 */ 7396 static void 7397 tcp_keepalive_killer(void *arg) 7398 { 7399 mblk_t *mp; 7400 conn_t *connp = (conn_t *)arg; 7401 tcp_t *tcp = connp->conn_tcp; 7402 int32_t firetime; 7403 int32_t idletime; 7404 int32_t ka_intrvl; 7405 tcp_stack_t *tcps = tcp->tcp_tcps; 7406 7407 tcp->tcp_ka_tid = 0; 7408 7409 if (tcp->tcp_fused) 7410 return; 7411 7412 BUMP_MIB(&tcps->tcps_mib, tcpTimKeepalive); 7413 ka_intrvl = tcp->tcp_ka_interval; 7414 7415 /* 7416 * Keepalive probe should only be sent if the application has not 7417 * done a close on the connection. 7418 */ 7419 if (tcp->tcp_state > TCPS_CLOSE_WAIT) { 7420 return; 7421 } 7422 /* Timer fired too early, restart it. */ 7423 if (tcp->tcp_state < TCPS_ESTABLISHED) { 7424 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 7425 MSEC_TO_TICK(ka_intrvl)); 7426 return; 7427 } 7428 7429 idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time); 7430 /* 7431 * If we have not heard from the other side for a long 7432 * time, kill the connection unless the keepalive abort 7433 * threshold is 0. In that case, we will probe "forever." 7434 */ 7435 if (tcp->tcp_ka_abort_thres != 0 && 7436 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) { 7437 BUMP_MIB(&tcps->tcps_mib, tcpTimKeepaliveDrop); 7438 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ? 7439 tcp->tcp_client_errno : ETIMEDOUT, 11); 7440 return; 7441 } 7442 7443 if (tcp->tcp_snxt == tcp->tcp_suna && 7444 idletime >= ka_intrvl) { 7445 /* Fake resend of last ACKed byte. */ 7446 mblk_t *mp1 = allocb(1, BPRI_LO); 7447 7448 if (mp1 != NULL) { 7449 *mp1->b_wptr++ = '\0'; 7450 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL, 7451 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE); 7452 freeb(mp1); 7453 /* 7454 * if allocation failed, fall through to start the 7455 * timer back. 7456 */ 7457 if (mp != NULL) { 7458 tcp_send_data(tcp, mp); 7459 BUMP_MIB(&tcps->tcps_mib, 7460 tcpTimKeepaliveProbe); 7461 if (tcp->tcp_ka_last_intrvl != 0) { 7462 int max; 7463 /* 7464 * We should probe again at least 7465 * in ka_intrvl, but not more than 7466 * tcp_rexmit_interval_max. 7467 */ 7468 max = tcps->tcps_rexmit_interval_max; 7469 firetime = MIN(ka_intrvl - 1, 7470 tcp->tcp_ka_last_intrvl << 1); 7471 if (firetime > max) 7472 firetime = max; 7473 } else { 7474 firetime = tcp->tcp_rto; 7475 } 7476 tcp->tcp_ka_tid = TCP_TIMER(tcp, 7477 tcp_keepalive_killer, 7478 MSEC_TO_TICK(firetime)); 7479 tcp->tcp_ka_last_intrvl = firetime; 7480 return; 7481 } 7482 } 7483 } else { 7484 tcp->tcp_ka_last_intrvl = 0; 7485 } 7486 7487 /* firetime can be negative if (mp1 == NULL || mp == NULL) */ 7488 if ((firetime = ka_intrvl - idletime) < 0) { 7489 firetime = ka_intrvl; 7490 } 7491 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 7492 MSEC_TO_TICK(firetime)); 7493 } 7494 7495 int 7496 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) 7497 { 7498 conn_t *connp = tcp->tcp_connp; 7499 queue_t *q = connp->conn_rq; 7500 int32_t mss = tcp->tcp_mss; 7501 int maxpsz; 7502 7503 if (TCP_IS_DETACHED(tcp)) 7504 return (mss); 7505 if (tcp->tcp_fused) { 7506 maxpsz = tcp_fuse_maxpsz(tcp); 7507 mss = INFPSZ; 7508 } else if (tcp->tcp_maxpsz_multiplier == 0) { 7509 /* 7510 * Set the sd_qn_maxpsz according to the socket send buffer 7511 * size, and sd_maxblk to INFPSZ (-1). This will essentially 7512 * instruct the stream head to copyin user data into contiguous 7513 * kernel-allocated buffers without breaking it up into smaller 7514 * chunks. We round up the buffer size to the nearest SMSS. 7515 */ 7516 maxpsz = MSS_ROUNDUP(connp->conn_sndbuf, mss); 7517 if (tcp->tcp_kssl_ctx == NULL) 7518 mss = INFPSZ; 7519 else 7520 mss = SSL3_MAX_RECORD_LEN; 7521 } else { 7522 /* 7523 * Set sd_qn_maxpsz to approx half the (receivers) buffer 7524 * (and a multiple of the mss). This instructs the stream 7525 * head to break down larger than SMSS writes into SMSS- 7526 * size mblks, up to tcp_maxpsz_multiplier mblks at a time. 7527 */ 7528 maxpsz = tcp->tcp_maxpsz_multiplier * mss; 7529 if (maxpsz > connp->conn_sndbuf / 2) { 7530 maxpsz = connp->conn_sndbuf / 2; 7531 /* Round up to nearest mss */ 7532 maxpsz = MSS_ROUNDUP(maxpsz, mss); 7533 } 7534 } 7535 7536 (void) proto_set_maxpsz(q, connp, maxpsz); 7537 if (!(IPCL_IS_NONSTR(connp))) 7538 connp->conn_wq->q_maxpsz = maxpsz; 7539 if (set_maxblk) 7540 (void) proto_set_tx_maxblk(q, connp, mss); 7541 return (mss); 7542 } 7543 7544 /* 7545 * Extract option values from a tcp header. We put any found values into the 7546 * tcpopt struct and return a bitmask saying which options were found. 7547 */ 7548 static int 7549 tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt) 7550 { 7551 uchar_t *endp; 7552 int len; 7553 uint32_t mss; 7554 uchar_t *up = (uchar_t *)tcpha; 7555 int found = 0; 7556 int32_t sack_len; 7557 tcp_seq sack_begin, sack_end; 7558 tcp_t *tcp; 7559 7560 endp = up + TCP_HDR_LENGTH(tcpha); 7561 up += TCP_MIN_HEADER_LENGTH; 7562 while (up < endp) { 7563 len = endp - up; 7564 switch (*up) { 7565 case TCPOPT_EOL: 7566 break; 7567 7568 case TCPOPT_NOP: 7569 up++; 7570 continue; 7571 7572 case TCPOPT_MAXSEG: 7573 if (len < TCPOPT_MAXSEG_LEN || 7574 up[1] != TCPOPT_MAXSEG_LEN) 7575 break; 7576 7577 mss = BE16_TO_U16(up+2); 7578 /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 7579 tcpopt->tcp_opt_mss = mss; 7580 found |= TCP_OPT_MSS_PRESENT; 7581 7582 up += TCPOPT_MAXSEG_LEN; 7583 continue; 7584 7585 case TCPOPT_WSCALE: 7586 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 7587 break; 7588 7589 if (up[2] > TCP_MAX_WINSHIFT) 7590 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 7591 else 7592 tcpopt->tcp_opt_wscale = up[2]; 7593 found |= TCP_OPT_WSCALE_PRESENT; 7594 7595 up += TCPOPT_WS_LEN; 7596 continue; 7597 7598 case TCPOPT_SACK_PERMITTED: 7599 if (len < TCPOPT_SACK_OK_LEN || 7600 up[1] != TCPOPT_SACK_OK_LEN) 7601 break; 7602 found |= TCP_OPT_SACK_OK_PRESENT; 7603 up += TCPOPT_SACK_OK_LEN; 7604 continue; 7605 7606 case TCPOPT_SACK: 7607 if (len <= 2 || up[1] <= 2 || len < up[1]) 7608 break; 7609 7610 /* If TCP is not interested in SACK blks... */ 7611 if ((tcp = tcpopt->tcp) == NULL) { 7612 up += up[1]; 7613 continue; 7614 } 7615 sack_len = up[1] - TCPOPT_HEADER_LEN; 7616 up += TCPOPT_HEADER_LEN; 7617 7618 /* 7619 * If the list is empty, allocate one and assume 7620 * nothing is sack'ed. 7621 */ 7622 ASSERT(tcp->tcp_sack_info != NULL); 7623 if (tcp->tcp_notsack_list == NULL) { 7624 tcp_notsack_update(&(tcp->tcp_notsack_list), 7625 tcp->tcp_suna, tcp->tcp_snxt, 7626 &(tcp->tcp_num_notsack_blk), 7627 &(tcp->tcp_cnt_notsack_list)); 7628 7629 /* 7630 * Make sure tcp_notsack_list is not NULL. 7631 * This happens when kmem_alloc(KM_NOSLEEP) 7632 * returns NULL. 7633 */ 7634 if (tcp->tcp_notsack_list == NULL) { 7635 up += sack_len; 7636 continue; 7637 } 7638 tcp->tcp_fack = tcp->tcp_suna; 7639 } 7640 7641 while (sack_len > 0) { 7642 if (up + 8 > endp) { 7643 up = endp; 7644 break; 7645 } 7646 sack_begin = BE32_TO_U32(up); 7647 up += 4; 7648 sack_end = BE32_TO_U32(up); 7649 up += 4; 7650 sack_len -= 8; 7651 /* 7652 * Bounds checking. Make sure the SACK 7653 * info is within tcp_suna and tcp_snxt. 7654 * If this SACK blk is out of bound, ignore 7655 * it but continue to parse the following 7656 * blks. 7657 */ 7658 if (SEQ_LEQ(sack_end, sack_begin) || 7659 SEQ_LT(sack_begin, tcp->tcp_suna) || 7660 SEQ_GT(sack_end, tcp->tcp_snxt)) { 7661 continue; 7662 } 7663 tcp_notsack_insert(&(tcp->tcp_notsack_list), 7664 sack_begin, sack_end, 7665 &(tcp->tcp_num_notsack_blk), 7666 &(tcp->tcp_cnt_notsack_list)); 7667 if (SEQ_GT(sack_end, tcp->tcp_fack)) { 7668 tcp->tcp_fack = sack_end; 7669 } 7670 } 7671 found |= TCP_OPT_SACK_PRESENT; 7672 continue; 7673 7674 case TCPOPT_TSTAMP: 7675 if (len < TCPOPT_TSTAMP_LEN || 7676 up[1] != TCPOPT_TSTAMP_LEN) 7677 break; 7678 7679 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 7680 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 7681 7682 found |= TCP_OPT_TSTAMP_PRESENT; 7683 7684 up += TCPOPT_TSTAMP_LEN; 7685 continue; 7686 7687 default: 7688 if (len <= 1 || len < (int)up[1] || up[1] == 0) 7689 break; 7690 up += up[1]; 7691 continue; 7692 } 7693 break; 7694 } 7695 return (found); 7696 } 7697 7698 /* 7699 * Set the MSS associated with a particular tcp based on its current value, 7700 * and a new one passed in. Observe minimums and maximums, and reset other 7701 * state variables that we want to view as multiples of MSS. 7702 * 7703 * The value of MSS could be either increased or descreased. 7704 */ 7705 static void 7706 tcp_mss_set(tcp_t *tcp, uint32_t mss) 7707 { 7708 uint32_t mss_max; 7709 tcp_stack_t *tcps = tcp->tcp_tcps; 7710 conn_t *connp = tcp->tcp_connp; 7711 7712 if (connp->conn_ipversion == IPV4_VERSION) 7713 mss_max = tcps->tcps_mss_max_ipv4; 7714 else 7715 mss_max = tcps->tcps_mss_max_ipv6; 7716 7717 if (mss < tcps->tcps_mss_min) 7718 mss = tcps->tcps_mss_min; 7719 if (mss > mss_max) 7720 mss = mss_max; 7721 /* 7722 * Unless naglim has been set by our client to 7723 * a non-mss value, force naglim to track mss. 7724 * This can help to aggregate small writes. 7725 */ 7726 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 7727 tcp->tcp_naglim = mss; 7728 /* 7729 * TCP should be able to buffer at least 4 MSS data for obvious 7730 * performance reason. 7731 */ 7732 if ((mss << 2) > connp->conn_sndbuf) 7733 connp->conn_sndbuf = mss << 2; 7734 7735 /* 7736 * Set the send lowater to at least twice of MSS. 7737 */ 7738 if ((mss << 1) > connp->conn_sndlowat) 7739 connp->conn_sndlowat = mss << 1; 7740 7741 /* 7742 * Update tcp_cwnd according to the new value of MSS. Keep the 7743 * previous ratio to preserve the transmit rate. 7744 */ 7745 tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss; 7746 tcp->tcp_cwnd_cnt = 0; 7747 7748 tcp->tcp_mss = mss; 7749 (void) tcp_maxpsz_set(tcp, B_TRUE); 7750 } 7751 7752 /* For /dev/tcp aka AF_INET open */ 7753 static int 7754 tcp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 7755 { 7756 return (tcp_open(q, devp, flag, sflag, credp, B_FALSE)); 7757 } 7758 7759 /* For /dev/tcp6 aka AF_INET6 open */ 7760 static int 7761 tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 7762 { 7763 return (tcp_open(q, devp, flag, sflag, credp, B_TRUE)); 7764 } 7765 7766 static conn_t * 7767 tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket, 7768 int *errorp) 7769 { 7770 tcp_t *tcp = NULL; 7771 conn_t *connp; 7772 zoneid_t zoneid; 7773 tcp_stack_t *tcps; 7774 squeue_t *sqp; 7775 7776 ASSERT(errorp != NULL); 7777 /* 7778 * Find the proper zoneid and netstack. 7779 */ 7780 /* 7781 * Special case for install: miniroot needs to be able to 7782 * access files via NFS as though it were always in the 7783 * global zone. 7784 */ 7785 if (credp == kcred && nfs_global_client_only != 0) { 7786 zoneid = GLOBAL_ZONEID; 7787 tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)-> 7788 netstack_tcp; 7789 ASSERT(tcps != NULL); 7790 } else { 7791 netstack_t *ns; 7792 7793 ns = netstack_find_by_cred(credp); 7794 ASSERT(ns != NULL); 7795 tcps = ns->netstack_tcp; 7796 ASSERT(tcps != NULL); 7797 7798 /* 7799 * For exclusive stacks we set the zoneid to zero 7800 * to make TCP operate as if in the global zone. 7801 */ 7802 if (tcps->tcps_netstack->netstack_stackid != 7803 GLOBAL_NETSTACKID) 7804 zoneid = GLOBAL_ZONEID; 7805 else 7806 zoneid = crgetzoneid(credp); 7807 } 7808 7809 sqp = IP_SQUEUE_GET((uint_t)gethrtime()); 7810 connp = (conn_t *)tcp_get_conn(sqp, tcps); 7811 /* 7812 * Both tcp_get_conn and netstack_find_by_cred incremented refcnt, 7813 * so we drop it by one. 7814 */ 7815 netstack_rele(tcps->tcps_netstack); 7816 if (connp == NULL) { 7817 *errorp = ENOSR; 7818 return (NULL); 7819 } 7820 ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto); 7821 7822 connp->conn_sqp = sqp; 7823 connp->conn_initial_sqp = connp->conn_sqp; 7824 connp->conn_ixa->ixa_sqp = connp->conn_sqp; 7825 tcp = connp->conn_tcp; 7826 7827 /* 7828 * Besides asking IP to set the checksum for us, have conn_ip_output 7829 * to do the following checks when necessary: 7830 * 7831 * IXAF_VERIFY_SOURCE: drop packets when our outer source goes invalid 7832 * IXAF_VERIFY_PMTU: verify PMTU changes 7833 * IXAF_VERIFY_LSO: verify LSO capability changes 7834 */ 7835 connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | 7836 IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO; 7837 7838 if (!tcps->tcps_dev_flow_ctl) 7839 connp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; 7840 7841 if (isv6) { 7842 connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT; 7843 connp->conn_ipversion = IPV6_VERSION; 7844 connp->conn_family = AF_INET6; 7845 tcp->tcp_mss = tcps->tcps_mss_def_ipv6; 7846 connp->conn_default_ttl = tcps->tcps_ipv6_hoplimit; 7847 } else { 7848 connp->conn_ipversion = IPV4_VERSION; 7849 connp->conn_family = AF_INET; 7850 tcp->tcp_mss = tcps->tcps_mss_def_ipv4; 7851 connp->conn_default_ttl = tcps->tcps_ipv4_ttl; 7852 } 7853 connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; 7854 7855 crhold(credp); 7856 connp->conn_cred = credp; 7857 connp->conn_cpid = curproc->p_pid; 7858 connp->conn_open_time = ddi_get_lbolt64(); 7859 7860 connp->conn_zoneid = zoneid; 7861 /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ 7862 connp->conn_ixa->ixa_zoneid = zoneid; 7863 connp->conn_mlp_type = mlptSingle; 7864 ASSERT(connp->conn_netstack == tcps->tcps_netstack); 7865 ASSERT(tcp->tcp_tcps == tcps); 7866 7867 /* 7868 * If the caller has the process-wide flag set, then default to MAC 7869 * exempt mode. This allows read-down to unlabeled hosts. 7870 */ 7871 if (getpflags(NET_MAC_AWARE, credp) != 0) 7872 connp->conn_mac_mode = CONN_MAC_AWARE; 7873 7874 connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); 7875 7876 if (issocket) { 7877 tcp->tcp_issocket = 1; 7878 } 7879 7880 connp->conn_rcvbuf = tcps->tcps_recv_hiwat; 7881 connp->conn_sndbuf = tcps->tcps_xmit_hiwat; 7882 connp->conn_sndlowat = tcps->tcps_xmit_lowat; 7883 connp->conn_so_type = SOCK_STREAM; 7884 connp->conn_wroff = connp->conn_ht_iphc_allocated + 7885 tcps->tcps_wroff_xtra; 7886 7887 SOCK_CONNID_INIT(tcp->tcp_connid); 7888 tcp->tcp_state = TCPS_IDLE; 7889 tcp_init_values(tcp); 7890 return (connp); 7891 } 7892 7893 static int 7894 tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 7895 boolean_t isv6) 7896 { 7897 tcp_t *tcp = NULL; 7898 conn_t *connp = NULL; 7899 int err; 7900 vmem_t *minor_arena = NULL; 7901 dev_t conn_dev; 7902 boolean_t issocket; 7903 7904 if (q->q_ptr != NULL) 7905 return (0); 7906 7907 if (sflag == MODOPEN) 7908 return (EINVAL); 7909 7910 if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) && 7911 ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) { 7912 minor_arena = ip_minor_arena_la; 7913 } else { 7914 /* 7915 * Either minor numbers in the large arena were exhausted 7916 * or a non socket application is doing the open. 7917 * Try to allocate from the small arena. 7918 */ 7919 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 7920 return (EBUSY); 7921 } 7922 minor_arena = ip_minor_arena_sa; 7923 } 7924 7925 ASSERT(minor_arena != NULL); 7926 7927 *devp = makedevice(getmajor(*devp), (minor_t)conn_dev); 7928 7929 if (flag & SO_FALLBACK) { 7930 /* 7931 * Non streams socket needs a stream to fallback to 7932 */ 7933 RD(q)->q_ptr = (void *)conn_dev; 7934 WR(q)->q_qinfo = &tcp_fallback_sock_winit; 7935 WR(q)->q_ptr = (void *)minor_arena; 7936 qprocson(q); 7937 return (0); 7938 } else if (flag & SO_ACCEPTOR) { 7939 q->q_qinfo = &tcp_acceptor_rinit; 7940 /* 7941 * the conn_dev and minor_arena will be subsequently used by 7942 * tcp_tli_accept() and tcp_tpi_close_accept() to figure out 7943 * the minor device number for this connection from the q_ptr. 7944 */ 7945 RD(q)->q_ptr = (void *)conn_dev; 7946 WR(q)->q_qinfo = &tcp_acceptor_winit; 7947 WR(q)->q_ptr = (void *)minor_arena; 7948 qprocson(q); 7949 return (0); 7950 } 7951 7952 issocket = flag & SO_SOCKSTR; 7953 connp = tcp_create_common(credp, isv6, issocket, &err); 7954 7955 if (connp == NULL) { 7956 inet_minor_free(minor_arena, conn_dev); 7957 q->q_ptr = WR(q)->q_ptr = NULL; 7958 return (err); 7959 } 7960 7961 connp->conn_rq = q; 7962 connp->conn_wq = WR(q); 7963 q->q_ptr = WR(q)->q_ptr = connp; 7964 7965 connp->conn_dev = conn_dev; 7966 connp->conn_minor_arena = minor_arena; 7967 7968 ASSERT(q->q_qinfo == &tcp_rinitv4 || q->q_qinfo == &tcp_rinitv6); 7969 ASSERT(WR(q)->q_qinfo == &tcp_winit); 7970 7971 tcp = connp->conn_tcp; 7972 7973 if (issocket) { 7974 WR(q)->q_qinfo = &tcp_sock_winit; 7975 } else { 7976 #ifdef _ILP32 7977 tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); 7978 #else 7979 tcp->tcp_acceptor_id = conn_dev; 7980 #endif /* _ILP32 */ 7981 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 7982 } 7983 7984 /* 7985 * Put the ref for TCP. Ref for IP was already put 7986 * by ipcl_conn_create. Also Make the conn_t globally 7987 * visible to walkers 7988 */ 7989 mutex_enter(&connp->conn_lock); 7990 CONN_INC_REF_LOCKED(connp); 7991 ASSERT(connp->conn_ref == 2); 7992 connp->conn_state_flags &= ~CONN_INCIPIENT; 7993 mutex_exit(&connp->conn_lock); 7994 7995 qprocson(q); 7996 return (0); 7997 } 7998 7999 /* 8000 * Some TCP options can be "set" by requesting them in the option 8001 * buffer. This is needed for XTI feature test though we do not 8002 * allow it in general. We interpret that this mechanism is more 8003 * applicable to OSI protocols and need not be allowed in general. 8004 * This routine filters out options for which it is not allowed (most) 8005 * and lets through those (few) for which it is. [ The XTI interface 8006 * test suite specifics will imply that any XTI_GENERIC level XTI_* if 8007 * ever implemented will have to be allowed here ]. 8008 */ 8009 static boolean_t 8010 tcp_allow_connopt_set(int level, int name) 8011 { 8012 8013 switch (level) { 8014 case IPPROTO_TCP: 8015 switch (name) { 8016 case TCP_NODELAY: 8017 return (B_TRUE); 8018 default: 8019 return (B_FALSE); 8020 } 8021 /*NOTREACHED*/ 8022 default: 8023 return (B_FALSE); 8024 } 8025 /*NOTREACHED*/ 8026 } 8027 8028 /* 8029 * This routine gets default values of certain options whose default 8030 * values are maintained by protocol specific code 8031 */ 8032 /* ARGSUSED */ 8033 int 8034 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 8035 { 8036 int32_t *i1 = (int32_t *)ptr; 8037 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 8038 8039 switch (level) { 8040 case IPPROTO_TCP: 8041 switch (name) { 8042 case TCP_NOTIFY_THRESHOLD: 8043 *i1 = tcps->tcps_ip_notify_interval; 8044 break; 8045 case TCP_ABORT_THRESHOLD: 8046 *i1 = tcps->tcps_ip_abort_interval; 8047 break; 8048 case TCP_CONN_NOTIFY_THRESHOLD: 8049 *i1 = tcps->tcps_ip_notify_cinterval; 8050 break; 8051 case TCP_CONN_ABORT_THRESHOLD: 8052 *i1 = tcps->tcps_ip_abort_cinterval; 8053 break; 8054 default: 8055 return (-1); 8056 } 8057 break; 8058 case IPPROTO_IP: 8059 switch (name) { 8060 case IP_TTL: 8061 *i1 = tcps->tcps_ipv4_ttl; 8062 break; 8063 default: 8064 return (-1); 8065 } 8066 break; 8067 case IPPROTO_IPV6: 8068 switch (name) { 8069 case IPV6_UNICAST_HOPS: 8070 *i1 = tcps->tcps_ipv6_hoplimit; 8071 break; 8072 default: 8073 return (-1); 8074 } 8075 break; 8076 default: 8077 return (-1); 8078 } 8079 return (sizeof (int)); 8080 } 8081 8082 /* 8083 * TCP routine to get the values of options. 8084 */ 8085 static int 8086 tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) 8087 { 8088 int *i1 = (int *)ptr; 8089 tcp_t *tcp = connp->conn_tcp; 8090 conn_opt_arg_t coas; 8091 int retval; 8092 8093 coas.coa_connp = connp; 8094 coas.coa_ixa = connp->conn_ixa; 8095 coas.coa_ipp = &connp->conn_xmit_ipp; 8096 coas.coa_ancillary = B_FALSE; 8097 coas.coa_changed = 0; 8098 8099 switch (level) { 8100 case SOL_SOCKET: 8101 switch (name) { 8102 case SO_SND_COPYAVOID: 8103 *i1 = tcp->tcp_snd_zcopy_on ? 8104 SO_SND_COPYAVOID : 0; 8105 return (sizeof (int)); 8106 case SO_ACCEPTCONN: 8107 *i1 = (tcp->tcp_state == TCPS_LISTEN); 8108 return (sizeof (int)); 8109 } 8110 break; 8111 case IPPROTO_TCP: 8112 switch (name) { 8113 case TCP_NODELAY: 8114 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; 8115 return (sizeof (int)); 8116 case TCP_MAXSEG: 8117 *i1 = tcp->tcp_mss; 8118 return (sizeof (int)); 8119 case TCP_NOTIFY_THRESHOLD: 8120 *i1 = (int)tcp->tcp_first_timer_threshold; 8121 return (sizeof (int)); 8122 case TCP_ABORT_THRESHOLD: 8123 *i1 = tcp->tcp_second_timer_threshold; 8124 return (sizeof (int)); 8125 case TCP_CONN_NOTIFY_THRESHOLD: 8126 *i1 = tcp->tcp_first_ctimer_threshold; 8127 return (sizeof (int)); 8128 case TCP_CONN_ABORT_THRESHOLD: 8129 *i1 = tcp->tcp_second_ctimer_threshold; 8130 return (sizeof (int)); 8131 case TCP_INIT_CWND: 8132 *i1 = tcp->tcp_init_cwnd; 8133 return (sizeof (int)); 8134 case TCP_KEEPALIVE_THRESHOLD: 8135 *i1 = tcp->tcp_ka_interval; 8136 return (sizeof (int)); 8137 case TCP_KEEPALIVE_ABORT_THRESHOLD: 8138 *i1 = tcp->tcp_ka_abort_thres; 8139 return (sizeof (int)); 8140 case TCP_CORK: 8141 *i1 = tcp->tcp_cork; 8142 return (sizeof (int)); 8143 } 8144 break; 8145 case IPPROTO_IP: 8146 if (connp->conn_family != AF_INET) 8147 return (-1); 8148 switch (name) { 8149 case IP_OPTIONS: 8150 case T_IP_OPTIONS: 8151 /* Caller ensures enough space */ 8152 return (ip_opt_get_user(connp, ptr)); 8153 default: 8154 break; 8155 } 8156 break; 8157 8158 case IPPROTO_IPV6: 8159 /* 8160 * IPPROTO_IPV6 options are only supported for sockets 8161 * that are using IPv6 on the wire. 8162 */ 8163 if (connp->conn_ipversion != IPV6_VERSION) { 8164 return (-1); 8165 } 8166 switch (name) { 8167 case IPV6_PATHMTU: 8168 if (tcp->tcp_state < TCPS_ESTABLISHED) 8169 return (-1); 8170 break; 8171 } 8172 break; 8173 } 8174 mutex_enter(&connp->conn_lock); 8175 retval = conn_opt_get(&coas, level, name, ptr); 8176 mutex_exit(&connp->conn_lock); 8177 return (retval); 8178 } 8179 8180 /* 8181 * TCP routine to get the values of options. 8182 */ 8183 int 8184 tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 8185 { 8186 return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr)); 8187 } 8188 8189 /* returns UNIX error, the optlen is a value-result arg */ 8190 int 8191 tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 8192 void *optvalp, socklen_t *optlen, cred_t *cr) 8193 { 8194 conn_t *connp = (conn_t *)proto_handle; 8195 squeue_t *sqp = connp->conn_sqp; 8196 int error; 8197 t_uscalar_t max_optbuf_len; 8198 void *optvalp_buf; 8199 int len; 8200 8201 ASSERT(connp->conn_upper_handle != NULL); 8202 8203 error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, 8204 tcp_opt_obj.odb_opt_des_arr, 8205 tcp_opt_obj.odb_opt_arr_cnt, 8206 B_FALSE, B_TRUE, cr); 8207 if (error != 0) { 8208 if (error < 0) { 8209 error = proto_tlitosyserr(-error); 8210 } 8211 return (error); 8212 } 8213 8214 optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); 8215 8216 error = squeue_synch_enter(sqp, connp, NULL); 8217 if (error == ENOMEM) { 8218 kmem_free(optvalp_buf, max_optbuf_len); 8219 return (ENOMEM); 8220 } 8221 8222 len = tcp_opt_get(connp, level, option_name, optvalp_buf); 8223 squeue_synch_exit(sqp, connp); 8224 8225 if (len == -1) { 8226 kmem_free(optvalp_buf, max_optbuf_len); 8227 return (EINVAL); 8228 } 8229 8230 /* 8231 * update optlen and copy option value 8232 */ 8233 t_uscalar_t size = MIN(len, *optlen); 8234 8235 bcopy(optvalp_buf, optvalp, size); 8236 bcopy(&size, optlen, sizeof (size)); 8237 8238 kmem_free(optvalp_buf, max_optbuf_len); 8239 return (0); 8240 } 8241 8242 /* 8243 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. 8244 * Parameters are assumed to be verified by the caller. 8245 */ 8246 /* ARGSUSED */ 8247 int 8248 tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, 8249 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 8250 void *thisdg_attrs, cred_t *cr) 8251 { 8252 tcp_t *tcp = connp->conn_tcp; 8253 int *i1 = (int *)invalp; 8254 boolean_t onoff = (*i1 == 0) ? 0 : 1; 8255 boolean_t checkonly; 8256 int reterr; 8257 tcp_stack_t *tcps = tcp->tcp_tcps; 8258 conn_opt_arg_t coas; 8259 8260 coas.coa_connp = connp; 8261 coas.coa_ixa = connp->conn_ixa; 8262 coas.coa_ipp = &connp->conn_xmit_ipp; 8263 coas.coa_ancillary = B_FALSE; 8264 coas.coa_changed = 0; 8265 8266 switch (optset_context) { 8267 case SETFN_OPTCOM_CHECKONLY: 8268 checkonly = B_TRUE; 8269 /* 8270 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 8271 * inlen != 0 implies value supplied and 8272 * we have to "pretend" to set it. 8273 * inlen == 0 implies that there is no 8274 * value part in T_CHECK request and just validation 8275 * done elsewhere should be enough, we just return here. 8276 */ 8277 if (inlen == 0) { 8278 *outlenp = 0; 8279 return (0); 8280 } 8281 break; 8282 case SETFN_OPTCOM_NEGOTIATE: 8283 checkonly = B_FALSE; 8284 break; 8285 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ 8286 case SETFN_CONN_NEGOTIATE: 8287 checkonly = B_FALSE; 8288 /* 8289 * Negotiating local and "association-related" options 8290 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) 8291 * primitives is allowed by XTI, but we choose 8292 * to not implement this style negotiation for Internet 8293 * protocols (We interpret it is a must for OSI world but 8294 * optional for Internet protocols) for all options. 8295 * [ Will do only for the few options that enable test 8296 * suites that our XTI implementation of this feature 8297 * works for transports that do allow it ] 8298 */ 8299 if (!tcp_allow_connopt_set(level, name)) { 8300 *outlenp = 0; 8301 return (EINVAL); 8302 } 8303 break; 8304 default: 8305 /* 8306 * We should never get here 8307 */ 8308 *outlenp = 0; 8309 return (EINVAL); 8310 } 8311 8312 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 8313 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 8314 8315 /* 8316 * For TCP, we should have no ancillary data sent down 8317 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs 8318 * has to be zero. 8319 */ 8320 ASSERT(thisdg_attrs == NULL); 8321 8322 /* 8323 * For fixed length options, no sanity check 8324 * of passed in length is done. It is assumed *_optcom_req() 8325 * routines do the right thing. 8326 */ 8327 switch (level) { 8328 case SOL_SOCKET: 8329 switch (name) { 8330 case SO_KEEPALIVE: 8331 if (checkonly) { 8332 /* check only case */ 8333 break; 8334 } 8335 8336 if (!onoff) { 8337 if (connp->conn_keepalive) { 8338 if (tcp->tcp_ka_tid != 0) { 8339 (void) TCP_TIMER_CANCEL(tcp, 8340 tcp->tcp_ka_tid); 8341 tcp->tcp_ka_tid = 0; 8342 } 8343 connp->conn_keepalive = 0; 8344 } 8345 break; 8346 } 8347 if (!connp->conn_keepalive) { 8348 /* Crank up the keepalive timer */ 8349 tcp->tcp_ka_last_intrvl = 0; 8350 tcp->tcp_ka_tid = TCP_TIMER(tcp, 8351 tcp_keepalive_killer, 8352 MSEC_TO_TICK(tcp->tcp_ka_interval)); 8353 connp->conn_keepalive = 1; 8354 } 8355 break; 8356 case SO_SNDBUF: { 8357 if (*i1 > tcps->tcps_max_buf) { 8358 *outlenp = 0; 8359 return (ENOBUFS); 8360 } 8361 if (checkonly) 8362 break; 8363 8364 connp->conn_sndbuf = *i1; 8365 if (tcps->tcps_snd_lowat_fraction != 0) { 8366 connp->conn_sndlowat = connp->conn_sndbuf / 8367 tcps->tcps_snd_lowat_fraction; 8368 } 8369 (void) tcp_maxpsz_set(tcp, B_TRUE); 8370 /* 8371 * If we are flow-controlled, recheck the condition. 8372 * There are apps that increase SO_SNDBUF size when 8373 * flow-controlled (EWOULDBLOCK), and expect the flow 8374 * control condition to be lifted right away. 8375 */ 8376 mutex_enter(&tcp->tcp_non_sq_lock); 8377 if (tcp->tcp_flow_stopped && 8378 TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) { 8379 tcp_clrqfull(tcp); 8380 } 8381 mutex_exit(&tcp->tcp_non_sq_lock); 8382 *outlenp = inlen; 8383 return (0); 8384 } 8385 case SO_RCVBUF: 8386 if (*i1 > tcps->tcps_max_buf) { 8387 *outlenp = 0; 8388 return (ENOBUFS); 8389 } 8390 /* Silently ignore zero */ 8391 if (!checkonly && *i1 != 0) { 8392 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); 8393 (void) tcp_rwnd_set(tcp, *i1); 8394 } 8395 /* 8396 * XXX should we return the rwnd here 8397 * and tcp_opt_get ? 8398 */ 8399 *outlenp = inlen; 8400 return (0); 8401 case SO_SND_COPYAVOID: 8402 if (!checkonly) { 8403 if (tcp->tcp_loopback || 8404 (tcp->tcp_kssl_ctx != NULL) || 8405 (onoff != 1) || !tcp_zcopy_check(tcp)) { 8406 *outlenp = 0; 8407 return (EOPNOTSUPP); 8408 } 8409 tcp->tcp_snd_zcopy_aware = 1; 8410 } 8411 *outlenp = inlen; 8412 return (0); 8413 } 8414 break; 8415 case IPPROTO_TCP: 8416 switch (name) { 8417 case TCP_NODELAY: 8418 if (!checkonly) 8419 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; 8420 break; 8421 case TCP_NOTIFY_THRESHOLD: 8422 if (!checkonly) 8423 tcp->tcp_first_timer_threshold = *i1; 8424 break; 8425 case TCP_ABORT_THRESHOLD: 8426 if (!checkonly) 8427 tcp->tcp_second_timer_threshold = *i1; 8428 break; 8429 case TCP_CONN_NOTIFY_THRESHOLD: 8430 if (!checkonly) 8431 tcp->tcp_first_ctimer_threshold = *i1; 8432 break; 8433 case TCP_CONN_ABORT_THRESHOLD: 8434 if (!checkonly) 8435 tcp->tcp_second_ctimer_threshold = *i1; 8436 break; 8437 case TCP_RECVDSTADDR: 8438 if (tcp->tcp_state > TCPS_LISTEN) { 8439 *outlenp = 0; 8440 return (EOPNOTSUPP); 8441 } 8442 /* Setting done in conn_opt_set */ 8443 break; 8444 case TCP_INIT_CWND: { 8445 uint32_t init_cwnd = *((uint32_t *)invalp); 8446 8447 if (checkonly) 8448 break; 8449 8450 /* 8451 * Only allow socket with network configuration 8452 * privilege to set the initial cwnd to be larger 8453 * than allowed by RFC 3390. 8454 */ 8455 if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { 8456 tcp->tcp_init_cwnd = init_cwnd; 8457 break; 8458 } 8459 if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) { 8460 *outlenp = 0; 8461 return (reterr); 8462 } 8463 if (init_cwnd > TCP_MAX_INIT_CWND) { 8464 *outlenp = 0; 8465 return (EINVAL); 8466 } 8467 tcp->tcp_init_cwnd = init_cwnd; 8468 break; 8469 } 8470 case TCP_KEEPALIVE_THRESHOLD: 8471 if (checkonly) 8472 break; 8473 8474 if (*i1 < tcps->tcps_keepalive_interval_low || 8475 *i1 > tcps->tcps_keepalive_interval_high) { 8476 *outlenp = 0; 8477 return (EINVAL); 8478 } 8479 if (*i1 != tcp->tcp_ka_interval) { 8480 tcp->tcp_ka_interval = *i1; 8481 /* 8482 * Check if we need to restart the 8483 * keepalive timer. 8484 */ 8485 if (tcp->tcp_ka_tid != 0) { 8486 ASSERT(connp->conn_keepalive); 8487 (void) TCP_TIMER_CANCEL(tcp, 8488 tcp->tcp_ka_tid); 8489 tcp->tcp_ka_last_intrvl = 0; 8490 tcp->tcp_ka_tid = TCP_TIMER(tcp, 8491 tcp_keepalive_killer, 8492 MSEC_TO_TICK(tcp->tcp_ka_interval)); 8493 } 8494 } 8495 break; 8496 case TCP_KEEPALIVE_ABORT_THRESHOLD: 8497 if (!checkonly) { 8498 if (*i1 < 8499 tcps->tcps_keepalive_abort_interval_low || 8500 *i1 > 8501 tcps->tcps_keepalive_abort_interval_high) { 8502 *outlenp = 0; 8503 return (EINVAL); 8504 } 8505 tcp->tcp_ka_abort_thres = *i1; 8506 } 8507 break; 8508 case TCP_CORK: 8509 if (!checkonly) { 8510 /* 8511 * if tcp->tcp_cork was set and is now 8512 * being unset, we have to make sure that 8513 * the remaining data gets sent out. Also 8514 * unset tcp->tcp_cork so that tcp_wput_data() 8515 * can send data even if it is less than mss 8516 */ 8517 if (tcp->tcp_cork && onoff == 0 && 8518 tcp->tcp_unsent > 0) { 8519 tcp->tcp_cork = B_FALSE; 8520 tcp_wput_data(tcp, NULL, B_FALSE); 8521 } 8522 tcp->tcp_cork = onoff; 8523 } 8524 break; 8525 default: 8526 break; 8527 } 8528 break; 8529 case IPPROTO_IP: 8530 if (connp->conn_family != AF_INET) { 8531 *outlenp = 0; 8532 return (EINVAL); 8533 } 8534 switch (name) { 8535 case IP_SEC_OPT: 8536 /* 8537 * We should not allow policy setting after 8538 * we start listening for connections. 8539 */ 8540 if (tcp->tcp_state == TCPS_LISTEN) { 8541 return (EINVAL); 8542 } 8543 break; 8544 } 8545 break; 8546 case IPPROTO_IPV6: 8547 /* 8548 * IPPROTO_IPV6 options are only supported for sockets 8549 * that are using IPv6 on the wire. 8550 */ 8551 if (connp->conn_ipversion != IPV6_VERSION) { 8552 *outlenp = 0; 8553 return (EINVAL); 8554 } 8555 8556 switch (name) { 8557 case IPV6_RECVPKTINFO: 8558 if (!checkonly) { 8559 /* Force it to be sent up with the next msg */ 8560 tcp->tcp_recvifindex = 0; 8561 } 8562 break; 8563 case IPV6_RECVTCLASS: 8564 if (!checkonly) { 8565 /* Force it to be sent up with the next msg */ 8566 tcp->tcp_recvtclass = 0xffffffffU; 8567 } 8568 break; 8569 case IPV6_RECVHOPLIMIT: 8570 if (!checkonly) { 8571 /* Force it to be sent up with the next msg */ 8572 tcp->tcp_recvhops = 0xffffffffU; 8573 } 8574 break; 8575 case IPV6_PKTINFO: 8576 /* This is an extra check for TCP */ 8577 if (inlen == sizeof (struct in6_pktinfo)) { 8578 struct in6_pktinfo *pkti; 8579 8580 pkti = (struct in6_pktinfo *)invalp; 8581 /* 8582 * RFC 3542 states that ipi6_addr must be 8583 * the unspecified address when setting the 8584 * IPV6_PKTINFO sticky socket option on a 8585 * TCP socket. 8586 */ 8587 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) 8588 return (EINVAL); 8589 } 8590 break; 8591 case IPV6_SEC_OPT: 8592 /* 8593 * We should not allow policy setting after 8594 * we start listening for connections. 8595 */ 8596 if (tcp->tcp_state == TCPS_LISTEN) { 8597 return (EINVAL); 8598 } 8599 break; 8600 } 8601 break; 8602 } 8603 reterr = conn_opt_set(&coas, level, name, inlen, invalp, 8604 checkonly, cr); 8605 if (reterr != 0) { 8606 *outlenp = 0; 8607 return (reterr); 8608 } 8609 8610 /* 8611 * Common case of OK return with outval same as inval 8612 */ 8613 if (invalp != outvalp) { 8614 /* don't trust bcopy for identical src/dst */ 8615 (void) bcopy(invalp, outvalp, inlen); 8616 } 8617 *outlenp = inlen; 8618 8619 if (coas.coa_changed & COA_HEADER_CHANGED) { 8620 /* If we are connected we rebuilt the headers */ 8621 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 8622 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 8623 reterr = tcp_build_hdrs(tcp); 8624 if (reterr != 0) 8625 return (reterr); 8626 } 8627 } 8628 if (coas.coa_changed & COA_ROUTE_CHANGED) { 8629 in6_addr_t nexthop; 8630 8631 /* 8632 * If we are connected we re-cache the information. 8633 * We ignore errors to preserve BSD behavior. 8634 * Note that we don't redo IPsec policy lookup here 8635 * since the final destination (or source) didn't change. 8636 */ 8637 ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa, 8638 &connp->conn_faddr_v6, &nexthop); 8639 8640 if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && 8641 !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { 8642 (void) ip_attr_connect(connp, connp->conn_ixa, 8643 &connp->conn_laddr_v6, &connp->conn_faddr_v6, 8644 &nexthop, connp->conn_fport, NULL, NULL, 8645 IPDF_VERIFY_DST); 8646 } 8647 } 8648 if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { 8649 connp->conn_wq->q_hiwat = connp->conn_sndbuf; 8650 } 8651 if (coas.coa_changed & COA_WROFF_CHANGED) { 8652 connp->conn_wroff = connp->conn_ht_iphc_allocated + 8653 tcps->tcps_wroff_xtra; 8654 (void) proto_set_tx_wroff(connp->conn_rq, connp, 8655 connp->conn_wroff); 8656 } 8657 if (coas.coa_changed & COA_OOBINLINE_CHANGED) { 8658 if (IPCL_IS_NONSTR(connp)) 8659 proto_set_rx_oob_opt(connp, onoff); 8660 } 8661 return (0); 8662 } 8663 8664 /* ARGSUSED */ 8665 int 8666 tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, 8667 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 8668 void *thisdg_attrs, cred_t *cr) 8669 { 8670 conn_t *connp = Q_TO_CONN(q); 8671 8672 return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp, 8673 outlenp, outvalp, thisdg_attrs, cr)); 8674 } 8675 8676 int 8677 tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, 8678 const void *optvalp, socklen_t optlen, cred_t *cr) 8679 { 8680 conn_t *connp = (conn_t *)proto_handle; 8681 squeue_t *sqp = connp->conn_sqp; 8682 int error; 8683 8684 ASSERT(connp->conn_upper_handle != NULL); 8685 /* 8686 * Entering the squeue synchronously can result in a context switch, 8687 * which can cause a rather sever performance degradation. So we try to 8688 * handle whatever options we can without entering the squeue. 8689 */ 8690 if (level == IPPROTO_TCP) { 8691 switch (option_name) { 8692 case TCP_NODELAY: 8693 if (optlen != sizeof (int32_t)) 8694 return (EINVAL); 8695 mutex_enter(&connp->conn_tcp->tcp_non_sq_lock); 8696 connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 : 8697 connp->conn_tcp->tcp_mss; 8698 mutex_exit(&connp->conn_tcp->tcp_non_sq_lock); 8699 return (0); 8700 default: 8701 break; 8702 } 8703 } 8704 8705 error = squeue_synch_enter(sqp, connp, NULL); 8706 if (error == ENOMEM) { 8707 return (ENOMEM); 8708 } 8709 8710 error = proto_opt_check(level, option_name, optlen, NULL, 8711 tcp_opt_obj.odb_opt_des_arr, 8712 tcp_opt_obj.odb_opt_arr_cnt, 8713 B_TRUE, B_FALSE, cr); 8714 8715 if (error != 0) { 8716 if (error < 0) { 8717 error = proto_tlitosyserr(-error); 8718 } 8719 squeue_synch_exit(sqp, connp); 8720 return (error); 8721 } 8722 8723 error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, 8724 optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, 8725 NULL, cr); 8726 squeue_synch_exit(sqp, connp); 8727 8728 ASSERT(error >= 0); 8729 8730 return (error); 8731 } 8732 8733 /* 8734 * Build/update the tcp header template (in conn_ht_iphc) based on 8735 * conn_xmit_ipp. The headers include ip6_t, any extension 8736 * headers, and the maximum size tcp header (to avoid reallocation 8737 * on the fly for additional tcp options). 8738 * 8739 * Assumes the caller has already set conn_{faddr,laddr,fport,lport,flowinfo}. 8740 * Returns failure if can't allocate memory. 8741 */ 8742 static int 8743 tcp_build_hdrs(tcp_t *tcp) 8744 { 8745 tcp_stack_t *tcps = tcp->tcp_tcps; 8746 conn_t *connp = tcp->tcp_connp; 8747 char buf[TCP_MAX_HDR_LENGTH]; 8748 uint_t buflen; 8749 uint_t ulplen = TCP_MIN_HEADER_LENGTH; 8750 uint_t extralen = TCP_MAX_TCP_OPTIONS_LENGTH; 8751 tcpha_t *tcpha; 8752 uint32_t cksum; 8753 int error; 8754 8755 /* 8756 * We might be called after the connection is set up, and we might 8757 * have TS options already in the TCP header. Thus we save any 8758 * existing tcp header. 8759 */ 8760 buflen = connp->conn_ht_ulp_len; 8761 if (buflen != 0) { 8762 bcopy(connp->conn_ht_ulp, buf, buflen); 8763 extralen -= buflen - ulplen; 8764 ulplen = buflen; 8765 } 8766 8767 /* Grab lock to satisfy ASSERT; TCP is serialized using squeue */ 8768 mutex_enter(&connp->conn_lock); 8769 error = conn_build_hdr_template(connp, ulplen, extralen, 8770 &connp->conn_laddr_v6, &connp->conn_faddr_v6, connp->conn_flowinfo); 8771 mutex_exit(&connp->conn_lock); 8772 if (error != 0) 8773 return (error); 8774 8775 /* 8776 * Any routing header/option has been massaged. The checksum difference 8777 * is stored in conn_sum for later use. 8778 */ 8779 tcpha = (tcpha_t *)connp->conn_ht_ulp; 8780 tcp->tcp_tcpha = tcpha; 8781 8782 /* restore any old tcp header */ 8783 if (buflen != 0) { 8784 bcopy(buf, connp->conn_ht_ulp, buflen); 8785 } else { 8786 tcpha->tha_sum = 0; 8787 tcpha->tha_offset_and_reserved = (5 << 4); 8788 tcpha->tha_lport = connp->conn_lport; 8789 tcpha->tha_fport = connp->conn_fport; 8790 } 8791 8792 /* 8793 * IP wants our header length in the checksum field to 8794 * allow it to perform a single pseudo-header+checksum 8795 * calculation on behalf of TCP. 8796 * Include the adjustment for a source route once IP_OPTIONS is set. 8797 */ 8798 cksum = sizeof (tcpha_t) + connp->conn_sum; 8799 cksum = (cksum >> 16) + (cksum & 0xFFFF); 8800 ASSERT(cksum < 0x10000); 8801 tcpha->tha_sum = htons(cksum); 8802 8803 if (connp->conn_ipversion == IPV4_VERSION) 8804 tcp->tcp_ipha = (ipha_t *)connp->conn_ht_iphc; 8805 else 8806 tcp->tcp_ip6h = (ip6_t *)connp->conn_ht_iphc; 8807 8808 if (connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra > 8809 connp->conn_wroff) { 8810 connp->conn_wroff = connp->conn_ht_iphc_allocated + 8811 tcps->tcps_wroff_xtra; 8812 (void) proto_set_tx_wroff(connp->conn_rq, connp, 8813 connp->conn_wroff); 8814 } 8815 return (0); 8816 } 8817 8818 /* Get callback routine passed to nd_load by tcp_param_register */ 8819 /* ARGSUSED */ 8820 static int 8821 tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 8822 { 8823 tcpparam_t *tcppa = (tcpparam_t *)cp; 8824 8825 (void) mi_mpprintf(mp, "%u", tcppa->tcp_param_val); 8826 return (0); 8827 } 8828 8829 /* 8830 * Walk through the param array specified registering each element with the 8831 * named dispatch handler. 8832 */ 8833 static boolean_t 8834 tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *tcps) 8835 { 8836 for (; cnt-- > 0; tcppa++) { 8837 if (tcppa->tcp_param_name && tcppa->tcp_param_name[0]) { 8838 if (!nd_load(ndp, tcppa->tcp_param_name, 8839 tcp_param_get, tcp_param_set, 8840 (caddr_t)tcppa)) { 8841 nd_free(ndp); 8842 return (B_FALSE); 8843 } 8844 } 8845 } 8846 tcps->tcps_wroff_xtra_param = kmem_zalloc(sizeof (tcpparam_t), 8847 KM_SLEEP); 8848 bcopy(&lcl_tcp_wroff_xtra_param, tcps->tcps_wroff_xtra_param, 8849 sizeof (tcpparam_t)); 8850 if (!nd_load(ndp, tcps->tcps_wroff_xtra_param->tcp_param_name, 8851 tcp_param_get, tcp_param_set_aligned, 8852 (caddr_t)tcps->tcps_wroff_xtra_param)) { 8853 nd_free(ndp); 8854 return (B_FALSE); 8855 } 8856 if (!nd_load(ndp, "tcp_extra_priv_ports", 8857 tcp_extra_priv_ports_get, NULL, NULL)) { 8858 nd_free(ndp); 8859 return (B_FALSE); 8860 } 8861 if (!nd_load(ndp, "tcp_extra_priv_ports_add", 8862 NULL, tcp_extra_priv_ports_add, NULL)) { 8863 nd_free(ndp); 8864 return (B_FALSE); 8865 } 8866 if (!nd_load(ndp, "tcp_extra_priv_ports_del", 8867 NULL, tcp_extra_priv_ports_del, NULL)) { 8868 nd_free(ndp); 8869 return (B_FALSE); 8870 } 8871 if (!nd_load(ndp, "tcp_1948_phrase", NULL, 8872 tcp_1948_phrase_set, NULL)) { 8873 nd_free(ndp); 8874 return (B_FALSE); 8875 } 8876 8877 8878 if (!nd_load(ndp, "tcp_listener_limit_conf", 8879 tcp_listener_conf_get, NULL, NULL)) { 8880 nd_free(ndp); 8881 return (B_FALSE); 8882 } 8883 if (!nd_load(ndp, "tcp_listener_limit_conf_add", 8884 NULL, tcp_listener_conf_add, NULL)) { 8885 nd_free(ndp); 8886 return (B_FALSE); 8887 } 8888 if (!nd_load(ndp, "tcp_listener_limit_conf_del", 8889 NULL, tcp_listener_conf_del, NULL)) { 8890 nd_free(ndp); 8891 return (B_FALSE); 8892 } 8893 8894 /* 8895 * Dummy ndd variables - only to convey obsolescence information 8896 * through printing of their name (no get or set routines) 8897 * XXX Remove in future releases ? 8898 */ 8899 if (!nd_load(ndp, 8900 "tcp_close_wait_interval(obsoleted - " 8901 "use tcp_time_wait_interval)", NULL, NULL, NULL)) { 8902 nd_free(ndp); 8903 return (B_FALSE); 8904 } 8905 return (B_TRUE); 8906 } 8907 8908 /* ndd set routine for tcp_wroff_xtra. */ 8909 /* ARGSUSED */ 8910 static int 8911 tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 8912 cred_t *cr) 8913 { 8914 long new_value; 8915 tcpparam_t *tcppa = (tcpparam_t *)cp; 8916 8917 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 8918 new_value < tcppa->tcp_param_min || 8919 new_value > tcppa->tcp_param_max) { 8920 return (EINVAL); 8921 } 8922 /* 8923 * Need to make sure new_value is a multiple of 4. If it is not, 8924 * round it up. For future 64 bit requirement, we actually make it 8925 * a multiple of 8. 8926 */ 8927 if (new_value & 0x7) { 8928 new_value = (new_value & ~0x7) + 0x8; 8929 } 8930 tcppa->tcp_param_val = new_value; 8931 return (0); 8932 } 8933 8934 /* Set callback routine passed to nd_load by tcp_param_register */ 8935 /* ARGSUSED */ 8936 static int 8937 tcp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) 8938 { 8939 long new_value; 8940 tcpparam_t *tcppa = (tcpparam_t *)cp; 8941 8942 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 8943 new_value < tcppa->tcp_param_min || 8944 new_value > tcppa->tcp_param_max) { 8945 return (EINVAL); 8946 } 8947 tcppa->tcp_param_val = new_value; 8948 return (0); 8949 } 8950 8951 static void 8952 tcp_reass_timer(void *arg) 8953 { 8954 conn_t *connp = (conn_t *)arg; 8955 tcp_t *tcp = connp->conn_tcp; 8956 8957 tcp->tcp_reass_tid = 0; 8958 if (tcp->tcp_reass_head == NULL) 8959 return; 8960 ASSERT(tcp->tcp_reass_tail != NULL); 8961 tcp_sack_remove(tcp->tcp_sack_list, TCP_REASS_END(tcp->tcp_reass_tail), 8962 &tcp->tcp_num_sack_blk); 8963 tcp_close_mpp(&tcp->tcp_reass_head); 8964 tcp->tcp_reass_tail = NULL; 8965 } 8966 8967 /* 8968 * Add a new piece to the tcp reassembly queue. If the gap at the beginning 8969 * is filled, return as much as we can. The message passed in may be 8970 * multi-part, chained using b_cont. "start" is the starting sequence 8971 * number for this piece. 8972 */ 8973 static mblk_t * 8974 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 8975 { 8976 uint32_t end; 8977 mblk_t *mp1; 8978 mblk_t *mp2; 8979 mblk_t *next_mp; 8980 uint32_t u1; 8981 tcp_stack_t *tcps = tcp->tcp_tcps; 8982 8983 8984 /* Walk through all the new pieces. */ 8985 do { 8986 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 8987 (uintptr_t)INT_MAX); 8988 end = start + (int)(mp->b_wptr - mp->b_rptr); 8989 next_mp = mp->b_cont; 8990 if (start == end) { 8991 /* Empty. Blast it. */ 8992 freeb(mp); 8993 continue; 8994 } 8995 mp->b_cont = NULL; 8996 TCP_REASS_SET_SEQ(mp, start); 8997 TCP_REASS_SET_END(mp, end); 8998 mp1 = tcp->tcp_reass_tail; 8999 if (!mp1) { 9000 tcp->tcp_reass_tail = mp; 9001 tcp->tcp_reass_head = mp; 9002 BUMP_MIB(&tcps->tcps_mib, tcpInDataUnorderSegs); 9003 UPDATE_MIB(&tcps->tcps_mib, 9004 tcpInDataUnorderBytes, end - start); 9005 continue; 9006 } 9007 /* New stuff completely beyond tail? */ 9008 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { 9009 /* Link it on end. */ 9010 mp1->b_cont = mp; 9011 tcp->tcp_reass_tail = mp; 9012 BUMP_MIB(&tcps->tcps_mib, tcpInDataUnorderSegs); 9013 UPDATE_MIB(&tcps->tcps_mib, 9014 tcpInDataUnorderBytes, end - start); 9015 continue; 9016 } 9017 mp1 = tcp->tcp_reass_head; 9018 u1 = TCP_REASS_SEQ(mp1); 9019 /* New stuff at the front? */ 9020 if (SEQ_LT(start, u1)) { 9021 /* Yes... Check for overlap. */ 9022 mp->b_cont = mp1; 9023 tcp->tcp_reass_head = mp; 9024 tcp_reass_elim_overlap(tcp, mp); 9025 continue; 9026 } 9027 /* 9028 * The new piece fits somewhere between the head and tail. 9029 * We find our slot, where mp1 precedes us and mp2 trails. 9030 */ 9031 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 9032 u1 = TCP_REASS_SEQ(mp2); 9033 if (SEQ_LEQ(start, u1)) 9034 break; 9035 } 9036 /* Link ourselves in */ 9037 mp->b_cont = mp2; 9038 mp1->b_cont = mp; 9039 9040 /* Trim overlap with following mblk(s) first */ 9041 tcp_reass_elim_overlap(tcp, mp); 9042 9043 /* Trim overlap with preceding mblk */ 9044 tcp_reass_elim_overlap(tcp, mp1); 9045 9046 } while (start = end, mp = next_mp); 9047 mp1 = tcp->tcp_reass_head; 9048 /* Anything ready to go? */ 9049 if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 9050 return (NULL); 9051 /* Eat what we can off the queue */ 9052 for (;;) { 9053 mp = mp1->b_cont; 9054 end = TCP_REASS_END(mp1); 9055 TCP_REASS_SET_SEQ(mp1, 0); 9056 TCP_REASS_SET_END(mp1, 0); 9057 if (!mp) { 9058 tcp->tcp_reass_tail = NULL; 9059 break; 9060 } 9061 if (end != TCP_REASS_SEQ(mp)) { 9062 mp1->b_cont = NULL; 9063 break; 9064 } 9065 mp1 = mp; 9066 } 9067 mp1 = tcp->tcp_reass_head; 9068 tcp->tcp_reass_head = mp; 9069 return (mp1); 9070 } 9071 9072 /* Eliminate any overlap that mp may have over later mblks */ 9073 static void 9074 tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 9075 { 9076 uint32_t end; 9077 mblk_t *mp1; 9078 uint32_t u1; 9079 tcp_stack_t *tcps = tcp->tcp_tcps; 9080 9081 end = TCP_REASS_END(mp); 9082 while ((mp1 = mp->b_cont) != NULL) { 9083 u1 = TCP_REASS_SEQ(mp1); 9084 if (!SEQ_GT(end, u1)) 9085 break; 9086 if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 9087 mp->b_wptr -= end - u1; 9088 TCP_REASS_SET_END(mp, u1); 9089 BUMP_MIB(&tcps->tcps_mib, tcpInDataPartDupSegs); 9090 UPDATE_MIB(&tcps->tcps_mib, 9091 tcpInDataPartDupBytes, end - u1); 9092 break; 9093 } 9094 mp->b_cont = mp1->b_cont; 9095 TCP_REASS_SET_SEQ(mp1, 0); 9096 TCP_REASS_SET_END(mp1, 0); 9097 freeb(mp1); 9098 BUMP_MIB(&tcps->tcps_mib, tcpInDataDupSegs); 9099 UPDATE_MIB(&tcps->tcps_mib, tcpInDataDupBytes, end - u1); 9100 } 9101 if (!mp1) 9102 tcp->tcp_reass_tail = mp; 9103 } 9104 9105 static uint_t 9106 tcp_rwnd_reopen(tcp_t *tcp) 9107 { 9108 uint_t ret = 0; 9109 uint_t thwin; 9110 conn_t *connp = tcp->tcp_connp; 9111 9112 /* Learn the latest rwnd information that we sent to the other side. */ 9113 thwin = ((uint_t)ntohs(tcp->tcp_tcpha->tha_win)) 9114 << tcp->tcp_rcv_ws; 9115 /* This is peer's calculated send window (our receive window). */ 9116 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 9117 /* 9118 * Increase the receive window to max. But we need to do receiver 9119 * SWS avoidance. This means that we need to check the increase of 9120 * of receive window is at least 1 MSS. 9121 */ 9122 if (connp->conn_rcvbuf - thwin >= tcp->tcp_mss) { 9123 /* 9124 * If the window that the other side knows is less than max 9125 * deferred acks segments, send an update immediately. 9126 */ 9127 if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { 9128 BUMP_MIB(&tcp->tcp_tcps->tcps_mib, tcpOutWinUpdate); 9129 ret = TH_ACK_NEEDED; 9130 } 9131 tcp->tcp_rwnd = connp->conn_rcvbuf; 9132 } 9133 return (ret); 9134 } 9135 9136 /* 9137 * Send up all messages queued on tcp_rcv_list. 9138 */ 9139 static uint_t 9140 tcp_rcv_drain(tcp_t *tcp) 9141 { 9142 mblk_t *mp; 9143 uint_t ret = 0; 9144 #ifdef DEBUG 9145 uint_t cnt = 0; 9146 #endif 9147 queue_t *q = tcp->tcp_connp->conn_rq; 9148 9149 /* Can't drain on an eager connection */ 9150 if (tcp->tcp_listener != NULL) 9151 return (ret); 9152 9153 /* Can't be a non-STREAMS connection */ 9154 ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 9155 9156 /* No need for the push timer now. */ 9157 if (tcp->tcp_push_tid != 0) { 9158 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 9159 tcp->tcp_push_tid = 0; 9160 } 9161 9162 /* 9163 * Handle two cases here: we are currently fused or we were 9164 * previously fused and have some urgent data to be delivered 9165 * upstream. The latter happens because we either ran out of 9166 * memory or were detached and therefore sending the SIGURG was 9167 * deferred until this point. In either case we pass control 9168 * over to tcp_fuse_rcv_drain() since it may need to complete 9169 * some work. 9170 */ 9171 if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) { 9172 ASSERT(IPCL_IS_NONSTR(tcp->tcp_connp) || 9173 tcp->tcp_fused_sigurg_mp != NULL); 9174 if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL : 9175 &tcp->tcp_fused_sigurg_mp)) 9176 return (ret); 9177 } 9178 9179 while ((mp = tcp->tcp_rcv_list) != NULL) { 9180 tcp->tcp_rcv_list = mp->b_next; 9181 mp->b_next = NULL; 9182 #ifdef DEBUG 9183 cnt += msgdsize(mp); 9184 #endif 9185 /* Does this need SSL processing first? */ 9186 if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) { 9187 DTRACE_PROBE1(kssl_mblk__ksslinput_rcvdrain, 9188 mblk_t *, mp); 9189 tcp_kssl_input(tcp, mp, NULL); 9190 continue; 9191 } 9192 putnext(q, mp); 9193 } 9194 #ifdef DEBUG 9195 ASSERT(cnt == tcp->tcp_rcv_cnt); 9196 #endif 9197 tcp->tcp_rcv_last_head = NULL; 9198 tcp->tcp_rcv_last_tail = NULL; 9199 tcp->tcp_rcv_cnt = 0; 9200 9201 if (canputnext(q)) 9202 return (tcp_rwnd_reopen(tcp)); 9203 9204 return (ret); 9205 } 9206 9207 /* 9208 * Queue data on tcp_rcv_list which is a b_next chain. 9209 * tcp_rcv_last_head/tail is the last element of this chain. 9210 * Each element of the chain is a b_cont chain. 9211 * 9212 * M_DATA messages are added to the current element. 9213 * Other messages are added as new (b_next) elements. 9214 */ 9215 void 9216 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len, cred_t *cr) 9217 { 9218 ASSERT(seg_len == msgdsize(mp)); 9219 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); 9220 9221 if (is_system_labeled()) { 9222 ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL); 9223 /* 9224 * Provide for protocols above TCP such as RPC. NOPID leaves 9225 * db_cpid unchanged. 9226 * The cred could have already been set. 9227 */ 9228 if (cr != NULL) 9229 mblk_setcred(mp, cr, NOPID); 9230 } 9231 9232 if (tcp->tcp_rcv_list == NULL) { 9233 ASSERT(tcp->tcp_rcv_last_head == NULL); 9234 tcp->tcp_rcv_list = mp; 9235 tcp->tcp_rcv_last_head = mp; 9236 } else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) { 9237 tcp->tcp_rcv_last_tail->b_cont = mp; 9238 } else { 9239 tcp->tcp_rcv_last_head->b_next = mp; 9240 tcp->tcp_rcv_last_head = mp; 9241 } 9242 9243 while (mp->b_cont) 9244 mp = mp->b_cont; 9245 9246 tcp->tcp_rcv_last_tail = mp; 9247 tcp->tcp_rcv_cnt += seg_len; 9248 tcp->tcp_rwnd -= seg_len; 9249 } 9250 9251 /* The minimum of smoothed mean deviation in RTO calculation. */ 9252 #define TCP_SD_MIN 400 9253 9254 /* 9255 * Set RTO for this connection. The formula is from Jacobson and Karels' 9256 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 9257 * are the same as those in Appendix A.2 of that paper. 9258 * 9259 * m = new measurement 9260 * sa = smoothed RTT average (8 * average estimates). 9261 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 9262 */ 9263 static void 9264 tcp_set_rto(tcp_t *tcp, clock_t rtt) 9265 { 9266 long m = TICK_TO_MSEC(rtt); 9267 clock_t sa = tcp->tcp_rtt_sa; 9268 clock_t sv = tcp->tcp_rtt_sd; 9269 clock_t rto; 9270 tcp_stack_t *tcps = tcp->tcp_tcps; 9271 9272 BUMP_MIB(&tcps->tcps_mib, tcpRttUpdate); 9273 tcp->tcp_rtt_update++; 9274 9275 /* tcp_rtt_sa is not 0 means this is a new sample. */ 9276 if (sa != 0) { 9277 /* 9278 * Update average estimator: 9279 * new rtt = 7/8 old rtt + 1/8 Error 9280 */ 9281 9282 /* m is now Error in estimate. */ 9283 m -= sa >> 3; 9284 if ((sa += m) <= 0) { 9285 /* 9286 * Don't allow the smoothed average to be negative. 9287 * We use 0 to denote reinitialization of the 9288 * variables. 9289 */ 9290 sa = 1; 9291 } 9292 9293 /* 9294 * Update deviation estimator: 9295 * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) 9296 */ 9297 if (m < 0) 9298 m = -m; 9299 m -= sv >> 2; 9300 sv += m; 9301 } else { 9302 /* 9303 * This follows BSD's implementation. So the reinitialized 9304 * RTO is 3 * m. We cannot go less than 2 because if the 9305 * link is bandwidth dominated, doubling the window size 9306 * during slow start means doubling the RTT. We want to be 9307 * more conservative when we reinitialize our estimates. 3 9308 * is just a convenient number. 9309 */ 9310 sa = m << 3; 9311 sv = m << 1; 9312 } 9313 if (sv < TCP_SD_MIN) { 9314 /* 9315 * We do not know that if sa captures the delay ACK 9316 * effect as in a long train of segments, a receiver 9317 * does not delay its ACKs. So set the minimum of sv 9318 * to be TCP_SD_MIN, which is default to 400 ms, twice 9319 * of BSD DATO. That means the minimum of mean 9320 * deviation is 100 ms. 9321 * 9322 */ 9323 sv = TCP_SD_MIN; 9324 } 9325 tcp->tcp_rtt_sa = sa; 9326 tcp->tcp_rtt_sd = sv; 9327 /* 9328 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) 9329 * 9330 * Add tcp_rexmit_interval extra in case of extreme environment 9331 * where the algorithm fails to work. The default value of 9332 * tcp_rexmit_interval_extra should be 0. 9333 * 9334 * As we use a finer grained clock than BSD and update 9335 * RTO for every ACKs, add in another .25 of RTT to the 9336 * deviation of RTO to accomodate burstiness of 1/4 of 9337 * window size. 9338 */ 9339 rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5); 9340 9341 if (rto > tcps->tcps_rexmit_interval_max) { 9342 tcp->tcp_rto = tcps->tcps_rexmit_interval_max; 9343 } else if (rto < tcps->tcps_rexmit_interval_min) { 9344 tcp->tcp_rto = tcps->tcps_rexmit_interval_min; 9345 } else { 9346 tcp->tcp_rto = rto; 9347 } 9348 9349 /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 9350 tcp->tcp_timer_backoff = 0; 9351 } 9352 9353 /* 9354 * tcp_get_seg_mp() is called to get the pointer to a segment in the 9355 * send queue which starts at the given sequence number. If the given 9356 * sequence number is equal to last valid sequence number (tcp_snxt), the 9357 * returned mblk is the last valid mblk, and off is set to the length of 9358 * that mblk. 9359 * 9360 * send queue which starts at the given seq. no. 9361 * 9362 * Parameters: 9363 * tcp_t *tcp: the tcp instance pointer. 9364 * uint32_t seq: the starting seq. no of the requested segment. 9365 * int32_t *off: after the execution, *off will be the offset to 9366 * the returned mblk which points to the requested seq no. 9367 * It is the caller's responsibility to send in a non-null off. 9368 * 9369 * Return: 9370 * A mblk_t pointer pointing to the requested segment in send queue. 9371 */ 9372 static mblk_t * 9373 tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 9374 { 9375 int32_t cnt; 9376 mblk_t *mp; 9377 9378 /* Defensive coding. Make sure we don't send incorrect data. */ 9379 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GT(seq, tcp->tcp_snxt)) 9380 return (NULL); 9381 9382 cnt = seq - tcp->tcp_suna; 9383 mp = tcp->tcp_xmit_head; 9384 while (cnt > 0 && mp != NULL) { 9385 cnt -= mp->b_wptr - mp->b_rptr; 9386 if (cnt <= 0) { 9387 cnt += mp->b_wptr - mp->b_rptr; 9388 break; 9389 } 9390 mp = mp->b_cont; 9391 } 9392 ASSERT(mp != NULL); 9393 *off = cnt; 9394 return (mp); 9395 } 9396 9397 /* 9398 * This function handles all retransmissions if SACK is enabled for this 9399 * connection. First it calculates how many segments can be retransmitted 9400 * based on tcp_pipe. Then it goes thru the notsack list to find eligible 9401 * segments. A segment is eligible if sack_cnt for that segment is greater 9402 * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 9403 * all eligible segments, it checks to see if TCP can send some new segments 9404 * (fast recovery). If it can, set the appropriate flag for tcp_input_data(). 9405 * 9406 * Parameters: 9407 * tcp_t *tcp: the tcp structure of the connection. 9408 * uint_t *flags: in return, appropriate value will be set for 9409 * tcp_input_data(). 9410 */ 9411 static void 9412 tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) 9413 { 9414 notsack_blk_t *notsack_blk; 9415 int32_t usable_swnd; 9416 int32_t mss; 9417 uint32_t seg_len; 9418 mblk_t *xmit_mp; 9419 tcp_stack_t *tcps = tcp->tcp_tcps; 9420 9421 ASSERT(tcp->tcp_sack_info != NULL); 9422 ASSERT(tcp->tcp_notsack_list != NULL); 9423 ASSERT(tcp->tcp_rexmit == B_FALSE); 9424 9425 /* Defensive coding in case there is a bug... */ 9426 if (tcp->tcp_notsack_list == NULL) { 9427 return; 9428 } 9429 notsack_blk = tcp->tcp_notsack_list; 9430 mss = tcp->tcp_mss; 9431 9432 /* 9433 * Limit the num of outstanding data in the network to be 9434 * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 9435 */ 9436 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 9437 9438 /* At least retransmit 1 MSS of data. */ 9439 if (usable_swnd <= 0) { 9440 usable_swnd = mss; 9441 } 9442 9443 /* Make sure no new RTT samples will be taken. */ 9444 tcp->tcp_csuna = tcp->tcp_snxt; 9445 9446 notsack_blk = tcp->tcp_notsack_list; 9447 while (usable_swnd > 0) { 9448 mblk_t *snxt_mp, *tmp_mp; 9449 tcp_seq begin = tcp->tcp_sack_snxt; 9450 tcp_seq end; 9451 int32_t off; 9452 9453 for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 9454 if (SEQ_GT(notsack_blk->end, begin) && 9455 (notsack_blk->sack_cnt >= 9456 tcps->tcps_dupack_fast_retransmit)) { 9457 end = notsack_blk->end; 9458 if (SEQ_LT(begin, notsack_blk->begin)) { 9459 begin = notsack_blk->begin; 9460 } 9461 break; 9462 } 9463 } 9464 /* 9465 * All holes are filled. Manipulate tcp_cwnd to send more 9466 * if we can. Note that after the SACK recovery, tcp_cwnd is 9467 * set to tcp_cwnd_ssthresh. 9468 */ 9469 if (notsack_blk == NULL) { 9470 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 9471 if (usable_swnd <= 0 || tcp->tcp_unsent == 0) { 9472 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 9473 ASSERT(tcp->tcp_cwnd > 0); 9474 return; 9475 } else { 9476 usable_swnd = usable_swnd / mss; 9477 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 9478 MAX(usable_swnd * mss, mss); 9479 *flags |= TH_XMIT_NEEDED; 9480 return; 9481 } 9482 } 9483 9484 /* 9485 * Note that we may send more than usable_swnd allows here 9486 * because of round off, but no more than 1 MSS of data. 9487 */ 9488 seg_len = end - begin; 9489 if (seg_len > mss) 9490 seg_len = mss; 9491 snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 9492 ASSERT(snxt_mp != NULL); 9493 /* This should not happen. Defensive coding again... */ 9494 if (snxt_mp == NULL) { 9495 return; 9496 } 9497 9498 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 9499 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 9500 if (xmit_mp == NULL) 9501 return; 9502 9503 usable_swnd -= seg_len; 9504 tcp->tcp_pipe += seg_len; 9505 tcp->tcp_sack_snxt = begin + seg_len; 9506 9507 tcp_send_data(tcp, xmit_mp); 9508 9509 /* 9510 * Update the send timestamp to avoid false retransmission. 9511 */ 9512 snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); 9513 9514 BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); 9515 UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, seg_len); 9516 BUMP_MIB(&tcps->tcps_mib, tcpOutSackRetransSegs); 9517 /* 9518 * Update tcp_rexmit_max to extend this SACK recovery phase. 9519 * This happens when new data sent during fast recovery is 9520 * also lost. If TCP retransmits those new data, it needs 9521 * to extend SACK recover phase to avoid starting another 9522 * fast retransmit/recovery unnecessarily. 9523 */ 9524 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 9525 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 9526 } 9527 } 9528 } 9529 9530 /* 9531 * tcp_ss_rexmit() is called to do slow start retransmission after a timeout 9532 * or ICMP errors. 9533 * 9534 * To limit the number of duplicate segments, we limit the number of segment 9535 * to be sent in one time to tcp_snd_burst, the burst variable. 9536 */ 9537 static void 9538 tcp_ss_rexmit(tcp_t *tcp) 9539 { 9540 uint32_t snxt; 9541 uint32_t smax; 9542 int32_t win; 9543 int32_t mss; 9544 int32_t off; 9545 int32_t burst = tcp->tcp_snd_burst; 9546 mblk_t *snxt_mp; 9547 tcp_stack_t *tcps = tcp->tcp_tcps; 9548 9549 /* 9550 * Note that tcp_rexmit can be set even though TCP has retransmitted 9551 * all unack'ed segments. 9552 */ 9553 if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 9554 smax = tcp->tcp_rexmit_max; 9555 snxt = tcp->tcp_rexmit_nxt; 9556 if (SEQ_LT(snxt, tcp->tcp_suna)) { 9557 snxt = tcp->tcp_suna; 9558 } 9559 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 9560 win -= snxt - tcp->tcp_suna; 9561 mss = tcp->tcp_mss; 9562 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 9563 9564 while (SEQ_LT(snxt, smax) && (win > 0) && 9565 (burst > 0) && (snxt_mp != NULL)) { 9566 mblk_t *xmit_mp; 9567 mblk_t *old_snxt_mp = snxt_mp; 9568 uint32_t cnt = mss; 9569 9570 if (win < cnt) { 9571 cnt = win; 9572 } 9573 if (SEQ_GT(snxt + cnt, smax)) { 9574 cnt = smax - snxt; 9575 } 9576 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 9577 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 9578 if (xmit_mp == NULL) 9579 return; 9580 9581 tcp_send_data(tcp, xmit_mp); 9582 9583 snxt += cnt; 9584 win -= cnt; 9585 /* 9586 * Update the send timestamp to avoid false 9587 * retransmission. 9588 */ 9589 old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); 9590 BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); 9591 UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, cnt); 9592 9593 tcp->tcp_rexmit_nxt = snxt; 9594 burst--; 9595 } 9596 /* 9597 * If we have transmitted all we have at the time 9598 * we started the retranmission, we can leave 9599 * the rest of the job to tcp_wput_data(). But we 9600 * need to check the send window first. If the 9601 * win is not 0, go on with tcp_wput_data(). 9602 */ 9603 if (SEQ_LT(snxt, smax) || win == 0) { 9604 return; 9605 } 9606 } 9607 /* Only call tcp_wput_data() if there is data to be sent. */ 9608 if (tcp->tcp_unsent) { 9609 tcp_wput_data(tcp, NULL, B_FALSE); 9610 } 9611 } 9612 9613 /* 9614 * Process all TCP option in SYN segment. Note that this function should 9615 * be called after tcp_set_destination() is called so that the necessary info 9616 * from IRE is already set in the tcp structure. 9617 * 9618 * This function sets up the correct tcp_mss value according to the 9619 * MSS option value and our header size. It also sets up the window scale 9620 * and timestamp values, and initialize SACK info blocks. But it does not 9621 * change receive window size after setting the tcp_mss value. The caller 9622 * should do the appropriate change. 9623 */ 9624 void 9625 tcp_process_options(tcp_t *tcp, tcpha_t *tcpha) 9626 { 9627 int options; 9628 tcp_opt_t tcpopt; 9629 uint32_t mss_max; 9630 char *tmp_tcph; 9631 tcp_stack_t *tcps = tcp->tcp_tcps; 9632 conn_t *connp = tcp->tcp_connp; 9633 9634 tcpopt.tcp = NULL; 9635 options = tcp_parse_options(tcpha, &tcpopt); 9636 9637 /* 9638 * Process MSS option. Note that MSS option value does not account 9639 * for IP or TCP options. This means that it is equal to MTU - minimum 9640 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 9641 * IPv6. 9642 */ 9643 if (!(options & TCP_OPT_MSS_PRESENT)) { 9644 if (connp->conn_ipversion == IPV4_VERSION) 9645 tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4; 9646 else 9647 tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6; 9648 } else { 9649 if (connp->conn_ipversion == IPV4_VERSION) 9650 mss_max = tcps->tcps_mss_max_ipv4; 9651 else 9652 mss_max = tcps->tcps_mss_max_ipv6; 9653 if (tcpopt.tcp_opt_mss < tcps->tcps_mss_min) 9654 tcpopt.tcp_opt_mss = tcps->tcps_mss_min; 9655 else if (tcpopt.tcp_opt_mss > mss_max) 9656 tcpopt.tcp_opt_mss = mss_max; 9657 } 9658 9659 /* Process Window Scale option. */ 9660 if (options & TCP_OPT_WSCALE_PRESENT) { 9661 tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 9662 tcp->tcp_snd_ws_ok = B_TRUE; 9663 } else { 9664 tcp->tcp_snd_ws = B_FALSE; 9665 tcp->tcp_snd_ws_ok = B_FALSE; 9666 tcp->tcp_rcv_ws = B_FALSE; 9667 } 9668 9669 /* Process Timestamp option. */ 9670 if ((options & TCP_OPT_TSTAMP_PRESENT) && 9671 (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) { 9672 tmp_tcph = (char *)tcp->tcp_tcpha; 9673 9674 tcp->tcp_snd_ts_ok = B_TRUE; 9675 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 9676 tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); 9677 ASSERT(OK_32PTR(tmp_tcph)); 9678 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); 9679 9680 /* Fill in our template header with basic timestamp option. */ 9681 tmp_tcph += connp->conn_ht_ulp_len; 9682 tmp_tcph[0] = TCPOPT_NOP; 9683 tmp_tcph[1] = TCPOPT_NOP; 9684 tmp_tcph[2] = TCPOPT_TSTAMP; 9685 tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 9686 connp->conn_ht_iphc_len += TCPOPT_REAL_TS_LEN; 9687 connp->conn_ht_ulp_len += TCPOPT_REAL_TS_LEN; 9688 tcp->tcp_tcpha->tha_offset_and_reserved += (3 << 4); 9689 } else { 9690 tcp->tcp_snd_ts_ok = B_FALSE; 9691 } 9692 9693 /* 9694 * Process SACK options. If SACK is enabled for this connection, 9695 * then allocate the SACK info structure. Note the following ways 9696 * when tcp_snd_sack_ok is set to true. 9697 * 9698 * For active connection: in tcp_set_destination() called in 9699 * tcp_connect(). 9700 * 9701 * For passive connection: in tcp_set_destination() called in 9702 * tcp_input_listener(). 9703 * 9704 * That's the reason why the extra TCP_IS_DETACHED() check is there. 9705 * That check makes sure that if we did not send a SACK OK option, 9706 * we will not enable SACK for this connection even though the other 9707 * side sends us SACK OK option. For active connection, the SACK 9708 * info structure has already been allocated. So we need to free 9709 * it if SACK is disabled. 9710 */ 9711 if ((options & TCP_OPT_SACK_OK_PRESENT) && 9712 (tcp->tcp_snd_sack_ok || 9713 (tcps->tcps_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) { 9714 /* This should be true only in the passive case. */ 9715 if (tcp->tcp_sack_info == NULL) { 9716 ASSERT(TCP_IS_DETACHED(tcp)); 9717 tcp->tcp_sack_info = 9718 kmem_cache_alloc(tcp_sack_info_cache, KM_NOSLEEP); 9719 } 9720 if (tcp->tcp_sack_info == NULL) { 9721 tcp->tcp_snd_sack_ok = B_FALSE; 9722 } else { 9723 tcp->tcp_snd_sack_ok = B_TRUE; 9724 if (tcp->tcp_snd_ts_ok) { 9725 tcp->tcp_max_sack_blk = 3; 9726 } else { 9727 tcp->tcp_max_sack_blk = 4; 9728 } 9729 } 9730 } else { 9731 /* 9732 * Resetting tcp_snd_sack_ok to B_FALSE so that 9733 * no SACK info will be used for this 9734 * connection. This assumes that SACK usage 9735 * permission is negotiated. This may need 9736 * to be changed once this is clarified. 9737 */ 9738 if (tcp->tcp_sack_info != NULL) { 9739 ASSERT(tcp->tcp_notsack_list == NULL); 9740 kmem_cache_free(tcp_sack_info_cache, 9741 tcp->tcp_sack_info); 9742 tcp->tcp_sack_info = NULL; 9743 } 9744 tcp->tcp_snd_sack_ok = B_FALSE; 9745 } 9746 9747 /* 9748 * Now we know the exact TCP/IP header length, subtract 9749 * that from tcp_mss to get our side's MSS. 9750 */ 9751 tcp->tcp_mss -= connp->conn_ht_iphc_len; 9752 9753 /* 9754 * Here we assume that the other side's header size will be equal to 9755 * our header size. We calculate the real MSS accordingly. Need to 9756 * take into additional stuffs IPsec puts in. 9757 * 9758 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 9759 */ 9760 tcpopt.tcp_opt_mss -= connp->conn_ht_iphc_len + 9761 tcp->tcp_ipsec_overhead - 9762 ((connp->conn_ipversion == IPV4_VERSION ? 9763 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH); 9764 9765 /* 9766 * Set MSS to the smaller one of both ends of the connection. 9767 * We should not have called tcp_mss_set() before, but our 9768 * side of the MSS should have been set to a proper value 9769 * by tcp_set_destination(). tcp_mss_set() will also set up the 9770 * STREAM head parameters properly. 9771 * 9772 * If we have a larger-than-16-bit window but the other side 9773 * didn't want to do window scale, tcp_rwnd_set() will take 9774 * care of that. 9775 */ 9776 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); 9777 9778 /* 9779 * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been 9780 * updated properly. 9781 */ 9782 SET_TCP_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial); 9783 } 9784 9785 /* 9786 * Sends the T_CONN_IND to the listener. The caller calls this 9787 * functions via squeue to get inside the listener's perimeter 9788 * once the 3 way hand shake is done a T_CONN_IND needs to be 9789 * sent. As an optimization, the caller can call this directly 9790 * if listener's perimeter is same as eager's. 9791 */ 9792 /* ARGSUSED */ 9793 void 9794 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) 9795 { 9796 conn_t *lconnp = (conn_t *)arg; 9797 tcp_t *listener = lconnp->conn_tcp; 9798 tcp_t *tcp; 9799 struct T_conn_ind *conn_ind; 9800 ipaddr_t *addr_cache; 9801 boolean_t need_send_conn_ind = B_FALSE; 9802 tcp_stack_t *tcps = listener->tcp_tcps; 9803 9804 /* retrieve the eager */ 9805 conn_ind = (struct T_conn_ind *)mp->b_rptr; 9806 ASSERT(conn_ind->OPT_offset != 0 && 9807 conn_ind->OPT_length == sizeof (intptr_t)); 9808 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 9809 conn_ind->OPT_length); 9810 9811 /* 9812 * TLI/XTI applications will get confused by 9813 * sending eager as an option since it violates 9814 * the option semantics. So remove the eager as 9815 * option since TLI/XTI app doesn't need it anyway. 9816 */ 9817 if (!TCP_IS_SOCKET(listener)) { 9818 conn_ind->OPT_length = 0; 9819 conn_ind->OPT_offset = 0; 9820 } 9821 if (listener->tcp_state != TCPS_LISTEN) { 9822 /* 9823 * If listener has closed, it would have caused a 9824 * a cleanup/blowoff to happen for the eager. We 9825 * just need to return. 9826 */ 9827 freemsg(mp); 9828 return; 9829 } 9830 9831 9832 /* 9833 * if the conn_req_q is full defer passing up the 9834 * T_CONN_IND until space is availabe after t_accept() 9835 * processing 9836 */ 9837 mutex_enter(&listener->tcp_eager_lock); 9838 9839 /* 9840 * Take the eager out, if it is in the list of droppable eagers 9841 * as we are here because the 3W handshake is over. 9842 */ 9843 MAKE_UNDROPPABLE(tcp); 9844 9845 if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { 9846 tcp_t *tail; 9847 9848 /* 9849 * The eager already has an extra ref put in tcp_input_data 9850 * so that it stays till accept comes back even though it 9851 * might get into TCPS_CLOSED as a result of a TH_RST etc. 9852 */ 9853 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 9854 listener->tcp_conn_req_cnt_q0--; 9855 listener->tcp_conn_req_cnt_q++; 9856 9857 /* Move from SYN_RCVD to ESTABLISHED list */ 9858 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 9859 tcp->tcp_eager_prev_q0; 9860 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 9861 tcp->tcp_eager_next_q0; 9862 tcp->tcp_eager_prev_q0 = NULL; 9863 tcp->tcp_eager_next_q0 = NULL; 9864 9865 /* 9866 * Insert at end of the queue because sockfs 9867 * sends down T_CONN_RES in chronological 9868 * order. Leaving the older conn indications 9869 * at front of the queue helps reducing search 9870 * time. 9871 */ 9872 tail = listener->tcp_eager_last_q; 9873 if (tail != NULL) 9874 tail->tcp_eager_next_q = tcp; 9875 else 9876 listener->tcp_eager_next_q = tcp; 9877 listener->tcp_eager_last_q = tcp; 9878 tcp->tcp_eager_next_q = NULL; 9879 /* 9880 * Delay sending up the T_conn_ind until we are 9881 * done with the eager. Once we have have sent up 9882 * the T_conn_ind, the accept can potentially complete 9883 * any time and release the refhold we have on the eager. 9884 */ 9885 need_send_conn_ind = B_TRUE; 9886 } else { 9887 /* 9888 * Defer connection on q0 and set deferred 9889 * connection bit true 9890 */ 9891 tcp->tcp_conn_def_q0 = B_TRUE; 9892 9893 /* take tcp out of q0 ... */ 9894 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 9895 tcp->tcp_eager_next_q0; 9896 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 9897 tcp->tcp_eager_prev_q0; 9898 9899 /* ... and place it at the end of q0 */ 9900 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 9901 tcp->tcp_eager_next_q0 = listener; 9902 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 9903 listener->tcp_eager_prev_q0 = tcp; 9904 tcp->tcp_conn.tcp_eager_conn_ind = mp; 9905 } 9906 9907 /* we have timed out before */ 9908 if (tcp->tcp_syn_rcvd_timeout != 0) { 9909 tcp->tcp_syn_rcvd_timeout = 0; 9910 listener->tcp_syn_rcvd_timeout--; 9911 if (listener->tcp_syn_defense && 9912 listener->tcp_syn_rcvd_timeout <= 9913 (tcps->tcps_conn_req_max_q0 >> 5) && 9914 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - 9915 listener->tcp_last_rcv_lbolt)) { 9916 /* 9917 * Turn off the defense mode if we 9918 * believe the SYN attack is over. 9919 */ 9920 listener->tcp_syn_defense = B_FALSE; 9921 if (listener->tcp_ip_addr_cache) { 9922 kmem_free((void *)listener->tcp_ip_addr_cache, 9923 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 9924 listener->tcp_ip_addr_cache = NULL; 9925 } 9926 } 9927 } 9928 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 9929 if (addr_cache != NULL) { 9930 /* 9931 * We have finished a 3-way handshake with this 9932 * remote host. This proves the IP addr is good. 9933 * Cache it! 9934 */ 9935 addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = 9936 tcp->tcp_connp->conn_faddr_v4; 9937 } 9938 mutex_exit(&listener->tcp_eager_lock); 9939 if (need_send_conn_ind) 9940 tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp); 9941 } 9942 9943 /* 9944 * Send the newconn notification to ulp. The eager is blown off if the 9945 * notification fails. 9946 */ 9947 static void 9948 tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp) 9949 { 9950 if (IPCL_IS_NONSTR(lconnp)) { 9951 cred_t *cr; 9952 pid_t cpid = NOPID; 9953 9954 ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp); 9955 ASSERT(econnp->conn_tcp->tcp_saved_listener == 9956 lconnp->conn_tcp); 9957 9958 cr = msg_getcred(mp, &cpid); 9959 9960 /* Keep the message around in case of a fallback to TPI */ 9961 econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp; 9962 /* 9963 * Notify the ULP about the newconn. It is guaranteed that no 9964 * tcp_accept() call will be made for the eager if the 9965 * notification fails, so it's safe to blow it off in that 9966 * case. 9967 * 9968 * The upper handle will be assigned when tcp_accept() is 9969 * called. 9970 */ 9971 if ((*lconnp->conn_upcalls->su_newconn) 9972 (lconnp->conn_upper_handle, 9973 (sock_lower_handle_t)econnp, 9974 &sock_tcp_downcalls, cr, cpid, 9975 &econnp->conn_upcalls) == NULL) { 9976 /* Failed to allocate a socket */ 9977 BUMP_MIB(&lconnp->conn_tcp->tcp_tcps->tcps_mib, 9978 tcpEstabResets); 9979 (void) tcp_eager_blowoff(lconnp->conn_tcp, 9980 econnp->conn_tcp->tcp_conn_req_seqnum); 9981 } 9982 } else { 9983 putnext(lconnp->conn_rq, mp); 9984 } 9985 } 9986 9987 /* 9988 * Handle a packet that has been reclassified by TCP. 9989 * This function drops the ref on connp that the caller had. 9990 */ 9991 static void 9992 tcp_reinput(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) 9993 { 9994 ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; 9995 9996 if (connp->conn_incoming_ifindex != 0 && 9997 connp->conn_incoming_ifindex != ira->ira_ruifindex) { 9998 freemsg(mp); 9999 CONN_DEC_REF(connp); 10000 return; 10001 } 10002 10003 if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || 10004 (ira->ira_flags & IRAF_IPSEC_SECURE)) { 10005 ip6_t *ip6h; 10006 ipha_t *ipha; 10007 10008 if (ira->ira_flags & IRAF_IS_IPV4) { 10009 ipha = (ipha_t *)mp->b_rptr; 10010 ip6h = NULL; 10011 } else { 10012 ipha = NULL; 10013 ip6h = (ip6_t *)mp->b_rptr; 10014 } 10015 mp = ipsec_check_inbound_policy(mp, connp, ipha, ip6h, ira); 10016 if (mp == NULL) { 10017 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 10018 /* Note that mp is NULL */ 10019 ip_drop_input("ipIfStatsInDiscards", mp, NULL); 10020 CONN_DEC_REF(connp); 10021 return; 10022 } 10023 } 10024 10025 if (IPCL_IS_TCP(connp)) { 10026 /* 10027 * do not drain, certain use cases can blow 10028 * the stack 10029 */ 10030 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 10031 connp->conn_recv, connp, ira, 10032 SQ_NODRAIN, SQTAG_IP_TCP_INPUT); 10033 } else { 10034 /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ 10035 (connp->conn_recv)(connp, mp, NULL, 10036 ira); 10037 CONN_DEC_REF(connp); 10038 } 10039 10040 } 10041 10042 boolean_t tcp_outbound_squeue_switch = B_FALSE; 10043 10044 /* 10045 * Handle M_DATA messages from IP. Its called directly from IP via 10046 * squeue for received IP packets. 10047 * 10048 * The first argument is always the connp/tcp to which the mp belongs. 10049 * There are no exceptions to this rule. The caller has already put 10050 * a reference on this connp/tcp and once tcp_input_data() returns, 10051 * the squeue will do the refrele. 10052 * 10053 * The TH_SYN for the listener directly go to tcp_input_listener via 10054 * squeue. ICMP errors go directly to tcp_icmp_input(). 10055 * 10056 * sqp: NULL = recursive, sqp != NULL means called from squeue 10057 */ 10058 void 10059 tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) 10060 { 10061 int32_t bytes_acked; 10062 int32_t gap; 10063 mblk_t *mp1; 10064 uint_t flags; 10065 uint32_t new_swnd = 0; 10066 uchar_t *iphdr; 10067 uchar_t *rptr; 10068 int32_t rgap; 10069 uint32_t seg_ack; 10070 int seg_len; 10071 uint_t ip_hdr_len; 10072 uint32_t seg_seq; 10073 tcpha_t *tcpha; 10074 int urp; 10075 tcp_opt_t tcpopt; 10076 ip_pkt_t ipp; 10077 boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 10078 uint32_t cwnd; 10079 uint32_t add; 10080 int npkt; 10081 int mss; 10082 conn_t *connp = (conn_t *)arg; 10083 squeue_t *sqp = (squeue_t *)arg2; 10084 tcp_t *tcp = connp->conn_tcp; 10085 tcp_stack_t *tcps = tcp->tcp_tcps; 10086 10087 /* 10088 * RST from fused tcp loopback peer should trigger an unfuse. 10089 */ 10090 if (tcp->tcp_fused) { 10091 TCP_STAT(tcps, tcp_fusion_aborted); 10092 tcp_unfuse(tcp); 10093 } 10094 10095 iphdr = mp->b_rptr; 10096 rptr = mp->b_rptr; 10097 ASSERT(OK_32PTR(rptr)); 10098 10099 ip_hdr_len = ira->ira_ip_hdr_length; 10100 if (connp->conn_recv_ancillary.crb_all != 0) { 10101 /* 10102 * Record packet information in the ip_pkt_t 10103 */ 10104 ipp.ipp_fields = 0; 10105 if (ira->ira_flags & IRAF_IS_IPV4) { 10106 (void) ip_find_hdr_v4((ipha_t *)rptr, &ipp, 10107 B_FALSE); 10108 } else { 10109 uint8_t nexthdrp; 10110 10111 /* 10112 * IPv6 packets can only be received by applications 10113 * that are prepared to receive IPv6 addresses. 10114 * The IP fanout must ensure this. 10115 */ 10116 ASSERT(connp->conn_family == AF_INET6); 10117 10118 (void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp, 10119 &nexthdrp); 10120 ASSERT(nexthdrp == IPPROTO_TCP); 10121 10122 /* Could have caused a pullup? */ 10123 iphdr = mp->b_rptr; 10124 rptr = mp->b_rptr; 10125 } 10126 } 10127 ASSERT(DB_TYPE(mp) == M_DATA); 10128 ASSERT(mp->b_next == NULL); 10129 10130 tcpha = (tcpha_t *)&rptr[ip_hdr_len]; 10131 seg_seq = ntohl(tcpha->tha_seq); 10132 seg_ack = ntohl(tcpha->tha_ack); 10133 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 10134 seg_len = (int)(mp->b_wptr - rptr) - 10135 (ip_hdr_len + TCP_HDR_LENGTH(tcpha)); 10136 if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) { 10137 do { 10138 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 10139 (uintptr_t)INT_MAX); 10140 seg_len += (int)(mp1->b_wptr - mp1->b_rptr); 10141 } while ((mp1 = mp1->b_cont) != NULL && 10142 mp1->b_datap->db_type == M_DATA); 10143 } 10144 10145 if (tcp->tcp_state == TCPS_TIME_WAIT) { 10146 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 10147 seg_len, tcpha, ira); 10148 return; 10149 } 10150 10151 if (sqp != NULL) { 10152 /* 10153 * This is the correct place to update tcp_last_recv_time. Note 10154 * that it is also updated for tcp structure that belongs to 10155 * global and listener queues which do not really need updating. 10156 * But that should not cause any harm. And it is updated for 10157 * all kinds of incoming segments, not only for data segments. 10158 */ 10159 tcp->tcp_last_recv_time = LBOLT_FASTPATH; 10160 } 10161 10162 flags = (unsigned int)tcpha->tha_flags & 0xFF; 10163 10164 BUMP_LOCAL(tcp->tcp_ibsegs); 10165 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); 10166 10167 if ((flags & TH_URG) && sqp != NULL) { 10168 /* 10169 * TCP can't handle urgent pointers that arrive before 10170 * the connection has been accept()ed since it can't 10171 * buffer OOB data. Discard segment if this happens. 10172 * 10173 * We can't just rely on a non-null tcp_listener to indicate 10174 * that the accept() has completed since unlinking of the 10175 * eager and completion of the accept are not atomic. 10176 * tcp_detached, when it is not set (B_FALSE) indicates 10177 * that the accept() has completed. 10178 * 10179 * Nor can it reassemble urgent pointers, so discard 10180 * if it's not the next segment expected. 10181 * 10182 * Otherwise, collapse chain into one mblk (discard if 10183 * that fails). This makes sure the headers, retransmitted 10184 * data, and new data all are in the same mblk. 10185 */ 10186 ASSERT(mp != NULL); 10187 if (tcp->tcp_detached || !pullupmsg(mp, -1)) { 10188 freemsg(mp); 10189 return; 10190 } 10191 /* Update pointers into message */ 10192 iphdr = rptr = mp->b_rptr; 10193 tcpha = (tcpha_t *)&rptr[ip_hdr_len]; 10194 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) { 10195 /* 10196 * Since we can't handle any data with this urgent 10197 * pointer that is out of sequence, we expunge 10198 * the data. This allows us to still register 10199 * the urgent mark and generate the M_PCSIG, 10200 * which we can do. 10201 */ 10202 mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); 10203 seg_len = 0; 10204 } 10205 } 10206 10207 switch (tcp->tcp_state) { 10208 case TCPS_SYN_SENT: 10209 if (connp->conn_final_sqp == NULL && 10210 tcp_outbound_squeue_switch && sqp != NULL) { 10211 ASSERT(connp->conn_initial_sqp == connp->conn_sqp); 10212 connp->conn_final_sqp = sqp; 10213 if (connp->conn_final_sqp != connp->conn_sqp) { 10214 DTRACE_PROBE1(conn__final__sqp__switch, 10215 conn_t *, connp); 10216 CONN_INC_REF(connp); 10217 SQUEUE_SWITCH(connp, connp->conn_final_sqp); 10218 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, 10219 tcp_input_data, connp, ira, ip_squeue_flag, 10220 SQTAG_CONNECT_FINISH); 10221 return; 10222 } 10223 DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp); 10224 } 10225 if (flags & TH_ACK) { 10226 /* 10227 * Note that our stack cannot send data before a 10228 * connection is established, therefore the 10229 * following check is valid. Otherwise, it has 10230 * to be changed. 10231 */ 10232 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 10233 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 10234 freemsg(mp); 10235 if (flags & TH_RST) 10236 return; 10237 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 10238 tcp, seg_ack, 0, TH_RST); 10239 return; 10240 } 10241 ASSERT(tcp->tcp_suna + 1 == seg_ack); 10242 } 10243 if (flags & TH_RST) { 10244 freemsg(mp); 10245 if (flags & TH_ACK) 10246 (void) tcp_clean_death(tcp, 10247 ECONNREFUSED, 13); 10248 return; 10249 } 10250 if (!(flags & TH_SYN)) { 10251 freemsg(mp); 10252 return; 10253 } 10254 10255 /* Process all TCP options. */ 10256 tcp_process_options(tcp, tcpha); 10257 /* 10258 * The following changes our rwnd to be a multiple of the 10259 * MIN(peer MSS, our MSS) for performance reason. 10260 */ 10261 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(connp->conn_rcvbuf, 10262 tcp->tcp_mss)); 10263 10264 /* Is the other end ECN capable? */ 10265 if (tcp->tcp_ecn_ok) { 10266 if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 10267 tcp->tcp_ecn_ok = B_FALSE; 10268 } 10269 } 10270 /* 10271 * Clear ECN flags because it may interfere with later 10272 * processing. 10273 */ 10274 flags &= ~(TH_ECE|TH_CWR); 10275 10276 tcp->tcp_irs = seg_seq; 10277 tcp->tcp_rack = seg_seq; 10278 tcp->tcp_rnxt = seg_seq + 1; 10279 tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); 10280 if (!TCP_IS_DETACHED(tcp)) { 10281 /* Allocate room for SACK options if needed. */ 10282 connp->conn_wroff = connp->conn_ht_iphc_len; 10283 if (tcp->tcp_snd_sack_ok) 10284 connp->conn_wroff += TCPOPT_MAX_SACK_LEN; 10285 if (!tcp->tcp_loopback) 10286 connp->conn_wroff += tcps->tcps_wroff_xtra; 10287 10288 (void) proto_set_tx_wroff(connp->conn_rq, connp, 10289 connp->conn_wroff); 10290 } 10291 if (flags & TH_ACK) { 10292 /* 10293 * If we can't get the confirmation upstream, pretend 10294 * we didn't even see this one. 10295 * 10296 * XXX: how can we pretend we didn't see it if we 10297 * have updated rnxt et. al. 10298 * 10299 * For loopback we defer sending up the T_CONN_CON 10300 * until after some checks below. 10301 */ 10302 mp1 = NULL; 10303 /* 10304 * tcp_sendmsg() checks tcp_state without entering 10305 * the squeue so tcp_state should be updated before 10306 * sending up connection confirmation 10307 */ 10308 tcp->tcp_state = TCPS_ESTABLISHED; 10309 if (!tcp_conn_con(tcp, iphdr, mp, 10310 tcp->tcp_loopback ? &mp1 : NULL, ira)) { 10311 tcp->tcp_state = TCPS_SYN_SENT; 10312 freemsg(mp); 10313 return; 10314 } 10315 /* SYN was acked - making progress */ 10316 tcp->tcp_ip_forward_progress = B_TRUE; 10317 10318 /* One for the SYN */ 10319 tcp->tcp_suna = tcp->tcp_iss + 1; 10320 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 10321 10322 /* 10323 * If SYN was retransmitted, need to reset all 10324 * retransmission info. This is because this 10325 * segment will be treated as a dup ACK. 10326 */ 10327 if (tcp->tcp_rexmit) { 10328 tcp->tcp_rexmit = B_FALSE; 10329 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 10330 tcp->tcp_rexmit_max = tcp->tcp_snxt; 10331 tcp->tcp_snd_burst = tcp->tcp_localnet ? 10332 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 10333 tcp->tcp_ms_we_have_waited = 0; 10334 10335 /* 10336 * Set tcp_cwnd back to 1 MSS, per 10337 * recommendation from 10338 * draft-floyd-incr-init-win-01.txt, 10339 * Increasing TCP's Initial Window. 10340 */ 10341 tcp->tcp_cwnd = tcp->tcp_mss; 10342 } 10343 10344 tcp->tcp_swl1 = seg_seq; 10345 tcp->tcp_swl2 = seg_ack; 10346 10347 new_swnd = ntohs(tcpha->tha_win); 10348 tcp->tcp_swnd = new_swnd; 10349 if (new_swnd > tcp->tcp_max_swnd) 10350 tcp->tcp_max_swnd = new_swnd; 10351 10352 /* 10353 * Always send the three-way handshake ack immediately 10354 * in order to make the connection complete as soon as 10355 * possible on the accepting host. 10356 */ 10357 flags |= TH_ACK_NEEDED; 10358 10359 /* 10360 * Special case for loopback. At this point we have 10361 * received SYN-ACK from the remote endpoint. In 10362 * order to ensure that both endpoints reach the 10363 * fused state prior to any data exchange, the final 10364 * ACK needs to be sent before we indicate T_CONN_CON 10365 * to the module upstream. 10366 */ 10367 if (tcp->tcp_loopback) { 10368 mblk_t *ack_mp; 10369 10370 ASSERT(!tcp->tcp_unfusable); 10371 ASSERT(mp1 != NULL); 10372 /* 10373 * For loopback, we always get a pure SYN-ACK 10374 * and only need to send back the final ACK 10375 * with no data (this is because the other 10376 * tcp is ours and we don't do T/TCP). This 10377 * final ACK triggers the passive side to 10378 * perform fusion in ESTABLISHED state. 10379 */ 10380 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) { 10381 if (tcp->tcp_ack_tid != 0) { 10382 (void) TCP_TIMER_CANCEL(tcp, 10383 tcp->tcp_ack_tid); 10384 tcp->tcp_ack_tid = 0; 10385 } 10386 tcp_send_data(tcp, ack_mp); 10387 BUMP_LOCAL(tcp->tcp_obsegs); 10388 BUMP_MIB(&tcps->tcps_mib, tcpOutAck); 10389 10390 if (!IPCL_IS_NONSTR(connp)) { 10391 /* Send up T_CONN_CON */ 10392 if (ira->ira_cred != NULL) { 10393 mblk_setcred(mp1, 10394 ira->ira_cred, 10395 ira->ira_cpid); 10396 } 10397 putnext(connp->conn_rq, mp1); 10398 } else { 10399 (*connp->conn_upcalls-> 10400 su_connected) 10401 (connp->conn_upper_handle, 10402 tcp->tcp_connid, 10403 ira->ira_cred, 10404 ira->ira_cpid); 10405 freemsg(mp1); 10406 } 10407 10408 freemsg(mp); 10409 return; 10410 } 10411 /* 10412 * Forget fusion; we need to handle more 10413 * complex cases below. Send the deferred 10414 * T_CONN_CON message upstream and proceed 10415 * as usual. Mark this tcp as not capable 10416 * of fusion. 10417 */ 10418 TCP_STAT(tcps, tcp_fusion_unfusable); 10419 tcp->tcp_unfusable = B_TRUE; 10420 if (!IPCL_IS_NONSTR(connp)) { 10421 if (ira->ira_cred != NULL) { 10422 mblk_setcred(mp1, ira->ira_cred, 10423 ira->ira_cpid); 10424 } 10425 putnext(connp->conn_rq, mp1); 10426 } else { 10427 (*connp->conn_upcalls->su_connected) 10428 (connp->conn_upper_handle, 10429 tcp->tcp_connid, ira->ira_cred, 10430 ira->ira_cpid); 10431 freemsg(mp1); 10432 } 10433 } 10434 10435 /* 10436 * Check to see if there is data to be sent. If 10437 * yes, set the transmit flag. Then check to see 10438 * if received data processing needs to be done. 10439 * If not, go straight to xmit_check. This short 10440 * cut is OK as we don't support T/TCP. 10441 */ 10442 if (tcp->tcp_unsent) 10443 flags |= TH_XMIT_NEEDED; 10444 10445 if (seg_len == 0 && !(flags & TH_URG)) { 10446 freemsg(mp); 10447 goto xmit_check; 10448 } 10449 10450 flags &= ~TH_SYN; 10451 seg_seq++; 10452 break; 10453 } 10454 tcp->tcp_state = TCPS_SYN_RCVD; 10455 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 10456 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 10457 if (mp1 != NULL) { 10458 tcp_send_data(tcp, mp1); 10459 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 10460 } 10461 freemsg(mp); 10462 return; 10463 case TCPS_SYN_RCVD: 10464 if (flags & TH_ACK) { 10465 /* 10466 * In this state, a SYN|ACK packet is either bogus 10467 * because the other side must be ACKing our SYN which 10468 * indicates it has seen the ACK for their SYN and 10469 * shouldn't retransmit it or we're crossing SYNs 10470 * on active open. 10471 */ 10472 if ((flags & TH_SYN) && !tcp->tcp_active_open) { 10473 freemsg(mp); 10474 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_syn", 10475 tcp, seg_ack, 0, TH_RST); 10476 return; 10477 } 10478 /* 10479 * NOTE: RFC 793 pg. 72 says this should be 10480 * tcp->tcp_suna <= seg_ack <= tcp->tcp_snxt 10481 * but that would mean we have an ack that ignored 10482 * our SYN. 10483 */ 10484 if (SEQ_LEQ(seg_ack, tcp->tcp_suna) || 10485 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 10486 freemsg(mp); 10487 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 10488 tcp, seg_ack, 0, TH_RST); 10489 return; 10490 } 10491 /* 10492 * No sane TCP stack will send such a small window 10493 * without receiving any data. Just drop this invalid 10494 * ACK. We also shorten the abort timeout in case 10495 * this is an attack. 10496 */ 10497 if ((ntohs(tcpha->tha_win) << tcp->tcp_snd_ws) < 10498 (tcp->tcp_mss >> tcp_init_wnd_shft)) { 10499 freemsg(mp); 10500 TCP_STAT(tcps, tcp_zwin_ack_syn); 10501 tcp->tcp_second_ctimer_threshold = 10502 tcp_early_abort * SECONDS; 10503 return; 10504 } 10505 } 10506 break; 10507 case TCPS_LISTEN: 10508 /* 10509 * Only a TLI listener can come through this path when a 10510 * acceptor is going back to be a listener and a packet 10511 * for the acceptor hits the classifier. For a socket 10512 * listener, this can never happen because a listener 10513 * can never accept connection on itself and hence a 10514 * socket acceptor can not go back to being a listener. 10515 */ 10516 ASSERT(!TCP_IS_SOCKET(tcp)); 10517 /*FALLTHRU*/ 10518 case TCPS_CLOSED: 10519 case TCPS_BOUND: { 10520 conn_t *new_connp; 10521 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 10522 10523 /* 10524 * Don't accept any input on a closed tcp as this TCP logically 10525 * does not exist on the system. Don't proceed further with 10526 * this TCP. For instance, this packet could trigger another 10527 * close of this tcp which would be disastrous for tcp_refcnt. 10528 * tcp_close_detached / tcp_clean_death / tcp_closei_local must 10529 * be called at most once on a TCP. In this case we need to 10530 * refeed the packet into the classifier and figure out where 10531 * the packet should go. 10532 */ 10533 new_connp = ipcl_classify(mp, ira, ipst); 10534 if (new_connp != NULL) { 10535 /* Drops ref on new_connp */ 10536 tcp_reinput(new_connp, mp, ira, ipst); 10537 return; 10538 } 10539 /* We failed to classify. For now just drop the packet */ 10540 freemsg(mp); 10541 return; 10542 } 10543 case TCPS_IDLE: 10544 /* 10545 * Handle the case where the tcp_clean_death() has happened 10546 * on a connection (application hasn't closed yet) but a packet 10547 * was already queued on squeue before tcp_clean_death() 10548 * was processed. Calling tcp_clean_death() twice on same 10549 * connection can result in weird behaviour. 10550 */ 10551 freemsg(mp); 10552 return; 10553 default: 10554 break; 10555 } 10556 10557 /* 10558 * Already on the correct queue/perimeter. 10559 * If this is a detached connection and not an eager 10560 * connection hanging off a listener then new data 10561 * (past the FIN) will cause a reset. 10562 * We do a special check here where it 10563 * is out of the main line, rather than check 10564 * if we are detached every time we see new 10565 * data down below. 10566 */ 10567 if (TCP_IS_DETACHED_NONEAGER(tcp) && 10568 (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) { 10569 BUMP_MIB(&tcps->tcps_mib, tcpInClosed); 10570 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); 10571 10572 freemsg(mp); 10573 /* 10574 * This could be an SSL closure alert. We're detached so just 10575 * acknowledge it this last time. 10576 */ 10577 if (tcp->tcp_kssl_ctx != NULL) { 10578 kssl_release_ctx(tcp->tcp_kssl_ctx); 10579 tcp->tcp_kssl_ctx = NULL; 10580 10581 tcp->tcp_rnxt += seg_len; 10582 tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); 10583 flags |= TH_ACK_NEEDED; 10584 goto ack_check; 10585 } 10586 10587 tcp_xmit_ctl("new data when detached", tcp, 10588 tcp->tcp_snxt, 0, TH_RST); 10589 (void) tcp_clean_death(tcp, EPROTO, 12); 10590 return; 10591 } 10592 10593 mp->b_rptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); 10594 urp = ntohs(tcpha->tha_urp) - TCP_OLD_URP_INTERPRETATION; 10595 new_swnd = ntohs(tcpha->tha_win) << 10596 ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 10597 10598 if (tcp->tcp_snd_ts_ok) { 10599 if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { 10600 /* 10601 * This segment is not acceptable. 10602 * Drop it and send back an ACK. 10603 */ 10604 freemsg(mp); 10605 flags |= TH_ACK_NEEDED; 10606 goto ack_check; 10607 } 10608 } else if (tcp->tcp_snd_sack_ok) { 10609 ASSERT(tcp->tcp_sack_info != NULL); 10610 tcpopt.tcp = tcp; 10611 /* 10612 * SACK info in already updated in tcp_parse_options. Ignore 10613 * all other TCP options... 10614 */ 10615 (void) tcp_parse_options(tcpha, &tcpopt); 10616 } 10617 try_again:; 10618 mss = tcp->tcp_mss; 10619 gap = seg_seq - tcp->tcp_rnxt; 10620 rgap = tcp->tcp_rwnd - (gap + seg_len); 10621 /* 10622 * gap is the amount of sequence space between what we expect to see 10623 * and what we got for seg_seq. A positive value for gap means 10624 * something got lost. A negative value means we got some old stuff. 10625 */ 10626 if (gap < 0) { 10627 /* Old stuff present. Is the SYN in there? */ 10628 if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 10629 (seg_len != 0)) { 10630 flags &= ~TH_SYN; 10631 seg_seq++; 10632 urp--; 10633 /* Recompute the gaps after noting the SYN. */ 10634 goto try_again; 10635 } 10636 BUMP_MIB(&tcps->tcps_mib, tcpInDataDupSegs); 10637 UPDATE_MIB(&tcps->tcps_mib, tcpInDataDupBytes, 10638 (seg_len > -gap ? -gap : seg_len)); 10639 /* Remove the old stuff from seg_len. */ 10640 seg_len += gap; 10641 /* 10642 * Anything left? 10643 * Make sure to check for unack'd FIN when rest of data 10644 * has been previously ack'd. 10645 */ 10646 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 10647 /* 10648 * Resets are only valid if they lie within our offered 10649 * window. If the RST bit is set, we just ignore this 10650 * segment. 10651 */ 10652 if (flags & TH_RST) { 10653 freemsg(mp); 10654 return; 10655 } 10656 10657 /* 10658 * The arriving of dup data packets indicate that we 10659 * may have postponed an ack for too long, or the other 10660 * side's RTT estimate is out of shape. Start acking 10661 * more often. 10662 */ 10663 if (SEQ_GEQ(seg_seq + seg_len - gap, tcp->tcp_rack) && 10664 tcp->tcp_rack_cnt >= 1 && 10665 tcp->tcp_rack_abs_max > 2) { 10666 tcp->tcp_rack_abs_max--; 10667 } 10668 tcp->tcp_rack_cur_max = 1; 10669 10670 /* 10671 * This segment is "unacceptable". None of its 10672 * sequence space lies within our advertized window. 10673 * 10674 * Adjust seg_len to the original value for tracing. 10675 */ 10676 seg_len -= gap; 10677 if (connp->conn_debug) { 10678 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 10679 "tcp_rput: unacceptable, gap %d, rgap %d, " 10680 "flags 0x%x, seg_seq %u, seg_ack %u, " 10681 "seg_len %d, rnxt %u, snxt %u, %s", 10682 gap, rgap, flags, seg_seq, seg_ack, 10683 seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 10684 tcp_display(tcp, NULL, 10685 DISP_ADDR_AND_PORT)); 10686 } 10687 10688 /* 10689 * Arrange to send an ACK in response to the 10690 * unacceptable segment per RFC 793 page 69. There 10691 * is only one small difference between ours and the 10692 * acceptability test in the RFC - we accept ACK-only 10693 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 10694 * will be generated. 10695 * 10696 * Note that we have to ACK an ACK-only packet at least 10697 * for stacks that send 0-length keep-alives with 10698 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 10699 * section 4.2.3.6. As long as we don't ever generate 10700 * an unacceptable packet in response to an incoming 10701 * packet that is unacceptable, it should not cause 10702 * "ACK wars". 10703 */ 10704 flags |= TH_ACK_NEEDED; 10705 10706 /* 10707 * Continue processing this segment in order to use the 10708 * ACK information it contains, but skip all other 10709 * sequence-number processing. Processing the ACK 10710 * information is necessary in order to 10711 * re-synchronize connections that may have lost 10712 * synchronization. 10713 * 10714 * We clear seg_len and flag fields related to 10715 * sequence number processing as they are not 10716 * to be trusted for an unacceptable segment. 10717 */ 10718 seg_len = 0; 10719 flags &= ~(TH_SYN | TH_FIN | TH_URG); 10720 goto process_ack; 10721 } 10722 10723 /* Fix seg_seq, and chew the gap off the front. */ 10724 seg_seq = tcp->tcp_rnxt; 10725 urp += gap; 10726 do { 10727 mblk_t *mp2; 10728 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 10729 (uintptr_t)UINT_MAX); 10730 gap += (uint_t)(mp->b_wptr - mp->b_rptr); 10731 if (gap > 0) { 10732 mp->b_rptr = mp->b_wptr - gap; 10733 break; 10734 } 10735 mp2 = mp; 10736 mp = mp->b_cont; 10737 freeb(mp2); 10738 } while (gap < 0); 10739 /* 10740 * If the urgent data has already been acknowledged, we 10741 * should ignore TH_URG below 10742 */ 10743 if (urp < 0) 10744 flags &= ~TH_URG; 10745 } 10746 /* 10747 * rgap is the amount of stuff received out of window. A negative 10748 * value is the amount out of window. 10749 */ 10750 if (rgap < 0) { 10751 mblk_t *mp2; 10752 10753 if (tcp->tcp_rwnd == 0) { 10754 BUMP_MIB(&tcps->tcps_mib, tcpInWinProbe); 10755 } else { 10756 BUMP_MIB(&tcps->tcps_mib, tcpInDataPastWinSegs); 10757 UPDATE_MIB(&tcps->tcps_mib, 10758 tcpInDataPastWinBytes, -rgap); 10759 } 10760 10761 /* 10762 * seg_len does not include the FIN, so if more than 10763 * just the FIN is out of window, we act like we don't 10764 * see it. (If just the FIN is out of window, rgap 10765 * will be zero and we will go ahead and acknowledge 10766 * the FIN.) 10767 */ 10768 flags &= ~TH_FIN; 10769 10770 /* Fix seg_len and make sure there is something left. */ 10771 seg_len += rgap; 10772 if (seg_len <= 0) { 10773 /* 10774 * Resets are only valid if they lie within our offered 10775 * window. If the RST bit is set, we just ignore this 10776 * segment. 10777 */ 10778 if (flags & TH_RST) { 10779 freemsg(mp); 10780 return; 10781 } 10782 10783 /* Per RFC 793, we need to send back an ACK. */ 10784 flags |= TH_ACK_NEEDED; 10785 10786 /* 10787 * Send SIGURG as soon as possible i.e. even 10788 * if the TH_URG was delivered in a window probe 10789 * packet (which will be unacceptable). 10790 * 10791 * We generate a signal if none has been generated 10792 * for this connection or if this is a new urgent 10793 * byte. Also send a zero-length "unmarked" message 10794 * to inform SIOCATMARK that this is not the mark. 10795 * 10796 * tcp_urp_last_valid is cleared when the T_exdata_ind 10797 * is sent up. This plus the check for old data 10798 * (gap >= 0) handles the wraparound of the sequence 10799 * number space without having to always track the 10800 * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks 10801 * this max in its rcv_up variable). 10802 * 10803 * This prevents duplicate SIGURGS due to a "late" 10804 * zero-window probe when the T_EXDATA_IND has already 10805 * been sent up. 10806 */ 10807 if ((flags & TH_URG) && 10808 (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, 10809 tcp->tcp_urp_last))) { 10810 if (IPCL_IS_NONSTR(connp)) { 10811 if (!TCP_IS_DETACHED(tcp)) { 10812 (*connp->conn_upcalls-> 10813 su_signal_oob) 10814 (connp->conn_upper_handle, 10815 urp); 10816 } 10817 } else { 10818 mp1 = allocb(0, BPRI_MED); 10819 if (mp1 == NULL) { 10820 freemsg(mp); 10821 return; 10822 } 10823 if (!TCP_IS_DETACHED(tcp) && 10824 !putnextctl1(connp->conn_rq, 10825 M_PCSIG, SIGURG)) { 10826 /* Try again on the rexmit. */ 10827 freemsg(mp1); 10828 freemsg(mp); 10829 return; 10830 } 10831 /* 10832 * If the next byte would be the mark 10833 * then mark with MARKNEXT else mark 10834 * with NOTMARKNEXT. 10835 */ 10836 if (gap == 0 && urp == 0) 10837 mp1->b_flag |= MSGMARKNEXT; 10838 else 10839 mp1->b_flag |= MSGNOTMARKNEXT; 10840 freemsg(tcp->tcp_urp_mark_mp); 10841 tcp->tcp_urp_mark_mp = mp1; 10842 flags |= TH_SEND_URP_MARK; 10843 } 10844 tcp->tcp_urp_last_valid = B_TRUE; 10845 tcp->tcp_urp_last = urp + seg_seq; 10846 } 10847 /* 10848 * If this is a zero window probe, continue to 10849 * process the ACK part. But we need to set seg_len 10850 * to 0 to avoid data processing. Otherwise just 10851 * drop the segment and send back an ACK. 10852 */ 10853 if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 10854 flags &= ~(TH_SYN | TH_URG); 10855 seg_len = 0; 10856 goto process_ack; 10857 } else { 10858 freemsg(mp); 10859 goto ack_check; 10860 } 10861 } 10862 /* Pitch out of window stuff off the end. */ 10863 rgap = seg_len; 10864 mp2 = mp; 10865 do { 10866 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 10867 (uintptr_t)INT_MAX); 10868 rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 10869 if (rgap < 0) { 10870 mp2->b_wptr += rgap; 10871 if ((mp1 = mp2->b_cont) != NULL) { 10872 mp2->b_cont = NULL; 10873 freemsg(mp1); 10874 } 10875 break; 10876 } 10877 } while ((mp2 = mp2->b_cont) != NULL); 10878 } 10879 ok:; 10880 /* 10881 * TCP should check ECN info for segments inside the window only. 10882 * Therefore the check should be done here. 10883 */ 10884 if (tcp->tcp_ecn_ok) { 10885 if (flags & TH_CWR) { 10886 tcp->tcp_ecn_echo_on = B_FALSE; 10887 } 10888 /* 10889 * Note that both ECN_CE and CWR can be set in the 10890 * same segment. In this case, we once again turn 10891 * on ECN_ECHO. 10892 */ 10893 if (connp->conn_ipversion == IPV4_VERSION) { 10894 uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service; 10895 10896 if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 10897 tcp->tcp_ecn_echo_on = B_TRUE; 10898 } 10899 } else { 10900 uint32_t vcf = ((ip6_t *)rptr)->ip6_vcf; 10901 10902 if ((vcf & htonl(IPH_ECN_CE << 20)) == 10903 htonl(IPH_ECN_CE << 20)) { 10904 tcp->tcp_ecn_echo_on = B_TRUE; 10905 } 10906 } 10907 } 10908 10909 /* 10910 * Check whether we can update tcp_ts_recent. This test is 10911 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 10912 * Extensions for High Performance: An Update", Internet Draft. 10913 */ 10914 if (tcp->tcp_snd_ts_ok && 10915 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 10916 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 10917 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 10918 tcp->tcp_last_rcv_lbolt = LBOLT_FASTPATH64; 10919 } 10920 10921 if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 10922 /* 10923 * FIN in an out of order segment. We record this in 10924 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 10925 * Clear the FIN so that any check on FIN flag will fail. 10926 * Remember that FIN also counts in the sequence number 10927 * space. So we need to ack out of order FIN only segments. 10928 */ 10929 if (flags & TH_FIN) { 10930 tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 10931 tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 10932 flags &= ~TH_FIN; 10933 flags |= TH_ACK_NEEDED; 10934 } 10935 if (seg_len > 0) { 10936 /* Fill in the SACK blk list. */ 10937 if (tcp->tcp_snd_sack_ok) { 10938 ASSERT(tcp->tcp_sack_info != NULL); 10939 tcp_sack_insert(tcp->tcp_sack_list, 10940 seg_seq, seg_seq + seg_len, 10941 &(tcp->tcp_num_sack_blk)); 10942 } 10943 10944 /* 10945 * Attempt reassembly and see if we have something 10946 * ready to go. 10947 */ 10948 mp = tcp_reass(tcp, mp, seg_seq); 10949 /* Always ack out of order packets */ 10950 flags |= TH_ACK_NEEDED | TH_PUSH; 10951 if (mp) { 10952 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 10953 (uintptr_t)INT_MAX); 10954 seg_len = mp->b_cont ? msgdsize(mp) : 10955 (int)(mp->b_wptr - mp->b_rptr); 10956 seg_seq = tcp->tcp_rnxt; 10957 /* 10958 * A gap is filled and the seq num and len 10959 * of the gap match that of a previously 10960 * received FIN, put the FIN flag back in. 10961 */ 10962 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 10963 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 10964 flags |= TH_FIN; 10965 tcp->tcp_valid_bits &= 10966 ~TCP_OFO_FIN_VALID; 10967 } 10968 if (tcp->tcp_reass_tid != 0) { 10969 (void) TCP_TIMER_CANCEL(tcp, 10970 tcp->tcp_reass_tid); 10971 /* 10972 * Restart the timer if there is still 10973 * data in the reassembly queue. 10974 */ 10975 if (tcp->tcp_reass_head != NULL) { 10976 tcp->tcp_reass_tid = TCP_TIMER( 10977 tcp, tcp_reass_timer, 10978 MSEC_TO_TICK( 10979 tcps->tcps_reass_timeout)); 10980 } else { 10981 tcp->tcp_reass_tid = 0; 10982 } 10983 } 10984 } else { 10985 /* 10986 * Keep going even with NULL mp. 10987 * There may be a useful ACK or something else 10988 * we don't want to miss. 10989 * 10990 * But TCP should not perform fast retransmit 10991 * because of the ack number. TCP uses 10992 * seg_len == 0 to determine if it is a pure 10993 * ACK. And this is not a pure ACK. 10994 */ 10995 seg_len = 0; 10996 ofo_seg = B_TRUE; 10997 10998 if (tcps->tcps_reass_timeout != 0 && 10999 tcp->tcp_reass_tid == 0) { 11000 tcp->tcp_reass_tid = TCP_TIMER(tcp, 11001 tcp_reass_timer, MSEC_TO_TICK( 11002 tcps->tcps_reass_timeout)); 11003 } 11004 } 11005 } 11006 } else if (seg_len > 0) { 11007 BUMP_MIB(&tcps->tcps_mib, tcpInDataInorderSegs); 11008 UPDATE_MIB(&tcps->tcps_mib, tcpInDataInorderBytes, seg_len); 11009 /* 11010 * If an out of order FIN was received before, and the seq 11011 * num and len of the new segment match that of the FIN, 11012 * put the FIN flag back in. 11013 */ 11014 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 11015 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 11016 flags |= TH_FIN; 11017 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 11018 } 11019 } 11020 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 11021 if (flags & TH_RST) { 11022 freemsg(mp); 11023 switch (tcp->tcp_state) { 11024 case TCPS_SYN_RCVD: 11025 (void) tcp_clean_death(tcp, ECONNREFUSED, 14); 11026 break; 11027 case TCPS_ESTABLISHED: 11028 case TCPS_FIN_WAIT_1: 11029 case TCPS_FIN_WAIT_2: 11030 case TCPS_CLOSE_WAIT: 11031 (void) tcp_clean_death(tcp, ECONNRESET, 15); 11032 break; 11033 case TCPS_CLOSING: 11034 case TCPS_LAST_ACK: 11035 (void) tcp_clean_death(tcp, 0, 16); 11036 break; 11037 default: 11038 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 11039 (void) tcp_clean_death(tcp, ENXIO, 17); 11040 break; 11041 } 11042 return; 11043 } 11044 if (flags & TH_SYN) { 11045 /* 11046 * See RFC 793, Page 71 11047 * 11048 * The seq number must be in the window as it should 11049 * be "fixed" above. If it is outside window, it should 11050 * be already rejected. Note that we allow seg_seq to be 11051 * rnxt + rwnd because we want to accept 0 window probe. 11052 */ 11053 ASSERT(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 11054 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 11055 freemsg(mp); 11056 /* 11057 * If the ACK flag is not set, just use our snxt as the 11058 * seq number of the RST segment. 11059 */ 11060 if (!(flags & TH_ACK)) { 11061 seg_ack = tcp->tcp_snxt; 11062 } 11063 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 11064 TH_RST|TH_ACK); 11065 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 11066 (void) tcp_clean_death(tcp, ECONNRESET, 18); 11067 return; 11068 } 11069 /* 11070 * urp could be -1 when the urp field in the packet is 0 11071 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent 11072 * byte was at seg_seq - 1, in which case we ignore the urgent flag. 11073 */ 11074 if (flags & TH_URG && urp >= 0) { 11075 if (!tcp->tcp_urp_last_valid || 11076 SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { 11077 /* 11078 * Non-STREAMS sockets handle the urgent data a litte 11079 * differently from STREAMS based sockets. There is no 11080 * need to mark any mblks with the MSG{NOT,}MARKNEXT 11081 * flags to keep SIOCATMARK happy. Instead a 11082 * su_signal_oob upcall is made to update the mark. 11083 * Neither is a T_EXDATA_IND mblk needed to be 11084 * prepended to the urgent data. The urgent data is 11085 * delivered using the su_recv upcall, where we set 11086 * the MSG_OOB flag to indicate that it is urg data. 11087 * 11088 * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED 11089 * are used by non-STREAMS sockets. 11090 */ 11091 if (IPCL_IS_NONSTR(connp)) { 11092 if (!TCP_IS_DETACHED(tcp)) { 11093 (*connp->conn_upcalls->su_signal_oob) 11094 (connp->conn_upper_handle, urp); 11095 } 11096 } else { 11097 /* 11098 * If we haven't generated the signal yet for 11099 * this urgent pointer value, do it now. Also, 11100 * send up a zero-length M_DATA indicating 11101 * whether or not this is the mark. The latter 11102 * is not needed when a T_EXDATA_IND is sent up. 11103 * However, if there are allocation failures 11104 * this code relies on the sender retransmitting 11105 * and the socket code for determining the mark 11106 * should not block waiting for the peer to 11107 * transmit. Thus, for simplicity we always 11108 * send up the mark indication. 11109 */ 11110 mp1 = allocb(0, BPRI_MED); 11111 if (mp1 == NULL) { 11112 freemsg(mp); 11113 return; 11114 } 11115 if (!TCP_IS_DETACHED(tcp) && 11116 !putnextctl1(connp->conn_rq, M_PCSIG, 11117 SIGURG)) { 11118 /* Try again on the rexmit. */ 11119 freemsg(mp1); 11120 freemsg(mp); 11121 return; 11122 } 11123 /* 11124 * Mark with NOTMARKNEXT for now. 11125 * The code below will change this to MARKNEXT 11126 * if we are at the mark. 11127 * 11128 * If there are allocation failures (e.g. in 11129 * dupmsg below) the next time tcp_input_data 11130 * sees the urgent segment it will send up the 11131 * MSGMARKNEXT message. 11132 */ 11133 mp1->b_flag |= MSGNOTMARKNEXT; 11134 freemsg(tcp->tcp_urp_mark_mp); 11135 tcp->tcp_urp_mark_mp = mp1; 11136 flags |= TH_SEND_URP_MARK; 11137 #ifdef DEBUG 11138 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 11139 "tcp_rput: sent M_PCSIG 2 seq %x urp %x " 11140 "last %x, %s", 11141 seg_seq, urp, tcp->tcp_urp_last, 11142 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 11143 #endif /* DEBUG */ 11144 } 11145 tcp->tcp_urp_last_valid = B_TRUE; 11146 tcp->tcp_urp_last = urp + seg_seq; 11147 } else if (tcp->tcp_urp_mark_mp != NULL) { 11148 /* 11149 * An allocation failure prevented the previous 11150 * tcp_input_data from sending up the allocated 11151 * MSG*MARKNEXT message - send it up this time 11152 * around. 11153 */ 11154 flags |= TH_SEND_URP_MARK; 11155 } 11156 11157 /* 11158 * If the urgent byte is in this segment, make sure that it is 11159 * all by itself. This makes it much easier to deal with the 11160 * possibility of an allocation failure on the T_exdata_ind. 11161 * Note that seg_len is the number of bytes in the segment, and 11162 * urp is the offset into the segment of the urgent byte. 11163 * urp < seg_len means that the urgent byte is in this segment. 11164 */ 11165 if (urp < seg_len) { 11166 if (seg_len != 1) { 11167 uint32_t tmp_rnxt; 11168 /* 11169 * Break it up and feed it back in. 11170 * Re-attach the IP header. 11171 */ 11172 mp->b_rptr = iphdr; 11173 if (urp > 0) { 11174 /* 11175 * There is stuff before the urgent 11176 * byte. 11177 */ 11178 mp1 = dupmsg(mp); 11179 if (!mp1) { 11180 /* 11181 * Trim from urgent byte on. 11182 * The rest will come back. 11183 */ 11184 (void) adjmsg(mp, 11185 urp - seg_len); 11186 tcp_input_data(connp, 11187 mp, NULL, ira); 11188 return; 11189 } 11190 (void) adjmsg(mp1, urp - seg_len); 11191 /* Feed this piece back in. */ 11192 tmp_rnxt = tcp->tcp_rnxt; 11193 tcp_input_data(connp, mp1, NULL, ira); 11194 /* 11195 * If the data passed back in was not 11196 * processed (ie: bad ACK) sending 11197 * the remainder back in will cause a 11198 * loop. In this case, drop the 11199 * packet and let the sender try 11200 * sending a good packet. 11201 */ 11202 if (tmp_rnxt == tcp->tcp_rnxt) { 11203 freemsg(mp); 11204 return; 11205 } 11206 } 11207 if (urp != seg_len - 1) { 11208 uint32_t tmp_rnxt; 11209 /* 11210 * There is stuff after the urgent 11211 * byte. 11212 */ 11213 mp1 = dupmsg(mp); 11214 if (!mp1) { 11215 /* 11216 * Trim everything beyond the 11217 * urgent byte. The rest will 11218 * come back. 11219 */ 11220 (void) adjmsg(mp, 11221 urp + 1 - seg_len); 11222 tcp_input_data(connp, 11223 mp, NULL, ira); 11224 return; 11225 } 11226 (void) adjmsg(mp1, urp + 1 - seg_len); 11227 tmp_rnxt = tcp->tcp_rnxt; 11228 tcp_input_data(connp, mp1, NULL, ira); 11229 /* 11230 * If the data passed back in was not 11231 * processed (ie: bad ACK) sending 11232 * the remainder back in will cause a 11233 * loop. In this case, drop the 11234 * packet and let the sender try 11235 * sending a good packet. 11236 */ 11237 if (tmp_rnxt == tcp->tcp_rnxt) { 11238 freemsg(mp); 11239 return; 11240 } 11241 } 11242 tcp_input_data(connp, mp, NULL, ira); 11243 return; 11244 } 11245 /* 11246 * This segment contains only the urgent byte. We 11247 * have to allocate the T_exdata_ind, if we can. 11248 */ 11249 if (IPCL_IS_NONSTR(connp)) { 11250 int error; 11251 11252 (*connp->conn_upcalls->su_recv) 11253 (connp->conn_upper_handle, mp, seg_len, 11254 MSG_OOB, &error, NULL); 11255 /* 11256 * We should never be in middle of a 11257 * fallback, the squeue guarantees that. 11258 */ 11259 ASSERT(error != EOPNOTSUPP); 11260 mp = NULL; 11261 goto update_ack; 11262 } else if (!tcp->tcp_urp_mp) { 11263 struct T_exdata_ind *tei; 11264 mp1 = allocb(sizeof (struct T_exdata_ind), 11265 BPRI_MED); 11266 if (!mp1) { 11267 /* 11268 * Sigh... It'll be back. 11269 * Generate any MSG*MARK message now. 11270 */ 11271 freemsg(mp); 11272 seg_len = 0; 11273 if (flags & TH_SEND_URP_MARK) { 11274 11275 11276 ASSERT(tcp->tcp_urp_mark_mp); 11277 tcp->tcp_urp_mark_mp->b_flag &= 11278 ~MSGNOTMARKNEXT; 11279 tcp->tcp_urp_mark_mp->b_flag |= 11280 MSGMARKNEXT; 11281 } 11282 goto ack_check; 11283 } 11284 mp1->b_datap->db_type = M_PROTO; 11285 tei = (struct T_exdata_ind *)mp1->b_rptr; 11286 tei->PRIM_type = T_EXDATA_IND; 11287 tei->MORE_flag = 0; 11288 mp1->b_wptr = (uchar_t *)&tei[1]; 11289 tcp->tcp_urp_mp = mp1; 11290 #ifdef DEBUG 11291 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 11292 "tcp_rput: allocated exdata_ind %s", 11293 tcp_display(tcp, NULL, 11294 DISP_PORT_ONLY)); 11295 #endif /* DEBUG */ 11296 /* 11297 * There is no need to send a separate MSG*MARK 11298 * message since the T_EXDATA_IND will be sent 11299 * now. 11300 */ 11301 flags &= ~TH_SEND_URP_MARK; 11302 freemsg(tcp->tcp_urp_mark_mp); 11303 tcp->tcp_urp_mark_mp = NULL; 11304 } 11305 /* 11306 * Now we are all set. On the next putnext upstream, 11307 * tcp_urp_mp will be non-NULL and will get prepended 11308 * to what has to be this piece containing the urgent 11309 * byte. If for any reason we abort this segment below, 11310 * if it comes back, we will have this ready, or it 11311 * will get blown off in close. 11312 */ 11313 } else if (urp == seg_len) { 11314 /* 11315 * The urgent byte is the next byte after this sequence 11316 * number. If this endpoint is non-STREAMS, then there 11317 * is nothing to do here since the socket has already 11318 * been notified about the urg pointer by the 11319 * su_signal_oob call above. 11320 * 11321 * In case of STREAMS, some more work might be needed. 11322 * If there is data it is marked with MSGMARKNEXT and 11323 * and any tcp_urp_mark_mp is discarded since it is not 11324 * needed. Otherwise, if the code above just allocated 11325 * a zero-length tcp_urp_mark_mp message, that message 11326 * is tagged with MSGMARKNEXT. Sending up these 11327 * MSGMARKNEXT messages makes SIOCATMARK work correctly 11328 * even though the T_EXDATA_IND will not be sent up 11329 * until the urgent byte arrives. 11330 */ 11331 if (!IPCL_IS_NONSTR(tcp->tcp_connp)) { 11332 if (seg_len != 0) { 11333 flags |= TH_MARKNEXT_NEEDED; 11334 freemsg(tcp->tcp_urp_mark_mp); 11335 tcp->tcp_urp_mark_mp = NULL; 11336 flags &= ~TH_SEND_URP_MARK; 11337 } else if (tcp->tcp_urp_mark_mp != NULL) { 11338 flags |= TH_SEND_URP_MARK; 11339 tcp->tcp_urp_mark_mp->b_flag &= 11340 ~MSGNOTMARKNEXT; 11341 tcp->tcp_urp_mark_mp->b_flag |= 11342 MSGMARKNEXT; 11343 } 11344 } 11345 #ifdef DEBUG 11346 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 11347 "tcp_rput: AT MARK, len %d, flags 0x%x, %s", 11348 seg_len, flags, 11349 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 11350 #endif /* DEBUG */ 11351 } 11352 #ifdef DEBUG 11353 else { 11354 /* Data left until we hit mark */ 11355 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 11356 "tcp_rput: URP %d bytes left, %s", 11357 urp - seg_len, tcp_display(tcp, NULL, 11358 DISP_PORT_ONLY)); 11359 } 11360 #endif /* DEBUG */ 11361 } 11362 11363 process_ack: 11364 if (!(flags & TH_ACK)) { 11365 freemsg(mp); 11366 goto xmit_check; 11367 } 11368 } 11369 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 11370 11371 if (bytes_acked > 0) 11372 tcp->tcp_ip_forward_progress = B_TRUE; 11373 if (tcp->tcp_state == TCPS_SYN_RCVD) { 11374 if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) && 11375 ((tcp->tcp_kssl_ent == NULL) || !tcp->tcp_kssl_pending)) { 11376 /* 3-way handshake complete - pass up the T_CONN_IND */ 11377 tcp_t *listener = tcp->tcp_listener; 11378 mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind; 11379 11380 tcp->tcp_tconnind_started = B_TRUE; 11381 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 11382 /* 11383 * We are here means eager is fine but it can 11384 * get a TH_RST at any point between now and till 11385 * accept completes and disappear. We need to 11386 * ensure that reference to eager is valid after 11387 * we get out of eager's perimeter. So we do 11388 * an extra refhold. 11389 */ 11390 CONN_INC_REF(connp); 11391 11392 /* 11393 * The listener also exists because of the refhold 11394 * done in tcp_input_listener. Its possible that it 11395 * might have closed. We will check that once we 11396 * get inside listeners context. 11397 */ 11398 CONN_INC_REF(listener->tcp_connp); 11399 if (listener->tcp_connp->conn_sqp == 11400 connp->conn_sqp) { 11401 /* 11402 * We optimize by not calling an SQUEUE_ENTER 11403 * on the listener since we know that the 11404 * listener and eager squeues are the same. 11405 * We are able to make this check safely only 11406 * because neither the eager nor the listener 11407 * can change its squeue. Only an active connect 11408 * can change its squeue 11409 */ 11410 tcp_send_conn_ind(listener->tcp_connp, mp, 11411 listener->tcp_connp->conn_sqp); 11412 CONN_DEC_REF(listener->tcp_connp); 11413 } else if (!tcp->tcp_loopback) { 11414 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, 11415 mp, tcp_send_conn_ind, 11416 listener->tcp_connp, NULL, SQ_FILL, 11417 SQTAG_TCP_CONN_IND); 11418 } else { 11419 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, 11420 mp, tcp_send_conn_ind, 11421 listener->tcp_connp, NULL, SQ_PROCESS, 11422 SQTAG_TCP_CONN_IND); 11423 } 11424 } 11425 11426 /* 11427 * We are seeing the final ack in the three way 11428 * hand shake of a active open'ed connection 11429 * so we must send up a T_CONN_CON 11430 * 11431 * tcp_sendmsg() checks tcp_state without entering 11432 * the squeue so tcp_state should be updated before 11433 * sending up connection confirmation. 11434 */ 11435 tcp->tcp_state = TCPS_ESTABLISHED; 11436 if (tcp->tcp_active_open) { 11437 if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) { 11438 freemsg(mp); 11439 tcp->tcp_state = TCPS_SYN_RCVD; 11440 return; 11441 } 11442 /* 11443 * Don't fuse the loopback endpoints for 11444 * simultaneous active opens. 11445 */ 11446 if (tcp->tcp_loopback) { 11447 TCP_STAT(tcps, tcp_fusion_unfusable); 11448 tcp->tcp_unfusable = B_TRUE; 11449 } 11450 } 11451 11452 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 11453 bytes_acked--; 11454 /* SYN was acked - making progress */ 11455 tcp->tcp_ip_forward_progress = B_TRUE; 11456 11457 /* 11458 * If SYN was retransmitted, need to reset all 11459 * retransmission info as this segment will be 11460 * treated as a dup ACK. 11461 */ 11462 if (tcp->tcp_rexmit) { 11463 tcp->tcp_rexmit = B_FALSE; 11464 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 11465 tcp->tcp_rexmit_max = tcp->tcp_snxt; 11466 tcp->tcp_snd_burst = tcp->tcp_localnet ? 11467 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 11468 tcp->tcp_ms_we_have_waited = 0; 11469 tcp->tcp_cwnd = mss; 11470 } 11471 11472 /* 11473 * We set the send window to zero here. 11474 * This is needed if there is data to be 11475 * processed already on the queue. 11476 * Later (at swnd_update label), the 11477 * "new_swnd > tcp_swnd" condition is satisfied 11478 * the XMIT_NEEDED flag is set in the current 11479 * (SYN_RCVD) state. This ensures tcp_wput_data() is 11480 * called if there is already data on queue in 11481 * this state. 11482 */ 11483 tcp->tcp_swnd = 0; 11484 11485 if (new_swnd > tcp->tcp_max_swnd) 11486 tcp->tcp_max_swnd = new_swnd; 11487 tcp->tcp_swl1 = seg_seq; 11488 tcp->tcp_swl2 = seg_ack; 11489 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 11490 11491 /* Fuse when both sides are in ESTABLISHED state */ 11492 if (tcp->tcp_loopback && do_tcp_fusion) 11493 tcp_fuse(tcp, iphdr, tcpha); 11494 11495 } 11496 /* This code follows 4.4BSD-Lite2 mostly. */ 11497 if (bytes_acked < 0) 11498 goto est; 11499 11500 /* 11501 * If TCP is ECN capable and the congestion experience bit is 11502 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 11503 * done once per window (or more loosely, per RTT). 11504 */ 11505 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 11506 tcp->tcp_cwr = B_FALSE; 11507 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { 11508 if (!tcp->tcp_cwr) { 11509 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss; 11510 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; 11511 tcp->tcp_cwnd = npkt * mss; 11512 /* 11513 * If the cwnd is 0, use the timer to clock out 11514 * new segments. This is required by the ECN spec. 11515 */ 11516 if (npkt == 0) { 11517 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 11518 /* 11519 * This makes sure that when the ACK comes 11520 * back, we will increase tcp_cwnd by 1 MSS. 11521 */ 11522 tcp->tcp_cwnd_cnt = 0; 11523 } 11524 tcp->tcp_cwr = B_TRUE; 11525 /* 11526 * This marks the end of the current window of in 11527 * flight data. That is why we don't use 11528 * tcp_suna + tcp_swnd. Only data in flight can 11529 * provide ECN info. 11530 */ 11531 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 11532 tcp->tcp_ecn_cwr_sent = B_FALSE; 11533 } 11534 } 11535 11536 mp1 = tcp->tcp_xmit_head; 11537 if (bytes_acked == 0) { 11538 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 11539 int dupack_cnt; 11540 11541 BUMP_MIB(&tcps->tcps_mib, tcpInDupAck); 11542 /* 11543 * Fast retransmit. When we have seen exactly three 11544 * identical ACKs while we have unacked data 11545 * outstanding we take it as a hint that our peer 11546 * dropped something. 11547 * 11548 * If TCP is retransmitting, don't do fast retransmit. 11549 */ 11550 if (mp1 && tcp->tcp_suna != tcp->tcp_snxt && 11551 ! tcp->tcp_rexmit) { 11552 /* Do Limited Transmit */ 11553 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 11554 tcps->tcps_dupack_fast_retransmit) { 11555 /* 11556 * RFC 3042 11557 * 11558 * What we need to do is temporarily 11559 * increase tcp_cwnd so that new 11560 * data can be sent if it is allowed 11561 * by the receive window (tcp_rwnd). 11562 * tcp_wput_data() will take care of 11563 * the rest. 11564 * 11565 * If the connection is SACK capable, 11566 * only do limited xmit when there 11567 * is SACK info. 11568 * 11569 * Note how tcp_cwnd is incremented. 11570 * The first dup ACK will increase 11571 * it by 1 MSS. The second dup ACK 11572 * will increase it by 2 MSS. This 11573 * means that only 1 new segment will 11574 * be sent for each dup ACK. 11575 */ 11576 if (tcp->tcp_unsent > 0 && 11577 (!tcp->tcp_snd_sack_ok || 11578 (tcp->tcp_snd_sack_ok && 11579 tcp->tcp_notsack_list != NULL))) { 11580 tcp->tcp_cwnd += mss << 11581 (tcp->tcp_dupack_cnt - 1); 11582 flags |= TH_LIMIT_XMIT; 11583 } 11584 } else if (dupack_cnt == 11585 tcps->tcps_dupack_fast_retransmit) { 11586 11587 /* 11588 * If we have reduced tcp_ssthresh 11589 * because of ECN, do not reduce it again 11590 * unless it is already one window of data 11591 * away. After one window of data, tcp_cwr 11592 * should then be cleared. Note that 11593 * for non ECN capable connection, tcp_cwr 11594 * should always be false. 11595 * 11596 * Adjust cwnd since the duplicate 11597 * ack indicates that a packet was 11598 * dropped (due to congestion.) 11599 */ 11600 if (!tcp->tcp_cwr) { 11601 npkt = ((tcp->tcp_snxt - 11602 tcp->tcp_suna) >> 1) / mss; 11603 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 11604 mss; 11605 tcp->tcp_cwnd = (npkt + 11606 tcp->tcp_dupack_cnt) * mss; 11607 } 11608 if (tcp->tcp_ecn_ok) { 11609 tcp->tcp_cwr = B_TRUE; 11610 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 11611 tcp->tcp_ecn_cwr_sent = B_FALSE; 11612 } 11613 11614 /* 11615 * We do Hoe's algorithm. Refer to her 11616 * paper "Improving the Start-up Behavior 11617 * of a Congestion Control Scheme for TCP," 11618 * appeared in SIGCOMM'96. 11619 * 11620 * Save highest seq no we have sent so far. 11621 * Be careful about the invisible FIN byte. 11622 */ 11623 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 11624 (tcp->tcp_unsent == 0)) { 11625 tcp->tcp_rexmit_max = tcp->tcp_fss; 11626 } else { 11627 tcp->tcp_rexmit_max = tcp->tcp_snxt; 11628 } 11629 11630 /* 11631 * Do not allow bursty traffic during. 11632 * fast recovery. Refer to Fall and Floyd's 11633 * paper "Simulation-based Comparisons of 11634 * Tahoe, Reno and SACK TCP" (in CCR?) 11635 * This is a best current practise. 11636 */ 11637 tcp->tcp_snd_burst = TCP_CWND_SS; 11638 11639 /* 11640 * For SACK: 11641 * Calculate tcp_pipe, which is the 11642 * estimated number of bytes in 11643 * network. 11644 * 11645 * tcp_fack is the highest sack'ed seq num 11646 * TCP has received. 11647 * 11648 * tcp_pipe is explained in the above quoted 11649 * Fall and Floyd's paper. tcp_fack is 11650 * explained in Mathis and Mahdavi's 11651 * "Forward Acknowledgment: Refining TCP 11652 * Congestion Control" in SIGCOMM '96. 11653 */ 11654 if (tcp->tcp_snd_sack_ok) { 11655 ASSERT(tcp->tcp_sack_info != NULL); 11656 if (tcp->tcp_notsack_list != NULL) { 11657 tcp->tcp_pipe = tcp->tcp_snxt - 11658 tcp->tcp_fack; 11659 tcp->tcp_sack_snxt = seg_ack; 11660 flags |= TH_NEED_SACK_REXMIT; 11661 } else { 11662 /* 11663 * Always initialize tcp_pipe 11664 * even though we don't have 11665 * any SACK info. If later 11666 * we get SACK info and 11667 * tcp_pipe is not initialized, 11668 * funny things will happen. 11669 */ 11670 tcp->tcp_pipe = 11671 tcp->tcp_cwnd_ssthresh; 11672 } 11673 } else { 11674 flags |= TH_REXMIT_NEEDED; 11675 } /* tcp_snd_sack_ok */ 11676 11677 } else { 11678 /* 11679 * Here we perform congestion 11680 * avoidance, but NOT slow start. 11681 * This is known as the Fast 11682 * Recovery Algorithm. 11683 */ 11684 if (tcp->tcp_snd_sack_ok && 11685 tcp->tcp_notsack_list != NULL) { 11686 flags |= TH_NEED_SACK_REXMIT; 11687 tcp->tcp_pipe -= mss; 11688 if (tcp->tcp_pipe < 0) 11689 tcp->tcp_pipe = 0; 11690 } else { 11691 /* 11692 * We know that one more packet has 11693 * left the pipe thus we can update 11694 * cwnd. 11695 */ 11696 cwnd = tcp->tcp_cwnd + mss; 11697 if (cwnd > tcp->tcp_cwnd_max) 11698 cwnd = tcp->tcp_cwnd_max; 11699 tcp->tcp_cwnd = cwnd; 11700 if (tcp->tcp_unsent > 0) 11701 flags |= TH_XMIT_NEEDED; 11702 } 11703 } 11704 } 11705 } else if (tcp->tcp_zero_win_probe) { 11706 /* 11707 * If the window has opened, need to arrange 11708 * to send additional data. 11709 */ 11710 if (new_swnd != 0) { 11711 /* tcp_suna != tcp_snxt */ 11712 /* Packet contains a window update */ 11713 BUMP_MIB(&tcps->tcps_mib, tcpInWinUpdate); 11714 tcp->tcp_zero_win_probe = 0; 11715 tcp->tcp_timer_backoff = 0; 11716 tcp->tcp_ms_we_have_waited = 0; 11717 11718 /* 11719 * Transmit starting with tcp_suna since 11720 * the one byte probe is not ack'ed. 11721 * If TCP has sent more than one identical 11722 * probe, tcp_rexmit will be set. That means 11723 * tcp_ss_rexmit() will send out the one 11724 * byte along with new data. Otherwise, 11725 * fake the retransmission. 11726 */ 11727 flags |= TH_XMIT_NEEDED; 11728 if (!tcp->tcp_rexmit) { 11729 tcp->tcp_rexmit = B_TRUE; 11730 tcp->tcp_dupack_cnt = 0; 11731 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 11732 tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 11733 } 11734 } 11735 } 11736 goto swnd_update; 11737 } 11738 11739 /* 11740 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 11741 * If the ACK value acks something that we have not yet sent, it might 11742 * be an old duplicate segment. Send an ACK to re-synchronize the 11743 * other side. 11744 * Note: reset in response to unacceptable ACK in SYN_RECEIVE 11745 * state is handled above, so we can always just drop the segment and 11746 * send an ACK here. 11747 * 11748 * In the case where the peer shrinks the window, we see the new window 11749 * update, but all the data sent previously is queued up by the peer. 11750 * To account for this, in tcp_process_shrunk_swnd(), the sequence 11751 * number, which was already sent, and within window, is recorded. 11752 * tcp_snxt is then updated. 11753 * 11754 * If the window has previously shrunk, and an ACK for data not yet 11755 * sent, according to tcp_snxt is recieved, it may still be valid. If 11756 * the ACK is for data within the window at the time the window was 11757 * shrunk, then the ACK is acceptable. In this case tcp_snxt is set to 11758 * the sequence number ACK'ed. 11759 * 11760 * If the ACK covers all the data sent at the time the window was 11761 * shrunk, we can now set tcp_is_wnd_shrnk to B_FALSE. 11762 * 11763 * Should we send ACKs in response to ACK only segments? 11764 */ 11765 11766 if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 11767 if ((tcp->tcp_is_wnd_shrnk) && 11768 (SEQ_LEQ(seg_ack, tcp->tcp_snxt_shrunk))) { 11769 uint32_t data_acked_ahead_snxt; 11770 11771 data_acked_ahead_snxt = seg_ack - tcp->tcp_snxt; 11772 tcp_update_xmit_tail(tcp, seg_ack); 11773 tcp->tcp_unsent -= data_acked_ahead_snxt; 11774 } else { 11775 BUMP_MIB(&tcps->tcps_mib, tcpInAckUnsent); 11776 /* drop the received segment */ 11777 freemsg(mp); 11778 11779 /* 11780 * Send back an ACK. If tcp_drop_ack_unsent_cnt is 11781 * greater than 0, check if the number of such 11782 * bogus ACks is greater than that count. If yes, 11783 * don't send back any ACK. This prevents TCP from 11784 * getting into an ACK storm if somehow an attacker 11785 * successfully spoofs an acceptable segment to our 11786 * peer. If this continues (count > 2 X threshold), 11787 * we should abort this connection. 11788 */ 11789 if (tcp_drop_ack_unsent_cnt > 0 && 11790 ++tcp->tcp_in_ack_unsent > 11791 tcp_drop_ack_unsent_cnt) { 11792 TCP_STAT(tcps, tcp_in_ack_unsent_drop); 11793 if (tcp->tcp_in_ack_unsent > 2 * 11794 tcp_drop_ack_unsent_cnt) { 11795 (void) tcp_clean_death(tcp, EPROTO, 20); 11796 } 11797 return; 11798 } 11799 mp = tcp_ack_mp(tcp); 11800 if (mp != NULL) { 11801 BUMP_LOCAL(tcp->tcp_obsegs); 11802 BUMP_MIB(&tcps->tcps_mib, tcpOutAck); 11803 tcp_send_data(tcp, mp); 11804 } 11805 return; 11806 } 11807 } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack, 11808 tcp->tcp_snxt_shrunk)) { 11809 tcp->tcp_is_wnd_shrnk = B_FALSE; 11810 } 11811 11812 /* 11813 * TCP gets a new ACK, update the notsack'ed list to delete those 11814 * blocks that are covered by this ACK. 11815 */ 11816 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 11817 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 11818 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 11819 } 11820 11821 /* 11822 * If we got an ACK after fast retransmit, check to see 11823 * if it is a partial ACK. If it is not and the congestion 11824 * window was inflated to account for the other side's 11825 * cached packets, retract it. If it is, do Hoe's algorithm. 11826 */ 11827 if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) { 11828 ASSERT(tcp->tcp_rexmit == B_FALSE); 11829 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 11830 tcp->tcp_dupack_cnt = 0; 11831 /* 11832 * Restore the orig tcp_cwnd_ssthresh after 11833 * fast retransmit phase. 11834 */ 11835 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 11836 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; 11837 } 11838 tcp->tcp_rexmit_max = seg_ack; 11839 tcp->tcp_cwnd_cnt = 0; 11840 tcp->tcp_snd_burst = tcp->tcp_localnet ? 11841 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 11842 11843 /* 11844 * Remove all notsack info to avoid confusion with 11845 * the next fast retrasnmit/recovery phase. 11846 */ 11847 if (tcp->tcp_snd_sack_ok && 11848 tcp->tcp_notsack_list != NULL) { 11849 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, 11850 tcp); 11851 } 11852 } else { 11853 if (tcp->tcp_snd_sack_ok && 11854 tcp->tcp_notsack_list != NULL) { 11855 flags |= TH_NEED_SACK_REXMIT; 11856 tcp->tcp_pipe -= mss; 11857 if (tcp->tcp_pipe < 0) 11858 tcp->tcp_pipe = 0; 11859 } else { 11860 /* 11861 * Hoe's algorithm: 11862 * 11863 * Retransmit the unack'ed segment and 11864 * restart fast recovery. Note that we 11865 * need to scale back tcp_cwnd to the 11866 * original value when we started fast 11867 * recovery. This is to prevent overly 11868 * aggressive behaviour in sending new 11869 * segments. 11870 */ 11871 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + 11872 tcps->tcps_dupack_fast_retransmit * mss; 11873 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 11874 flags |= TH_REXMIT_NEEDED; 11875 } 11876 } 11877 } else { 11878 tcp->tcp_dupack_cnt = 0; 11879 if (tcp->tcp_rexmit) { 11880 /* 11881 * TCP is retranmitting. If the ACK ack's all 11882 * outstanding data, update tcp_rexmit_max and 11883 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 11884 * to the correct value. 11885 * 11886 * Note that SEQ_LEQ() is used. This is to avoid 11887 * unnecessary fast retransmit caused by dup ACKs 11888 * received when TCP does slow start retransmission 11889 * after a time out. During this phase, TCP may 11890 * send out segments which are already received. 11891 * This causes dup ACKs to be sent back. 11892 */ 11893 if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 11894 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 11895 tcp->tcp_rexmit_nxt = seg_ack; 11896 } 11897 if (seg_ack != tcp->tcp_rexmit_max) { 11898 flags |= TH_XMIT_NEEDED; 11899 } 11900 } else { 11901 tcp->tcp_rexmit = B_FALSE; 11902 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 11903 tcp->tcp_snd_burst = tcp->tcp_localnet ? 11904 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 11905 } 11906 tcp->tcp_ms_we_have_waited = 0; 11907 } 11908 } 11909 11910 BUMP_MIB(&tcps->tcps_mib, tcpInAckSegs); 11911 UPDATE_MIB(&tcps->tcps_mib, tcpInAckBytes, bytes_acked); 11912 tcp->tcp_suna = seg_ack; 11913 if (tcp->tcp_zero_win_probe != 0) { 11914 tcp->tcp_zero_win_probe = 0; 11915 tcp->tcp_timer_backoff = 0; 11916 } 11917 11918 /* 11919 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 11920 * Note that it cannot be the SYN being ack'ed. The code flow 11921 * will not reach here. 11922 */ 11923 if (mp1 == NULL) { 11924 goto fin_acked; 11925 } 11926 11927 /* 11928 * Update the congestion window. 11929 * 11930 * If TCP is not ECN capable or TCP is ECN capable but the 11931 * congestion experience bit is not set, increase the tcp_cwnd as 11932 * usual. 11933 */ 11934 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 11935 cwnd = tcp->tcp_cwnd; 11936 add = mss; 11937 11938 if (cwnd >= tcp->tcp_cwnd_ssthresh) { 11939 /* 11940 * This is to prevent an increase of less than 1 MSS of 11941 * tcp_cwnd. With partial increase, tcp_wput_data() 11942 * may send out tinygrams in order to preserve mblk 11943 * boundaries. 11944 * 11945 * By initializing tcp_cwnd_cnt to new tcp_cwnd and 11946 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is 11947 * increased by 1 MSS for every RTTs. 11948 */ 11949 if (tcp->tcp_cwnd_cnt <= 0) { 11950 tcp->tcp_cwnd_cnt = cwnd + add; 11951 } else { 11952 tcp->tcp_cwnd_cnt -= add; 11953 add = 0; 11954 } 11955 } 11956 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); 11957 } 11958 11959 /* See if the latest urgent data has been acknowledged */ 11960 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && 11961 SEQ_GT(seg_ack, tcp->tcp_urg)) 11962 tcp->tcp_valid_bits &= ~TCP_URG_VALID; 11963 11964 /* Can we update the RTT estimates? */ 11965 if (tcp->tcp_snd_ts_ok) { 11966 /* Ignore zero timestamp echo-reply. */ 11967 if (tcpopt.tcp_opt_ts_ecr != 0) { 11968 tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - 11969 (int32_t)tcpopt.tcp_opt_ts_ecr); 11970 } 11971 11972 /* If needed, restart the timer. */ 11973 if (tcp->tcp_set_timer == 1) { 11974 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 11975 tcp->tcp_set_timer = 0; 11976 } 11977 /* 11978 * Update tcp_csuna in case the other side stops sending 11979 * us timestamps. 11980 */ 11981 tcp->tcp_csuna = tcp->tcp_snxt; 11982 } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 11983 /* 11984 * An ACK sequence we haven't seen before, so get the RTT 11985 * and update the RTO. But first check if the timestamp is 11986 * valid to use. 11987 */ 11988 if ((mp1->b_next != NULL) && 11989 SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) 11990 tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - 11991 (int32_t)(intptr_t)mp1->b_prev); 11992 else 11993 BUMP_MIB(&tcps->tcps_mib, tcpRttNoUpdate); 11994 11995 /* Remeber the last sequence to be ACKed */ 11996 tcp->tcp_csuna = seg_ack; 11997 if (tcp->tcp_set_timer == 1) { 11998 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 11999 tcp->tcp_set_timer = 0; 12000 } 12001 } else { 12002 BUMP_MIB(&tcps->tcps_mib, tcpRttNoUpdate); 12003 } 12004 12005 /* Eat acknowledged bytes off the xmit queue. */ 12006 for (;;) { 12007 mblk_t *mp2; 12008 uchar_t *wptr; 12009 12010 wptr = mp1->b_wptr; 12011 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 12012 bytes_acked -= (int)(wptr - mp1->b_rptr); 12013 if (bytes_acked < 0) { 12014 mp1->b_rptr = wptr + bytes_acked; 12015 /* 12016 * Set a new timestamp if all the bytes timed by the 12017 * old timestamp have been ack'ed. 12018 */ 12019 if (SEQ_GT(seg_ack, 12020 (uint32_t)(uintptr_t)(mp1->b_next))) { 12021 mp1->b_prev = 12022 (mblk_t *)(uintptr_t)LBOLT_FASTPATH; 12023 mp1->b_next = NULL; 12024 } 12025 break; 12026 } 12027 mp1->b_next = NULL; 12028 mp1->b_prev = NULL; 12029 mp2 = mp1; 12030 mp1 = mp1->b_cont; 12031 12032 /* 12033 * This notification is required for some zero-copy 12034 * clients to maintain a copy semantic. After the data 12035 * is ack'ed, client is safe to modify or reuse the buffer. 12036 */ 12037 if (tcp->tcp_snd_zcopy_aware && 12038 (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 12039 tcp_zcopy_notify(tcp); 12040 freeb(mp2); 12041 if (bytes_acked == 0) { 12042 if (mp1 == NULL) { 12043 /* Everything is ack'ed, clear the tail. */ 12044 tcp->tcp_xmit_tail = NULL; 12045 /* 12046 * Cancel the timer unless we are still 12047 * waiting for an ACK for the FIN packet. 12048 */ 12049 if (tcp->tcp_timer_tid != 0 && 12050 tcp->tcp_snxt == tcp->tcp_suna) { 12051 (void) TCP_TIMER_CANCEL(tcp, 12052 tcp->tcp_timer_tid); 12053 tcp->tcp_timer_tid = 0; 12054 } 12055 goto pre_swnd_update; 12056 } 12057 if (mp2 != tcp->tcp_xmit_tail) 12058 break; 12059 tcp->tcp_xmit_tail = mp1; 12060 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 12061 (uintptr_t)INT_MAX); 12062 tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 12063 mp1->b_rptr); 12064 break; 12065 } 12066 if (mp1 == NULL) { 12067 /* 12068 * More was acked but there is nothing more 12069 * outstanding. This means that the FIN was 12070 * just acked or that we're talking to a clown. 12071 */ 12072 fin_acked: 12073 ASSERT(tcp->tcp_fin_sent); 12074 tcp->tcp_xmit_tail = NULL; 12075 if (tcp->tcp_fin_sent) { 12076 /* FIN was acked - making progress */ 12077 if (!tcp->tcp_fin_acked) 12078 tcp->tcp_ip_forward_progress = B_TRUE; 12079 tcp->tcp_fin_acked = B_TRUE; 12080 if (tcp->tcp_linger_tid != 0 && 12081 TCP_TIMER_CANCEL(tcp, 12082 tcp->tcp_linger_tid) >= 0) { 12083 tcp_stop_lingering(tcp); 12084 freemsg(mp); 12085 mp = NULL; 12086 } 12087 } else { 12088 /* 12089 * We should never get here because 12090 * we have already checked that the 12091 * number of bytes ack'ed should be 12092 * smaller than or equal to what we 12093 * have sent so far (it is the 12094 * acceptability check of the ACK). 12095 * We can only get here if the send 12096 * queue is corrupted. 12097 * 12098 * Terminate the connection and 12099 * panic the system. It is better 12100 * for us to panic instead of 12101 * continuing to avoid other disaster. 12102 */ 12103 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 12104 tcp->tcp_rnxt, TH_RST|TH_ACK); 12105 panic("Memory corruption " 12106 "detected for connection %s.", 12107 tcp_display(tcp, NULL, 12108 DISP_ADDR_AND_PORT)); 12109 /*NOTREACHED*/ 12110 } 12111 goto pre_swnd_update; 12112 } 12113 ASSERT(mp2 != tcp->tcp_xmit_tail); 12114 } 12115 if (tcp->tcp_unsent) { 12116 flags |= TH_XMIT_NEEDED; 12117 } 12118 pre_swnd_update: 12119 tcp->tcp_xmit_head = mp1; 12120 swnd_update: 12121 /* 12122 * The following check is different from most other implementations. 12123 * For bi-directional transfer, when segments are dropped, the 12124 * "normal" check will not accept a window update in those 12125 * retransmitted segemnts. Failing to do that, TCP may send out 12126 * segments which are outside receiver's window. As TCP accepts 12127 * the ack in those retransmitted segments, if the window update in 12128 * the same segment is not accepted, TCP will incorrectly calculates 12129 * that it can send more segments. This can create a deadlock 12130 * with the receiver if its window becomes zero. 12131 */ 12132 if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 12133 SEQ_LT(tcp->tcp_swl1, seg_seq) || 12134 (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 12135 /* 12136 * The criteria for update is: 12137 * 12138 * 1. the segment acknowledges some data. Or 12139 * 2. the segment is new, i.e. it has a higher seq num. Or 12140 * 3. the segment is not old and the advertised window is 12141 * larger than the previous advertised window. 12142 */ 12143 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 12144 flags |= TH_XMIT_NEEDED; 12145 tcp->tcp_swnd = new_swnd; 12146 if (new_swnd > tcp->tcp_max_swnd) 12147 tcp->tcp_max_swnd = new_swnd; 12148 tcp->tcp_swl1 = seg_seq; 12149 tcp->tcp_swl2 = seg_ack; 12150 } 12151 est: 12152 if (tcp->tcp_state > TCPS_ESTABLISHED) { 12153 12154 switch (tcp->tcp_state) { 12155 case TCPS_FIN_WAIT_1: 12156 if (tcp->tcp_fin_acked) { 12157 tcp->tcp_state = TCPS_FIN_WAIT_2; 12158 /* 12159 * We implement the non-standard BSD/SunOS 12160 * FIN_WAIT_2 flushing algorithm. 12161 * If there is no user attached to this 12162 * TCP endpoint, then this TCP struct 12163 * could hang around forever in FIN_WAIT_2 12164 * state if the peer forgets to send us 12165 * a FIN. To prevent this, we wait only 12166 * 2*MSL (a convenient time value) for 12167 * the FIN to arrive. If it doesn't show up, 12168 * we flush the TCP endpoint. This algorithm, 12169 * though a violation of RFC-793, has worked 12170 * for over 10 years in BSD systems. 12171 * Note: SunOS 4.x waits 675 seconds before 12172 * flushing the FIN_WAIT_2 connection. 12173 */ 12174 TCP_TIMER_RESTART(tcp, 12175 tcps->tcps_fin_wait_2_flush_interval); 12176 } 12177 break; 12178 case TCPS_FIN_WAIT_2: 12179 break; /* Shutdown hook? */ 12180 case TCPS_LAST_ACK: 12181 freemsg(mp); 12182 if (tcp->tcp_fin_acked) { 12183 (void) tcp_clean_death(tcp, 0, 19); 12184 return; 12185 } 12186 goto xmit_check; 12187 case TCPS_CLOSING: 12188 if (tcp->tcp_fin_acked) 12189 SET_TIME_WAIT(tcps, tcp, connp); 12190 /*FALLTHRU*/ 12191 case TCPS_CLOSE_WAIT: 12192 freemsg(mp); 12193 goto xmit_check; 12194 default: 12195 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 12196 break; 12197 } 12198 } 12199 if (flags & TH_FIN) { 12200 /* Make sure we ack the fin */ 12201 flags |= TH_ACK_NEEDED; 12202 if (!tcp->tcp_fin_rcvd) { 12203 tcp->tcp_fin_rcvd = B_TRUE; 12204 tcp->tcp_rnxt++; 12205 tcpha = tcp->tcp_tcpha; 12206 tcpha->tha_ack = htonl(tcp->tcp_rnxt); 12207 12208 /* 12209 * Generate the ordrel_ind at the end unless we 12210 * are an eager guy. 12211 * In the eager case tcp_rsrv will do this when run 12212 * after tcp_accept is done. 12213 */ 12214 if (tcp->tcp_listener == NULL && 12215 !TCP_IS_DETACHED(tcp) && !tcp->tcp_hard_binding) 12216 flags |= TH_ORDREL_NEEDED; 12217 switch (tcp->tcp_state) { 12218 case TCPS_SYN_RCVD: 12219 case TCPS_ESTABLISHED: 12220 tcp->tcp_state = TCPS_CLOSE_WAIT; 12221 /* Keepalive? */ 12222 break; 12223 case TCPS_FIN_WAIT_1: 12224 if (!tcp->tcp_fin_acked) { 12225 tcp->tcp_state = TCPS_CLOSING; 12226 break; 12227 } 12228 /* FALLTHRU */ 12229 case TCPS_FIN_WAIT_2: 12230 SET_TIME_WAIT(tcps, tcp, connp); 12231 if (seg_len) { 12232 /* 12233 * implies data piggybacked on FIN. 12234 * break to handle data. 12235 */ 12236 break; 12237 } 12238 freemsg(mp); 12239 goto ack_check; 12240 } 12241 } 12242 } 12243 if (mp == NULL) 12244 goto xmit_check; 12245 if (seg_len == 0) { 12246 freemsg(mp); 12247 goto xmit_check; 12248 } 12249 if (mp->b_rptr == mp->b_wptr) { 12250 /* 12251 * The header has been consumed, so we remove the 12252 * zero-length mblk here. 12253 */ 12254 mp1 = mp; 12255 mp = mp->b_cont; 12256 freeb(mp1); 12257 } 12258 update_ack: 12259 tcpha = tcp->tcp_tcpha; 12260 tcp->tcp_rack_cnt++; 12261 { 12262 uint32_t cur_max; 12263 12264 cur_max = tcp->tcp_rack_cur_max; 12265 if (tcp->tcp_rack_cnt >= cur_max) { 12266 /* 12267 * We have more unacked data than we should - send 12268 * an ACK now. 12269 */ 12270 flags |= TH_ACK_NEEDED; 12271 cur_max++; 12272 if (cur_max > tcp->tcp_rack_abs_max) 12273 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 12274 else 12275 tcp->tcp_rack_cur_max = cur_max; 12276 } else if (TCP_IS_DETACHED(tcp)) { 12277 /* We don't have an ACK timer for detached TCP. */ 12278 flags |= TH_ACK_NEEDED; 12279 } else if (seg_len < mss) { 12280 /* 12281 * If we get a segment that is less than an mss, and we 12282 * already have unacknowledged data, and the amount 12283 * unacknowledged is not a multiple of mss, then we 12284 * better generate an ACK now. Otherwise, this may be 12285 * the tail piece of a transaction, and we would rather 12286 * wait for the response. 12287 */ 12288 uint32_t udif; 12289 ASSERT((uintptr_t)(tcp->tcp_rnxt - tcp->tcp_rack) <= 12290 (uintptr_t)INT_MAX); 12291 udif = (int)(tcp->tcp_rnxt - tcp->tcp_rack); 12292 if (udif && (udif % mss)) 12293 flags |= TH_ACK_NEEDED; 12294 else 12295 flags |= TH_ACK_TIMER_NEEDED; 12296 } else { 12297 /* Start delayed ack timer */ 12298 flags |= TH_ACK_TIMER_NEEDED; 12299 } 12300 } 12301 tcp->tcp_rnxt += seg_len; 12302 tcpha->tha_ack = htonl(tcp->tcp_rnxt); 12303 12304 if (mp == NULL) 12305 goto xmit_check; 12306 12307 /* Update SACK list */ 12308 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 12309 tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 12310 &(tcp->tcp_num_sack_blk)); 12311 } 12312 12313 if (tcp->tcp_urp_mp) { 12314 tcp->tcp_urp_mp->b_cont = mp; 12315 mp = tcp->tcp_urp_mp; 12316 tcp->tcp_urp_mp = NULL; 12317 /* Ready for a new signal. */ 12318 tcp->tcp_urp_last_valid = B_FALSE; 12319 #ifdef DEBUG 12320 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 12321 "tcp_rput: sending exdata_ind %s", 12322 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 12323 #endif /* DEBUG */ 12324 } 12325 12326 /* 12327 * Check for ancillary data changes compared to last segment. 12328 */ 12329 if (connp->conn_recv_ancillary.crb_all != 0) { 12330 mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira); 12331 if (mp == NULL) 12332 return; 12333 } 12334 12335 if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) { 12336 /* 12337 * Side queue inbound data until the accept happens. 12338 * tcp_accept/tcp_rput drains this when the accept happens. 12339 * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or 12340 * T_EXDATA_IND) it is queued on b_next. 12341 * XXX Make urgent data use this. Requires: 12342 * Removing tcp_listener check for TH_URG 12343 * Making M_PCPROTO and MARK messages skip the eager case 12344 */ 12345 12346 if (tcp->tcp_kssl_pending) { 12347 DTRACE_PROBE1(kssl_mblk__ksslinput_pending, 12348 mblk_t *, mp); 12349 tcp_kssl_input(tcp, mp, ira->ira_cred); 12350 } else { 12351 tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); 12352 } 12353 } else if (IPCL_IS_NONSTR(connp)) { 12354 /* 12355 * Non-STREAMS socket 12356 * 12357 * Note that no KSSL processing is done here, because 12358 * KSSL is not supported for non-STREAMS sockets. 12359 */ 12360 boolean_t push = flags & (TH_PUSH|TH_FIN); 12361 int error; 12362 12363 if ((*connp->conn_upcalls->su_recv)( 12364 connp->conn_upper_handle, 12365 mp, seg_len, 0, &error, &push) <= 0) { 12366 /* 12367 * We should never be in middle of a 12368 * fallback, the squeue guarantees that. 12369 */ 12370 ASSERT(error != EOPNOTSUPP); 12371 if (error == ENOSPC) 12372 tcp->tcp_rwnd -= seg_len; 12373 } else if (push) { 12374 /* PUSH bit set and sockfs is not flow controlled */ 12375 flags |= tcp_rwnd_reopen(tcp); 12376 } 12377 } else { 12378 /* STREAMS socket */ 12379 if (mp->b_datap->db_type != M_DATA || 12380 (flags & TH_MARKNEXT_NEEDED)) { 12381 if (tcp->tcp_rcv_list != NULL) { 12382 flags |= tcp_rcv_drain(tcp); 12383 } 12384 ASSERT(tcp->tcp_rcv_list == NULL || 12385 tcp->tcp_fused_sigurg); 12386 12387 if (flags & TH_MARKNEXT_NEEDED) { 12388 #ifdef DEBUG 12389 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 12390 "tcp_rput: sending MSGMARKNEXT %s", 12391 tcp_display(tcp, NULL, 12392 DISP_PORT_ONLY)); 12393 #endif /* DEBUG */ 12394 mp->b_flag |= MSGMARKNEXT; 12395 flags &= ~TH_MARKNEXT_NEEDED; 12396 } 12397 12398 /* Does this need SSL processing first? */ 12399 if ((tcp->tcp_kssl_ctx != NULL) && 12400 (DB_TYPE(mp) == M_DATA)) { 12401 DTRACE_PROBE1(kssl_mblk__ksslinput_data1, 12402 mblk_t *, mp); 12403 tcp_kssl_input(tcp, mp, ira->ira_cred); 12404 } else { 12405 if (is_system_labeled()) 12406 tcp_setcred_data(mp, ira); 12407 12408 putnext(connp->conn_rq, mp); 12409 if (!canputnext(connp->conn_rq)) 12410 tcp->tcp_rwnd -= seg_len; 12411 } 12412 } else if ((tcp->tcp_kssl_ctx != NULL) && 12413 (DB_TYPE(mp) == M_DATA)) { 12414 /* Does this need SSL processing first? */ 12415 DTRACE_PROBE1(kssl_mblk__ksslinput_data2, mblk_t *, mp); 12416 tcp_kssl_input(tcp, mp, ira->ira_cred); 12417 } else if ((flags & (TH_PUSH|TH_FIN)) || 12418 tcp->tcp_rcv_cnt + seg_len >= connp->conn_rcvbuf >> 3) { 12419 if (tcp->tcp_rcv_list != NULL) { 12420 /* 12421 * Enqueue the new segment first and then 12422 * call tcp_rcv_drain() to send all data 12423 * up. The other way to do this is to 12424 * send all queued data up and then call 12425 * putnext() to send the new segment up. 12426 * This way can remove the else part later 12427 * on. 12428 * 12429 * We don't do this to avoid one more call to 12430 * canputnext() as tcp_rcv_drain() needs to 12431 * call canputnext(). 12432 */ 12433 tcp_rcv_enqueue(tcp, mp, seg_len, 12434 ira->ira_cred); 12435 flags |= tcp_rcv_drain(tcp); 12436 } else { 12437 if (is_system_labeled()) 12438 tcp_setcred_data(mp, ira); 12439 12440 putnext(connp->conn_rq, mp); 12441 if (!canputnext(connp->conn_rq)) 12442 tcp->tcp_rwnd -= seg_len; 12443 } 12444 } else { 12445 /* 12446 * Enqueue all packets when processing an mblk 12447 * from the co queue and also enqueue normal packets. 12448 */ 12449 tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); 12450 } 12451 /* 12452 * Make sure the timer is running if we have data waiting 12453 * for a push bit. This provides resiliency against 12454 * implementations that do not correctly generate push bits. 12455 */ 12456 if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) { 12457 /* 12458 * The connection may be closed at this point, so don't 12459 * do anything for a detached tcp. 12460 */ 12461 if (!TCP_IS_DETACHED(tcp)) 12462 tcp->tcp_push_tid = TCP_TIMER(tcp, 12463 tcp_push_timer, 12464 MSEC_TO_TICK( 12465 tcps->tcps_push_timer_interval)); 12466 } 12467 } 12468 12469 xmit_check: 12470 /* Is there anything left to do? */ 12471 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 12472 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 12473 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED| 12474 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 12475 goto done; 12476 12477 /* Any transmit work to do and a non-zero window? */ 12478 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 12479 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 12480 if (flags & TH_REXMIT_NEEDED) { 12481 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 12482 12483 BUMP_MIB(&tcps->tcps_mib, tcpOutFastRetrans); 12484 if (snd_size > mss) 12485 snd_size = mss; 12486 if (snd_size > tcp->tcp_swnd) 12487 snd_size = tcp->tcp_swnd; 12488 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 12489 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 12490 B_TRUE); 12491 12492 if (mp1 != NULL) { 12493 tcp->tcp_xmit_head->b_prev = 12494 (mblk_t *)LBOLT_FASTPATH; 12495 tcp->tcp_csuna = tcp->tcp_snxt; 12496 BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); 12497 UPDATE_MIB(&tcps->tcps_mib, 12498 tcpRetransBytes, snd_size); 12499 tcp_send_data(tcp, mp1); 12500 } 12501 } 12502 if (flags & TH_NEED_SACK_REXMIT) { 12503 tcp_sack_rxmit(tcp, &flags); 12504 } 12505 /* 12506 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 12507 * out new segment. Note that tcp_rexmit should not be 12508 * set, otherwise TH_LIMIT_XMIT should not be set. 12509 */ 12510 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 12511 if (!tcp->tcp_rexmit) { 12512 tcp_wput_data(tcp, NULL, B_FALSE); 12513 } else { 12514 tcp_ss_rexmit(tcp); 12515 } 12516 } 12517 /* 12518 * Adjust tcp_cwnd back to normal value after sending 12519 * new data segments. 12520 */ 12521 if (flags & TH_LIMIT_XMIT) { 12522 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 12523 /* 12524 * This will restart the timer. Restarting the 12525 * timer is used to avoid a timeout before the 12526 * limited transmitted segment's ACK gets back. 12527 */ 12528 if (tcp->tcp_xmit_head != NULL) 12529 tcp->tcp_xmit_head->b_prev = 12530 (mblk_t *)LBOLT_FASTPATH; 12531 } 12532 12533 /* Anything more to do? */ 12534 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| 12535 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 12536 goto done; 12537 } 12538 ack_check: 12539 if (flags & TH_SEND_URP_MARK) { 12540 ASSERT(tcp->tcp_urp_mark_mp); 12541 ASSERT(!IPCL_IS_NONSTR(connp)); 12542 /* 12543 * Send up any queued data and then send the mark message 12544 */ 12545 if (tcp->tcp_rcv_list != NULL) { 12546 flags |= tcp_rcv_drain(tcp); 12547 12548 } 12549 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 12550 mp1 = tcp->tcp_urp_mark_mp; 12551 tcp->tcp_urp_mark_mp = NULL; 12552 if (is_system_labeled()) 12553 tcp_setcred_data(mp1, ira); 12554 12555 putnext(connp->conn_rq, mp1); 12556 #ifdef DEBUG 12557 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 12558 "tcp_rput: sending zero-length %s %s", 12559 ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" : 12560 "MSGNOTMARKNEXT"), 12561 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 12562 #endif /* DEBUG */ 12563 flags &= ~TH_SEND_URP_MARK; 12564 } 12565 if (flags & TH_ACK_NEEDED) { 12566 /* 12567 * Time to send an ack for some reason. 12568 */ 12569 mp1 = tcp_ack_mp(tcp); 12570 12571 if (mp1 != NULL) { 12572 tcp_send_data(tcp, mp1); 12573 BUMP_LOCAL(tcp->tcp_obsegs); 12574 BUMP_MIB(&tcps->tcps_mib, tcpOutAck); 12575 } 12576 if (tcp->tcp_ack_tid != 0) { 12577 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 12578 tcp->tcp_ack_tid = 0; 12579 } 12580 } 12581 if (flags & TH_ACK_TIMER_NEEDED) { 12582 /* 12583 * Arrange for deferred ACK or push wait timeout. 12584 * Start timer if it is not already running. 12585 */ 12586 if (tcp->tcp_ack_tid == 0) { 12587 tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer, 12588 MSEC_TO_TICK(tcp->tcp_localnet ? 12589 (clock_t)tcps->tcps_local_dack_interval : 12590 (clock_t)tcps->tcps_deferred_ack_interval)); 12591 } 12592 } 12593 if (flags & TH_ORDREL_NEEDED) { 12594 /* 12595 * Send up the ordrel_ind unless we are an eager guy. 12596 * In the eager case tcp_rsrv will do this when run 12597 * after tcp_accept is done. 12598 */ 12599 ASSERT(tcp->tcp_listener == NULL); 12600 ASSERT(!tcp->tcp_detached); 12601 12602 if (IPCL_IS_NONSTR(connp)) { 12603 ASSERT(tcp->tcp_ordrel_mp == NULL); 12604 tcp->tcp_ordrel_done = B_TRUE; 12605 (*connp->conn_upcalls->su_opctl) 12606 (connp->conn_upper_handle, SOCK_OPCTL_SHUT_RECV, 0); 12607 goto done; 12608 } 12609 12610 if (tcp->tcp_rcv_list != NULL) { 12611 /* 12612 * Push any mblk(s) enqueued from co processing. 12613 */ 12614 flags |= tcp_rcv_drain(tcp); 12615 } 12616 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 12617 12618 mp1 = tcp->tcp_ordrel_mp; 12619 tcp->tcp_ordrel_mp = NULL; 12620 tcp->tcp_ordrel_done = B_TRUE; 12621 putnext(connp->conn_rq, mp1); 12622 } 12623 done: 12624 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 12625 } 12626 12627 /* 12628 * This routine adjusts next-to-send sequence number variables, in the 12629 * case where the reciever has shrunk it's window. 12630 */ 12631 static void 12632 tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt) 12633 { 12634 mblk_t *xmit_tail; 12635 int32_t offset; 12636 12637 tcp->tcp_snxt = snxt; 12638 12639 /* Get the mblk, and the offset in it, as per the shrunk window */ 12640 xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset); 12641 ASSERT(xmit_tail != NULL); 12642 tcp->tcp_xmit_tail = xmit_tail; 12643 tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr - 12644 xmit_tail->b_rptr - offset; 12645 } 12646 12647 /* 12648 * This function does PAWS protection check. Returns B_TRUE if the 12649 * segment passes the PAWS test, else returns B_FALSE. 12650 */ 12651 boolean_t 12652 tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp) 12653 { 12654 uint8_t flags; 12655 int options; 12656 uint8_t *up; 12657 conn_t *connp = tcp->tcp_connp; 12658 12659 flags = (unsigned int)tcpha->tha_flags & 0xFF; 12660 /* 12661 * If timestamp option is aligned nicely, get values inline, 12662 * otherwise call general routine to parse. Only do that 12663 * if timestamp is the only option. 12664 */ 12665 if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH + 12666 TCPOPT_REAL_TS_LEN && 12667 OK_32PTR((up = ((uint8_t *)tcpha) + 12668 TCP_MIN_HEADER_LENGTH)) && 12669 *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 12670 tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 12671 tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 12672 12673 options = TCP_OPT_TSTAMP_PRESENT; 12674 } else { 12675 if (tcp->tcp_snd_sack_ok) { 12676 tcpoptp->tcp = tcp; 12677 } else { 12678 tcpoptp->tcp = NULL; 12679 } 12680 options = tcp_parse_options(tcpha, tcpoptp); 12681 } 12682 12683 if (options & TCP_OPT_TSTAMP_PRESENT) { 12684 /* 12685 * Do PAWS per RFC 1323 section 4.2. Accept RST 12686 * regardless of the timestamp, page 18 RFC 1323.bis. 12687 */ 12688 if ((flags & TH_RST) == 0 && 12689 TSTMP_LT(tcpoptp->tcp_opt_ts_val, 12690 tcp->tcp_ts_recent)) { 12691 if (TSTMP_LT(LBOLT_FASTPATH64, 12692 tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) { 12693 /* This segment is not acceptable. */ 12694 return (B_FALSE); 12695 } else { 12696 /* 12697 * Connection has been idle for 12698 * too long. Reset the timestamp 12699 * and assume the segment is valid. 12700 */ 12701 tcp->tcp_ts_recent = 12702 tcpoptp->tcp_opt_ts_val; 12703 } 12704 } 12705 } else { 12706 /* 12707 * If we don't get a timestamp on every packet, we 12708 * figure we can't really trust 'em, so we stop sending 12709 * and parsing them. 12710 */ 12711 tcp->tcp_snd_ts_ok = B_FALSE; 12712 12713 connp->conn_ht_iphc_len -= TCPOPT_REAL_TS_LEN; 12714 connp->conn_ht_ulp_len -= TCPOPT_REAL_TS_LEN; 12715 tcp->tcp_tcpha->tha_offset_and_reserved -= (3 << 4); 12716 /* 12717 * Adjust the tcp_mss and tcp_cwnd accordingly. We avoid 12718 * doing a slow start here so as to not to lose on the 12719 * transfer rate built up so far. 12720 */ 12721 tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); 12722 if (tcp->tcp_snd_sack_ok) { 12723 ASSERT(tcp->tcp_sack_info != NULL); 12724 tcp->tcp_max_sack_blk = 4; 12725 } 12726 } 12727 return (B_TRUE); 12728 } 12729 12730 /* 12731 * Attach ancillary data to a received TCP segments for the 12732 * ancillary pieces requested by the application that are 12733 * different than they were in the previous data segment. 12734 * 12735 * Save the "current" values once memory allocation is ok so that 12736 * when memory allocation fails we can just wait for the next data segment. 12737 */ 12738 static mblk_t * 12739 tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, 12740 ip_recv_attr_t *ira) 12741 { 12742 struct T_optdata_ind *todi; 12743 int optlen; 12744 uchar_t *optptr; 12745 struct T_opthdr *toh; 12746 crb_t addflag; /* Which pieces to add */ 12747 mblk_t *mp1; 12748 conn_t *connp = tcp->tcp_connp; 12749 12750 optlen = 0; 12751 addflag.crb_all = 0; 12752 /* If app asked for pktinfo and the index has changed ... */ 12753 if (connp->conn_recv_ancillary.crb_ip_recvpktinfo && 12754 ira->ira_ruifindex != tcp->tcp_recvifindex) { 12755 optlen += sizeof (struct T_opthdr) + 12756 sizeof (struct in6_pktinfo); 12757 addflag.crb_ip_recvpktinfo = 1; 12758 } 12759 /* If app asked for hoplimit and it has changed ... */ 12760 if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit && 12761 ipp->ipp_hoplimit != tcp->tcp_recvhops) { 12762 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 12763 addflag.crb_ipv6_recvhoplimit = 1; 12764 } 12765 /* If app asked for tclass and it has changed ... */ 12766 if (connp->conn_recv_ancillary.crb_ipv6_recvtclass && 12767 ipp->ipp_tclass != tcp->tcp_recvtclass) { 12768 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 12769 addflag.crb_ipv6_recvtclass = 1; 12770 } 12771 /* 12772 * If app asked for hopbyhop headers and it has changed ... 12773 * For security labels, note that (1) security labels can't change on 12774 * a connected socket at all, (2) we're connected to at most one peer, 12775 * (3) if anything changes, then it must be some other extra option. 12776 */ 12777 if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts && 12778 ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen, 12779 (ipp->ipp_fields & IPPF_HOPOPTS), 12780 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) { 12781 optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen; 12782 addflag.crb_ipv6_recvhopopts = 1; 12783 if (!ip_allocbuf((void **)&tcp->tcp_hopopts, 12784 &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), 12785 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) 12786 return (mp); 12787 } 12788 /* If app asked for dst headers before routing headers ... */ 12789 if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts && 12790 ip_cmpbuf(tcp->tcp_rthdrdstopts, tcp->tcp_rthdrdstoptslen, 12791 (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), 12792 ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) { 12793 optlen += sizeof (struct T_opthdr) + 12794 ipp->ipp_rthdrdstoptslen; 12795 addflag.crb_ipv6_recvrthdrdstopts = 1; 12796 if (!ip_allocbuf((void **)&tcp->tcp_rthdrdstopts, 12797 &tcp->tcp_rthdrdstoptslen, 12798 (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), 12799 ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) 12800 return (mp); 12801 } 12802 /* If app asked for routing headers and it has changed ... */ 12803 if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr && 12804 ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen, 12805 (ipp->ipp_fields & IPPF_RTHDR), 12806 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { 12807 optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; 12808 addflag.crb_ipv6_recvrthdr = 1; 12809 if (!ip_allocbuf((void **)&tcp->tcp_rthdr, 12810 &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), 12811 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) 12812 return (mp); 12813 } 12814 /* If app asked for dest headers and it has changed ... */ 12815 if ((connp->conn_recv_ancillary.crb_ipv6_recvdstopts || 12816 connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts) && 12817 ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen, 12818 (ipp->ipp_fields & IPPF_DSTOPTS), 12819 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) { 12820 optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; 12821 addflag.crb_ipv6_recvdstopts = 1; 12822 if (!ip_allocbuf((void **)&tcp->tcp_dstopts, 12823 &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), 12824 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) 12825 return (mp); 12826 } 12827 12828 if (optlen == 0) { 12829 /* Nothing to add */ 12830 return (mp); 12831 } 12832 mp1 = allocb(sizeof (struct T_optdata_ind) + optlen, BPRI_MED); 12833 if (mp1 == NULL) { 12834 /* 12835 * Defer sending ancillary data until the next TCP segment 12836 * arrives. 12837 */ 12838 return (mp); 12839 } 12840 mp1->b_cont = mp; 12841 mp = mp1; 12842 mp->b_wptr += sizeof (*todi) + optlen; 12843 mp->b_datap->db_type = M_PROTO; 12844 todi = (struct T_optdata_ind *)mp->b_rptr; 12845 todi->PRIM_type = T_OPTDATA_IND; 12846 todi->DATA_flag = 1; /* MORE data */ 12847 todi->OPT_length = optlen; 12848 todi->OPT_offset = sizeof (*todi); 12849 optptr = (uchar_t *)&todi[1]; 12850 /* 12851 * If app asked for pktinfo and the index has changed ... 12852 * Note that the local address never changes for the connection. 12853 */ 12854 if (addflag.crb_ip_recvpktinfo) { 12855 struct in6_pktinfo *pkti; 12856 uint_t ifindex; 12857 12858 ifindex = ira->ira_ruifindex; 12859 toh = (struct T_opthdr *)optptr; 12860 toh->level = IPPROTO_IPV6; 12861 toh->name = IPV6_PKTINFO; 12862 toh->len = sizeof (*toh) + sizeof (*pkti); 12863 toh->status = 0; 12864 optptr += sizeof (*toh); 12865 pkti = (struct in6_pktinfo *)optptr; 12866 pkti->ipi6_addr = connp->conn_laddr_v6; 12867 pkti->ipi6_ifindex = ifindex; 12868 optptr += sizeof (*pkti); 12869 ASSERT(OK_32PTR(optptr)); 12870 /* Save as "last" value */ 12871 tcp->tcp_recvifindex = ifindex; 12872 } 12873 /* If app asked for hoplimit and it has changed ... */ 12874 if (addflag.crb_ipv6_recvhoplimit) { 12875 toh = (struct T_opthdr *)optptr; 12876 toh->level = IPPROTO_IPV6; 12877 toh->name = IPV6_HOPLIMIT; 12878 toh->len = sizeof (*toh) + sizeof (uint_t); 12879 toh->status = 0; 12880 optptr += sizeof (*toh); 12881 *(uint_t *)optptr = ipp->ipp_hoplimit; 12882 optptr += sizeof (uint_t); 12883 ASSERT(OK_32PTR(optptr)); 12884 /* Save as "last" value */ 12885 tcp->tcp_recvhops = ipp->ipp_hoplimit; 12886 } 12887 /* If app asked for tclass and it has changed ... */ 12888 if (addflag.crb_ipv6_recvtclass) { 12889 toh = (struct T_opthdr *)optptr; 12890 toh->level = IPPROTO_IPV6; 12891 toh->name = IPV6_TCLASS; 12892 toh->len = sizeof (*toh) + sizeof (uint_t); 12893 toh->status = 0; 12894 optptr += sizeof (*toh); 12895 *(uint_t *)optptr = ipp->ipp_tclass; 12896 optptr += sizeof (uint_t); 12897 ASSERT(OK_32PTR(optptr)); 12898 /* Save as "last" value */ 12899 tcp->tcp_recvtclass = ipp->ipp_tclass; 12900 } 12901 if (addflag.crb_ipv6_recvhopopts) { 12902 toh = (struct T_opthdr *)optptr; 12903 toh->level = IPPROTO_IPV6; 12904 toh->name = IPV6_HOPOPTS; 12905 toh->len = sizeof (*toh) + ipp->ipp_hopoptslen; 12906 toh->status = 0; 12907 optptr += sizeof (*toh); 12908 bcopy((uchar_t *)ipp->ipp_hopopts, optptr, ipp->ipp_hopoptslen); 12909 optptr += ipp->ipp_hopoptslen; 12910 ASSERT(OK_32PTR(optptr)); 12911 /* Save as last value */ 12912 ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, 12913 (ipp->ipp_fields & IPPF_HOPOPTS), 12914 ipp->ipp_hopopts, ipp->ipp_hopoptslen); 12915 } 12916 if (addflag.crb_ipv6_recvrthdrdstopts) { 12917 toh = (struct T_opthdr *)optptr; 12918 toh->level = IPPROTO_IPV6; 12919 toh->name = IPV6_RTHDRDSTOPTS; 12920 toh->len = sizeof (*toh) + ipp->ipp_rthdrdstoptslen; 12921 toh->status = 0; 12922 optptr += sizeof (*toh); 12923 bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen); 12924 optptr += ipp->ipp_rthdrdstoptslen; 12925 ASSERT(OK_32PTR(optptr)); 12926 /* Save as last value */ 12927 ip_savebuf((void **)&tcp->tcp_rthdrdstopts, 12928 &tcp->tcp_rthdrdstoptslen, 12929 (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), 12930 ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen); 12931 } 12932 if (addflag.crb_ipv6_recvrthdr) { 12933 toh = (struct T_opthdr *)optptr; 12934 toh->level = IPPROTO_IPV6; 12935 toh->name = IPV6_RTHDR; 12936 toh->len = sizeof (*toh) + ipp->ipp_rthdrlen; 12937 toh->status = 0; 12938 optptr += sizeof (*toh); 12939 bcopy(ipp->ipp_rthdr, optptr, ipp->ipp_rthdrlen); 12940 optptr += ipp->ipp_rthdrlen; 12941 ASSERT(OK_32PTR(optptr)); 12942 /* Save as last value */ 12943 ip_savebuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen, 12944 (ipp->ipp_fields & IPPF_RTHDR), 12945 ipp->ipp_rthdr, ipp->ipp_rthdrlen); 12946 } 12947 if (addflag.crb_ipv6_recvdstopts) { 12948 toh = (struct T_opthdr *)optptr; 12949 toh->level = IPPROTO_IPV6; 12950 toh->name = IPV6_DSTOPTS; 12951 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen; 12952 toh->status = 0; 12953 optptr += sizeof (*toh); 12954 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen); 12955 optptr += ipp->ipp_dstoptslen; 12956 ASSERT(OK_32PTR(optptr)); 12957 /* Save as last value */ 12958 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen, 12959 (ipp->ipp_fields & IPPF_DSTOPTS), 12960 ipp->ipp_dstopts, ipp->ipp_dstoptslen); 12961 } 12962 ASSERT(optptr == mp->b_wptr); 12963 return (mp); 12964 } 12965 12966 /* ARGSUSED */ 12967 static void 12968 tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 12969 { 12970 conn_t *connp = (conn_t *)arg; 12971 tcp_t *tcp = connp->conn_tcp; 12972 queue_t *q = connp->conn_rq; 12973 tcp_stack_t *tcps = tcp->tcp_tcps; 12974 12975 ASSERT(!IPCL_IS_NONSTR(connp)); 12976 mutex_enter(&tcp->tcp_rsrv_mp_lock); 12977 tcp->tcp_rsrv_mp = mp; 12978 mutex_exit(&tcp->tcp_rsrv_mp_lock); 12979 12980 TCP_STAT(tcps, tcp_rsrv_calls); 12981 12982 if (TCP_IS_DETACHED(tcp) || q == NULL) { 12983 return; 12984 } 12985 12986 if (tcp->tcp_fused) { 12987 tcp_fuse_backenable(tcp); 12988 return; 12989 } 12990 12991 if (canputnext(q)) { 12992 /* Not flow-controlled, open rwnd */ 12993 tcp->tcp_rwnd = connp->conn_rcvbuf; 12994 12995 /* 12996 * Send back a window update immediately if TCP is above 12997 * ESTABLISHED state and the increase of the rcv window 12998 * that the other side knows is at least 1 MSS after flow 12999 * control is lifted. 13000 */ 13001 if (tcp->tcp_state >= TCPS_ESTABLISHED && 13002 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 13003 tcp_xmit_ctl(NULL, tcp, 13004 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 13005 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 13006 } 13007 } 13008 } 13009 13010 /* 13011 * The read side service routine is called mostly when we get back-enabled as a 13012 * result of flow control relief. Since we don't actually queue anything in 13013 * TCP, we have no data to send out of here. What we do is clear the receive 13014 * window, and send out a window update. 13015 */ 13016 static void 13017 tcp_rsrv(queue_t *q) 13018 { 13019 conn_t *connp = Q_TO_CONN(q); 13020 tcp_t *tcp = connp->conn_tcp; 13021 mblk_t *mp; 13022 13023 /* No code does a putq on the read side */ 13024 ASSERT(q->q_first == NULL); 13025 13026 /* 13027 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_rsrv() has already 13028 * been run. So just return. 13029 */ 13030 mutex_enter(&tcp->tcp_rsrv_mp_lock); 13031 if ((mp = tcp->tcp_rsrv_mp) == NULL) { 13032 mutex_exit(&tcp->tcp_rsrv_mp_lock); 13033 return; 13034 } 13035 tcp->tcp_rsrv_mp = NULL; 13036 mutex_exit(&tcp->tcp_rsrv_mp_lock); 13037 13038 CONN_INC_REF(connp); 13039 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp, 13040 NULL, SQ_PROCESS, SQTAG_TCP_RSRV); 13041 } 13042 13043 /* 13044 * tcp_rwnd_set() is called to adjust the receive window to a desired value. 13045 * We do not allow the receive window to shrink. After setting rwnd, 13046 * set the flow control hiwat of the stream. 13047 * 13048 * This function is called in 2 cases: 13049 * 13050 * 1) Before data transfer begins, in tcp_input_listener() for accepting a 13051 * connection (passive open) and in tcp_input_data() for active connect. 13052 * This is called after tcp_mss_set() when the desired MSS value is known. 13053 * This makes sure that our window size is a mutiple of the other side's 13054 * MSS. 13055 * 2) Handling SO_RCVBUF option. 13056 * 13057 * It is ASSUMED that the requested size is a multiple of the current MSS. 13058 * 13059 * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 13060 * user requests so. 13061 */ 13062 int 13063 tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 13064 { 13065 uint32_t mss = tcp->tcp_mss; 13066 uint32_t old_max_rwnd; 13067 uint32_t max_transmittable_rwnd; 13068 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 13069 tcp_stack_t *tcps = tcp->tcp_tcps; 13070 conn_t *connp = tcp->tcp_connp; 13071 13072 /* 13073 * Insist on a receive window that is at least 13074 * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 13075 * funny TCP interactions of Nagle algorithm, SWS avoidance 13076 * and delayed acknowledgement. 13077 */ 13078 rwnd = MAX(rwnd, tcps->tcps_recv_hiwat_minmss * mss); 13079 13080 if (tcp->tcp_fused) { 13081 size_t sth_hiwat; 13082 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 13083 13084 ASSERT(peer_tcp != NULL); 13085 sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd); 13086 if (!tcp_detached) { 13087 (void) proto_set_rx_hiwat(connp->conn_rq, connp, 13088 sth_hiwat); 13089 tcp_set_recv_threshold(tcp, sth_hiwat >> 3); 13090 } 13091 13092 /* Caller could have changed tcp_rwnd; update tha_win */ 13093 if (tcp->tcp_tcpha != NULL) { 13094 tcp->tcp_tcpha->tha_win = 13095 htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 13096 } 13097 if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 13098 tcp->tcp_cwnd_max = rwnd; 13099 13100 /* 13101 * In the fusion case, the maxpsz stream head value of 13102 * our peer is set according to its send buffer size 13103 * and our receive buffer size; since the latter may 13104 * have changed we need to update the peer's maxpsz. 13105 */ 13106 (void) tcp_maxpsz_set(peer_tcp, B_TRUE); 13107 return (sth_hiwat); 13108 } 13109 13110 if (tcp_detached) 13111 old_max_rwnd = tcp->tcp_rwnd; 13112 else 13113 old_max_rwnd = connp->conn_rcvbuf; 13114 13115 13116 /* 13117 * If window size info has already been exchanged, TCP should not 13118 * shrink the window. Shrinking window is doable if done carefully. 13119 * We may add that support later. But so far there is not a real 13120 * need to do that. 13121 */ 13122 if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 13123 /* MSS may have changed, do a round up again. */ 13124 rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 13125 } 13126 13127 /* 13128 * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 13129 * can be applied even before the window scale option is decided. 13130 */ 13131 max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 13132 if (rwnd > max_transmittable_rwnd) { 13133 rwnd = max_transmittable_rwnd - 13134 (max_transmittable_rwnd % mss); 13135 if (rwnd < mss) 13136 rwnd = max_transmittable_rwnd; 13137 /* 13138 * If we're over the limit we may have to back down tcp_rwnd. 13139 * The increment below won't work for us. So we set all three 13140 * here and the increment below will have no effect. 13141 */ 13142 tcp->tcp_rwnd = old_max_rwnd = rwnd; 13143 } 13144 if (tcp->tcp_localnet) { 13145 tcp->tcp_rack_abs_max = 13146 MIN(tcps->tcps_local_dacks_max, rwnd / mss / 2); 13147 } else { 13148 /* 13149 * For a remote host on a different subnet (through a router), 13150 * we ack every other packet to be conforming to RFC1122. 13151 * tcp_deferred_acks_max is default to 2. 13152 */ 13153 tcp->tcp_rack_abs_max = 13154 MIN(tcps->tcps_deferred_acks_max, rwnd / mss / 2); 13155 } 13156 if (tcp->tcp_rack_cur_max > tcp->tcp_rack_abs_max) 13157 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 13158 else 13159 tcp->tcp_rack_cur_max = 0; 13160 /* 13161 * Increment the current rwnd by the amount the maximum grew (we 13162 * can not overwrite it since we might be in the middle of a 13163 * connection.) 13164 */ 13165 tcp->tcp_rwnd += rwnd - old_max_rwnd; 13166 connp->conn_rcvbuf = rwnd; 13167 13168 /* Are we already connected? */ 13169 if (tcp->tcp_tcpha != NULL) { 13170 tcp->tcp_tcpha->tha_win = 13171 htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 13172 } 13173 13174 if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 13175 tcp->tcp_cwnd_max = rwnd; 13176 13177 if (tcp_detached) 13178 return (rwnd); 13179 13180 tcp_set_recv_threshold(tcp, rwnd >> 3); 13181 13182 (void) proto_set_rx_hiwat(connp->conn_rq, connp, rwnd); 13183 return (rwnd); 13184 } 13185 13186 /* 13187 * Return SNMP stuff in buffer in mpdata. 13188 */ 13189 mblk_t * 13190 tcp_snmp_get(queue_t *q, mblk_t *mpctl) 13191 { 13192 mblk_t *mpdata; 13193 mblk_t *mp_conn_ctl = NULL; 13194 mblk_t *mp_conn_tail; 13195 mblk_t *mp_attr_ctl = NULL; 13196 mblk_t *mp_attr_tail; 13197 mblk_t *mp6_conn_ctl = NULL; 13198 mblk_t *mp6_conn_tail; 13199 mblk_t *mp6_attr_ctl = NULL; 13200 mblk_t *mp6_attr_tail; 13201 struct opthdr *optp; 13202 mib2_tcpConnEntry_t tce; 13203 mib2_tcp6ConnEntry_t tce6; 13204 mib2_transportMLPEntry_t mlp; 13205 connf_t *connfp; 13206 int i; 13207 boolean_t ispriv; 13208 zoneid_t zoneid; 13209 int v4_conn_idx; 13210 int v6_conn_idx; 13211 conn_t *connp = Q_TO_CONN(q); 13212 tcp_stack_t *tcps; 13213 ip_stack_t *ipst; 13214 mblk_t *mp2ctl; 13215 13216 /* 13217 * make a copy of the original message 13218 */ 13219 mp2ctl = copymsg(mpctl); 13220 13221 if (mpctl == NULL || 13222 (mpdata = mpctl->b_cont) == NULL || 13223 (mp_conn_ctl = copymsg(mpctl)) == NULL || 13224 (mp_attr_ctl = copymsg(mpctl)) == NULL || 13225 (mp6_conn_ctl = copymsg(mpctl)) == NULL || 13226 (mp6_attr_ctl = copymsg(mpctl)) == NULL) { 13227 freemsg(mp_conn_ctl); 13228 freemsg(mp_attr_ctl); 13229 freemsg(mp6_conn_ctl); 13230 freemsg(mp6_attr_ctl); 13231 freemsg(mpctl); 13232 freemsg(mp2ctl); 13233 return (NULL); 13234 } 13235 13236 ipst = connp->conn_netstack->netstack_ip; 13237 tcps = connp->conn_netstack->netstack_tcp; 13238 13239 /* build table of connections -- need count in fixed part */ 13240 SET_MIB(tcps->tcps_mib.tcpRtoAlgorithm, 4); /* vanj */ 13241 SET_MIB(tcps->tcps_mib.tcpRtoMin, tcps->tcps_rexmit_interval_min); 13242 SET_MIB(tcps->tcps_mib.tcpRtoMax, tcps->tcps_rexmit_interval_max); 13243 SET_MIB(tcps->tcps_mib.tcpMaxConn, -1); 13244 SET_MIB(tcps->tcps_mib.tcpCurrEstab, 0); 13245 13246 ispriv = 13247 secpolicy_ip_config((Q_TO_CONN(q))->conn_cred, B_TRUE) == 0; 13248 zoneid = Q_TO_CONN(q)->conn_zoneid; 13249 13250 v4_conn_idx = v6_conn_idx = 0; 13251 mp_conn_tail = mp_attr_tail = mp6_conn_tail = mp6_attr_tail = NULL; 13252 13253 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 13254 ipst = tcps->tcps_netstack->netstack_ip; 13255 13256 connfp = &ipst->ips_ipcl_globalhash_fanout[i]; 13257 13258 connp = NULL; 13259 13260 while ((connp = 13261 ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { 13262 tcp_t *tcp; 13263 boolean_t needattr; 13264 13265 if (connp->conn_zoneid != zoneid) 13266 continue; /* not in this zone */ 13267 13268 tcp = connp->conn_tcp; 13269 UPDATE_MIB(&tcps->tcps_mib, 13270 tcpHCInSegs, tcp->tcp_ibsegs); 13271 tcp->tcp_ibsegs = 0; 13272 UPDATE_MIB(&tcps->tcps_mib, 13273 tcpHCOutSegs, tcp->tcp_obsegs); 13274 tcp->tcp_obsegs = 0; 13275 13276 tce6.tcp6ConnState = tce.tcpConnState = 13277 tcp_snmp_state(tcp); 13278 if (tce.tcpConnState == MIB2_TCP_established || 13279 tce.tcpConnState == MIB2_TCP_closeWait) 13280 BUMP_MIB(&tcps->tcps_mib, tcpCurrEstab); 13281 13282 needattr = B_FALSE; 13283 bzero(&mlp, sizeof (mlp)); 13284 if (connp->conn_mlp_type != mlptSingle) { 13285 if (connp->conn_mlp_type == mlptShared || 13286 connp->conn_mlp_type == mlptBoth) 13287 mlp.tme_flags |= MIB2_TMEF_SHARED; 13288 if (connp->conn_mlp_type == mlptPrivate || 13289 connp->conn_mlp_type == mlptBoth) 13290 mlp.tme_flags |= MIB2_TMEF_PRIVATE; 13291 needattr = B_TRUE; 13292 } 13293 if (connp->conn_anon_mlp) { 13294 mlp.tme_flags |= MIB2_TMEF_ANONMLP; 13295 needattr = B_TRUE; 13296 } 13297 switch (connp->conn_mac_mode) { 13298 case CONN_MAC_DEFAULT: 13299 break; 13300 case CONN_MAC_AWARE: 13301 mlp.tme_flags |= MIB2_TMEF_MACEXEMPT; 13302 needattr = B_TRUE; 13303 break; 13304 case CONN_MAC_IMPLICIT: 13305 mlp.tme_flags |= MIB2_TMEF_MACIMPLICIT; 13306 needattr = B_TRUE; 13307 break; 13308 } 13309 if (connp->conn_ixa->ixa_tsl != NULL) { 13310 ts_label_t *tsl; 13311 13312 tsl = connp->conn_ixa->ixa_tsl; 13313 mlp.tme_flags |= MIB2_TMEF_IS_LABELED; 13314 mlp.tme_doi = label2doi(tsl); 13315 mlp.tme_label = *label2bslabel(tsl); 13316 needattr = B_TRUE; 13317 } 13318 13319 /* Create a message to report on IPv6 entries */ 13320 if (connp->conn_ipversion == IPV6_VERSION) { 13321 tce6.tcp6ConnLocalAddress = connp->conn_laddr_v6; 13322 tce6.tcp6ConnRemAddress = connp->conn_faddr_v6; 13323 tce6.tcp6ConnLocalPort = ntohs(connp->conn_lport); 13324 tce6.tcp6ConnRemPort = ntohs(connp->conn_fport); 13325 if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET) { 13326 tce6.tcp6ConnIfIndex = 13327 connp->conn_ixa->ixa_scopeid; 13328 } else { 13329 tce6.tcp6ConnIfIndex = connp->conn_bound_if; 13330 } 13331 /* Don't want just anybody seeing these... */ 13332 if (ispriv) { 13333 tce6.tcp6ConnEntryInfo.ce_snxt = 13334 tcp->tcp_snxt; 13335 tce6.tcp6ConnEntryInfo.ce_suna = 13336 tcp->tcp_suna; 13337 tce6.tcp6ConnEntryInfo.ce_rnxt = 13338 tcp->tcp_rnxt; 13339 tce6.tcp6ConnEntryInfo.ce_rack = 13340 tcp->tcp_rack; 13341 } else { 13342 /* 13343 * Netstat, unfortunately, uses this to 13344 * get send/receive queue sizes. How to fix? 13345 * Why not compute the difference only? 13346 */ 13347 tce6.tcp6ConnEntryInfo.ce_snxt = 13348 tcp->tcp_snxt - tcp->tcp_suna; 13349 tce6.tcp6ConnEntryInfo.ce_suna = 0; 13350 tce6.tcp6ConnEntryInfo.ce_rnxt = 13351 tcp->tcp_rnxt - tcp->tcp_rack; 13352 tce6.tcp6ConnEntryInfo.ce_rack = 0; 13353 } 13354 13355 tce6.tcp6ConnEntryInfo.ce_swnd = tcp->tcp_swnd; 13356 tce6.tcp6ConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; 13357 tce6.tcp6ConnEntryInfo.ce_rto = tcp->tcp_rto; 13358 tce6.tcp6ConnEntryInfo.ce_mss = tcp->tcp_mss; 13359 tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state; 13360 13361 tce6.tcp6ConnCreationProcess = 13362 (connp->conn_cpid < 0) ? MIB2_UNKNOWN_PROCESS : 13363 connp->conn_cpid; 13364 tce6.tcp6ConnCreationTime = connp->conn_open_time; 13365 13366 (void) snmp_append_data2(mp6_conn_ctl->b_cont, 13367 &mp6_conn_tail, (char *)&tce6, sizeof (tce6)); 13368 13369 mlp.tme_connidx = v6_conn_idx++; 13370 if (needattr) 13371 (void) snmp_append_data2(mp6_attr_ctl->b_cont, 13372 &mp6_attr_tail, (char *)&mlp, sizeof (mlp)); 13373 } 13374 /* 13375 * Create an IPv4 table entry for IPv4 entries and also 13376 * for IPv6 entries which are bound to in6addr_any 13377 * but don't have IPV6_V6ONLY set. 13378 * (i.e. anything an IPv4 peer could connect to) 13379 */ 13380 if (connp->conn_ipversion == IPV4_VERSION || 13381 (tcp->tcp_state <= TCPS_LISTEN && 13382 !connp->conn_ipv6_v6only && 13383 IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6))) { 13384 if (connp->conn_ipversion == IPV6_VERSION) { 13385 tce.tcpConnRemAddress = INADDR_ANY; 13386 tce.tcpConnLocalAddress = INADDR_ANY; 13387 } else { 13388 tce.tcpConnRemAddress = 13389 connp->conn_faddr_v4; 13390 tce.tcpConnLocalAddress = 13391 connp->conn_laddr_v4; 13392 } 13393 tce.tcpConnLocalPort = ntohs(connp->conn_lport); 13394 tce.tcpConnRemPort = ntohs(connp->conn_fport); 13395 /* Don't want just anybody seeing these... */ 13396 if (ispriv) { 13397 tce.tcpConnEntryInfo.ce_snxt = 13398 tcp->tcp_snxt; 13399 tce.tcpConnEntryInfo.ce_suna = 13400 tcp->tcp_suna; 13401 tce.tcpConnEntryInfo.ce_rnxt = 13402 tcp->tcp_rnxt; 13403 tce.tcpConnEntryInfo.ce_rack = 13404 tcp->tcp_rack; 13405 } else { 13406 /* 13407 * Netstat, unfortunately, uses this to 13408 * get send/receive queue sizes. How 13409 * to fix? 13410 * Why not compute the difference only? 13411 */ 13412 tce.tcpConnEntryInfo.ce_snxt = 13413 tcp->tcp_snxt - tcp->tcp_suna; 13414 tce.tcpConnEntryInfo.ce_suna = 0; 13415 tce.tcpConnEntryInfo.ce_rnxt = 13416 tcp->tcp_rnxt - tcp->tcp_rack; 13417 tce.tcpConnEntryInfo.ce_rack = 0; 13418 } 13419 13420 tce.tcpConnEntryInfo.ce_swnd = tcp->tcp_swnd; 13421 tce.tcpConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; 13422 tce.tcpConnEntryInfo.ce_rto = tcp->tcp_rto; 13423 tce.tcpConnEntryInfo.ce_mss = tcp->tcp_mss; 13424 tce.tcpConnEntryInfo.ce_state = 13425 tcp->tcp_state; 13426 13427 tce.tcpConnCreationProcess = 13428 (connp->conn_cpid < 0) ? 13429 MIB2_UNKNOWN_PROCESS : 13430 connp->conn_cpid; 13431 tce.tcpConnCreationTime = connp->conn_open_time; 13432 13433 (void) snmp_append_data2(mp_conn_ctl->b_cont, 13434 &mp_conn_tail, (char *)&tce, sizeof (tce)); 13435 13436 mlp.tme_connidx = v4_conn_idx++; 13437 if (needattr) 13438 (void) snmp_append_data2( 13439 mp_attr_ctl->b_cont, 13440 &mp_attr_tail, (char *)&mlp, 13441 sizeof (mlp)); 13442 } 13443 } 13444 } 13445 13446 /* fixed length structure for IPv4 and IPv6 counters */ 13447 SET_MIB(tcps->tcps_mib.tcpConnTableSize, sizeof (mib2_tcpConnEntry_t)); 13448 SET_MIB(tcps->tcps_mib.tcp6ConnTableSize, 13449 sizeof (mib2_tcp6ConnEntry_t)); 13450 /* synchronize 32- and 64-bit counters */ 13451 SYNC32_MIB(&tcps->tcps_mib, tcpInSegs, tcpHCInSegs); 13452 SYNC32_MIB(&tcps->tcps_mib, tcpOutSegs, tcpHCOutSegs); 13453 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 13454 optp->level = MIB2_TCP; 13455 optp->name = 0; 13456 (void) snmp_append_data(mpdata, (char *)&tcps->tcps_mib, 13457 sizeof (tcps->tcps_mib)); 13458 optp->len = msgdsize(mpdata); 13459 qreply(q, mpctl); 13460 13461 /* table of connections... */ 13462 optp = (struct opthdr *)&mp_conn_ctl->b_rptr[ 13463 sizeof (struct T_optmgmt_ack)]; 13464 optp->level = MIB2_TCP; 13465 optp->name = MIB2_TCP_CONN; 13466 optp->len = msgdsize(mp_conn_ctl->b_cont); 13467 qreply(q, mp_conn_ctl); 13468 13469 /* table of MLP attributes... */ 13470 optp = (struct opthdr *)&mp_attr_ctl->b_rptr[ 13471 sizeof (struct T_optmgmt_ack)]; 13472 optp->level = MIB2_TCP; 13473 optp->name = EXPER_XPORT_MLP; 13474 optp->len = msgdsize(mp_attr_ctl->b_cont); 13475 if (optp->len == 0) 13476 freemsg(mp_attr_ctl); 13477 else 13478 qreply(q, mp_attr_ctl); 13479 13480 /* table of IPv6 connections... */ 13481 optp = (struct opthdr *)&mp6_conn_ctl->b_rptr[ 13482 sizeof (struct T_optmgmt_ack)]; 13483 optp->level = MIB2_TCP6; 13484 optp->name = MIB2_TCP6_CONN; 13485 optp->len = msgdsize(mp6_conn_ctl->b_cont); 13486 qreply(q, mp6_conn_ctl); 13487 13488 /* table of IPv6 MLP attributes... */ 13489 optp = (struct opthdr *)&mp6_attr_ctl->b_rptr[ 13490 sizeof (struct T_optmgmt_ack)]; 13491 optp->level = MIB2_TCP6; 13492 optp->name = EXPER_XPORT_MLP; 13493 optp->len = msgdsize(mp6_attr_ctl->b_cont); 13494 if (optp->len == 0) 13495 freemsg(mp6_attr_ctl); 13496 else 13497 qreply(q, mp6_attr_ctl); 13498 return (mp2ctl); 13499 } 13500 13501 /* Return 0 if invalid set request, 1 otherwise, including non-tcp requests */ 13502 /* ARGSUSED */ 13503 int 13504 tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) 13505 { 13506 mib2_tcpConnEntry_t *tce = (mib2_tcpConnEntry_t *)ptr; 13507 13508 switch (level) { 13509 case MIB2_TCP: 13510 switch (name) { 13511 case 13: 13512 if (tce->tcpConnState != MIB2_TCP_deleteTCB) 13513 return (0); 13514 /* TODO: delete entry defined by tce */ 13515 return (1); 13516 default: 13517 return (0); 13518 } 13519 default: 13520 return (1); 13521 } 13522 } 13523 13524 /* Translate TCP state to MIB2 TCP state. */ 13525 static int 13526 tcp_snmp_state(tcp_t *tcp) 13527 { 13528 if (tcp == NULL) 13529 return (0); 13530 13531 switch (tcp->tcp_state) { 13532 case TCPS_CLOSED: 13533 case TCPS_IDLE: /* RFC1213 doesn't have analogue for IDLE & BOUND */ 13534 case TCPS_BOUND: 13535 return (MIB2_TCP_closed); 13536 case TCPS_LISTEN: 13537 return (MIB2_TCP_listen); 13538 case TCPS_SYN_SENT: 13539 return (MIB2_TCP_synSent); 13540 case TCPS_SYN_RCVD: 13541 return (MIB2_TCP_synReceived); 13542 case TCPS_ESTABLISHED: 13543 return (MIB2_TCP_established); 13544 case TCPS_CLOSE_WAIT: 13545 return (MIB2_TCP_closeWait); 13546 case TCPS_FIN_WAIT_1: 13547 return (MIB2_TCP_finWait1); 13548 case TCPS_CLOSING: 13549 return (MIB2_TCP_closing); 13550 case TCPS_LAST_ACK: 13551 return (MIB2_TCP_lastAck); 13552 case TCPS_FIN_WAIT_2: 13553 return (MIB2_TCP_finWait2); 13554 case TCPS_TIME_WAIT: 13555 return (MIB2_TCP_timeWait); 13556 default: 13557 return (0); 13558 } 13559 } 13560 13561 /* 13562 * tcp_timer is the timer service routine. It handles the retransmission, 13563 * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out 13564 * from the state of the tcp instance what kind of action needs to be done 13565 * at the time it is called. 13566 */ 13567 static void 13568 tcp_timer(void *arg) 13569 { 13570 mblk_t *mp; 13571 clock_t first_threshold; 13572 clock_t second_threshold; 13573 clock_t ms; 13574 uint32_t mss; 13575 conn_t *connp = (conn_t *)arg; 13576 tcp_t *tcp = connp->conn_tcp; 13577 tcp_stack_t *tcps = tcp->tcp_tcps; 13578 13579 tcp->tcp_timer_tid = 0; 13580 13581 if (tcp->tcp_fused) 13582 return; 13583 13584 first_threshold = tcp->tcp_first_timer_threshold; 13585 second_threshold = tcp->tcp_second_timer_threshold; 13586 switch (tcp->tcp_state) { 13587 case TCPS_IDLE: 13588 case TCPS_BOUND: 13589 case TCPS_LISTEN: 13590 return; 13591 case TCPS_SYN_RCVD: { 13592 tcp_t *listener = tcp->tcp_listener; 13593 13594 if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) { 13595 /* it's our first timeout */ 13596 tcp->tcp_syn_rcvd_timeout = 1; 13597 mutex_enter(&listener->tcp_eager_lock); 13598 listener->tcp_syn_rcvd_timeout++; 13599 if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) { 13600 /* 13601 * Make this eager available for drop if we 13602 * need to drop one to accomodate a new 13603 * incoming SYN request. 13604 */ 13605 MAKE_DROPPABLE(listener, tcp); 13606 } 13607 if (!listener->tcp_syn_defense && 13608 (listener->tcp_syn_rcvd_timeout > 13609 (tcps->tcps_conn_req_max_q0 >> 2)) && 13610 (tcps->tcps_conn_req_max_q0 > 200)) { 13611 /* We may be under attack. Put on a defense. */ 13612 listener->tcp_syn_defense = B_TRUE; 13613 cmn_err(CE_WARN, "High TCP connect timeout " 13614 "rate! System (port %d) may be under a " 13615 "SYN flood attack!", 13616 ntohs(listener->tcp_connp->conn_lport)); 13617 13618 listener->tcp_ip_addr_cache = kmem_zalloc( 13619 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t), 13620 KM_NOSLEEP); 13621 } 13622 mutex_exit(&listener->tcp_eager_lock); 13623 } else if (listener != NULL) { 13624 mutex_enter(&listener->tcp_eager_lock); 13625 tcp->tcp_syn_rcvd_timeout++; 13626 if (tcp->tcp_syn_rcvd_timeout > 1 && 13627 !tcp->tcp_closemp_used) { 13628 /* 13629 * This is our second timeout. Put the tcp in 13630 * the list of droppable eagers to allow it to 13631 * be dropped, if needed. We don't check 13632 * whether tcp_dontdrop is set or not to 13633 * protect ourselve from a SYN attack where a 13634 * remote host can spoof itself as one of the 13635 * good IP source and continue to hold 13636 * resources too long. 13637 */ 13638 MAKE_DROPPABLE(listener, tcp); 13639 } 13640 mutex_exit(&listener->tcp_eager_lock); 13641 } 13642 } 13643 /* FALLTHRU */ 13644 case TCPS_SYN_SENT: 13645 first_threshold = tcp->tcp_first_ctimer_threshold; 13646 second_threshold = tcp->tcp_second_ctimer_threshold; 13647 break; 13648 case TCPS_ESTABLISHED: 13649 case TCPS_FIN_WAIT_1: 13650 case TCPS_CLOSING: 13651 case TCPS_CLOSE_WAIT: 13652 case TCPS_LAST_ACK: 13653 /* If we have data to rexmit */ 13654 if (tcp->tcp_suna != tcp->tcp_snxt) { 13655 clock_t time_to_wait; 13656 13657 BUMP_MIB(&tcps->tcps_mib, tcpTimRetrans); 13658 if (!tcp->tcp_xmit_head) 13659 break; 13660 time_to_wait = ddi_get_lbolt() - 13661 (clock_t)tcp->tcp_xmit_head->b_prev; 13662 time_to_wait = tcp->tcp_rto - 13663 TICK_TO_MSEC(time_to_wait); 13664 /* 13665 * If the timer fires too early, 1 clock tick earlier, 13666 * restart the timer. 13667 */ 13668 if (time_to_wait > msec_per_tick) { 13669 TCP_STAT(tcps, tcp_timer_fire_early); 13670 TCP_TIMER_RESTART(tcp, time_to_wait); 13671 return; 13672 } 13673 /* 13674 * When we probe zero windows, we force the swnd open. 13675 * If our peer acks with a closed window swnd will be 13676 * set to zero by tcp_rput(). As long as we are 13677 * receiving acks tcp_rput will 13678 * reset 'tcp_ms_we_have_waited' so as not to trip the 13679 * first and second interval actions. NOTE: the timer 13680 * interval is allowed to continue its exponential 13681 * backoff. 13682 */ 13683 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 13684 if (connp->conn_debug) { 13685 (void) strlog(TCP_MOD_ID, 0, 1, 13686 SL_TRACE, "tcp_timer: zero win"); 13687 } 13688 } else { 13689 /* 13690 * After retransmission, we need to do 13691 * slow start. Set the ssthresh to one 13692 * half of current effective window and 13693 * cwnd to one MSS. Also reset 13694 * tcp_cwnd_cnt. 13695 * 13696 * Note that if tcp_ssthresh is reduced because 13697 * of ECN, do not reduce it again unless it is 13698 * already one window of data away (tcp_cwr 13699 * should then be cleared) or this is a 13700 * timeout for a retransmitted segment. 13701 */ 13702 uint32_t npkt; 13703 13704 if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 13705 npkt = ((tcp->tcp_timer_backoff ? 13706 tcp->tcp_cwnd_ssthresh : 13707 tcp->tcp_snxt - 13708 tcp->tcp_suna) >> 1) / tcp->tcp_mss; 13709 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 13710 tcp->tcp_mss; 13711 } 13712 tcp->tcp_cwnd = tcp->tcp_mss; 13713 tcp->tcp_cwnd_cnt = 0; 13714 if (tcp->tcp_ecn_ok) { 13715 tcp->tcp_cwr = B_TRUE; 13716 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 13717 tcp->tcp_ecn_cwr_sent = B_FALSE; 13718 } 13719 } 13720 break; 13721 } 13722 /* 13723 * We have something to send yet we cannot send. The 13724 * reason can be: 13725 * 13726 * 1. Zero send window: we need to do zero window probe. 13727 * 2. Zero cwnd: because of ECN, we need to "clock out 13728 * segments. 13729 * 3. SWS avoidance: receiver may have shrunk window, 13730 * reset our knowledge. 13731 * 13732 * Note that condition 2 can happen with either 1 or 13733 * 3. But 1 and 3 are exclusive. 13734 */ 13735 if (tcp->tcp_unsent != 0) { 13736 /* 13737 * Should not hold the zero-copy messages for too long. 13738 */ 13739 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) 13740 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, 13741 tcp->tcp_xmit_head, B_TRUE); 13742 13743 if (tcp->tcp_cwnd == 0) { 13744 /* 13745 * Set tcp_cwnd to 1 MSS so that a 13746 * new segment can be sent out. We 13747 * are "clocking out" new data when 13748 * the network is really congested. 13749 */ 13750 ASSERT(tcp->tcp_ecn_ok); 13751 tcp->tcp_cwnd = tcp->tcp_mss; 13752 } 13753 if (tcp->tcp_swnd == 0) { 13754 /* Extend window for zero window probe */ 13755 tcp->tcp_swnd++; 13756 tcp->tcp_zero_win_probe = B_TRUE; 13757 BUMP_MIB(&tcps->tcps_mib, tcpOutWinProbe); 13758 } else { 13759 /* 13760 * Handle timeout from sender SWS avoidance. 13761 * Reset our knowledge of the max send window 13762 * since the receiver might have reduced its 13763 * receive buffer. Avoid setting tcp_max_swnd 13764 * to one since that will essentially disable 13765 * the SWS checks. 13766 * 13767 * Note that since we don't have a SWS 13768 * state variable, if the timeout is set 13769 * for ECN but not for SWS, this 13770 * code will also be executed. This is 13771 * fine as tcp_max_swnd is updated 13772 * constantly and it will not affect 13773 * anything. 13774 */ 13775 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 13776 } 13777 tcp_wput_data(tcp, NULL, B_FALSE); 13778 return; 13779 } 13780 /* Is there a FIN that needs to be to re retransmitted? */ 13781 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 13782 !tcp->tcp_fin_acked) 13783 break; 13784 /* Nothing to do, return without restarting timer. */ 13785 TCP_STAT(tcps, tcp_timer_fire_miss); 13786 return; 13787 case TCPS_FIN_WAIT_2: 13788 /* 13789 * User closed the TCP endpoint and peer ACK'ed our FIN. 13790 * We waited some time for for peer's FIN, but it hasn't 13791 * arrived. We flush the connection now to avoid 13792 * case where the peer has rebooted. 13793 */ 13794 if (TCP_IS_DETACHED(tcp)) { 13795 (void) tcp_clean_death(tcp, 0, 23); 13796 } else { 13797 TCP_TIMER_RESTART(tcp, 13798 tcps->tcps_fin_wait_2_flush_interval); 13799 } 13800 return; 13801 case TCPS_TIME_WAIT: 13802 (void) tcp_clean_death(tcp, 0, 24); 13803 return; 13804 default: 13805 if (connp->conn_debug) { 13806 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 13807 "tcp_timer: strange state (%d) %s", 13808 tcp->tcp_state, tcp_display(tcp, NULL, 13809 DISP_PORT_ONLY)); 13810 } 13811 return; 13812 } 13813 13814 /* 13815 * If the system is under memory pressure or the max number of 13816 * connections have been established for the listener, be more 13817 * aggressive in aborting connections. 13818 */ 13819 if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL && 13820 tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) { 13821 second_threshold = tcp_early_abort * SECONDS; 13822 } 13823 13824 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 13825 /* 13826 * Should not hold the zero-copy messages for too long. 13827 */ 13828 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) 13829 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, 13830 tcp->tcp_xmit_head, B_TRUE); 13831 13832 /* 13833 * For zero window probe, we need to send indefinitely, 13834 * unless we have not heard from the other side for some 13835 * time... 13836 */ 13837 if ((tcp->tcp_zero_win_probe == 0) || 13838 (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) > 13839 second_threshold)) { 13840 BUMP_MIB(&tcps->tcps_mib, tcpTimRetransDrop); 13841 /* 13842 * If TCP is in SYN_RCVD state, send back a 13843 * RST|ACK as BSD does. Note that tcp_zero_win_probe 13844 * should be zero in TCPS_SYN_RCVD state. 13845 */ 13846 if (tcp->tcp_state == TCPS_SYN_RCVD) { 13847 tcp_xmit_ctl("tcp_timer: RST sent on timeout " 13848 "in SYN_RCVD", 13849 tcp, tcp->tcp_snxt, 13850 tcp->tcp_rnxt, TH_RST | TH_ACK); 13851 } 13852 (void) tcp_clean_death(tcp, 13853 tcp->tcp_client_errno ? 13854 tcp->tcp_client_errno : ETIMEDOUT, 25); 13855 return; 13856 } else { 13857 /* 13858 * If the system is under memory pressure, we also 13859 * abort connection in zero window probing. 13860 */ 13861 if (tcps->tcps_reclaim) { 13862 (void) tcp_clean_death(tcp, 13863 tcp->tcp_client_errno ? 13864 tcp->tcp_client_errno : ETIMEDOUT, 25); 13865 return; 13866 } 13867 /* 13868 * Set tcp_ms_we_have_waited to second_threshold 13869 * so that in next timeout, we will do the above 13870 * check (ddi_get_lbolt() - tcp_last_recv_time). 13871 * This is also to avoid overflow. 13872 * 13873 * We don't need to decrement tcp_timer_backoff 13874 * to avoid overflow because it will be decremented 13875 * later if new timeout value is greater than 13876 * tcp_rexmit_interval_max. In the case when 13877 * tcp_rexmit_interval_max is greater than 13878 * second_threshold, it means that we will wait 13879 * longer than second_threshold to send the next 13880 * window probe. 13881 */ 13882 tcp->tcp_ms_we_have_waited = second_threshold; 13883 } 13884 } else if (ms > first_threshold) { 13885 /* 13886 * Should not hold the zero-copy messages for too long. 13887 */ 13888 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) 13889 tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, 13890 tcp->tcp_xmit_head, B_TRUE); 13891 13892 /* 13893 * We have been retransmitting for too long... The RTT 13894 * we calculated is probably incorrect. Reinitialize it. 13895 * Need to compensate for 0 tcp_rtt_sa. Reset 13896 * tcp_rtt_update so that we won't accidentally cache a 13897 * bad value. But only do this if this is not a zero 13898 * window probe. 13899 */ 13900 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { 13901 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 13902 (tcp->tcp_rtt_sa >> 5); 13903 tcp->tcp_rtt_sa = 0; 13904 tcp_ip_notify(tcp); 13905 tcp->tcp_rtt_update = 0; 13906 } 13907 } 13908 tcp->tcp_timer_backoff++; 13909 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 13910 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 13911 tcps->tcps_rexmit_interval_min) { 13912 /* 13913 * This means the original RTO is tcp_rexmit_interval_min. 13914 * So we will use tcp_rexmit_interval_min as the RTO value 13915 * and do the backoff. 13916 */ 13917 ms = tcps->tcps_rexmit_interval_min << tcp->tcp_timer_backoff; 13918 } else { 13919 ms <<= tcp->tcp_timer_backoff; 13920 } 13921 if (ms > tcps->tcps_rexmit_interval_max) { 13922 ms = tcps->tcps_rexmit_interval_max; 13923 /* 13924 * ms is at max, decrement tcp_timer_backoff to avoid 13925 * overflow. 13926 */ 13927 tcp->tcp_timer_backoff--; 13928 } 13929 tcp->tcp_ms_we_have_waited += ms; 13930 if (tcp->tcp_zero_win_probe == 0) { 13931 tcp->tcp_rto = ms; 13932 } 13933 TCP_TIMER_RESTART(tcp, ms); 13934 /* 13935 * This is after a timeout and tcp_rto is backed off. Set 13936 * tcp_set_timer to 1 so that next time RTO is updated, we will 13937 * restart the timer with a correct value. 13938 */ 13939 tcp->tcp_set_timer = 1; 13940 mss = tcp->tcp_snxt - tcp->tcp_suna; 13941 if (mss > tcp->tcp_mss) 13942 mss = tcp->tcp_mss; 13943 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 13944 mss = tcp->tcp_swnd; 13945 13946 if ((mp = tcp->tcp_xmit_head) != NULL) 13947 mp->b_prev = (mblk_t *)ddi_get_lbolt(); 13948 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 13949 B_TRUE); 13950 13951 /* 13952 * When slow start after retransmission begins, start with 13953 * this seq no. tcp_rexmit_max marks the end of special slow 13954 * start phase. tcp_snd_burst controls how many segments 13955 * can be sent because of an ack. 13956 */ 13957 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 13958 tcp->tcp_snd_burst = TCP_CWND_SS; 13959 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 13960 (tcp->tcp_unsent == 0)) { 13961 tcp->tcp_rexmit_max = tcp->tcp_fss; 13962 } else { 13963 tcp->tcp_rexmit_max = tcp->tcp_snxt; 13964 } 13965 tcp->tcp_rexmit = B_TRUE; 13966 tcp->tcp_dupack_cnt = 0; 13967 13968 /* 13969 * Remove all rexmit SACK blk to start from fresh. 13970 */ 13971 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) 13972 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); 13973 if (mp == NULL) { 13974 return; 13975 } 13976 13977 tcp->tcp_csuna = tcp->tcp_snxt; 13978 BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); 13979 UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, mss); 13980 tcp_send_data(tcp, mp); 13981 13982 } 13983 13984 static int 13985 tcp_do_unbind(conn_t *connp) 13986 { 13987 tcp_t *tcp = connp->conn_tcp; 13988 13989 switch (tcp->tcp_state) { 13990 case TCPS_BOUND: 13991 case TCPS_LISTEN: 13992 break; 13993 default: 13994 return (-TOUTSTATE); 13995 } 13996 13997 /* 13998 * Need to clean up all the eagers since after the unbind, segments 13999 * will no longer be delivered to this listener stream. 14000 */ 14001 mutex_enter(&tcp->tcp_eager_lock); 14002 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 14003 tcp_eager_cleanup(tcp, 0); 14004 } 14005 mutex_exit(&tcp->tcp_eager_lock); 14006 14007 /* Clean up the listener connection counter if necessary. */ 14008 if (tcp->tcp_listen_cnt != NULL) 14009 TCP_DECR_LISTEN_CNT(tcp); 14010 connp->conn_laddr_v6 = ipv6_all_zeros; 14011 connp->conn_saddr_v6 = ipv6_all_zeros; 14012 tcp_bind_hash_remove(tcp); 14013 tcp->tcp_state = TCPS_IDLE; 14014 14015 ip_unbind(connp); 14016 bzero(&connp->conn_ports, sizeof (connp->conn_ports)); 14017 14018 return (0); 14019 } 14020 14021 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ 14022 static void 14023 tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp) 14024 { 14025 conn_t *connp = tcp->tcp_connp; 14026 int error; 14027 14028 error = tcp_do_unbind(connp); 14029 if (error > 0) { 14030 tcp_err_ack(tcp, mp, TSYSERR, error); 14031 } else if (error < 0) { 14032 tcp_err_ack(tcp, mp, -error, 0); 14033 } else { 14034 /* Send M_FLUSH according to TPI */ 14035 (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); 14036 14037 mp = mi_tpi_ok_ack_alloc(mp); 14038 if (mp != NULL) 14039 putnext(connp->conn_rq, mp); 14040 } 14041 } 14042 14043 /* 14044 * Don't let port fall into the privileged range. 14045 * Since the extra privileged ports can be arbitrary we also 14046 * ensure that we exclude those from consideration. 14047 * tcp_g_epriv_ports is not sorted thus we loop over it until 14048 * there are no changes. 14049 * 14050 * Note: No locks are held when inspecting tcp_g_*epriv_ports 14051 * but instead the code relies on: 14052 * - the fact that the address of the array and its size never changes 14053 * - the atomic assignment of the elements of the array 14054 * 14055 * Returns 0 if there are no more ports available. 14056 * 14057 * TS note: skip multilevel ports. 14058 */ 14059 static in_port_t 14060 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) 14061 { 14062 int i; 14063 boolean_t restart = B_FALSE; 14064 tcp_stack_t *tcps = tcp->tcp_tcps; 14065 14066 if (random && tcp_random_anon_port != 0) { 14067 (void) random_get_pseudo_bytes((uint8_t *)&port, 14068 sizeof (in_port_t)); 14069 /* 14070 * Unless changed by a sys admin, the smallest anon port 14071 * is 32768 and the largest anon port is 65535. It is 14072 * very likely (50%) for the random port to be smaller 14073 * than the smallest anon port. When that happens, 14074 * add port % (anon port range) to the smallest anon 14075 * port to get the random port. It should fall into the 14076 * valid anon port range. 14077 */ 14078 if (port < tcps->tcps_smallest_anon_port) { 14079 port = tcps->tcps_smallest_anon_port + 14080 port % (tcps->tcps_largest_anon_port - 14081 tcps->tcps_smallest_anon_port); 14082 } 14083 } 14084 14085 retry: 14086 if (port < tcps->tcps_smallest_anon_port) 14087 port = (in_port_t)tcps->tcps_smallest_anon_port; 14088 14089 if (port > tcps->tcps_largest_anon_port) { 14090 if (restart) 14091 return (0); 14092 restart = B_TRUE; 14093 port = (in_port_t)tcps->tcps_smallest_anon_port; 14094 } 14095 14096 if (port < tcps->tcps_smallest_nonpriv_port) 14097 port = (in_port_t)tcps->tcps_smallest_nonpriv_port; 14098 14099 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 14100 if (port == tcps->tcps_g_epriv_ports[i]) { 14101 port++; 14102 /* 14103 * Make sure whether the port is in the 14104 * valid range. 14105 */ 14106 goto retry; 14107 } 14108 } 14109 if (is_system_labeled() && 14110 (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port, 14111 IPPROTO_TCP, B_TRUE)) != 0) { 14112 port = i; 14113 goto retry; 14114 } 14115 return (port); 14116 } 14117 14118 /* 14119 * Return the next anonymous port in the privileged port range for 14120 * bind checking. It starts at IPPORT_RESERVED - 1 and goes 14121 * downwards. This is the same behavior as documented in the userland 14122 * library call rresvport(3N). 14123 * 14124 * TS note: skip multilevel ports. 14125 */ 14126 static in_port_t 14127 tcp_get_next_priv_port(const tcp_t *tcp) 14128 { 14129 static in_port_t next_priv_port = IPPORT_RESERVED - 1; 14130 in_port_t nextport; 14131 boolean_t restart = B_FALSE; 14132 tcp_stack_t *tcps = tcp->tcp_tcps; 14133 retry: 14134 if (next_priv_port < tcps->tcps_min_anonpriv_port || 14135 next_priv_port >= IPPORT_RESERVED) { 14136 next_priv_port = IPPORT_RESERVED - 1; 14137 if (restart) 14138 return (0); 14139 restart = B_TRUE; 14140 } 14141 if (is_system_labeled() && 14142 (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), 14143 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { 14144 next_priv_port = nextport; 14145 goto retry; 14146 } 14147 return (next_priv_port--); 14148 } 14149 14150 /* The write side r/w procedure. */ 14151 14152 #if CCS_STATS 14153 struct { 14154 struct { 14155 int64_t count, bytes; 14156 } tot, hit; 14157 } wrw_stats; 14158 #endif 14159 14160 /* 14161 * Call by tcp_wput() to handle all non data, except M_PROTO and M_PCPROTO, 14162 * messages. 14163 */ 14164 /* ARGSUSED */ 14165 static void 14166 tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 14167 { 14168 conn_t *connp = (conn_t *)arg; 14169 tcp_t *tcp = connp->conn_tcp; 14170 14171 ASSERT(DB_TYPE(mp) != M_IOCTL); 14172 /* 14173 * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close. 14174 * Once the close starts, streamhead and sockfs will not let any data 14175 * packets come down (close ensures that there are no threads using the 14176 * queue and no new threads will come down) but since qprocsoff() 14177 * hasn't happened yet, a M_FLUSH or some non data message might 14178 * get reflected back (in response to our own FLUSHRW) and get 14179 * processed after tcp_close() is done. The conn would still be valid 14180 * because a ref would have added but we need to check the state 14181 * before actually processing the packet. 14182 */ 14183 if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) { 14184 freemsg(mp); 14185 return; 14186 } 14187 14188 switch (DB_TYPE(mp)) { 14189 case M_IOCDATA: 14190 tcp_wput_iocdata(tcp, mp); 14191 break; 14192 case M_FLUSH: 14193 tcp_wput_flush(tcp, mp); 14194 break; 14195 default: 14196 ip_wput_nondata(connp->conn_wq, mp); 14197 break; 14198 } 14199 } 14200 14201 /* 14202 * The TCP fast path write put procedure. 14203 * NOTE: the logic of the fast path is duplicated from tcp_wput_data() 14204 */ 14205 /* ARGSUSED */ 14206 void 14207 tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 14208 { 14209 int len; 14210 int hdrlen; 14211 int plen; 14212 mblk_t *mp1; 14213 uchar_t *rptr; 14214 uint32_t snxt; 14215 tcpha_t *tcpha; 14216 struct datab *db; 14217 uint32_t suna; 14218 uint32_t mss; 14219 ipaddr_t *dst; 14220 ipaddr_t *src; 14221 uint32_t sum; 14222 int usable; 14223 conn_t *connp = (conn_t *)arg; 14224 tcp_t *tcp = connp->conn_tcp; 14225 uint32_t msize; 14226 tcp_stack_t *tcps = tcp->tcp_tcps; 14227 ip_xmit_attr_t *ixa; 14228 clock_t now; 14229 14230 /* 14231 * Try and ASSERT the minimum possible references on the 14232 * conn early enough. Since we are executing on write side, 14233 * the connection is obviously not detached and that means 14234 * there is a ref each for TCP and IP. Since we are behind 14235 * the squeue, the minimum references needed are 3. If the 14236 * conn is in classifier hash list, there should be an 14237 * extra ref for that (we check both the possibilities). 14238 */ 14239 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 14240 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 14241 14242 ASSERT(DB_TYPE(mp) == M_DATA); 14243 msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 14244 14245 mutex_enter(&tcp->tcp_non_sq_lock); 14246 tcp->tcp_squeue_bytes -= msize; 14247 mutex_exit(&tcp->tcp_non_sq_lock); 14248 14249 /* Bypass tcp protocol for fused tcp loopback */ 14250 if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 14251 return; 14252 14253 mss = tcp->tcp_mss; 14254 /* 14255 * If ZEROCOPY has turned off, try not to send any zero-copy message 14256 * down. Do backoff, now. 14257 */ 14258 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on) 14259 mp = tcp_zcopy_backoff(tcp, mp, B_FALSE); 14260 14261 14262 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 14263 len = (int)(mp->b_wptr - mp->b_rptr); 14264 14265 /* 14266 * Criteria for fast path: 14267 * 14268 * 1. no unsent data 14269 * 2. single mblk in request 14270 * 3. connection established 14271 * 4. data in mblk 14272 * 5. len <= mss 14273 * 6. no tcp_valid bits 14274 */ 14275 if ((tcp->tcp_unsent != 0) || 14276 (tcp->tcp_cork) || 14277 (mp->b_cont != NULL) || 14278 (tcp->tcp_state != TCPS_ESTABLISHED) || 14279 (len == 0) || 14280 (len > mss) || 14281 (tcp->tcp_valid_bits != 0)) { 14282 tcp_wput_data(tcp, mp, B_FALSE); 14283 return; 14284 } 14285 14286 ASSERT(tcp->tcp_xmit_tail_unsent == 0); 14287 ASSERT(tcp->tcp_fin_sent == 0); 14288 14289 /* queue new packet onto retransmission queue */ 14290 if (tcp->tcp_xmit_head == NULL) { 14291 tcp->tcp_xmit_head = mp; 14292 } else { 14293 tcp->tcp_xmit_last->b_cont = mp; 14294 } 14295 tcp->tcp_xmit_last = mp; 14296 tcp->tcp_xmit_tail = mp; 14297 14298 /* find out how much we can send */ 14299 /* BEGIN CSTYLED */ 14300 /* 14301 * un-acked usable 14302 * |--------------|-----------------| 14303 * tcp_suna tcp_snxt tcp_suna+tcp_swnd 14304 */ 14305 /* END CSTYLED */ 14306 14307 /* start sending from tcp_snxt */ 14308 snxt = tcp->tcp_snxt; 14309 14310 /* 14311 * Check to see if this connection has been idled for some 14312 * time and no ACK is expected. If it is, we need to slow 14313 * start again to get back the connection's "self-clock" as 14314 * described in VJ's paper. 14315 * 14316 * Reinitialize tcp_cwnd after idle. 14317 */ 14318 now = LBOLT_FASTPATH; 14319 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 14320 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 14321 SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); 14322 } 14323 14324 usable = tcp->tcp_swnd; /* tcp window size */ 14325 if (usable > tcp->tcp_cwnd) 14326 usable = tcp->tcp_cwnd; /* congestion window smaller */ 14327 usable -= snxt; /* subtract stuff already sent */ 14328 suna = tcp->tcp_suna; 14329 usable += suna; 14330 /* usable can be < 0 if the congestion window is smaller */ 14331 if (len > usable) { 14332 /* Can't send complete M_DATA in one shot */ 14333 goto slow; 14334 } 14335 14336 mutex_enter(&tcp->tcp_non_sq_lock); 14337 if (tcp->tcp_flow_stopped && 14338 TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { 14339 tcp_clrqfull(tcp); 14340 } 14341 mutex_exit(&tcp->tcp_non_sq_lock); 14342 14343 /* 14344 * determine if anything to send (Nagle). 14345 * 14346 * 1. len < tcp_mss (i.e. small) 14347 * 2. unacknowledged data present 14348 * 3. len < nagle limit 14349 * 4. last packet sent < nagle limit (previous packet sent) 14350 */ 14351 if ((len < mss) && (snxt != suna) && 14352 (len < (int)tcp->tcp_naglim) && 14353 (tcp->tcp_last_sent_len < tcp->tcp_naglim)) { 14354 /* 14355 * This was the first unsent packet and normally 14356 * mss < xmit_hiwater so there is no need to worry 14357 * about flow control. The next packet will go 14358 * through the flow control check in tcp_wput_data(). 14359 */ 14360 /* leftover work from above */ 14361 tcp->tcp_unsent = len; 14362 tcp->tcp_xmit_tail_unsent = len; 14363 14364 return; 14365 } 14366 14367 /* len <= tcp->tcp_mss && len == unsent so no silly window */ 14368 14369 if (snxt == suna) { 14370 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 14371 } 14372 14373 /* we have always sent something */ 14374 tcp->tcp_rack_cnt = 0; 14375 14376 tcp->tcp_snxt = snxt + len; 14377 tcp->tcp_rack = tcp->tcp_rnxt; 14378 14379 if ((mp1 = dupb(mp)) == 0) 14380 goto no_memory; 14381 mp->b_prev = (mblk_t *)(uintptr_t)now; 14382 mp->b_next = (mblk_t *)(uintptr_t)snxt; 14383 14384 /* adjust tcp header information */ 14385 tcpha = tcp->tcp_tcpha; 14386 tcpha->tha_flags = (TH_ACK|TH_PUSH); 14387 14388 sum = len + connp->conn_ht_ulp_len + connp->conn_sum; 14389 sum = (sum >> 16) + (sum & 0xFFFF); 14390 tcpha->tha_sum = htons(sum); 14391 14392 tcpha->tha_seq = htonl(snxt); 14393 14394 BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); 14395 UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); 14396 BUMP_LOCAL(tcp->tcp_obsegs); 14397 14398 /* Update the latest receive window size in TCP header. */ 14399 tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 14400 14401 tcp->tcp_last_sent_len = (ushort_t)len; 14402 14403 plen = len + connp->conn_ht_iphc_len; 14404 14405 ixa = connp->conn_ixa; 14406 ixa->ixa_pktlen = plen; 14407 14408 if (ixa->ixa_flags & IXAF_IS_IPV4) { 14409 tcp->tcp_ipha->ipha_length = htons(plen); 14410 } else { 14411 tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN); 14412 } 14413 14414 /* see if we need to allocate a mblk for the headers */ 14415 hdrlen = connp->conn_ht_iphc_len; 14416 rptr = mp1->b_rptr - hdrlen; 14417 db = mp1->b_datap; 14418 if ((db->db_ref != 2) || rptr < db->db_base || 14419 (!OK_32PTR(rptr))) { 14420 /* NOTE: we assume allocb returns an OK_32PTR */ 14421 mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED); 14422 if (!mp) { 14423 freemsg(mp1); 14424 goto no_memory; 14425 } 14426 mp->b_cont = mp1; 14427 mp1 = mp; 14428 /* Leave room for Link Level header */ 14429 rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra]; 14430 mp1->b_wptr = &rptr[hdrlen]; 14431 } 14432 mp1->b_rptr = rptr; 14433 14434 /* Fill in the timestamp option. */ 14435 if (tcp->tcp_snd_ts_ok) { 14436 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 14437 14438 U32_TO_BE32(llbolt, 14439 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); 14440 U32_TO_BE32(tcp->tcp_ts_recent, 14441 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); 14442 } else { 14443 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); 14444 } 14445 14446 /* copy header into outgoing packet */ 14447 dst = (ipaddr_t *)rptr; 14448 src = (ipaddr_t *)connp->conn_ht_iphc; 14449 dst[0] = src[0]; 14450 dst[1] = src[1]; 14451 dst[2] = src[2]; 14452 dst[3] = src[3]; 14453 dst[4] = src[4]; 14454 dst[5] = src[5]; 14455 dst[6] = src[6]; 14456 dst[7] = src[7]; 14457 dst[8] = src[8]; 14458 dst[9] = src[9]; 14459 if (hdrlen -= 40) { 14460 hdrlen >>= 2; 14461 dst += 10; 14462 src += 10; 14463 do { 14464 *dst++ = *src++; 14465 } while (--hdrlen); 14466 } 14467 14468 /* 14469 * Set the ECN info in the TCP header. Note that this 14470 * is not the template header. 14471 */ 14472 if (tcp->tcp_ecn_ok) { 14473 SET_ECT(tcp, rptr); 14474 14475 tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length); 14476 if (tcp->tcp_ecn_echo_on) 14477 tcpha->tha_flags |= TH_ECE; 14478 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 14479 tcpha->tha_flags |= TH_CWR; 14480 tcp->tcp_ecn_cwr_sent = B_TRUE; 14481 } 14482 } 14483 14484 if (tcp->tcp_ip_forward_progress) { 14485 tcp->tcp_ip_forward_progress = B_FALSE; 14486 connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; 14487 } else { 14488 connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; 14489 } 14490 tcp_send_data(tcp, mp1); 14491 return; 14492 14493 /* 14494 * If we ran out of memory, we pretend to have sent the packet 14495 * and that it was lost on the wire. 14496 */ 14497 no_memory: 14498 return; 14499 14500 slow: 14501 /* leftover work from above */ 14502 tcp->tcp_unsent = len; 14503 tcp->tcp_xmit_tail_unsent = len; 14504 tcp_wput_data(tcp, NULL, B_FALSE); 14505 } 14506 14507 /* 14508 * This runs at the tail end of accept processing on the squeue of the 14509 * new connection. 14510 */ 14511 /* ARGSUSED */ 14512 void 14513 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 14514 { 14515 conn_t *connp = (conn_t *)arg; 14516 tcp_t *tcp = connp->conn_tcp; 14517 queue_t *q = connp->conn_rq; 14518 tcp_stack_t *tcps = tcp->tcp_tcps; 14519 /* socket options */ 14520 struct sock_proto_props sopp; 14521 14522 /* We should just receive a single mblk that fits a T_discon_ind */ 14523 ASSERT(mp->b_cont == NULL); 14524 14525 /* 14526 * Drop the eager's ref on the listener, that was placed when 14527 * this eager began life in tcp_input_listener. 14528 */ 14529 CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); 14530 if (IPCL_IS_NONSTR(connp)) { 14531 /* Safe to free conn_ind message */ 14532 freemsg(tcp->tcp_conn.tcp_eager_conn_ind); 14533 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 14534 } 14535 14536 tcp->tcp_detached = B_FALSE; 14537 14538 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) { 14539 /* 14540 * Someone blewoff the eager before we could finish 14541 * the accept. 14542 * 14543 * The only reason eager exists it because we put in 14544 * a ref on it when conn ind went up. We need to send 14545 * a disconnect indication up while the last reference 14546 * on the eager will be dropped by the squeue when we 14547 * return. 14548 */ 14549 ASSERT(tcp->tcp_listener == NULL); 14550 if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { 14551 if (IPCL_IS_NONSTR(connp)) { 14552 ASSERT(tcp->tcp_issocket); 14553 (*connp->conn_upcalls->su_disconnected)( 14554 connp->conn_upper_handle, tcp->tcp_connid, 14555 ECONNREFUSED); 14556 freemsg(mp); 14557 } else { 14558 struct T_discon_ind *tdi; 14559 14560 (void) putnextctl1(q, M_FLUSH, FLUSHRW); 14561 /* 14562 * Let us reuse the incoming mblk to avoid 14563 * memory allocation failure problems. We know 14564 * that the size of the incoming mblk i.e. 14565 * stroptions is greater than sizeof 14566 * T_discon_ind. 14567 */ 14568 ASSERT(DB_REF(mp) == 1); 14569 ASSERT(MBLKSIZE(mp) >= 14570 sizeof (struct T_discon_ind)); 14571 14572 DB_TYPE(mp) = M_PROTO; 14573 ((union T_primitives *)mp->b_rptr)->type = 14574 T_DISCON_IND; 14575 tdi = (struct T_discon_ind *)mp->b_rptr; 14576 if (tcp->tcp_issocket) { 14577 tdi->DISCON_reason = ECONNREFUSED; 14578 tdi->SEQ_number = 0; 14579 } else { 14580 tdi->DISCON_reason = ENOPROTOOPT; 14581 tdi->SEQ_number = 14582 tcp->tcp_conn_req_seqnum; 14583 } 14584 mp->b_wptr = mp->b_rptr + 14585 sizeof (struct T_discon_ind); 14586 putnext(q, mp); 14587 } 14588 } 14589 tcp->tcp_hard_binding = B_FALSE; 14590 return; 14591 } 14592 14593 /* 14594 * This is the first time we run on the correct 14595 * queue after tcp_accept. So fix all the q parameters 14596 * here. 14597 */ 14598 sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF; 14599 sopp.sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 14600 14601 sopp.sopp_rxhiwat = tcp->tcp_fused ? 14602 tcp_fuse_set_rcv_hiwat(tcp, connp->conn_rcvbuf) : 14603 connp->conn_rcvbuf; 14604 14605 /* 14606 * Determine what write offset value to use depending on SACK and 14607 * whether the endpoint is fused or not. 14608 */ 14609 if (tcp->tcp_fused) { 14610 ASSERT(tcp->tcp_loopback); 14611 ASSERT(tcp->tcp_loopback_peer != NULL); 14612 /* 14613 * For fused tcp loopback, set the stream head's write 14614 * offset value to zero since we won't be needing any room 14615 * for TCP/IP headers. This would also improve performance 14616 * since it would reduce the amount of work done by kmem. 14617 * Non-fused tcp loopback case is handled separately below. 14618 */ 14619 sopp.sopp_wroff = 0; 14620 /* 14621 * Update the peer's transmit parameters according to 14622 * our recently calculated high water mark value. 14623 */ 14624 (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); 14625 } else if (tcp->tcp_snd_sack_ok) { 14626 sopp.sopp_wroff = connp->conn_ht_iphc_allocated + 14627 (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra); 14628 } else { 14629 sopp.sopp_wroff = connp->conn_ht_iphc_len + 14630 (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra); 14631 } 14632 14633 /* 14634 * If this is endpoint is handling SSL, then reserve extra 14635 * offset and space at the end. 14636 * Also have the stream head allocate SSL3_MAX_RECORD_LEN packets, 14637 * overriding the previous setting. The extra cost of signing and 14638 * encrypting multiple MSS-size records (12 of them with Ethernet), 14639 * instead of a single contiguous one by the stream head 14640 * largely outweighs the statistical reduction of ACKs, when 14641 * applicable. The peer will also save on decryption and verification 14642 * costs. 14643 */ 14644 if (tcp->tcp_kssl_ctx != NULL) { 14645 sopp.sopp_wroff += SSL3_WROFFSET; 14646 14647 sopp.sopp_flags |= SOCKOPT_TAIL; 14648 sopp.sopp_tail = SSL3_MAX_TAIL_LEN; 14649 14650 sopp.sopp_flags |= SOCKOPT_ZCOPY; 14651 sopp.sopp_zcopyflag = ZCVMUNSAFE; 14652 14653 sopp.sopp_maxblk = SSL3_MAX_RECORD_LEN; 14654 } 14655 14656 /* Send the options up */ 14657 if (IPCL_IS_NONSTR(connp)) { 14658 if (sopp.sopp_flags & SOCKOPT_TAIL) { 14659 ASSERT(tcp->tcp_kssl_ctx != NULL); 14660 ASSERT(sopp.sopp_flags & SOCKOPT_ZCOPY); 14661 } 14662 if (tcp->tcp_loopback) { 14663 sopp.sopp_flags |= SOCKOPT_LOOPBACK; 14664 sopp.sopp_loopback = B_TRUE; 14665 } 14666 (*connp->conn_upcalls->su_set_proto_props) 14667 (connp->conn_upper_handle, &sopp); 14668 freemsg(mp); 14669 } else { 14670 /* 14671 * Let us reuse the incoming mblk to avoid 14672 * memory allocation failure problems. We know 14673 * that the size of the incoming mblk is at least 14674 * stroptions 14675 */ 14676 struct stroptions *stropt; 14677 14678 ASSERT(DB_REF(mp) == 1); 14679 ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions)); 14680 14681 DB_TYPE(mp) = M_SETOPTS; 14682 stropt = (struct stroptions *)mp->b_rptr; 14683 mp->b_wptr = mp->b_rptr + sizeof (struct stroptions); 14684 stropt = (struct stroptions *)mp->b_rptr; 14685 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 14686 stropt->so_hiwat = sopp.sopp_rxhiwat; 14687 stropt->so_wroff = sopp.sopp_wroff; 14688 stropt->so_maxblk = sopp.sopp_maxblk; 14689 14690 if (sopp.sopp_flags & SOCKOPT_TAIL) { 14691 ASSERT(tcp->tcp_kssl_ctx != NULL); 14692 14693 stropt->so_flags |= SO_TAIL | SO_COPYOPT; 14694 stropt->so_tail = sopp.sopp_tail; 14695 stropt->so_copyopt = sopp.sopp_zcopyflag; 14696 } 14697 14698 /* Send the options up */ 14699 putnext(q, mp); 14700 } 14701 14702 /* 14703 * Pass up any data and/or a fin that has been received. 14704 * 14705 * Adjust receive window in case it had decreased 14706 * (because there is data <=> tcp_rcv_list != NULL) 14707 * while the connection was detached. Note that 14708 * in case the eager was flow-controlled, w/o this 14709 * code, the rwnd may never open up again! 14710 */ 14711 if (tcp->tcp_rcv_list != NULL) { 14712 if (IPCL_IS_NONSTR(connp)) { 14713 mblk_t *mp; 14714 int space_left; 14715 int error; 14716 boolean_t push = B_TRUE; 14717 14718 if (!tcp->tcp_fused && (*connp->conn_upcalls->su_recv) 14719 (connp->conn_upper_handle, NULL, 0, 0, &error, 14720 &push) >= 0) { 14721 tcp->tcp_rwnd = connp->conn_rcvbuf; 14722 if (tcp->tcp_state >= TCPS_ESTABLISHED && 14723 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 14724 tcp_xmit_ctl(NULL, 14725 tcp, (tcp->tcp_swnd == 0) ? 14726 tcp->tcp_suna : tcp->tcp_snxt, 14727 tcp->tcp_rnxt, TH_ACK); 14728 } 14729 } 14730 while ((mp = tcp->tcp_rcv_list) != NULL) { 14731 push = B_TRUE; 14732 tcp->tcp_rcv_list = mp->b_next; 14733 mp->b_next = NULL; 14734 space_left = (*connp->conn_upcalls->su_recv) 14735 (connp->conn_upper_handle, mp, msgdsize(mp), 14736 0, &error, &push); 14737 if (space_left < 0) { 14738 /* 14739 * We should never be in middle of a 14740 * fallback, the squeue guarantees that. 14741 */ 14742 ASSERT(error != EOPNOTSUPP); 14743 } 14744 } 14745 tcp->tcp_rcv_last_head = NULL; 14746 tcp->tcp_rcv_last_tail = NULL; 14747 tcp->tcp_rcv_cnt = 0; 14748 } else { 14749 /* We drain directly in case of fused tcp loopback */ 14750 14751 if (!tcp->tcp_fused && canputnext(q)) { 14752 tcp->tcp_rwnd = connp->conn_rcvbuf; 14753 if (tcp->tcp_state >= TCPS_ESTABLISHED && 14754 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 14755 tcp_xmit_ctl(NULL, 14756 tcp, (tcp->tcp_swnd == 0) ? 14757 tcp->tcp_suna : tcp->tcp_snxt, 14758 tcp->tcp_rnxt, TH_ACK); 14759 } 14760 } 14761 14762 (void) tcp_rcv_drain(tcp); 14763 } 14764 14765 /* 14766 * For fused tcp loopback, back-enable peer endpoint 14767 * if it's currently flow-controlled. 14768 */ 14769 if (tcp->tcp_fused) { 14770 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 14771 14772 ASSERT(peer_tcp != NULL); 14773 ASSERT(peer_tcp->tcp_fused); 14774 14775 mutex_enter(&peer_tcp->tcp_non_sq_lock); 14776 if (peer_tcp->tcp_flow_stopped) { 14777 tcp_clrqfull(peer_tcp); 14778 TCP_STAT(tcps, tcp_fusion_backenabled); 14779 } 14780 mutex_exit(&peer_tcp->tcp_non_sq_lock); 14781 } 14782 } 14783 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 14784 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 14785 tcp->tcp_ordrel_done = B_TRUE; 14786 if (IPCL_IS_NONSTR(connp)) { 14787 ASSERT(tcp->tcp_ordrel_mp == NULL); 14788 (*connp->conn_upcalls->su_opctl)( 14789 connp->conn_upper_handle, 14790 SOCK_OPCTL_SHUT_RECV, 0); 14791 } else { 14792 mp = tcp->tcp_ordrel_mp; 14793 tcp->tcp_ordrel_mp = NULL; 14794 putnext(q, mp); 14795 } 14796 } 14797 tcp->tcp_hard_binding = B_FALSE; 14798 14799 if (connp->conn_keepalive) { 14800 tcp->tcp_ka_last_intrvl = 0; 14801 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 14802 MSEC_TO_TICK(tcp->tcp_ka_interval)); 14803 } 14804 14805 /* 14806 * At this point, eager is fully established and will 14807 * have the following references - 14808 * 14809 * 2 references for connection to exist (1 for TCP and 1 for IP). 14810 * 1 reference for the squeue which will be dropped by the squeue as 14811 * soon as this function returns. 14812 * There will be 1 additonal reference for being in classifier 14813 * hash list provided something bad hasn't happened. 14814 */ 14815 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 14816 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 14817 } 14818 14819 /* 14820 * The function called through squeue to get behind listener's perimeter to 14821 * send a deferred conn_ind. 14822 */ 14823 /* ARGSUSED */ 14824 void 14825 tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 14826 { 14827 conn_t *lconnp = (conn_t *)arg; 14828 tcp_t *listener = lconnp->conn_tcp; 14829 struct T_conn_ind *conn_ind; 14830 tcp_t *tcp; 14831 14832 conn_ind = (struct T_conn_ind *)mp->b_rptr; 14833 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 14834 conn_ind->OPT_length); 14835 14836 if (listener->tcp_state != TCPS_LISTEN) { 14837 /* 14838 * If listener has closed, it would have caused a 14839 * a cleanup/blowoff to happen for the eager, so 14840 * we don't need to do anything more. 14841 */ 14842 freemsg(mp); 14843 return; 14844 } 14845 14846 tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp); 14847 } 14848 14849 /* 14850 * Common to TPI and sockfs accept code. 14851 */ 14852 /* ARGSUSED2 */ 14853 static int 14854 tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr) 14855 { 14856 tcp_t *listener, *eager; 14857 mblk_t *discon_mp; 14858 14859 listener = lconnp->conn_tcp; 14860 ASSERT(listener->tcp_state == TCPS_LISTEN); 14861 eager = econnp->conn_tcp; 14862 ASSERT(eager->tcp_listener != NULL); 14863 14864 /* 14865 * Pre allocate the discon_ind mblk also. tcp_accept_finish will 14866 * use it if something failed. 14867 */ 14868 discon_mp = allocb(MAX(sizeof (struct T_discon_ind), 14869 sizeof (struct stroptions)), BPRI_HI); 14870 14871 if (discon_mp == NULL) { 14872 return (-TPROTO); 14873 } 14874 eager->tcp_issocket = B_TRUE; 14875 14876 econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; 14877 econnp->conn_allzones = listener->tcp_connp->conn_allzones; 14878 ASSERT(econnp->conn_netstack == 14879 listener->tcp_connp->conn_netstack); 14880 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 14881 14882 /* Put the ref for IP */ 14883 CONN_INC_REF(econnp); 14884 14885 /* 14886 * We should have minimum of 3 references on the conn 14887 * at this point. One each for TCP and IP and one for 14888 * the T_conn_ind that was sent up when the 3-way handshake 14889 * completed. In the normal case we would also have another 14890 * reference (making a total of 4) for the conn being in the 14891 * classifier hash list. However the eager could have received 14892 * an RST subsequently and tcp_closei_local could have removed 14893 * the eager from the classifier hash list, hence we can't 14894 * assert that reference. 14895 */ 14896 ASSERT(econnp->conn_ref >= 3); 14897 14898 mutex_enter(&listener->tcp_eager_lock); 14899 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 14900 14901 tcp_t *tail; 14902 tcp_t *tcp; 14903 mblk_t *mp1; 14904 14905 tcp = listener->tcp_eager_prev_q0; 14906 /* 14907 * listener->tcp_eager_prev_q0 points to the TAIL of the 14908 * deferred T_conn_ind queue. We need to get to the head 14909 * of the queue in order to send up T_conn_ind the same 14910 * order as how the 3WHS is completed. 14911 */ 14912 while (tcp != listener) { 14913 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 && 14914 !tcp->tcp_kssl_pending) 14915 break; 14916 else 14917 tcp = tcp->tcp_eager_prev_q0; 14918 } 14919 /* None of the pending eagers can be sent up now */ 14920 if (tcp == listener) 14921 goto no_more_eagers; 14922 14923 mp1 = tcp->tcp_conn.tcp_eager_conn_ind; 14924 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 14925 /* Move from q0 to q */ 14926 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 14927 listener->tcp_conn_req_cnt_q0--; 14928 listener->tcp_conn_req_cnt_q++; 14929 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 14930 tcp->tcp_eager_prev_q0; 14931 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 14932 tcp->tcp_eager_next_q0; 14933 tcp->tcp_eager_prev_q0 = NULL; 14934 tcp->tcp_eager_next_q0 = NULL; 14935 tcp->tcp_conn_def_q0 = B_FALSE; 14936 14937 /* Make sure the tcp isn't in the list of droppables */ 14938 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && 14939 tcp->tcp_eager_prev_drop_q0 == NULL); 14940 14941 /* 14942 * Insert at end of the queue because sockfs sends 14943 * down T_CONN_RES in chronological order. Leaving 14944 * the older conn indications at front of the queue 14945 * helps reducing search time. 14946 */ 14947 tail = listener->tcp_eager_last_q; 14948 if (tail != NULL) { 14949 tail->tcp_eager_next_q = tcp; 14950 } else { 14951 listener->tcp_eager_next_q = tcp; 14952 } 14953 listener->tcp_eager_last_q = tcp; 14954 tcp->tcp_eager_next_q = NULL; 14955 14956 /* Need to get inside the listener perimeter */ 14957 CONN_INC_REF(listener->tcp_connp); 14958 SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1, 14959 tcp_send_pending, listener->tcp_connp, NULL, SQ_FILL, 14960 SQTAG_TCP_SEND_PENDING); 14961 } 14962 no_more_eagers: 14963 tcp_eager_unlink(eager); 14964 mutex_exit(&listener->tcp_eager_lock); 14965 14966 /* 14967 * At this point, the eager is detached from the listener 14968 * but we still have an extra refs on eager (apart from the 14969 * usual tcp references). The ref was placed in tcp_input_data 14970 * before sending the conn_ind in tcp_send_conn_ind. 14971 * The ref will be dropped in tcp_accept_finish(). 14972 */ 14973 SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish, 14974 econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); 14975 return (0); 14976 } 14977 14978 int 14979 tcp_accept(sock_lower_handle_t lproto_handle, 14980 sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, 14981 cred_t *cr) 14982 { 14983 conn_t *lconnp, *econnp; 14984 tcp_t *listener, *eager; 14985 14986 lconnp = (conn_t *)lproto_handle; 14987 listener = lconnp->conn_tcp; 14988 ASSERT(listener->tcp_state == TCPS_LISTEN); 14989 econnp = (conn_t *)eproto_handle; 14990 eager = econnp->conn_tcp; 14991 ASSERT(eager->tcp_listener != NULL); 14992 14993 /* 14994 * It is OK to manipulate these fields outside the eager's squeue 14995 * because they will not start being used until tcp_accept_finish 14996 * has been called. 14997 */ 14998 ASSERT(lconnp->conn_upper_handle != NULL); 14999 ASSERT(econnp->conn_upper_handle == NULL); 15000 econnp->conn_upper_handle = sock_handle; 15001 econnp->conn_upcalls = lconnp->conn_upcalls; 15002 ASSERT(IPCL_IS_NONSTR(econnp)); 15003 return (tcp_accept_common(lconnp, econnp, cr)); 15004 } 15005 15006 15007 /* 15008 * This is the STREAMS entry point for T_CONN_RES coming down on 15009 * Acceptor STREAM when sockfs listener does accept processing. 15010 * Read the block comment on top of tcp_input_listener(). 15011 */ 15012 void 15013 tcp_tpi_accept(queue_t *q, mblk_t *mp) 15014 { 15015 queue_t *rq = RD(q); 15016 struct T_conn_res *conn_res; 15017 tcp_t *eager; 15018 tcp_t *listener; 15019 struct T_ok_ack *ok; 15020 t_scalar_t PRIM_type; 15021 conn_t *econnp; 15022 cred_t *cr; 15023 15024 ASSERT(DB_TYPE(mp) == M_PROTO); 15025 15026 /* 15027 * All Solaris components should pass a db_credp 15028 * for this TPI message, hence we ASSERT. 15029 * But in case there is some other M_PROTO that looks 15030 * like a TPI message sent by some other kernel 15031 * component, we check and return an error. 15032 */ 15033 cr = msg_getcred(mp, NULL); 15034 ASSERT(cr != NULL); 15035 if (cr == NULL) { 15036 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); 15037 if (mp != NULL) 15038 putnext(rq, mp); 15039 return; 15040 } 15041 conn_res = (struct T_conn_res *)mp->b_rptr; 15042 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 15043 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { 15044 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 15045 if (mp != NULL) 15046 putnext(rq, mp); 15047 return; 15048 } 15049 switch (conn_res->PRIM_type) { 15050 case O_T_CONN_RES: 15051 case T_CONN_RES: 15052 /* 15053 * We pass up an err ack if allocb fails. This will 15054 * cause sockfs to issue a T_DISCON_REQ which will cause 15055 * tcp_eager_blowoff to be called. sockfs will then call 15056 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. 15057 * we need to do the allocb up here because we have to 15058 * make sure rq->q_qinfo->qi_qclose still points to the 15059 * correct function (tcp_tpi_close_accept) in case allocb 15060 * fails. 15061 */ 15062 bcopy(mp->b_rptr + conn_res->OPT_offset, 15063 &eager, conn_res->OPT_length); 15064 PRIM_type = conn_res->PRIM_type; 15065 mp->b_datap->db_type = M_PCPROTO; 15066 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); 15067 ok = (struct T_ok_ack *)mp->b_rptr; 15068 ok->PRIM_type = T_OK_ACK; 15069 ok->CORRECT_prim = PRIM_type; 15070 econnp = eager->tcp_connp; 15071 econnp->conn_dev = (dev_t)RD(q)->q_ptr; 15072 econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr); 15073 econnp->conn_rq = rq; 15074 econnp->conn_wq = q; 15075 rq->q_ptr = econnp; 15076 rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */ 15077 q->q_ptr = econnp; 15078 q->q_qinfo = &tcp_winit; 15079 listener = eager->tcp_listener; 15080 15081 if (tcp_accept_common(listener->tcp_connp, 15082 econnp, cr) < 0) { 15083 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 15084 if (mp != NULL) 15085 putnext(rq, mp); 15086 return; 15087 } 15088 15089 /* 15090 * Send the new local address also up to sockfs. There 15091 * should already be enough space in the mp that came 15092 * down from soaccept(). 15093 */ 15094 if (econnp->conn_family == AF_INET) { 15095 sin_t *sin; 15096 15097 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 15098 (sizeof (struct T_ok_ack) + sizeof (sin_t))); 15099 sin = (sin_t *)mp->b_wptr; 15100 mp->b_wptr += sizeof (sin_t); 15101 sin->sin_family = AF_INET; 15102 sin->sin_port = econnp->conn_lport; 15103 sin->sin_addr.s_addr = econnp->conn_laddr_v4; 15104 } else { 15105 sin6_t *sin6; 15106 15107 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 15108 sizeof (struct T_ok_ack) + sizeof (sin6_t)); 15109 sin6 = (sin6_t *)mp->b_wptr; 15110 mp->b_wptr += sizeof (sin6_t); 15111 sin6->sin6_family = AF_INET6; 15112 sin6->sin6_port = econnp->conn_lport; 15113 sin6->sin6_addr = econnp->conn_laddr_v6; 15114 if (econnp->conn_ipversion == IPV4_VERSION) 15115 sin6->sin6_flowinfo = 0; 15116 else 15117 sin6->sin6_flowinfo = econnp->conn_flowinfo; 15118 if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && 15119 (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { 15120 sin6->sin6_scope_id = 15121 econnp->conn_ixa->ixa_scopeid; 15122 } else { 15123 sin6->sin6_scope_id = 0; 15124 } 15125 sin6->__sin6_src_id = 0; 15126 } 15127 15128 putnext(rq, mp); 15129 return; 15130 default: 15131 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); 15132 if (mp != NULL) 15133 putnext(rq, mp); 15134 return; 15135 } 15136 } 15137 15138 /* 15139 * Handle special out-of-band ioctl requests (see PSARC/2008/265). 15140 */ 15141 static void 15142 tcp_wput_cmdblk(queue_t *q, mblk_t *mp) 15143 { 15144 void *data; 15145 mblk_t *datamp = mp->b_cont; 15146 conn_t *connp = Q_TO_CONN(q); 15147 tcp_t *tcp = connp->conn_tcp; 15148 cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr; 15149 15150 if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) { 15151 cmdp->cb_error = EPROTO; 15152 qreply(q, mp); 15153 return; 15154 } 15155 15156 data = datamp->b_rptr; 15157 15158 switch (cmdp->cb_cmd) { 15159 case TI_GETPEERNAME: 15160 if (tcp->tcp_state < TCPS_SYN_RCVD) 15161 cmdp->cb_error = ENOTCONN; 15162 else 15163 cmdp->cb_error = conn_getpeername(connp, data, 15164 &cmdp->cb_len); 15165 break; 15166 case TI_GETMYNAME: 15167 cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len); 15168 break; 15169 default: 15170 cmdp->cb_error = EINVAL; 15171 break; 15172 } 15173 15174 qreply(q, mp); 15175 } 15176 15177 void 15178 tcp_wput(queue_t *q, mblk_t *mp) 15179 { 15180 conn_t *connp = Q_TO_CONN(q); 15181 tcp_t *tcp; 15182 void (*output_proc)(); 15183 t_scalar_t type; 15184 uchar_t *rptr; 15185 struct iocblk *iocp; 15186 size_t size; 15187 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 15188 15189 ASSERT(connp->conn_ref >= 2); 15190 15191 switch (DB_TYPE(mp)) { 15192 case M_DATA: 15193 tcp = connp->conn_tcp; 15194 ASSERT(tcp != NULL); 15195 15196 size = msgdsize(mp); 15197 15198 mutex_enter(&tcp->tcp_non_sq_lock); 15199 tcp->tcp_squeue_bytes += size; 15200 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { 15201 tcp_setqfull(tcp); 15202 } 15203 mutex_exit(&tcp->tcp_non_sq_lock); 15204 15205 CONN_INC_REF(connp); 15206 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp, 15207 NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 15208 return; 15209 15210 case M_CMD: 15211 tcp_wput_cmdblk(q, mp); 15212 return; 15213 15214 case M_PROTO: 15215 case M_PCPROTO: 15216 /* 15217 * if it is a snmp message, don't get behind the squeue 15218 */ 15219 tcp = connp->conn_tcp; 15220 rptr = mp->b_rptr; 15221 if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 15222 type = ((union T_primitives *)rptr)->type; 15223 } else { 15224 if (connp->conn_debug) { 15225 (void) strlog(TCP_MOD_ID, 0, 1, 15226 SL_ERROR|SL_TRACE, 15227 "tcp_wput_proto, dropping one..."); 15228 } 15229 freemsg(mp); 15230 return; 15231 } 15232 if (type == T_SVR4_OPTMGMT_REQ) { 15233 /* 15234 * All Solaris components should pass a db_credp 15235 * for this TPI message, hence we ASSERT. 15236 * But in case there is some other M_PROTO that looks 15237 * like a TPI message sent by some other kernel 15238 * component, we check and return an error. 15239 */ 15240 cred_t *cr = msg_getcred(mp, NULL); 15241 15242 ASSERT(cr != NULL); 15243 if (cr == NULL) { 15244 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 15245 return; 15246 } 15247 if (snmpcom_req(q, mp, tcp_snmp_set, ip_snmp_get, 15248 cr)) { 15249 /* 15250 * This was a SNMP request 15251 */ 15252 return; 15253 } else { 15254 output_proc = tcp_wput_proto; 15255 } 15256 } else { 15257 output_proc = tcp_wput_proto; 15258 } 15259 break; 15260 case M_IOCTL: 15261 /* 15262 * Most ioctls can be processed right away without going via 15263 * squeues - process them right here. Those that do require 15264 * squeue (currently _SIOCSOCKFALLBACK) 15265 * are processed by tcp_wput_ioctl(). 15266 */ 15267 iocp = (struct iocblk *)mp->b_rptr; 15268 tcp = connp->conn_tcp; 15269 15270 switch (iocp->ioc_cmd) { 15271 case TCP_IOC_ABORT_CONN: 15272 tcp_ioctl_abort_conn(q, mp); 15273 return; 15274 case TI_GETPEERNAME: 15275 case TI_GETMYNAME: 15276 mi_copyin(q, mp, NULL, 15277 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 15278 return; 15279 case ND_SET: 15280 /* nd_getset does the necessary checks */ 15281 case ND_GET: 15282 if (nd_getset(q, tcps->tcps_g_nd, mp)) { 15283 qreply(q, mp); 15284 return; 15285 } 15286 ip_wput_nondata(q, mp); 15287 return; 15288 15289 default: 15290 output_proc = tcp_wput_ioctl; 15291 break; 15292 } 15293 break; 15294 default: 15295 output_proc = tcp_wput_nondata; 15296 break; 15297 } 15298 15299 CONN_INC_REF(connp); 15300 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp, 15301 NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER); 15302 } 15303 15304 /* 15305 * Initial STREAMS write side put() procedure for sockets. It tries to 15306 * handle the T_CAPABILITY_REQ which sockfs sends down while setting 15307 * up the socket without using the squeue. Non T_CAPABILITY_REQ messages 15308 * are handled by tcp_wput() as usual. 15309 * 15310 * All further messages will also be handled by tcp_wput() because we cannot 15311 * be sure that the above short cut is safe later. 15312 */ 15313 static void 15314 tcp_wput_sock(queue_t *wq, mblk_t *mp) 15315 { 15316 conn_t *connp = Q_TO_CONN(wq); 15317 tcp_t *tcp = connp->conn_tcp; 15318 struct T_capability_req *car = (struct T_capability_req *)mp->b_rptr; 15319 15320 ASSERT(wq->q_qinfo == &tcp_sock_winit); 15321 wq->q_qinfo = &tcp_winit; 15322 15323 ASSERT(IPCL_IS_TCP(connp)); 15324 ASSERT(TCP_IS_SOCKET(tcp)); 15325 15326 if (DB_TYPE(mp) == M_PCPROTO && 15327 MBLKL(mp) == sizeof (struct T_capability_req) && 15328 car->PRIM_type == T_CAPABILITY_REQ) { 15329 tcp_capability_req(tcp, mp); 15330 return; 15331 } 15332 15333 tcp_wput(wq, mp); 15334 } 15335 15336 /* ARGSUSED */ 15337 static void 15338 tcp_wput_fallback(queue_t *wq, mblk_t *mp) 15339 { 15340 #ifdef DEBUG 15341 cmn_err(CE_CONT, "tcp_wput_fallback: Message during fallback \n"); 15342 #endif 15343 freemsg(mp); 15344 } 15345 15346 /* 15347 * Check the usability of ZEROCOPY. It's instead checking the flag set by IP. 15348 */ 15349 static boolean_t 15350 tcp_zcopy_check(tcp_t *tcp) 15351 { 15352 conn_t *connp = tcp->tcp_connp; 15353 ip_xmit_attr_t *ixa = connp->conn_ixa; 15354 boolean_t zc_enabled = B_FALSE; 15355 tcp_stack_t *tcps = tcp->tcp_tcps; 15356 15357 if (do_tcpzcopy == 2) 15358 zc_enabled = B_TRUE; 15359 else if ((do_tcpzcopy == 1) && (ixa->ixa_flags & IXAF_ZCOPY_CAPAB)) 15360 zc_enabled = B_TRUE; 15361 15362 tcp->tcp_snd_zcopy_on = zc_enabled; 15363 if (!TCP_IS_DETACHED(tcp)) { 15364 if (zc_enabled) { 15365 ixa->ixa_flags |= IXAF_VERIFY_ZCOPY; 15366 (void) proto_set_tx_copyopt(connp->conn_rq, connp, 15367 ZCVMSAFE); 15368 TCP_STAT(tcps, tcp_zcopy_on); 15369 } else { 15370 ixa->ixa_flags &= ~IXAF_VERIFY_ZCOPY; 15371 (void) proto_set_tx_copyopt(connp->conn_rq, connp, 15372 ZCVMUNSAFE); 15373 TCP_STAT(tcps, tcp_zcopy_off); 15374 } 15375 } 15376 return (zc_enabled); 15377 } 15378 15379 /* 15380 * Backoff from a zero-copy message by copying data to a new allocated 15381 * message and freeing the original desballoca'ed segmapped message. 15382 * 15383 * This function is called by following two callers: 15384 * 1. tcp_timer: fix_xmitlist is set to B_TRUE, because it's safe to free 15385 * the origial desballoca'ed message and notify sockfs. This is in re- 15386 * transmit state. 15387 * 2. tcp_output: fix_xmitlist is set to B_FALSE. Flag STRUIO_ZCNOTIFY need 15388 * to be copied to new message. 15389 */ 15390 static mblk_t * 15391 tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, boolean_t fix_xmitlist) 15392 { 15393 mblk_t *nbp; 15394 mblk_t *head = NULL; 15395 mblk_t *tail = NULL; 15396 tcp_stack_t *tcps = tcp->tcp_tcps; 15397 15398 ASSERT(bp != NULL); 15399 while (bp != NULL) { 15400 if (IS_VMLOANED_MBLK(bp)) { 15401 TCP_STAT(tcps, tcp_zcopy_backoff); 15402 if ((nbp = copyb(bp)) == NULL) { 15403 tcp->tcp_xmit_zc_clean = B_FALSE; 15404 if (tail != NULL) 15405 tail->b_cont = bp; 15406 return ((head == NULL) ? bp : head); 15407 } 15408 15409 if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { 15410 if (fix_xmitlist) 15411 tcp_zcopy_notify(tcp); 15412 else 15413 nbp->b_datap->db_struioflag |= 15414 STRUIO_ZCNOTIFY; 15415 } 15416 nbp->b_cont = bp->b_cont; 15417 15418 /* 15419 * Copy saved information and adjust tcp_xmit_tail 15420 * if needed. 15421 */ 15422 if (fix_xmitlist) { 15423 nbp->b_prev = bp->b_prev; 15424 nbp->b_next = bp->b_next; 15425 15426 if (tcp->tcp_xmit_tail == bp) 15427 tcp->tcp_xmit_tail = nbp; 15428 } 15429 15430 /* Free the original message. */ 15431 bp->b_prev = NULL; 15432 bp->b_next = NULL; 15433 freeb(bp); 15434 15435 bp = nbp; 15436 } 15437 15438 if (head == NULL) { 15439 head = bp; 15440 } 15441 if (tail == NULL) { 15442 tail = bp; 15443 } else { 15444 tail->b_cont = bp; 15445 tail = bp; 15446 } 15447 15448 /* Move forward. */ 15449 bp = bp->b_cont; 15450 } 15451 15452 if (fix_xmitlist) { 15453 tcp->tcp_xmit_last = tail; 15454 tcp->tcp_xmit_zc_clean = B_TRUE; 15455 } 15456 15457 return (head); 15458 } 15459 15460 static void 15461 tcp_zcopy_notify(tcp_t *tcp) 15462 { 15463 struct stdata *stp; 15464 conn_t *connp; 15465 15466 if (tcp->tcp_detached) 15467 return; 15468 connp = tcp->tcp_connp; 15469 if (IPCL_IS_NONSTR(connp)) { 15470 (*connp->conn_upcalls->su_zcopy_notify) 15471 (connp->conn_upper_handle); 15472 return; 15473 } 15474 stp = STREAM(connp->conn_rq); 15475 mutex_enter(&stp->sd_lock); 15476 stp->sd_flag |= STZCNOTIFY; 15477 cv_broadcast(&stp->sd_zcopy_wait); 15478 mutex_exit(&stp->sd_lock); 15479 } 15480 15481 /* 15482 * Update the TCP connection according to change of LSO capability. 15483 */ 15484 static void 15485 tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa) 15486 { 15487 /* 15488 * We check against IPv4 header length to preserve the old behavior 15489 * of only enabling LSO when there are no IP options. 15490 * But this restriction might not be necessary at all. Before removing 15491 * it, need to verify how LSO is handled for source routing case, with 15492 * which IP does software checksum. 15493 * 15494 * For IPv6, whenever any extension header is needed, LSO is supressed. 15495 */ 15496 if (ixa->ixa_ip_hdr_length != ((ixa->ixa_flags & IXAF_IS_IPV4) ? 15497 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN)) 15498 return; 15499 15500 /* 15501 * Either the LSO capability newly became usable, or it has changed. 15502 */ 15503 if (ixa->ixa_flags & IXAF_LSO_CAPAB) { 15504 ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; 15505 15506 ASSERT(lsoc->ill_lso_max > 0); 15507 tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, lsoc->ill_lso_max); 15508 15509 DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso, 15510 boolean_t, B_TRUE, uint32_t, tcp->tcp_lso_max); 15511 15512 /* 15513 * If LSO to be enabled, notify the STREAM header with larger 15514 * data block. 15515 */ 15516 if (!tcp->tcp_lso) 15517 tcp->tcp_maxpsz_multiplier = 0; 15518 15519 tcp->tcp_lso = B_TRUE; 15520 TCP_STAT(tcp->tcp_tcps, tcp_lso_enabled); 15521 } else { /* LSO capability is not usable any more. */ 15522 DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso, 15523 boolean_t, B_FALSE, uint32_t, tcp->tcp_lso_max); 15524 15525 /* 15526 * If LSO to be disabled, notify the STREAM header with smaller 15527 * data block. And need to restore fragsize to PMTU. 15528 */ 15529 if (tcp->tcp_lso) { 15530 tcp->tcp_maxpsz_multiplier = 15531 tcp->tcp_tcps->tcps_maxpsz_multiplier; 15532 ixa->ixa_fragsize = ixa->ixa_pmtu; 15533 tcp->tcp_lso = B_FALSE; 15534 TCP_STAT(tcp->tcp_tcps, tcp_lso_disabled); 15535 } 15536 } 15537 15538 (void) tcp_maxpsz_set(tcp, B_TRUE); 15539 } 15540 15541 /* 15542 * Update the TCP connection according to change of ZEROCOPY capability. 15543 */ 15544 static void 15545 tcp_update_zcopy(tcp_t *tcp) 15546 { 15547 conn_t *connp = tcp->tcp_connp; 15548 tcp_stack_t *tcps = tcp->tcp_tcps; 15549 15550 if (tcp->tcp_snd_zcopy_on) { 15551 tcp->tcp_snd_zcopy_on = B_FALSE; 15552 if (!TCP_IS_DETACHED(tcp)) { 15553 (void) proto_set_tx_copyopt(connp->conn_rq, connp, 15554 ZCVMUNSAFE); 15555 TCP_STAT(tcps, tcp_zcopy_off); 15556 } 15557 } else { 15558 tcp->tcp_snd_zcopy_on = B_TRUE; 15559 if (!TCP_IS_DETACHED(tcp)) { 15560 (void) proto_set_tx_copyopt(connp->conn_rq, connp, 15561 ZCVMSAFE); 15562 TCP_STAT(tcps, tcp_zcopy_on); 15563 } 15564 } 15565 } 15566 15567 /* 15568 * Notify function registered with ip_xmit_attr_t. It's called in the squeue 15569 * so it's safe to update the TCP connection. 15570 */ 15571 /* ARGSUSED1 */ 15572 static void 15573 tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype, 15574 ixa_notify_arg_t narg) 15575 { 15576 tcp_t *tcp = (tcp_t *)arg; 15577 conn_t *connp = tcp->tcp_connp; 15578 15579 switch (ntype) { 15580 case IXAN_LSO: 15581 tcp_update_lso(tcp, connp->conn_ixa); 15582 break; 15583 case IXAN_PMTU: 15584 tcp_update_pmtu(tcp, B_FALSE); 15585 break; 15586 case IXAN_ZCOPY: 15587 tcp_update_zcopy(tcp); 15588 break; 15589 default: 15590 break; 15591 } 15592 } 15593 15594 static void 15595 tcp_send_data(tcp_t *tcp, mblk_t *mp) 15596 { 15597 conn_t *connp = tcp->tcp_connp; 15598 15599 /* 15600 * Check here to avoid sending zero-copy message down to IP when 15601 * ZEROCOPY capability has turned off. We only need to deal with 15602 * the race condition between sockfs and the notification here. 15603 * Since we have tried to backoff the tcp_xmit_head when turning 15604 * zero-copy off and new messages in tcp_output(), we simply drop 15605 * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean 15606 * is not true. 15607 */ 15608 if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on && 15609 !tcp->tcp_xmit_zc_clean) { 15610 ip_drop_output("TCP ZC was disabled but not clean", mp, NULL); 15611 freemsg(mp); 15612 return; 15613 } 15614 15615 ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp); 15616 (void) conn_ip_output(mp, connp->conn_ixa); 15617 } 15618 15619 /* 15620 * This handles the case when the receiver has shrunk its win. Per RFC 1122 15621 * if the receiver shrinks the window, i.e. moves the right window to the 15622 * left, the we should not send new data, but should retransmit normally the 15623 * old unacked data between suna and suna + swnd. We might has sent data 15624 * that is now outside the new window, pretend that we didn't send it. 15625 */ 15626 static void 15627 tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count) 15628 { 15629 uint32_t snxt = tcp->tcp_snxt; 15630 15631 ASSERT(shrunk_count > 0); 15632 15633 if (!tcp->tcp_is_wnd_shrnk) { 15634 tcp->tcp_snxt_shrunk = snxt; 15635 tcp->tcp_is_wnd_shrnk = B_TRUE; 15636 } else if (SEQ_GT(snxt, tcp->tcp_snxt_shrunk)) { 15637 tcp->tcp_snxt_shrunk = snxt; 15638 } 15639 15640 /* Pretend we didn't send the data outside the window */ 15641 snxt -= shrunk_count; 15642 15643 /* Reset all the values per the now shrunk window */ 15644 tcp_update_xmit_tail(tcp, snxt); 15645 tcp->tcp_unsent += shrunk_count; 15646 15647 /* 15648 * If the SACK option is set, delete the entire list of 15649 * notsack'ed blocks. 15650 */ 15651 if (tcp->tcp_sack_info != NULL) { 15652 if (tcp->tcp_notsack_list != NULL) 15653 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); 15654 } 15655 15656 if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0) 15657 /* 15658 * Make sure the timer is running so that we will probe a zero 15659 * window. 15660 */ 15661 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 15662 } 15663 15664 15665 /* 15666 * The TCP normal data output path. 15667 * NOTE: the logic of the fast path is duplicated from this function. 15668 */ 15669 static void 15670 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) 15671 { 15672 int len; 15673 mblk_t *local_time; 15674 mblk_t *mp1; 15675 uint32_t snxt; 15676 int tail_unsent; 15677 int tcpstate; 15678 int usable = 0; 15679 mblk_t *xmit_tail; 15680 int32_t mss; 15681 int32_t num_sack_blk = 0; 15682 int32_t total_hdr_len; 15683 int32_t tcp_hdr_len; 15684 int rc; 15685 tcp_stack_t *tcps = tcp->tcp_tcps; 15686 conn_t *connp = tcp->tcp_connp; 15687 clock_t now = LBOLT_FASTPATH; 15688 15689 tcpstate = tcp->tcp_state; 15690 if (mp == NULL) { 15691 /* 15692 * tcp_wput_data() with NULL mp should only be called when 15693 * there is unsent data. 15694 */ 15695 ASSERT(tcp->tcp_unsent > 0); 15696 /* Really tacky... but we need this for detached closes. */ 15697 len = tcp->tcp_unsent; 15698 goto data_null; 15699 } 15700 15701 #if CCS_STATS 15702 wrw_stats.tot.count++; 15703 wrw_stats.tot.bytes += msgdsize(mp); 15704 #endif 15705 ASSERT(mp->b_datap->db_type == M_DATA); 15706 /* 15707 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 15708 * or before a connection attempt has begun. 15709 */ 15710 if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 15711 (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 15712 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 15713 #ifdef DEBUG 15714 cmn_err(CE_WARN, 15715 "tcp_wput_data: data after ordrel, %s", 15716 tcp_display(tcp, NULL, 15717 DISP_ADDR_AND_PORT)); 15718 #else 15719 if (connp->conn_debug) { 15720 (void) strlog(TCP_MOD_ID, 0, 1, 15721 SL_TRACE|SL_ERROR, 15722 "tcp_wput_data: data after ordrel, %s\n", 15723 tcp_display(tcp, NULL, 15724 DISP_ADDR_AND_PORT)); 15725 } 15726 #endif /* DEBUG */ 15727 } 15728 if (tcp->tcp_snd_zcopy_aware && 15729 (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 15730 tcp_zcopy_notify(tcp); 15731 freemsg(mp); 15732 mutex_enter(&tcp->tcp_non_sq_lock); 15733 if (tcp->tcp_flow_stopped && 15734 TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { 15735 tcp_clrqfull(tcp); 15736 } 15737 mutex_exit(&tcp->tcp_non_sq_lock); 15738 return; 15739 } 15740 15741 /* Strip empties */ 15742 for (;;) { 15743 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 15744 (uintptr_t)INT_MAX); 15745 len = (int)(mp->b_wptr - mp->b_rptr); 15746 if (len > 0) 15747 break; 15748 mp1 = mp; 15749 mp = mp->b_cont; 15750 freeb(mp1); 15751 if (!mp) { 15752 return; 15753 } 15754 } 15755 15756 /* If we are the first on the list ... */ 15757 if (tcp->tcp_xmit_head == NULL) { 15758 tcp->tcp_xmit_head = mp; 15759 tcp->tcp_xmit_tail = mp; 15760 tcp->tcp_xmit_tail_unsent = len; 15761 } else { 15762 /* If tiny tx and room in txq tail, pullup to save mblks. */ 15763 struct datab *dp; 15764 15765 mp1 = tcp->tcp_xmit_last; 15766 if (len < tcp_tx_pull_len && 15767 (dp = mp1->b_datap)->db_ref == 1 && 15768 dp->db_lim - mp1->b_wptr >= len) { 15769 ASSERT(len > 0); 15770 ASSERT(!mp1->b_cont); 15771 if (len == 1) { 15772 *mp1->b_wptr++ = *mp->b_rptr; 15773 } else { 15774 bcopy(mp->b_rptr, mp1->b_wptr, len); 15775 mp1->b_wptr += len; 15776 } 15777 if (mp1 == tcp->tcp_xmit_tail) 15778 tcp->tcp_xmit_tail_unsent += len; 15779 mp1->b_cont = mp->b_cont; 15780 if (tcp->tcp_snd_zcopy_aware && 15781 (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 15782 mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 15783 freeb(mp); 15784 mp = mp1; 15785 } else { 15786 tcp->tcp_xmit_last->b_cont = mp; 15787 } 15788 len += tcp->tcp_unsent; 15789 } 15790 15791 /* Tack on however many more positive length mblks we have */ 15792 if ((mp1 = mp->b_cont) != NULL) { 15793 do { 15794 int tlen; 15795 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 15796 (uintptr_t)INT_MAX); 15797 tlen = (int)(mp1->b_wptr - mp1->b_rptr); 15798 if (tlen <= 0) { 15799 mp->b_cont = mp1->b_cont; 15800 freeb(mp1); 15801 } else { 15802 len += tlen; 15803 mp = mp1; 15804 } 15805 } while ((mp1 = mp->b_cont) != NULL); 15806 } 15807 tcp->tcp_xmit_last = mp; 15808 tcp->tcp_unsent = len; 15809 15810 if (urgent) 15811 usable = 1; 15812 15813 data_null: 15814 snxt = tcp->tcp_snxt; 15815 xmit_tail = tcp->tcp_xmit_tail; 15816 tail_unsent = tcp->tcp_xmit_tail_unsent; 15817 15818 /* 15819 * Note that tcp_mss has been adjusted to take into account the 15820 * timestamp option if applicable. Because SACK options do not 15821 * appear in every TCP segments and they are of variable lengths, 15822 * they cannot be included in tcp_mss. Thus we need to calculate 15823 * the actual segment length when we need to send a segment which 15824 * includes SACK options. 15825 */ 15826 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 15827 int32_t opt_len; 15828 15829 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 15830 tcp->tcp_num_sack_blk); 15831 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 15832 2 + TCPOPT_HEADER_LEN; 15833 mss = tcp->tcp_mss - opt_len; 15834 total_hdr_len = connp->conn_ht_iphc_len + opt_len; 15835 tcp_hdr_len = connp->conn_ht_ulp_len + opt_len; 15836 } else { 15837 mss = tcp->tcp_mss; 15838 total_hdr_len = connp->conn_ht_iphc_len; 15839 tcp_hdr_len = connp->conn_ht_ulp_len; 15840 } 15841 15842 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 15843 (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 15844 SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); 15845 } 15846 if (tcpstate == TCPS_SYN_RCVD) { 15847 /* 15848 * The three-way connection establishment handshake is not 15849 * complete yet. We want to queue the data for transmission 15850 * after entering ESTABLISHED state (RFC793). A jump to 15851 * "done" label effectively leaves data on the queue. 15852 */ 15853 goto done; 15854 } else { 15855 int usable_r; 15856 15857 /* 15858 * In the special case when cwnd is zero, which can only 15859 * happen if the connection is ECN capable, return now. 15860 * New segments is sent using tcp_timer(). The timer 15861 * is set in tcp_input_data(). 15862 */ 15863 if (tcp->tcp_cwnd == 0) { 15864 /* 15865 * Note that tcp_cwnd is 0 before 3-way handshake is 15866 * finished. 15867 */ 15868 ASSERT(tcp->tcp_ecn_ok || 15869 tcp->tcp_state < TCPS_ESTABLISHED); 15870 return; 15871 } 15872 15873 /* NOTE: trouble if xmitting while SYN not acked? */ 15874 usable_r = snxt - tcp->tcp_suna; 15875 usable_r = tcp->tcp_swnd - usable_r; 15876 15877 /* 15878 * Check if the receiver has shrunk the window. If 15879 * tcp_wput_data() with NULL mp is called, tcp_fin_sent 15880 * cannot be set as there is unsent data, so FIN cannot 15881 * be sent out. Otherwise, we need to take into account 15882 * of FIN as it consumes an "invisible" sequence number. 15883 */ 15884 ASSERT(tcp->tcp_fin_sent == 0); 15885 if (usable_r < 0) { 15886 /* 15887 * The receiver has shrunk the window and we have sent 15888 * -usable_r date beyond the window, re-adjust. 15889 * 15890 * If TCP window scaling is enabled, there can be 15891 * round down error as the advertised receive window 15892 * is actually right shifted n bits. This means that 15893 * the lower n bits info is wiped out. It will look 15894 * like the window is shrunk. Do a check here to 15895 * see if the shrunk amount is actually within the 15896 * error in window calculation. If it is, just 15897 * return. Note that this check is inside the 15898 * shrunk window check. This makes sure that even 15899 * though tcp_process_shrunk_swnd() is not called, 15900 * we will stop further processing. 15901 */ 15902 if ((-usable_r >> tcp->tcp_snd_ws) > 0) { 15903 tcp_process_shrunk_swnd(tcp, -usable_r); 15904 } 15905 return; 15906 } 15907 15908 /* usable = MIN(swnd, cwnd) - unacked_bytes */ 15909 if (tcp->tcp_swnd > tcp->tcp_cwnd) 15910 usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd; 15911 15912 /* usable = MIN(usable, unsent) */ 15913 if (usable_r > len) 15914 usable_r = len; 15915 15916 /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 15917 if (usable_r > 0) { 15918 usable = usable_r; 15919 } else { 15920 /* Bypass all other unnecessary processing. */ 15921 goto done; 15922 } 15923 } 15924 15925 local_time = (mblk_t *)now; 15926 15927 /* 15928 * "Our" Nagle Algorithm. This is not the same as in the old 15929 * BSD. This is more in line with the true intent of Nagle. 15930 * 15931 * The conditions are: 15932 * 1. The amount of unsent data (or amount of data which can be 15933 * sent, whichever is smaller) is less than Nagle limit. 15934 * 2. The last sent size is also less than Nagle limit. 15935 * 3. There is unack'ed data. 15936 * 4. Urgent pointer is not set. Send urgent data ignoring the 15937 * Nagle algorithm. This reduces the probability that urgent 15938 * bytes get "merged" together. 15939 * 5. The app has not closed the connection. This eliminates the 15940 * wait time of the receiving side waiting for the last piece of 15941 * (small) data. 15942 * 15943 * If all are satisified, exit without sending anything. Note 15944 * that Nagle limit can be smaller than 1 MSS. Nagle limit is 15945 * the smaller of 1 MSS and global tcp_naglim_def (default to be 15946 * 4095). 15947 */ 15948 if (usable < (int)tcp->tcp_naglim && 15949 tcp->tcp_naglim > tcp->tcp_last_sent_len && 15950 snxt != tcp->tcp_suna && 15951 !(tcp->tcp_valid_bits & TCP_URG_VALID) && 15952 !(tcp->tcp_valid_bits & TCP_FSS_VALID)) { 15953 goto done; 15954 } 15955 15956 /* 15957 * If tcp_zero_win_probe is not set and the tcp->tcp_cork option 15958 * is set, then we have to force TCP not to send partial segment 15959 * (smaller than MSS bytes). We are calculating the usable now 15960 * based on full mss and will save the rest of remaining data for 15961 * later. When tcp_zero_win_probe is set, TCP needs to send out 15962 * something to do zero window probe. 15963 */ 15964 if (tcp->tcp_cork && !tcp->tcp_zero_win_probe) { 15965 if (usable < mss) 15966 goto done; 15967 usable = (usable / mss) * mss; 15968 } 15969 15970 /* Update the latest receive window size in TCP header. */ 15971 tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 15972 15973 /* Send the packet. */ 15974 rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len, 15975 num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, 15976 local_time); 15977 15978 /* Pretend that all we were trying to send really got sent */ 15979 if (rc < 0 && tail_unsent < 0) { 15980 do { 15981 xmit_tail = xmit_tail->b_cont; 15982 xmit_tail->b_prev = local_time; 15983 ASSERT((uintptr_t)(xmit_tail->b_wptr - 15984 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 15985 tail_unsent += (int)(xmit_tail->b_wptr - 15986 xmit_tail->b_rptr); 15987 } while (tail_unsent < 0); 15988 } 15989 done:; 15990 tcp->tcp_xmit_tail = xmit_tail; 15991 tcp->tcp_xmit_tail_unsent = tail_unsent; 15992 len = tcp->tcp_snxt - snxt; 15993 if (len) { 15994 /* 15995 * If new data was sent, need to update the notsack 15996 * list, which is, afterall, data blocks that have 15997 * not been sack'ed by the receiver. New data is 15998 * not sack'ed. 15999 */ 16000 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 16001 /* len is a negative value. */ 16002 tcp->tcp_pipe -= len; 16003 tcp_notsack_update(&(tcp->tcp_notsack_list), 16004 tcp->tcp_snxt, snxt, 16005 &(tcp->tcp_num_notsack_blk), 16006 &(tcp->tcp_cnt_notsack_list)); 16007 } 16008 tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 16009 tcp->tcp_rack = tcp->tcp_rnxt; 16010 tcp->tcp_rack_cnt = 0; 16011 if ((snxt + len) == tcp->tcp_suna) { 16012 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 16013 } 16014 } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 16015 /* 16016 * Didn't send anything. Make sure the timer is running 16017 * so that we will probe a zero window. 16018 */ 16019 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 16020 } 16021 /* Note that len is the amount we just sent but with a negative sign */ 16022 tcp->tcp_unsent += len; 16023 mutex_enter(&tcp->tcp_non_sq_lock); 16024 if (tcp->tcp_flow_stopped) { 16025 if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { 16026 tcp_clrqfull(tcp); 16027 } 16028 } else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) { 16029 if (!(tcp->tcp_detached)) 16030 tcp_setqfull(tcp); 16031 } 16032 mutex_exit(&tcp->tcp_non_sq_lock); 16033 } 16034 16035 /* 16036 * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header 16037 * with the template header, as well as other options such as time-stamp, 16038 * ECN and/or SACK. 16039 */ 16040 static void 16041 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) 16042 { 16043 tcpha_t *tcp_tmpl, *tcpha; 16044 uint32_t *dst, *src; 16045 int hdrlen; 16046 conn_t *connp = tcp->tcp_connp; 16047 16048 ASSERT(OK_32PTR(rptr)); 16049 16050 /* Template header */ 16051 tcp_tmpl = tcp->tcp_tcpha; 16052 16053 /* Header of outgoing packet */ 16054 tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length); 16055 16056 /* dst and src are opaque 32-bit fields, used for copying */ 16057 dst = (uint32_t *)rptr; 16058 src = (uint32_t *)connp->conn_ht_iphc; 16059 hdrlen = connp->conn_ht_iphc_len; 16060 16061 /* Fill time-stamp option if needed */ 16062 if (tcp->tcp_snd_ts_ok) { 16063 U32_TO_BE32((uint32_t)now, 16064 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); 16065 U32_TO_BE32(tcp->tcp_ts_recent, 16066 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); 16067 } else { 16068 ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); 16069 } 16070 16071 /* 16072 * Copy the template header; is this really more efficient than 16073 * calling bcopy()? For simple IPv4/TCP, it may be the case, 16074 * but perhaps not for other scenarios. 16075 */ 16076 dst[0] = src[0]; 16077 dst[1] = src[1]; 16078 dst[2] = src[2]; 16079 dst[3] = src[3]; 16080 dst[4] = src[4]; 16081 dst[5] = src[5]; 16082 dst[6] = src[6]; 16083 dst[7] = src[7]; 16084 dst[8] = src[8]; 16085 dst[9] = src[9]; 16086 if (hdrlen -= 40) { 16087 hdrlen >>= 2; 16088 dst += 10; 16089 src += 10; 16090 do { 16091 *dst++ = *src++; 16092 } while (--hdrlen); 16093 } 16094 16095 /* 16096 * Set the ECN info in the TCP header if it is not a zero 16097 * window probe. Zero window probe is only sent in 16098 * tcp_wput_data() and tcp_timer(). 16099 */ 16100 if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 16101 SET_ECT(tcp, rptr); 16102 16103 if (tcp->tcp_ecn_echo_on) 16104 tcpha->tha_flags |= TH_ECE; 16105 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 16106 tcpha->tha_flags |= TH_CWR; 16107 tcp->tcp_ecn_cwr_sent = B_TRUE; 16108 } 16109 } 16110 16111 /* Fill in SACK options */ 16112 if (num_sack_blk > 0) { 16113 uchar_t *wptr = rptr + connp->conn_ht_iphc_len; 16114 sack_blk_t *tmp; 16115 int32_t i; 16116 16117 wptr[0] = TCPOPT_NOP; 16118 wptr[1] = TCPOPT_NOP; 16119 wptr[2] = TCPOPT_SACK; 16120 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 16121 sizeof (sack_blk_t); 16122 wptr += TCPOPT_REAL_SACK_LEN; 16123 16124 tmp = tcp->tcp_sack_list; 16125 for (i = 0; i < num_sack_blk; i++) { 16126 U32_TO_BE32(tmp[i].begin, wptr); 16127 wptr += sizeof (tcp_seq); 16128 U32_TO_BE32(tmp[i].end, wptr); 16129 wptr += sizeof (tcp_seq); 16130 } 16131 tcpha->tha_offset_and_reserved += 16132 ((num_sack_blk * 2 + 1) << 4); 16133 } 16134 } 16135 16136 /* 16137 * tcp_send() is called by tcp_wput_data() and returns one of the following: 16138 * 16139 * -1 = failed allocation. 16140 * 0 = success; burst count reached, or usable send window is too small, 16141 * and that we'd rather wait until later before sending again. 16142 */ 16143 static int 16144 tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, 16145 const int tcp_hdr_len, const int num_sack_blk, int *usable, 16146 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time) 16147 { 16148 int num_burst_seg = tcp->tcp_snd_burst; 16149 int num_lso_seg = 1; 16150 uint_t lso_usable; 16151 boolean_t do_lso_send = B_FALSE; 16152 tcp_stack_t *tcps = tcp->tcp_tcps; 16153 conn_t *connp = tcp->tcp_connp; 16154 ip_xmit_attr_t *ixa = connp->conn_ixa; 16155 16156 /* 16157 * Check LSO possibility. The value of tcp->tcp_lso indicates whether 16158 * the underlying connection is LSO capable. Will check whether having 16159 * enough available data to initiate LSO transmission in the for(){} 16160 * loops. 16161 */ 16162 if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0) 16163 do_lso_send = B_TRUE; 16164 16165 for (;;) { 16166 struct datab *db; 16167 tcpha_t *tcpha; 16168 uint32_t sum; 16169 mblk_t *mp, *mp1; 16170 uchar_t *rptr; 16171 int len; 16172 16173 /* 16174 * Burst count reached, return successfully. 16175 */ 16176 if (num_burst_seg == 0) 16177 break; 16178 16179 /* 16180 * Calculate the maximum payload length we can send at one 16181 * time. 16182 */ 16183 if (do_lso_send) { 16184 /* 16185 * Check whether be able to to do LSO for the current 16186 * available data. 16187 */ 16188 if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) { 16189 lso_usable = MIN(tcp->tcp_lso_max, *usable); 16190 lso_usable = MIN(lso_usable, 16191 num_burst_seg * mss); 16192 16193 num_lso_seg = lso_usable / mss; 16194 if (lso_usable % mss) { 16195 num_lso_seg++; 16196 tcp->tcp_last_sent_len = (ushort_t) 16197 (lso_usable % mss); 16198 } else { 16199 tcp->tcp_last_sent_len = (ushort_t)mss; 16200 } 16201 } else { 16202 do_lso_send = B_FALSE; 16203 num_lso_seg = 1; 16204 lso_usable = mss; 16205 } 16206 } 16207 16208 ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1); 16209 #ifdef DEBUG 16210 DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t, 16211 do_lso_send); 16212 #endif 16213 /* 16214 * Adjust num_burst_seg here. 16215 */ 16216 num_burst_seg -= num_lso_seg; 16217 16218 len = mss; 16219 if (len > *usable) { 16220 ASSERT(do_lso_send == B_FALSE); 16221 16222 len = *usable; 16223 if (len <= 0) { 16224 /* Terminate the loop */ 16225 break; /* success; too small */ 16226 } 16227 /* 16228 * Sender silly-window avoidance. 16229 * Ignore this if we are going to send a 16230 * zero window probe out. 16231 * 16232 * TODO: force data into microscopic window? 16233 * ==> (!pushed || (unsent > usable)) 16234 */ 16235 if (len < (tcp->tcp_max_swnd >> 1) && 16236 (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len && 16237 !((tcp->tcp_valid_bits & TCP_URG_VALID) && 16238 len == 1) && (! tcp->tcp_zero_win_probe)) { 16239 /* 16240 * If the retransmit timer is not running 16241 * we start it so that we will retransmit 16242 * in the case when the receiver has 16243 * decremented the window. 16244 */ 16245 if (*snxt == tcp->tcp_snxt && 16246 *snxt == tcp->tcp_suna) { 16247 /* 16248 * We are not supposed to send 16249 * anything. So let's wait a little 16250 * bit longer before breaking SWS 16251 * avoidance. 16252 * 16253 * What should the value be? 16254 * Suggestion: MAX(init rexmit time, 16255 * tcp->tcp_rto) 16256 */ 16257 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 16258 } 16259 break; /* success; too small */ 16260 } 16261 } 16262 16263 tcpha = tcp->tcp_tcpha; 16264 16265 /* 16266 * The reason to adjust len here is that we need to set flags 16267 * and calculate checksum. 16268 */ 16269 if (do_lso_send) 16270 len = lso_usable; 16271 16272 *usable -= len; /* Approximate - can be adjusted later */ 16273 if (*usable > 0) 16274 tcpha->tha_flags = TH_ACK; 16275 else 16276 tcpha->tha_flags = (TH_ACK | TH_PUSH); 16277 16278 /* 16279 * Prime pump for IP's checksumming on our behalf. 16280 * Include the adjustment for a source route if any. 16281 * In case of LSO, the partial pseudo-header checksum should 16282 * exclusive TCP length, so zero tha_sum before IP calculate 16283 * pseudo-header checksum for partial checksum offload. 16284 */ 16285 if (do_lso_send) { 16286 sum = 0; 16287 } else { 16288 sum = len + tcp_hdr_len + connp->conn_sum; 16289 sum = (sum >> 16) + (sum & 0xFFFF); 16290 } 16291 tcpha->tha_sum = htons(sum); 16292 tcpha->tha_seq = htonl(*snxt); 16293 16294 /* 16295 * Branch off to tcp_xmit_mp() if any of the VALID bits is 16296 * set. For the case when TCP_FSS_VALID is the only valid 16297 * bit (normal active close), branch off only when we think 16298 * that the FIN flag needs to be set. Note for this case, 16299 * that (snxt + len) may not reflect the actual seg_len, 16300 * as len may be further reduced in tcp_xmit_mp(). If len 16301 * gets modified, we will end up here again. 16302 */ 16303 if (tcp->tcp_valid_bits != 0 && 16304 (tcp->tcp_valid_bits != TCP_FSS_VALID || 16305 ((*snxt + len) == tcp->tcp_fss))) { 16306 uchar_t *prev_rptr; 16307 uint32_t prev_snxt = tcp->tcp_snxt; 16308 16309 if (*tail_unsent == 0) { 16310 ASSERT((*xmit_tail)->b_cont != NULL); 16311 *xmit_tail = (*xmit_tail)->b_cont; 16312 prev_rptr = (*xmit_tail)->b_rptr; 16313 *tail_unsent = (int)((*xmit_tail)->b_wptr - 16314 (*xmit_tail)->b_rptr); 16315 } else { 16316 prev_rptr = (*xmit_tail)->b_rptr; 16317 (*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr - 16318 *tail_unsent; 16319 } 16320 mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL, 16321 *snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 16322 /* Restore tcp_snxt so we get amount sent right. */ 16323 tcp->tcp_snxt = prev_snxt; 16324 if (prev_rptr == (*xmit_tail)->b_rptr) { 16325 /* 16326 * If the previous timestamp is still in use, 16327 * don't stomp on it. 16328 */ 16329 if ((*xmit_tail)->b_next == NULL) { 16330 (*xmit_tail)->b_prev = local_time; 16331 (*xmit_tail)->b_next = 16332 (mblk_t *)(uintptr_t)(*snxt); 16333 } 16334 } else 16335 (*xmit_tail)->b_rptr = prev_rptr; 16336 16337 if (mp == NULL) { 16338 return (-1); 16339 } 16340 mp1 = mp->b_cont; 16341 16342 if (len <= mss) /* LSO is unusable (!do_lso_send) */ 16343 tcp->tcp_last_sent_len = (ushort_t)len; 16344 while (mp1->b_cont) { 16345 *xmit_tail = (*xmit_tail)->b_cont; 16346 (*xmit_tail)->b_prev = local_time; 16347 (*xmit_tail)->b_next = 16348 (mblk_t *)(uintptr_t)(*snxt); 16349 mp1 = mp1->b_cont; 16350 } 16351 *snxt += len; 16352 *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; 16353 BUMP_LOCAL(tcp->tcp_obsegs); 16354 BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); 16355 UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); 16356 tcp_send_data(tcp, mp); 16357 continue; 16358 } 16359 16360 *snxt += len; /* Adjust later if we don't send all of len */ 16361 BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); 16362 UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); 16363 16364 if (*tail_unsent) { 16365 /* Are the bytes above us in flight? */ 16366 rptr = (*xmit_tail)->b_wptr - *tail_unsent; 16367 if (rptr != (*xmit_tail)->b_rptr) { 16368 *tail_unsent -= len; 16369 if (len <= mss) /* LSO is unusable */ 16370 tcp->tcp_last_sent_len = (ushort_t)len; 16371 len += total_hdr_len; 16372 ixa->ixa_pktlen = len; 16373 16374 if (ixa->ixa_flags & IXAF_IS_IPV4) { 16375 tcp->tcp_ipha->ipha_length = htons(len); 16376 } else { 16377 tcp->tcp_ip6h->ip6_plen = 16378 htons(len - IPV6_HDR_LEN); 16379 } 16380 16381 mp = dupb(*xmit_tail); 16382 if (mp == NULL) { 16383 return (-1); /* out_of_mem */ 16384 } 16385 mp->b_rptr = rptr; 16386 /* 16387 * If the old timestamp is no longer in use, 16388 * sample a new timestamp now. 16389 */ 16390 if ((*xmit_tail)->b_next == NULL) { 16391 (*xmit_tail)->b_prev = local_time; 16392 (*xmit_tail)->b_next = 16393 (mblk_t *)(uintptr_t)(*snxt-len); 16394 } 16395 goto must_alloc; 16396 } 16397 } else { 16398 *xmit_tail = (*xmit_tail)->b_cont; 16399 ASSERT((uintptr_t)((*xmit_tail)->b_wptr - 16400 (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX); 16401 *tail_unsent = (int)((*xmit_tail)->b_wptr - 16402 (*xmit_tail)->b_rptr); 16403 } 16404 16405 (*xmit_tail)->b_prev = local_time; 16406 (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len); 16407 16408 *tail_unsent -= len; 16409 if (len <= mss) /* LSO is unusable (!do_lso_send) */ 16410 tcp->tcp_last_sent_len = (ushort_t)len; 16411 16412 len += total_hdr_len; 16413 ixa->ixa_pktlen = len; 16414 16415 if (ixa->ixa_flags & IXAF_IS_IPV4) { 16416 tcp->tcp_ipha->ipha_length = htons(len); 16417 } else { 16418 tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 16419 } 16420 16421 mp = dupb(*xmit_tail); 16422 if (mp == NULL) { 16423 return (-1); /* out_of_mem */ 16424 } 16425 16426 len = total_hdr_len; 16427 /* 16428 * There are four reasons to allocate a new hdr mblk: 16429 * 1) The bytes above us are in use by another packet 16430 * 2) We don't have good alignment 16431 * 3) The mblk is being shared 16432 * 4) We don't have enough room for a header 16433 */ 16434 rptr = mp->b_rptr - len; 16435 if (!OK_32PTR(rptr) || 16436 ((db = mp->b_datap), db->db_ref != 2) || 16437 rptr < db->db_base) { 16438 /* NOTE: we assume allocb returns an OK_32PTR */ 16439 16440 must_alloc:; 16441 mp1 = allocb(connp->conn_ht_iphc_allocated + 16442 tcps->tcps_wroff_xtra, BPRI_MED); 16443 if (mp1 == NULL) { 16444 freemsg(mp); 16445 return (-1); /* out_of_mem */ 16446 } 16447 mp1->b_cont = mp; 16448 mp = mp1; 16449 /* Leave room for Link Level header */ 16450 len = total_hdr_len; 16451 rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; 16452 mp->b_wptr = &rptr[len]; 16453 } 16454 16455 /* 16456 * Fill in the header using the template header, and add 16457 * options such as time-stamp, ECN and/or SACK, as needed. 16458 */ 16459 tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk); 16460 16461 mp->b_rptr = rptr; 16462 16463 if (*tail_unsent) { 16464 int spill = *tail_unsent; 16465 16466 mp1 = mp->b_cont; 16467 if (mp1 == NULL) 16468 mp1 = mp; 16469 16470 /* 16471 * If we're a little short, tack on more mblks until 16472 * there is no more spillover. 16473 */ 16474 while (spill < 0) { 16475 mblk_t *nmp; 16476 int nmpsz; 16477 16478 nmp = (*xmit_tail)->b_cont; 16479 nmpsz = MBLKL(nmp); 16480 16481 /* 16482 * Excess data in mblk; can we split it? 16483 * If LSO is enabled for the connection, 16484 * keep on splitting as this is a transient 16485 * send path. 16486 */ 16487 if (!do_lso_send && (spill + nmpsz > 0)) { 16488 /* 16489 * Don't split if stream head was 16490 * told to break up larger writes 16491 * into smaller ones. 16492 */ 16493 if (tcp->tcp_maxpsz_multiplier > 0) 16494 break; 16495 16496 /* 16497 * Next mblk is less than SMSS/2 16498 * rounded up to nearest 64-byte; 16499 * let it get sent as part of the 16500 * next segment. 16501 */ 16502 if (tcp->tcp_localnet && 16503 !tcp->tcp_cork && 16504 (nmpsz < roundup((mss >> 1), 64))) 16505 break; 16506 } 16507 16508 *xmit_tail = nmp; 16509 ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX); 16510 /* Stash for rtt use later */ 16511 (*xmit_tail)->b_prev = local_time; 16512 (*xmit_tail)->b_next = 16513 (mblk_t *)(uintptr_t)(*snxt - len); 16514 mp1->b_cont = dupb(*xmit_tail); 16515 mp1 = mp1->b_cont; 16516 16517 spill += nmpsz; 16518 if (mp1 == NULL) { 16519 *tail_unsent = spill; 16520 freemsg(mp); 16521 return (-1); /* out_of_mem */ 16522 } 16523 } 16524 16525 /* Trim back any surplus on the last mblk */ 16526 if (spill >= 0) { 16527 mp1->b_wptr -= spill; 16528 *tail_unsent = spill; 16529 } else { 16530 /* 16531 * We did not send everything we could in 16532 * order to remain within the b_cont limit. 16533 */ 16534 *usable -= spill; 16535 *snxt += spill; 16536 tcp->tcp_last_sent_len += spill; 16537 UPDATE_MIB(&tcps->tcps_mib, 16538 tcpOutDataBytes, spill); 16539 /* 16540 * Adjust the checksum 16541 */ 16542 tcpha = (tcpha_t *)(rptr + 16543 ixa->ixa_ip_hdr_length); 16544 sum += spill; 16545 sum = (sum >> 16) + (sum & 0xFFFF); 16546 tcpha->tha_sum = htons(sum); 16547 if (connp->conn_ipversion == IPV4_VERSION) { 16548 sum = ntohs( 16549 ((ipha_t *)rptr)->ipha_length) + 16550 spill; 16551 ((ipha_t *)rptr)->ipha_length = 16552 htons(sum); 16553 } else { 16554 sum = ntohs( 16555 ((ip6_t *)rptr)->ip6_plen) + 16556 spill; 16557 ((ip6_t *)rptr)->ip6_plen = 16558 htons(sum); 16559 } 16560 ixa->ixa_pktlen += spill; 16561 *tail_unsent = 0; 16562 } 16563 } 16564 if (tcp->tcp_ip_forward_progress) { 16565 tcp->tcp_ip_forward_progress = B_FALSE; 16566 ixa->ixa_flags |= IXAF_REACH_CONF; 16567 } else { 16568 ixa->ixa_flags &= ~IXAF_REACH_CONF; 16569 } 16570 16571 /* 16572 * Append LSO information, both flags and mss, to the mp. 16573 */ 16574 if (do_lso_send) { 16575 lso_info_set(mp, mss, HW_LSO); 16576 ixa->ixa_fragsize = IP_MAXPACKET; 16577 ixa->ixa_extra_ident = num_lso_seg - 1; 16578 16579 DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, 16580 boolean_t, B_TRUE); 16581 16582 tcp_send_data(tcp, mp); 16583 16584 /* 16585 * Restore values of ixa_fragsize and ixa_extra_ident. 16586 */ 16587 ixa->ixa_fragsize = ixa->ixa_pmtu; 16588 ixa->ixa_extra_ident = 0; 16589 tcp->tcp_obsegs += num_lso_seg; 16590 TCP_STAT(tcps, tcp_lso_times); 16591 TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg); 16592 } else { 16593 tcp_send_data(tcp, mp); 16594 BUMP_LOCAL(tcp->tcp_obsegs); 16595 } 16596 } 16597 16598 return (0); 16599 } 16600 16601 /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */ 16602 static void 16603 tcp_wput_flush(tcp_t *tcp, mblk_t *mp) 16604 { 16605 uchar_t fval = *mp->b_rptr; 16606 mblk_t *tail; 16607 conn_t *connp = tcp->tcp_connp; 16608 queue_t *q = connp->conn_wq; 16609 16610 /* TODO: How should flush interact with urgent data? */ 16611 if ((fval & FLUSHW) && tcp->tcp_xmit_head && 16612 !(tcp->tcp_valid_bits & TCP_URG_VALID)) { 16613 /* 16614 * Flush only data that has not yet been put on the wire. If 16615 * we flush data that we have already transmitted, life, as we 16616 * know it, may come to an end. 16617 */ 16618 tail = tcp->tcp_xmit_tail; 16619 tail->b_wptr -= tcp->tcp_xmit_tail_unsent; 16620 tcp->tcp_xmit_tail_unsent = 0; 16621 tcp->tcp_unsent = 0; 16622 if (tail->b_wptr != tail->b_rptr) 16623 tail = tail->b_cont; 16624 if (tail) { 16625 mblk_t **excess = &tcp->tcp_xmit_head; 16626 for (;;) { 16627 mblk_t *mp1 = *excess; 16628 if (mp1 == tail) 16629 break; 16630 tcp->tcp_xmit_tail = mp1; 16631 tcp->tcp_xmit_last = mp1; 16632 excess = &mp1->b_cont; 16633 } 16634 *excess = NULL; 16635 tcp_close_mpp(&tail); 16636 if (tcp->tcp_snd_zcopy_aware) 16637 tcp_zcopy_notify(tcp); 16638 } 16639 /* 16640 * We have no unsent data, so unsent must be less than 16641 * conn_sndlowat, so re-enable flow. 16642 */ 16643 mutex_enter(&tcp->tcp_non_sq_lock); 16644 if (tcp->tcp_flow_stopped) { 16645 tcp_clrqfull(tcp); 16646 } 16647 mutex_exit(&tcp->tcp_non_sq_lock); 16648 } 16649 /* 16650 * TODO: you can't just flush these, you have to increase rwnd for one 16651 * thing. For another, how should urgent data interact? 16652 */ 16653 if (fval & FLUSHR) { 16654 *mp->b_rptr = fval & ~FLUSHW; 16655 /* XXX */ 16656 qreply(q, mp); 16657 return; 16658 } 16659 freemsg(mp); 16660 } 16661 16662 /* 16663 * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA 16664 * messages. 16665 */ 16666 static void 16667 tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) 16668 { 16669 mblk_t *mp1; 16670 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 16671 STRUCT_HANDLE(strbuf, sb); 16672 uint_t addrlen; 16673 conn_t *connp = tcp->tcp_connp; 16674 queue_t *q = connp->conn_wq; 16675 16676 /* Make sure it is one of ours. */ 16677 switch (iocp->ioc_cmd) { 16678 case TI_GETMYNAME: 16679 case TI_GETPEERNAME: 16680 break; 16681 default: 16682 ip_wput_nondata(q, mp); 16683 return; 16684 } 16685 switch (mi_copy_state(q, mp, &mp1)) { 16686 case -1: 16687 return; 16688 case MI_COPY_CASE(MI_COPY_IN, 1): 16689 break; 16690 case MI_COPY_CASE(MI_COPY_OUT, 1): 16691 /* Copy out the strbuf. */ 16692 mi_copyout(q, mp); 16693 return; 16694 case MI_COPY_CASE(MI_COPY_OUT, 2): 16695 /* All done. */ 16696 mi_copy_done(q, mp, 0); 16697 return; 16698 default: 16699 mi_copy_done(q, mp, EPROTO); 16700 return; 16701 } 16702 /* Check alignment of the strbuf */ 16703 if (!OK_32PTR(mp1->b_rptr)) { 16704 mi_copy_done(q, mp, EINVAL); 16705 return; 16706 } 16707 16708 STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr); 16709 16710 if (connp->conn_family == AF_INET) 16711 addrlen = sizeof (sin_t); 16712 else 16713 addrlen = sizeof (sin6_t); 16714 16715 if (STRUCT_FGET(sb, maxlen) < addrlen) { 16716 mi_copy_done(q, mp, EINVAL); 16717 return; 16718 } 16719 16720 switch (iocp->ioc_cmd) { 16721 case TI_GETMYNAME: 16722 break; 16723 case TI_GETPEERNAME: 16724 if (tcp->tcp_state < TCPS_SYN_RCVD) { 16725 mi_copy_done(q, mp, ENOTCONN); 16726 return; 16727 } 16728 break; 16729 } 16730 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 16731 if (!mp1) 16732 return; 16733 16734 STRUCT_FSET(sb, len, addrlen); 16735 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 16736 case TI_GETMYNAME: 16737 (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, 16738 &addrlen); 16739 break; 16740 case TI_GETPEERNAME: 16741 (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, 16742 &addrlen); 16743 break; 16744 } 16745 mp1->b_wptr += addrlen; 16746 /* Copy out the address */ 16747 mi_copyout(q, mp); 16748 } 16749 16750 static void 16751 tcp_use_pure_tpi(tcp_t *tcp) 16752 { 16753 conn_t *connp = tcp->tcp_connp; 16754 16755 #ifdef _ILP32 16756 tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq; 16757 #else 16758 tcp->tcp_acceptor_id = connp->conn_dev; 16759 #endif 16760 /* 16761 * Insert this socket into the acceptor hash. 16762 * We might need it for T_CONN_RES message 16763 */ 16764 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 16765 16766 tcp->tcp_issocket = B_FALSE; 16767 TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback); 16768 } 16769 16770 /* 16771 * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL 16772 * messages. 16773 */ 16774 /* ARGSUSED */ 16775 static void 16776 tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 16777 { 16778 conn_t *connp = (conn_t *)arg; 16779 tcp_t *tcp = connp->conn_tcp; 16780 queue_t *q = connp->conn_wq; 16781 struct iocblk *iocp; 16782 16783 ASSERT(DB_TYPE(mp) == M_IOCTL); 16784 /* 16785 * Try and ASSERT the minimum possible references on the 16786 * conn early enough. Since we are executing on write side, 16787 * the connection is obviously not detached and that means 16788 * there is a ref each for TCP and IP. Since we are behind 16789 * the squeue, the minimum references needed are 3. If the 16790 * conn is in classifier hash list, there should be an 16791 * extra ref for that (we check both the possibilities). 16792 */ 16793 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 16794 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 16795 16796 iocp = (struct iocblk *)mp->b_rptr; 16797 switch (iocp->ioc_cmd) { 16798 case _SIOCSOCKFALLBACK: 16799 /* 16800 * Either sockmod is about to be popped and the socket 16801 * would now be treated as a plain stream, or a module 16802 * is about to be pushed so we could no longer use read- 16803 * side synchronous streams for fused loopback tcp. 16804 * Drain any queued data and disable direct sockfs 16805 * interface from now on. 16806 */ 16807 if (!tcp->tcp_issocket) { 16808 DB_TYPE(mp) = M_IOCNAK; 16809 iocp->ioc_error = EINVAL; 16810 } else { 16811 tcp_use_pure_tpi(tcp); 16812 DB_TYPE(mp) = M_IOCACK; 16813 iocp->ioc_error = 0; 16814 } 16815 iocp->ioc_count = 0; 16816 iocp->ioc_rval = 0; 16817 qreply(q, mp); 16818 return; 16819 } 16820 ip_wput_nondata(q, mp); 16821 } 16822 16823 /* 16824 * This routine is called by tcp_wput() to handle all TPI requests. 16825 */ 16826 /* ARGSUSED */ 16827 static void 16828 tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 16829 { 16830 conn_t *connp = (conn_t *)arg; 16831 tcp_t *tcp = connp->conn_tcp; 16832 union T_primitives *tprim = (union T_primitives *)mp->b_rptr; 16833 uchar_t *rptr; 16834 t_scalar_t type; 16835 cred_t *cr; 16836 16837 /* 16838 * Try and ASSERT the minimum possible references on the 16839 * conn early enough. Since we are executing on write side, 16840 * the connection is obviously not detached and that means 16841 * there is a ref each for TCP and IP. Since we are behind 16842 * the squeue, the minimum references needed are 3. If the 16843 * conn is in classifier hash list, there should be an 16844 * extra ref for that (we check both the possibilities). 16845 */ 16846 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 16847 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 16848 16849 rptr = mp->b_rptr; 16850 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 16851 if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 16852 type = ((union T_primitives *)rptr)->type; 16853 if (type == T_EXDATA_REQ) { 16854 tcp_output_urgent(connp, mp, arg2, NULL); 16855 } else if (type != T_DATA_REQ) { 16856 goto non_urgent_data; 16857 } else { 16858 /* TODO: options, flags, ... from user */ 16859 /* Set length to zero for reclamation below */ 16860 tcp_wput_data(tcp, mp->b_cont, B_TRUE); 16861 freeb(mp); 16862 } 16863 return; 16864 } else { 16865 if (connp->conn_debug) { 16866 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 16867 "tcp_wput_proto, dropping one..."); 16868 } 16869 freemsg(mp); 16870 return; 16871 } 16872 16873 non_urgent_data: 16874 16875 switch ((int)tprim->type) { 16876 case T_SSL_PROXY_BIND_REQ: /* an SSL proxy endpoint bind request */ 16877 /* 16878 * save the kssl_ent_t from the next block, and convert this 16879 * back to a normal bind_req. 16880 */ 16881 if (mp->b_cont != NULL) { 16882 ASSERT(MBLKL(mp->b_cont) >= sizeof (kssl_ent_t)); 16883 16884 if (tcp->tcp_kssl_ent != NULL) { 16885 kssl_release_ent(tcp->tcp_kssl_ent, NULL, 16886 KSSL_NO_PROXY); 16887 tcp->tcp_kssl_ent = NULL; 16888 } 16889 bcopy(mp->b_cont->b_rptr, &tcp->tcp_kssl_ent, 16890 sizeof (kssl_ent_t)); 16891 kssl_hold_ent(tcp->tcp_kssl_ent); 16892 freemsg(mp->b_cont); 16893 mp->b_cont = NULL; 16894 } 16895 tprim->type = T_BIND_REQ; 16896 16897 /* FALLTHROUGH */ 16898 case O_T_BIND_REQ: /* bind request */ 16899 case T_BIND_REQ: /* new semantics bind request */ 16900 tcp_tpi_bind(tcp, mp); 16901 break; 16902 case T_UNBIND_REQ: /* unbind request */ 16903 tcp_tpi_unbind(tcp, mp); 16904 break; 16905 case O_T_CONN_RES: /* old connection response XXX */ 16906 case T_CONN_RES: /* connection response */ 16907 tcp_tli_accept(tcp, mp); 16908 break; 16909 case T_CONN_REQ: /* connection request */ 16910 tcp_tpi_connect(tcp, mp); 16911 break; 16912 case T_DISCON_REQ: /* disconnect request */ 16913 tcp_disconnect(tcp, mp); 16914 break; 16915 case T_CAPABILITY_REQ: 16916 tcp_capability_req(tcp, mp); /* capability request */ 16917 break; 16918 case T_INFO_REQ: /* information request */ 16919 tcp_info_req(tcp, mp); 16920 break; 16921 case T_SVR4_OPTMGMT_REQ: /* manage options req */ 16922 case T_OPTMGMT_REQ: 16923 /* 16924 * Note: no support for snmpcom_req() through new 16925 * T_OPTMGMT_REQ. See comments in ip.c 16926 */ 16927 16928 /* 16929 * All Solaris components should pass a db_credp 16930 * for this TPI message, hence we ASSERT. 16931 * But in case there is some other M_PROTO that looks 16932 * like a TPI message sent by some other kernel 16933 * component, we check and return an error. 16934 */ 16935 cr = msg_getcred(mp, NULL); 16936 ASSERT(cr != NULL); 16937 if (cr == NULL) { 16938 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 16939 return; 16940 } 16941 /* 16942 * If EINPROGRESS is returned, the request has been queued 16943 * for subsequent processing by ip_restart_optmgmt(), which 16944 * will do the CONN_DEC_REF(). 16945 */ 16946 if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) { 16947 svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); 16948 } else { 16949 tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); 16950 } 16951 break; 16952 16953 case T_UNITDATA_REQ: /* unitdata request */ 16954 tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 16955 break; 16956 case T_ORDREL_REQ: /* orderly release req */ 16957 freemsg(mp); 16958 16959 if (tcp->tcp_fused) 16960 tcp_unfuse(tcp); 16961 16962 if (tcp_xmit_end(tcp) != 0) { 16963 /* 16964 * We were crossing FINs and got a reset from 16965 * the other side. Just ignore it. 16966 */ 16967 if (connp->conn_debug) { 16968 (void) strlog(TCP_MOD_ID, 0, 1, 16969 SL_ERROR|SL_TRACE, 16970 "tcp_wput_proto, T_ORDREL_REQ out of " 16971 "state %s", 16972 tcp_display(tcp, NULL, 16973 DISP_ADDR_AND_PORT)); 16974 } 16975 } 16976 break; 16977 case T_ADDR_REQ: 16978 tcp_addr_req(tcp, mp); 16979 break; 16980 default: 16981 if (connp->conn_debug) { 16982 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 16983 "tcp_wput_proto, bogus TPI msg, type %d", 16984 tprim->type); 16985 } 16986 /* 16987 * We used to M_ERROR. Sending TNOTSUPPORT gives the user 16988 * to recover. 16989 */ 16990 tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 16991 break; 16992 } 16993 } 16994 16995 /* 16996 * The TCP write service routine should never be called... 16997 */ 16998 /* ARGSUSED */ 16999 static void 17000 tcp_wsrv(queue_t *q) 17001 { 17002 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 17003 17004 TCP_STAT(tcps, tcp_wsrv_called); 17005 } 17006 17007 /* 17008 * Send out a control packet on the tcp connection specified. This routine 17009 * is typically called where we need a simple ACK or RST generated. 17010 */ 17011 static void 17012 tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) 17013 { 17014 uchar_t *rptr; 17015 tcpha_t *tcpha; 17016 ipha_t *ipha = NULL; 17017 ip6_t *ip6h = NULL; 17018 uint32_t sum; 17019 int total_hdr_len; 17020 int ip_hdr_len; 17021 mblk_t *mp; 17022 tcp_stack_t *tcps = tcp->tcp_tcps; 17023 conn_t *connp = tcp->tcp_connp; 17024 ip_xmit_attr_t *ixa = connp->conn_ixa; 17025 17026 /* 17027 * Save sum for use in source route later. 17028 */ 17029 sum = connp->conn_ht_ulp_len + connp->conn_sum; 17030 total_hdr_len = connp->conn_ht_iphc_len; 17031 ip_hdr_len = ixa->ixa_ip_hdr_length; 17032 17033 /* If a text string is passed in with the request, pass it to strlog. */ 17034 if (str != NULL && connp->conn_debug) { 17035 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 17036 "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", 17037 str, seq, ack, ctl); 17038 } 17039 mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, 17040 BPRI_MED); 17041 if (mp == NULL) { 17042 return; 17043 } 17044 rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; 17045 mp->b_rptr = rptr; 17046 mp->b_wptr = &rptr[total_hdr_len]; 17047 bcopy(connp->conn_ht_iphc, rptr, total_hdr_len); 17048 17049 ixa->ixa_pktlen = total_hdr_len; 17050 17051 if (ixa->ixa_flags & IXAF_IS_IPV4) { 17052 ipha = (ipha_t *)rptr; 17053 ipha->ipha_length = htons(total_hdr_len); 17054 } else { 17055 ip6h = (ip6_t *)rptr; 17056 ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); 17057 } 17058 tcpha = (tcpha_t *)&rptr[ip_hdr_len]; 17059 tcpha->tha_flags = (uint8_t)ctl; 17060 if (ctl & TH_RST) { 17061 BUMP_MIB(&tcps->tcps_mib, tcpOutRsts); 17062 BUMP_MIB(&tcps->tcps_mib, tcpOutControl); 17063 /* 17064 * Don't send TSopt w/ TH_RST packets per RFC 1323. 17065 */ 17066 if (tcp->tcp_snd_ts_ok && 17067 tcp->tcp_state > TCPS_SYN_SENT) { 17068 mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN]; 17069 *(mp->b_wptr) = TCPOPT_EOL; 17070 17071 ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN; 17072 17073 if (connp->conn_ipversion == IPV4_VERSION) { 17074 ipha->ipha_length = htons(total_hdr_len - 17075 TCPOPT_REAL_TS_LEN); 17076 } else { 17077 ip6h->ip6_plen = htons(total_hdr_len - 17078 IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN); 17079 } 17080 tcpha->tha_offset_and_reserved -= (3 << 4); 17081 sum -= TCPOPT_REAL_TS_LEN; 17082 } 17083 } 17084 if (ctl & TH_ACK) { 17085 if (tcp->tcp_snd_ts_ok) { 17086 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 17087 17088 U32_TO_BE32(llbolt, 17089 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); 17090 U32_TO_BE32(tcp->tcp_ts_recent, 17091 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); 17092 } 17093 17094 /* Update the latest receive window size in TCP header. */ 17095 tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 17096 /* Track what we sent to the peer */ 17097 tcp->tcp_tcpha->tha_win = tcpha->tha_win; 17098 tcp->tcp_rack = ack; 17099 tcp->tcp_rack_cnt = 0; 17100 BUMP_MIB(&tcps->tcps_mib, tcpOutAck); 17101 } 17102 BUMP_LOCAL(tcp->tcp_obsegs); 17103 tcpha->tha_seq = htonl(seq); 17104 tcpha->tha_ack = htonl(ack); 17105 /* 17106 * Include the adjustment for a source route if any. 17107 */ 17108 sum = (sum >> 16) + (sum & 0xFFFF); 17109 tcpha->tha_sum = htons(sum); 17110 tcp_send_data(tcp, mp); 17111 } 17112 17113 /* 17114 * If this routine returns B_TRUE, TCP can generate a RST in response 17115 * to a segment. If it returns B_FALSE, TCP should not respond. 17116 */ 17117 static boolean_t 17118 tcp_send_rst_chk(tcp_stack_t *tcps) 17119 { 17120 int64_t now; 17121 17122 /* 17123 * TCP needs to protect itself from generating too many RSTs. 17124 * This can be a DoS attack by sending us random segments 17125 * soliciting RSTs. 17126 * 17127 * What we do here is to have a limit of tcp_rst_sent_rate RSTs 17128 * in each 1 second interval. In this way, TCP still generate 17129 * RSTs in normal cases but when under attack, the impact is 17130 * limited. 17131 */ 17132 if (tcps->tcps_rst_sent_rate_enabled != 0) { 17133 now = ddi_get_lbolt64(); 17134 if (TICK_TO_MSEC(now - tcps->tcps_last_rst_intrvl) > 17135 1*SECONDS) { 17136 tcps->tcps_last_rst_intrvl = now; 17137 tcps->tcps_rst_cnt = 1; 17138 } else if (++tcps->tcps_rst_cnt > tcps->tcps_rst_sent_rate) { 17139 return (B_FALSE); 17140 } 17141 } 17142 return (B_TRUE); 17143 } 17144 17145 /* 17146 * Generate a reset based on an inbound packet, connp is set by caller 17147 * when RST is in response to an unexpected inbound packet for which 17148 * there is active tcp state in the system. 17149 * 17150 * IPSEC NOTE : Try to send the reply with the same protection as it came 17151 * in. We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t. 17152 * That way the packet will go out at the same level of protection as it 17153 * came in with. 17154 */ 17155 static void 17156 tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl, 17157 ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp) 17158 { 17159 ipha_t *ipha = NULL; 17160 ip6_t *ip6h = NULL; 17161 ushort_t len; 17162 tcpha_t *tcpha; 17163 int i; 17164 ipaddr_t v4addr; 17165 in6_addr_t v6addr; 17166 netstack_t *ns = ipst->ips_netstack; 17167 tcp_stack_t *tcps = ns->netstack_tcp; 17168 ip_xmit_attr_t ixas, *ixa; 17169 uint_t ip_hdr_len = ira->ira_ip_hdr_length; 17170 boolean_t need_refrele = B_FALSE; /* ixa_refrele(ixa) */ 17171 ushort_t port; 17172 17173 if (!tcp_send_rst_chk(tcps)) { 17174 TCP_STAT(tcps, tcp_rst_unsent); 17175 freemsg(mp); 17176 return; 17177 } 17178 17179 /* 17180 * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other 17181 * options from the listener. In that case the caller must ensure that 17182 * we are running on the listener = connp squeue. 17183 * 17184 * We get a safe copy of conn_ixa so we don't need to restore anything 17185 * we or ip_output_simple might change in the ixa. 17186 */ 17187 if (connp != NULL) { 17188 ASSERT(connp->conn_on_sqp); 17189 17190 ixa = conn_get_ixa_exclusive(connp); 17191 if (ixa == NULL) { 17192 TCP_STAT(tcps, tcp_rst_unsent); 17193 freemsg(mp); 17194 return; 17195 } 17196 need_refrele = B_TRUE; 17197 } else { 17198 bzero(&ixas, sizeof (ixas)); 17199 ixa = &ixas; 17200 /* 17201 * IXAF_VERIFY_SOURCE is overkill since we know the 17202 * packet was for us. 17203 */ 17204 ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE; 17205 ixa->ixa_protocol = IPPROTO_TCP; 17206 ixa->ixa_zoneid = ira->ira_zoneid; 17207 ixa->ixa_ifindex = 0; 17208 ixa->ixa_ipst = ipst; 17209 ixa->ixa_cred = kcred; 17210 ixa->ixa_cpid = NOPID; 17211 } 17212 17213 if (str && tcps->tcps_dbg) { 17214 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 17215 "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 17216 "flags 0x%x", 17217 str, seq, ack, ctl); 17218 } 17219 if (mp->b_datap->db_ref != 1) { 17220 mblk_t *mp1 = copyb(mp); 17221 freemsg(mp); 17222 mp = mp1; 17223 if (mp == NULL) 17224 goto done; 17225 } else if (mp->b_cont) { 17226 freemsg(mp->b_cont); 17227 mp->b_cont = NULL; 17228 DB_CKSUMFLAGS(mp) = 0; 17229 } 17230 /* 17231 * We skip reversing source route here. 17232 * (for now we replace all IP options with EOL) 17233 */ 17234 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 17235 ipha = (ipha_t *)mp->b_rptr; 17236 for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 17237 mp->b_rptr[i] = IPOPT_EOL; 17238 /* 17239 * Make sure that src address isn't flagrantly invalid. 17240 * Not all broadcast address checking for the src address 17241 * is possible, since we don't know the netmask of the src 17242 * addr. No check for destination address is done, since 17243 * IP will not pass up a packet with a broadcast dest 17244 * address to TCP. Similar checks are done below for IPv6. 17245 */ 17246 if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST || 17247 CLASSD(ipha->ipha_src)) { 17248 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 17249 ip_drop_input("ipIfStatsInDiscards", mp, NULL); 17250 freemsg(mp); 17251 goto done; 17252 } 17253 } else { 17254 ip6h = (ip6_t *)mp->b_rptr; 17255 17256 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || 17257 IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { 17258 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); 17259 ip_drop_input("ipIfStatsInDiscards", mp, NULL); 17260 freemsg(mp); 17261 goto done; 17262 } 17263 17264 /* Remove any extension headers assuming partial overlay */ 17265 if (ip_hdr_len > IPV6_HDR_LEN) { 17266 uint8_t *to; 17267 17268 to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN; 17269 ovbcopy(ip6h, to, IPV6_HDR_LEN); 17270 mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN; 17271 ip_hdr_len = IPV6_HDR_LEN; 17272 ip6h = (ip6_t *)mp->b_rptr; 17273 ip6h->ip6_nxt = IPPROTO_TCP; 17274 } 17275 } 17276 tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; 17277 if (tcpha->tha_flags & TH_RST) { 17278 freemsg(mp); 17279 goto done; 17280 } 17281 tcpha->tha_offset_and_reserved = (5 << 4); 17282 len = ip_hdr_len + sizeof (tcpha_t); 17283 mp->b_wptr = &mp->b_rptr[len]; 17284 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 17285 ipha->ipha_length = htons(len); 17286 /* Swap addresses */ 17287 v4addr = ipha->ipha_src; 17288 ipha->ipha_src = ipha->ipha_dst; 17289 ipha->ipha_dst = v4addr; 17290 ipha->ipha_ident = 0; 17291 ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; 17292 ixa->ixa_flags |= IXAF_IS_IPV4; 17293 ixa->ixa_ip_hdr_length = ip_hdr_len; 17294 } else { 17295 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 17296 /* Swap addresses */ 17297 v6addr = ip6h->ip6_src; 17298 ip6h->ip6_src = ip6h->ip6_dst; 17299 ip6h->ip6_dst = v6addr; 17300 ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit; 17301 ixa->ixa_flags &= ~IXAF_IS_IPV4; 17302 17303 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) { 17304 ixa->ixa_flags |= IXAF_SCOPEID_SET; 17305 ixa->ixa_scopeid = ira->ira_ruifindex; 17306 } 17307 ixa->ixa_ip_hdr_length = IPV6_HDR_LEN; 17308 } 17309 ixa->ixa_pktlen = len; 17310 17311 /* Swap the ports */ 17312 port = tcpha->tha_fport; 17313 tcpha->tha_fport = tcpha->tha_lport; 17314 tcpha->tha_lport = port; 17315 17316 tcpha->tha_ack = htonl(ack); 17317 tcpha->tha_seq = htonl(seq); 17318 tcpha->tha_win = 0; 17319 tcpha->tha_sum = htons(sizeof (tcpha_t)); 17320 tcpha->tha_flags = (uint8_t)ctl; 17321 if (ctl & TH_RST) { 17322 BUMP_MIB(&tcps->tcps_mib, tcpOutRsts); 17323 BUMP_MIB(&tcps->tcps_mib, tcpOutControl); 17324 } 17325 17326 /* Discard any old label */ 17327 if (ixa->ixa_free_flags & IXA_FREE_TSL) { 17328 ASSERT(ixa->ixa_tsl != NULL); 17329 label_rele(ixa->ixa_tsl); 17330 ixa->ixa_free_flags &= ~IXA_FREE_TSL; 17331 } 17332 ixa->ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ 17333 17334 if (ira->ira_flags & IRAF_IPSEC_SECURE) { 17335 /* 17336 * Apply IPsec based on how IPsec was applied to 17337 * the packet that caused the RST. 17338 */ 17339 if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) { 17340 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); 17341 /* Note: mp already consumed and ip_drop_packet done */ 17342 goto done; 17343 } 17344 } else { 17345 /* 17346 * This is in clear. The RST message we are building 17347 * here should go out in clear, independent of our policy. 17348 */ 17349 ixa->ixa_flags |= IXAF_NO_IPSEC; 17350 } 17351 17352 /* 17353 * NOTE: one might consider tracing a TCP packet here, but 17354 * this function has no active TCP state and no tcp structure 17355 * that has a trace buffer. If we traced here, we would have 17356 * to keep a local trace buffer in tcp_record_trace(). 17357 */ 17358 17359 (void) ip_output_simple(mp, ixa); 17360 done: 17361 ixa_cleanup(ixa); 17362 if (need_refrele) { 17363 ASSERT(ixa != &ixas); 17364 ixa_refrele(ixa); 17365 } 17366 } 17367 17368 /* 17369 * Initiate closedown sequence on an active connection. (May be called as 17370 * writer.) Return value zero for OK return, non-zero for error return. 17371 */ 17372 static int 17373 tcp_xmit_end(tcp_t *tcp) 17374 { 17375 mblk_t *mp; 17376 tcp_stack_t *tcps = tcp->tcp_tcps; 17377 iulp_t uinfo; 17378 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 17379 conn_t *connp = tcp->tcp_connp; 17380 17381 if (tcp->tcp_state < TCPS_SYN_RCVD || 17382 tcp->tcp_state > TCPS_CLOSE_WAIT) { 17383 /* 17384 * Invalid state, only states TCPS_SYN_RCVD, 17385 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 17386 */ 17387 return (-1); 17388 } 17389 17390 tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 17391 tcp->tcp_valid_bits |= TCP_FSS_VALID; 17392 /* 17393 * If there is nothing more unsent, send the FIN now. 17394 * Otherwise, it will go out with the last segment. 17395 */ 17396 if (tcp->tcp_unsent == 0) { 17397 mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 17398 tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 17399 17400 if (mp) { 17401 tcp_send_data(tcp, mp); 17402 } else { 17403 /* 17404 * Couldn't allocate msg. Pretend we got it out. 17405 * Wait for rexmit timeout. 17406 */ 17407 tcp->tcp_snxt = tcp->tcp_fss + 1; 17408 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 17409 } 17410 17411 /* 17412 * If needed, update tcp_rexmit_snxt as tcp_snxt is 17413 * changed. 17414 */ 17415 if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 17416 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 17417 } 17418 } else { 17419 /* 17420 * If tcp->tcp_cork is set, then the data will not get sent, 17421 * so we have to check that and unset it first. 17422 */ 17423 if (tcp->tcp_cork) 17424 tcp->tcp_cork = B_FALSE; 17425 tcp_wput_data(tcp, NULL, B_FALSE); 17426 } 17427 17428 /* 17429 * If TCP does not get enough samples of RTT or tcp_rtt_updates 17430 * is 0, don't update the cache. 17431 */ 17432 if (tcps->tcps_rtt_updates == 0 || 17433 tcp->tcp_rtt_update < tcps->tcps_rtt_updates) 17434 return (0); 17435 17436 /* 17437 * We do not have a good algorithm to update ssthresh at this time. 17438 * So don't do any update. 17439 */ 17440 bzero(&uinfo, sizeof (uinfo)); 17441 uinfo.iulp_rtt = tcp->tcp_rtt_sa; 17442 uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd; 17443 17444 /* 17445 * Note that uinfo is kept for conn_faddr in the DCE. Could update even 17446 * if source routed but we don't. 17447 */ 17448 if (connp->conn_ipversion == IPV4_VERSION) { 17449 if (connp->conn_faddr_v4 != tcp->tcp_ipha->ipha_dst) { 17450 return (0); 17451 } 17452 (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst); 17453 } else { 17454 uint_t ifindex; 17455 17456 if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, 17457 &tcp->tcp_ip6h->ip6_dst))) { 17458 return (0); 17459 } 17460 ifindex = 0; 17461 if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) { 17462 ip_xmit_attr_t *ixa = connp->conn_ixa; 17463 17464 /* 17465 * If we are going to create a DCE we'd better have 17466 * an ifindex 17467 */ 17468 if (ixa->ixa_nce != NULL) { 17469 ifindex = ixa->ixa_nce->nce_common->ncec_ill-> 17470 ill_phyint->phyint_ifindex; 17471 } else { 17472 return (0); 17473 } 17474 } 17475 17476 (void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo, 17477 ipst); 17478 } 17479 return (0); 17480 } 17481 17482 /* 17483 * Generate a "no listener here" RST in response to an "unknown" segment. 17484 * connp is set by caller when RST is in response to an unexpected 17485 * inbound packet for which there is active tcp state in the system. 17486 * Note that we are reusing the incoming mp to construct the outgoing RST. 17487 */ 17488 void 17489 tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst, 17490 conn_t *connp) 17491 { 17492 uchar_t *rptr; 17493 uint32_t seg_len; 17494 tcpha_t *tcpha; 17495 uint32_t seg_seq; 17496 uint32_t seg_ack; 17497 uint_t flags; 17498 ipha_t *ipha; 17499 ip6_t *ip6h; 17500 boolean_t policy_present; 17501 netstack_t *ns = ipst->ips_netstack; 17502 tcp_stack_t *tcps = ns->netstack_tcp; 17503 ipsec_stack_t *ipss = tcps->tcps_netstack->netstack_ipsec; 17504 uint_t ip_hdr_len = ira->ira_ip_hdr_length; 17505 17506 TCP_STAT(tcps, tcp_no_listener); 17507 17508 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 17509 policy_present = ipss->ipsec_inbound_v4_policy_present; 17510 ipha = (ipha_t *)mp->b_rptr; 17511 ip6h = NULL; 17512 } else { 17513 policy_present = ipss->ipsec_inbound_v6_policy_present; 17514 ipha = NULL; 17515 ip6h = (ip6_t *)mp->b_rptr; 17516 } 17517 17518 if (policy_present) { 17519 /* 17520 * The conn_t parameter is NULL because we already know 17521 * nobody's home. 17522 */ 17523 mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h, 17524 ira, ns); 17525 if (mp == NULL) 17526 return; 17527 } 17528 if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) { 17529 DTRACE_PROBE2( 17530 tx__ip__log__error__nolistener__tcp, 17531 char *, "Could not reply with RST to mp(1)", 17532 mblk_t *, mp); 17533 ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n")); 17534 freemsg(mp); 17535 return; 17536 } 17537 17538 rptr = mp->b_rptr; 17539 17540 tcpha = (tcpha_t *)&rptr[ip_hdr_len]; 17541 seg_seq = ntohl(tcpha->tha_seq); 17542 seg_ack = ntohl(tcpha->tha_ack); 17543 flags = tcpha->tha_flags; 17544 17545 seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len); 17546 if (flags & TH_RST) { 17547 freemsg(mp); 17548 } else if (flags & TH_ACK) { 17549 tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST, 17550 ira, ipst, connp); 17551 } else { 17552 if (flags & TH_SYN) { 17553 seg_len++; 17554 } else { 17555 /* 17556 * Here we violate the RFC. Note that a normal 17557 * TCP will never send a segment without the ACK 17558 * flag, except for RST or SYN segment. This 17559 * segment is neither. Just drop it on the 17560 * floor. 17561 */ 17562 freemsg(mp); 17563 TCP_STAT(tcps, tcp_rst_unsent); 17564 return; 17565 } 17566 17567 tcp_xmit_early_reset("no tcp, reset/ack", mp, 0, 17568 seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp); 17569 } 17570 } 17571 17572 /* 17573 * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 17574 * ip and tcp header ready to pass down to IP. If the mp passed in is 17575 * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 17576 * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 17577 * otherwise it will dup partial mblks.) 17578 * Otherwise, an appropriate ACK packet will be generated. This 17579 * routine is not usually called to send new data for the first time. It 17580 * is mostly called out of the timer for retransmits, and to generate ACKs. 17581 * 17582 * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 17583 * be adjusted by *offset. And after dupb(), the offset and the ending mblk 17584 * of the original mblk chain will be returned in *offset and *end_mp. 17585 */ 17586 mblk_t * 17587 tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 17588 mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 17589 boolean_t rexmit) 17590 { 17591 int data_length; 17592 int32_t off = 0; 17593 uint_t flags; 17594 mblk_t *mp1; 17595 mblk_t *mp2; 17596 uchar_t *rptr; 17597 tcpha_t *tcpha; 17598 int32_t num_sack_blk = 0; 17599 int32_t sack_opt_len = 0; 17600 tcp_stack_t *tcps = tcp->tcp_tcps; 17601 conn_t *connp = tcp->tcp_connp; 17602 ip_xmit_attr_t *ixa = connp->conn_ixa; 17603 17604 /* Allocate for our maximum TCP header + link-level */ 17605 mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, 17606 BPRI_MED); 17607 if (!mp1) 17608 return (NULL); 17609 data_length = 0; 17610 17611 /* 17612 * Note that tcp_mss has been adjusted to take into account the 17613 * timestamp option if applicable. Because SACK options do not 17614 * appear in every TCP segments and they are of variable lengths, 17615 * they cannot be included in tcp_mss. Thus we need to calculate 17616 * the actual segment length when we need to send a segment which 17617 * includes SACK options. 17618 */ 17619 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 17620 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 17621 tcp->tcp_num_sack_blk); 17622 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 17623 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 17624 if (max_to_send + sack_opt_len > tcp->tcp_mss) 17625 max_to_send -= sack_opt_len; 17626 } 17627 17628 if (offset != NULL) { 17629 off = *offset; 17630 /* We use offset as an indicator that end_mp is not NULL. */ 17631 *end_mp = NULL; 17632 } 17633 for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 17634 /* This could be faster with cooperation from downstream */ 17635 if (mp2 != mp1 && !sendall && 17636 data_length + (int)(mp->b_wptr - mp->b_rptr) > 17637 max_to_send) 17638 /* 17639 * Don't send the next mblk since the whole mblk 17640 * does not fit. 17641 */ 17642 break; 17643 mp2->b_cont = dupb(mp); 17644 mp2 = mp2->b_cont; 17645 if (!mp2) { 17646 freemsg(mp1); 17647 return (NULL); 17648 } 17649 mp2->b_rptr += off; 17650 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 17651 (uintptr_t)INT_MAX); 17652 17653 data_length += (int)(mp2->b_wptr - mp2->b_rptr); 17654 if (data_length > max_to_send) { 17655 mp2->b_wptr -= data_length - max_to_send; 17656 data_length = max_to_send; 17657 off = mp2->b_wptr - mp->b_rptr; 17658 break; 17659 } else { 17660 off = 0; 17661 } 17662 } 17663 if (offset != NULL) { 17664 *offset = off; 17665 *end_mp = mp; 17666 } 17667 if (seg_len != NULL) { 17668 *seg_len = data_length; 17669 } 17670 17671 /* Update the latest receive window size in TCP header. */ 17672 tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 17673 17674 rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; 17675 mp1->b_rptr = rptr; 17676 mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len; 17677 bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); 17678 tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; 17679 tcpha->tha_seq = htonl(seq); 17680 17681 /* 17682 * Use tcp_unsent to determine if the PUSH bit should be used assumes 17683 * that this function was called from tcp_wput_data. Thus, when called 17684 * to retransmit data the setting of the PUSH bit may appear some 17685 * what random in that it might get set when it should not. This 17686 * should not pose any performance issues. 17687 */ 17688 if (data_length != 0 && (tcp->tcp_unsent == 0 || 17689 tcp->tcp_unsent == data_length)) { 17690 flags = TH_ACK | TH_PUSH; 17691 } else { 17692 flags = TH_ACK; 17693 } 17694 17695 if (tcp->tcp_ecn_ok) { 17696 if (tcp->tcp_ecn_echo_on) 17697 flags |= TH_ECE; 17698 17699 /* 17700 * Only set ECT bit and ECN_CWR if a segment contains new data. 17701 * There is no TCP flow control for non-data segments, and 17702 * only data segment is transmitted reliably. 17703 */ 17704 if (data_length > 0 && !rexmit) { 17705 SET_ECT(tcp, rptr); 17706 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 17707 flags |= TH_CWR; 17708 tcp->tcp_ecn_cwr_sent = B_TRUE; 17709 } 17710 } 17711 } 17712 17713 if (tcp->tcp_valid_bits) { 17714 uint32_t u1; 17715 17716 if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 17717 seq == tcp->tcp_iss) { 17718 uchar_t *wptr; 17719 17720 /* 17721 * If TCP_ISS_VALID and the seq number is tcp_iss, 17722 * TCP can only be in SYN-SENT, SYN-RCVD or 17723 * FIN-WAIT-1 state. It can be FIN-WAIT-1 if 17724 * our SYN is not ack'ed but the app closes this 17725 * TCP connection. 17726 */ 17727 ASSERT(tcp->tcp_state == TCPS_SYN_SENT || 17728 tcp->tcp_state == TCPS_SYN_RCVD || 17729 tcp->tcp_state == TCPS_FIN_WAIT_1); 17730 17731 /* 17732 * Tack on the MSS option. It is always needed 17733 * for both active and passive open. 17734 * 17735 * MSS option value should be interface MTU - MIN 17736 * TCP/IP header according to RFC 793 as it means 17737 * the maximum segment size TCP can receive. But 17738 * to get around some broken middle boxes/end hosts 17739 * out there, we allow the option value to be the 17740 * same as the MSS option size on the peer side. 17741 * In this way, the other side will not send 17742 * anything larger than they can receive. 17743 * 17744 * Note that for SYN_SENT state, the ndd param 17745 * tcp_use_smss_as_mss_opt has no effect as we 17746 * don't know the peer's MSS option value. So 17747 * the only case we need to take care of is in 17748 * SYN_RCVD state, which is done later. 17749 */ 17750 wptr = mp1->b_wptr; 17751 wptr[0] = TCPOPT_MAXSEG; 17752 wptr[1] = TCPOPT_MAXSEG_LEN; 17753 wptr += 2; 17754 u1 = tcp->tcp_initial_pmtu - 17755 (connp->conn_ipversion == IPV4_VERSION ? 17756 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - 17757 TCP_MIN_HEADER_LENGTH; 17758 U16_TO_BE16(u1, wptr); 17759 mp1->b_wptr = wptr + 2; 17760 /* Update the offset to cover the additional word */ 17761 tcpha->tha_offset_and_reserved += (1 << 4); 17762 17763 /* 17764 * Note that the following way of filling in 17765 * TCP options are not optimal. Some NOPs can 17766 * be saved. But there is no need at this time 17767 * to optimize it. When it is needed, we will 17768 * do it. 17769 */ 17770 switch (tcp->tcp_state) { 17771 case TCPS_SYN_SENT: 17772 flags = TH_SYN; 17773 17774 if (tcp->tcp_snd_ts_ok) { 17775 uint32_t llbolt = 17776 (uint32_t)LBOLT_FASTPATH; 17777 17778 wptr = mp1->b_wptr; 17779 wptr[0] = TCPOPT_NOP; 17780 wptr[1] = TCPOPT_NOP; 17781 wptr[2] = TCPOPT_TSTAMP; 17782 wptr[3] = TCPOPT_TSTAMP_LEN; 17783 wptr += 4; 17784 U32_TO_BE32(llbolt, wptr); 17785 wptr += 4; 17786 ASSERT(tcp->tcp_ts_recent == 0); 17787 U32_TO_BE32(0L, wptr); 17788 mp1->b_wptr += TCPOPT_REAL_TS_LEN; 17789 tcpha->tha_offset_and_reserved += 17790 (3 << 4); 17791 } 17792 17793 /* 17794 * Set up all the bits to tell other side 17795 * we are ECN capable. 17796 */ 17797 if (tcp->tcp_ecn_ok) { 17798 flags |= (TH_ECE | TH_CWR); 17799 } 17800 break; 17801 case TCPS_SYN_RCVD: 17802 flags |= TH_SYN; 17803 17804 /* 17805 * Reset the MSS option value to be SMSS 17806 * We should probably add back the bytes 17807 * for timestamp option and IPsec. We 17808 * don't do that as this is a workaround 17809 * for broken middle boxes/end hosts, it 17810 * is better for us to be more cautious. 17811 * They may not take these things into 17812 * account in their SMSS calculation. Thus 17813 * the peer's calculated SMSS may be smaller 17814 * than what it can be. This should be OK. 17815 */ 17816 if (tcps->tcps_use_smss_as_mss_opt) { 17817 u1 = tcp->tcp_mss; 17818 U16_TO_BE16(u1, wptr); 17819 } 17820 17821 /* 17822 * If the other side is ECN capable, reply 17823 * that we are also ECN capable. 17824 */ 17825 if (tcp->tcp_ecn_ok) 17826 flags |= TH_ECE; 17827 break; 17828 default: 17829 /* 17830 * The above ASSERT() makes sure that this 17831 * must be FIN-WAIT-1 state. Our SYN has 17832 * not been ack'ed so retransmit it. 17833 */ 17834 flags |= TH_SYN; 17835 break; 17836 } 17837 17838 if (tcp->tcp_snd_ws_ok) { 17839 wptr = mp1->b_wptr; 17840 wptr[0] = TCPOPT_NOP; 17841 wptr[1] = TCPOPT_WSCALE; 17842 wptr[2] = TCPOPT_WS_LEN; 17843 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 17844 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 17845 tcpha->tha_offset_and_reserved += (1 << 4); 17846 } 17847 17848 if (tcp->tcp_snd_sack_ok) { 17849 wptr = mp1->b_wptr; 17850 wptr[0] = TCPOPT_NOP; 17851 wptr[1] = TCPOPT_NOP; 17852 wptr[2] = TCPOPT_SACK_PERMITTED; 17853 wptr[3] = TCPOPT_SACK_OK_LEN; 17854 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 17855 tcpha->tha_offset_and_reserved += (1 << 4); 17856 } 17857 17858 /* allocb() of adequate mblk assures space */ 17859 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 17860 (uintptr_t)INT_MAX); 17861 u1 = (int)(mp1->b_wptr - mp1->b_rptr); 17862 /* 17863 * Get IP set to checksum on our behalf 17864 * Include the adjustment for a source route if any. 17865 */ 17866 u1 += connp->conn_sum; 17867 u1 = (u1 >> 16) + (u1 & 0xFFFF); 17868 tcpha->tha_sum = htons(u1); 17869 BUMP_MIB(&tcps->tcps_mib, tcpOutControl); 17870 } 17871 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 17872 (seq + data_length) == tcp->tcp_fss) { 17873 if (!tcp->tcp_fin_acked) { 17874 flags |= TH_FIN; 17875 BUMP_MIB(&tcps->tcps_mib, tcpOutControl); 17876 } 17877 if (!tcp->tcp_fin_sent) { 17878 tcp->tcp_fin_sent = B_TRUE; 17879 switch (tcp->tcp_state) { 17880 case TCPS_SYN_RCVD: 17881 case TCPS_ESTABLISHED: 17882 tcp->tcp_state = TCPS_FIN_WAIT_1; 17883 break; 17884 case TCPS_CLOSE_WAIT: 17885 tcp->tcp_state = TCPS_LAST_ACK; 17886 break; 17887 } 17888 if (tcp->tcp_suna == tcp->tcp_snxt) 17889 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 17890 tcp->tcp_snxt = tcp->tcp_fss + 1; 17891 } 17892 } 17893 /* 17894 * Note the trick here. u1 is unsigned. When tcp_urg 17895 * is smaller than seq, u1 will become a very huge value. 17896 * So the comparison will fail. Also note that tcp_urp 17897 * should be positive, see RFC 793 page 17. 17898 */ 17899 u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION; 17900 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 && 17901 u1 < (uint32_t)(64 * 1024)) { 17902 flags |= TH_URG; 17903 BUMP_MIB(&tcps->tcps_mib, tcpOutUrg); 17904 tcpha->tha_urp = htons(u1); 17905 } 17906 } 17907 tcpha->tha_flags = (uchar_t)flags; 17908 tcp->tcp_rack = tcp->tcp_rnxt; 17909 tcp->tcp_rack_cnt = 0; 17910 17911 if (tcp->tcp_snd_ts_ok) { 17912 if (tcp->tcp_state != TCPS_SYN_SENT) { 17913 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 17914 17915 U32_TO_BE32(llbolt, 17916 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); 17917 U32_TO_BE32(tcp->tcp_ts_recent, 17918 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); 17919 } 17920 } 17921 17922 if (num_sack_blk > 0) { 17923 uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len; 17924 sack_blk_t *tmp; 17925 int32_t i; 17926 17927 wptr[0] = TCPOPT_NOP; 17928 wptr[1] = TCPOPT_NOP; 17929 wptr[2] = TCPOPT_SACK; 17930 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 17931 sizeof (sack_blk_t); 17932 wptr += TCPOPT_REAL_SACK_LEN; 17933 17934 tmp = tcp->tcp_sack_list; 17935 for (i = 0; i < num_sack_blk; i++) { 17936 U32_TO_BE32(tmp[i].begin, wptr); 17937 wptr += sizeof (tcp_seq); 17938 U32_TO_BE32(tmp[i].end, wptr); 17939 wptr += sizeof (tcp_seq); 17940 } 17941 tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4); 17942 } 17943 ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 17944 data_length += (int)(mp1->b_wptr - rptr); 17945 17946 ixa->ixa_pktlen = data_length; 17947 17948 if (ixa->ixa_flags & IXAF_IS_IPV4) { 17949 ((ipha_t *)rptr)->ipha_length = htons(data_length); 17950 } else { 17951 ip6_t *ip6 = (ip6_t *)rptr; 17952 17953 ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN); 17954 } 17955 17956 /* 17957 * Prime pump for IP 17958 * Include the adjustment for a source route if any. 17959 */ 17960 data_length -= ixa->ixa_ip_hdr_length; 17961 data_length += connp->conn_sum; 17962 data_length = (data_length >> 16) + (data_length & 0xFFFF); 17963 tcpha->tha_sum = htons(data_length); 17964 if (tcp->tcp_ip_forward_progress) { 17965 tcp->tcp_ip_forward_progress = B_FALSE; 17966 connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; 17967 } else { 17968 connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; 17969 } 17970 return (mp1); 17971 } 17972 17973 /* This function handles the push timeout. */ 17974 void 17975 tcp_push_timer(void *arg) 17976 { 17977 conn_t *connp = (conn_t *)arg; 17978 tcp_t *tcp = connp->conn_tcp; 17979 17980 TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt); 17981 17982 ASSERT(tcp->tcp_listener == NULL); 17983 17984 ASSERT(!IPCL_IS_NONSTR(connp)); 17985 17986 tcp->tcp_push_tid = 0; 17987 17988 if (tcp->tcp_rcv_list != NULL && 17989 tcp_rcv_drain(tcp) == TH_ACK_NEEDED) 17990 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 17991 } 17992 17993 /* 17994 * This function handles delayed ACK timeout. 17995 */ 17996 static void 17997 tcp_ack_timer(void *arg) 17998 { 17999 conn_t *connp = (conn_t *)arg; 18000 tcp_t *tcp = connp->conn_tcp; 18001 mblk_t *mp; 18002 tcp_stack_t *tcps = tcp->tcp_tcps; 18003 18004 TCP_DBGSTAT(tcps, tcp_ack_timer_cnt); 18005 18006 tcp->tcp_ack_tid = 0; 18007 18008 if (tcp->tcp_fused) 18009 return; 18010 18011 /* 18012 * Do not send ACK if there is no outstanding unack'ed data. 18013 */ 18014 if (tcp->tcp_rnxt == tcp->tcp_rack) { 18015 return; 18016 } 18017 18018 if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) { 18019 /* 18020 * Make sure we don't allow deferred ACKs to result in 18021 * timer-based ACKing. If we have held off an ACK 18022 * when there was more than an mss here, and the timer 18023 * goes off, we have to worry about the possibility 18024 * that the sender isn't doing slow-start, or is out 18025 * of step with us for some other reason. We fall 18026 * permanently back in the direction of 18027 * ACK-every-other-packet as suggested in RFC 1122. 18028 */ 18029 if (tcp->tcp_rack_abs_max > 2) 18030 tcp->tcp_rack_abs_max--; 18031 tcp->tcp_rack_cur_max = 2; 18032 } 18033 mp = tcp_ack_mp(tcp); 18034 18035 if (mp != NULL) { 18036 BUMP_LOCAL(tcp->tcp_obsegs); 18037 BUMP_MIB(&tcps->tcps_mib, tcpOutAck); 18038 BUMP_MIB(&tcps->tcps_mib, tcpOutAckDelayed); 18039 tcp_send_data(tcp, mp); 18040 } 18041 } 18042 18043 18044 /* Generate an ACK-only (no data) segment for a TCP endpoint */ 18045 static mblk_t * 18046 tcp_ack_mp(tcp_t *tcp) 18047 { 18048 uint32_t seq_no; 18049 tcp_stack_t *tcps = tcp->tcp_tcps; 18050 conn_t *connp = tcp->tcp_connp; 18051 18052 /* 18053 * There are a few cases to be considered while setting the sequence no. 18054 * Essentially, we can come here while processing an unacceptable pkt 18055 * in the TCPS_SYN_RCVD state, in which case we set the sequence number 18056 * to snxt (per RFC 793), note the swnd wouldn't have been set yet. 18057 * If we are here for a zero window probe, stick with suna. In all 18058 * other cases, we check if suna + swnd encompasses snxt and set 18059 * the sequence number to snxt, if so. If snxt falls outside the 18060 * window (the receiver probably shrunk its window), we will go with 18061 * suna + swnd, otherwise the sequence no will be unacceptable to the 18062 * receiver. 18063 */ 18064 if (tcp->tcp_zero_win_probe) { 18065 seq_no = tcp->tcp_suna; 18066 } else if (tcp->tcp_state == TCPS_SYN_RCVD) { 18067 ASSERT(tcp->tcp_swnd == 0); 18068 seq_no = tcp->tcp_snxt; 18069 } else { 18070 seq_no = SEQ_GT(tcp->tcp_snxt, 18071 (tcp->tcp_suna + tcp->tcp_swnd)) ? 18072 (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt; 18073 } 18074 18075 if (tcp->tcp_valid_bits) { 18076 /* 18077 * For the complex case where we have to send some 18078 * controls (FIN or SYN), let tcp_xmit_mp do it. 18079 */ 18080 return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE, 18081 NULL, B_FALSE)); 18082 } else { 18083 /* Generate a simple ACK */ 18084 int data_length; 18085 uchar_t *rptr; 18086 tcpha_t *tcpha; 18087 mblk_t *mp1; 18088 int32_t total_hdr_len; 18089 int32_t tcp_hdr_len; 18090 int32_t num_sack_blk = 0; 18091 int32_t sack_opt_len; 18092 ip_xmit_attr_t *ixa = connp->conn_ixa; 18093 18094 /* 18095 * Allocate space for TCP + IP headers 18096 * and link-level header 18097 */ 18098 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 18099 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 18100 tcp->tcp_num_sack_blk); 18101 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 18102 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 18103 total_hdr_len = connp->conn_ht_iphc_len + sack_opt_len; 18104 tcp_hdr_len = connp->conn_ht_ulp_len + sack_opt_len; 18105 } else { 18106 total_hdr_len = connp->conn_ht_iphc_len; 18107 tcp_hdr_len = connp->conn_ht_ulp_len; 18108 } 18109 mp1 = allocb(total_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED); 18110 if (!mp1) 18111 return (NULL); 18112 18113 /* Update the latest receive window size in TCP header. */ 18114 tcp->tcp_tcpha->tha_win = 18115 htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 18116 /* copy in prototype TCP + IP header */ 18117 rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; 18118 mp1->b_rptr = rptr; 18119 mp1->b_wptr = rptr + total_hdr_len; 18120 bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); 18121 18122 tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; 18123 18124 /* Set the TCP sequence number. */ 18125 tcpha->tha_seq = htonl(seq_no); 18126 18127 /* Set up the TCP flag field. */ 18128 tcpha->tha_flags = (uchar_t)TH_ACK; 18129 if (tcp->tcp_ecn_echo_on) 18130 tcpha->tha_flags |= TH_ECE; 18131 18132 tcp->tcp_rack = tcp->tcp_rnxt; 18133 tcp->tcp_rack_cnt = 0; 18134 18135 /* fill in timestamp option if in use */ 18136 if (tcp->tcp_snd_ts_ok) { 18137 uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; 18138 18139 U32_TO_BE32(llbolt, 18140 (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); 18141 U32_TO_BE32(tcp->tcp_ts_recent, 18142 (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); 18143 } 18144 18145 /* Fill in SACK options */ 18146 if (num_sack_blk > 0) { 18147 uchar_t *wptr = (uchar_t *)tcpha + 18148 connp->conn_ht_ulp_len; 18149 sack_blk_t *tmp; 18150 int32_t i; 18151 18152 wptr[0] = TCPOPT_NOP; 18153 wptr[1] = TCPOPT_NOP; 18154 wptr[2] = TCPOPT_SACK; 18155 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 18156 sizeof (sack_blk_t); 18157 wptr += TCPOPT_REAL_SACK_LEN; 18158 18159 tmp = tcp->tcp_sack_list; 18160 for (i = 0; i < num_sack_blk; i++) { 18161 U32_TO_BE32(tmp[i].begin, wptr); 18162 wptr += sizeof (tcp_seq); 18163 U32_TO_BE32(tmp[i].end, wptr); 18164 wptr += sizeof (tcp_seq); 18165 } 18166 tcpha->tha_offset_and_reserved += 18167 ((num_sack_blk * 2 + 1) << 4); 18168 } 18169 18170 ixa->ixa_pktlen = total_hdr_len; 18171 18172 if (ixa->ixa_flags & IXAF_IS_IPV4) { 18173 ((ipha_t *)rptr)->ipha_length = htons(total_hdr_len); 18174 } else { 18175 ip6_t *ip6 = (ip6_t *)rptr; 18176 18177 ip6->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); 18178 } 18179 18180 /* 18181 * Prime pump for checksum calculation in IP. Include the 18182 * adjustment for a source route if any. 18183 */ 18184 data_length = tcp_hdr_len + connp->conn_sum; 18185 data_length = (data_length >> 16) + (data_length & 0xFFFF); 18186 tcpha->tha_sum = htons(data_length); 18187 18188 if (tcp->tcp_ip_forward_progress) { 18189 tcp->tcp_ip_forward_progress = B_FALSE; 18190 connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; 18191 } else { 18192 connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; 18193 } 18194 return (mp1); 18195 } 18196 } 18197 18198 /* 18199 * Hash list insertion routine for tcp_t structures. Each hash bucket 18200 * contains a list of tcp_t entries, and each entry is bound to a unique 18201 * port. If there are multiple tcp_t's that are bound to the same port, then 18202 * one of them will be linked into the hash bucket list, and the rest will 18203 * hang off of that one entry. For each port, entries bound to a specific IP 18204 * address will be inserted before those those bound to INADDR_ANY. 18205 */ 18206 static void 18207 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) 18208 { 18209 tcp_t **tcpp; 18210 tcp_t *tcpnext; 18211 tcp_t *tcphash; 18212 conn_t *connp = tcp->tcp_connp; 18213 conn_t *connext; 18214 18215 if (tcp->tcp_ptpbhn != NULL) { 18216 ASSERT(!caller_holds_lock); 18217 tcp_bind_hash_remove(tcp); 18218 } 18219 tcpp = &tbf->tf_tcp; 18220 if (!caller_holds_lock) { 18221 mutex_enter(&tbf->tf_lock); 18222 } else { 18223 ASSERT(MUTEX_HELD(&tbf->tf_lock)); 18224 } 18225 tcphash = tcpp[0]; 18226 tcpnext = NULL; 18227 if (tcphash != NULL) { 18228 /* Look for an entry using the same port */ 18229 while ((tcphash = tcpp[0]) != NULL && 18230 connp->conn_lport != tcphash->tcp_connp->conn_lport) 18231 tcpp = &(tcphash->tcp_bind_hash); 18232 18233 /* The port was not found, just add to the end */ 18234 if (tcphash == NULL) 18235 goto insert; 18236 18237 /* 18238 * OK, there already exists an entry bound to the 18239 * same port. 18240 * 18241 * If the new tcp bound to the INADDR_ANY address 18242 * and the first one in the list is not bound to 18243 * INADDR_ANY we skip all entries until we find the 18244 * first one bound to INADDR_ANY. 18245 * This makes sure that applications binding to a 18246 * specific address get preference over those binding to 18247 * INADDR_ANY. 18248 */ 18249 tcpnext = tcphash; 18250 connext = tcpnext->tcp_connp; 18251 tcphash = NULL; 18252 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) && 18253 !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { 18254 while ((tcpnext = tcpp[0]) != NULL) { 18255 connext = tcpnext->tcp_connp; 18256 if (!V6_OR_V4_INADDR_ANY( 18257 connext->conn_bound_addr_v6)) 18258 tcpp = &(tcpnext->tcp_bind_hash_port); 18259 else 18260 break; 18261 } 18262 if (tcpnext != NULL) { 18263 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; 18264 tcphash = tcpnext->tcp_bind_hash; 18265 if (tcphash != NULL) { 18266 tcphash->tcp_ptpbhn = 18267 &(tcp->tcp_bind_hash); 18268 tcpnext->tcp_bind_hash = NULL; 18269 } 18270 } 18271 } else { 18272 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; 18273 tcphash = tcpnext->tcp_bind_hash; 18274 if (tcphash != NULL) { 18275 tcphash->tcp_ptpbhn = 18276 &(tcp->tcp_bind_hash); 18277 tcpnext->tcp_bind_hash = NULL; 18278 } 18279 } 18280 } 18281 insert: 18282 tcp->tcp_bind_hash_port = tcpnext; 18283 tcp->tcp_bind_hash = tcphash; 18284 tcp->tcp_ptpbhn = tcpp; 18285 tcpp[0] = tcp; 18286 if (!caller_holds_lock) 18287 mutex_exit(&tbf->tf_lock); 18288 } 18289 18290 /* 18291 * Hash list removal routine for tcp_t structures. 18292 */ 18293 static void 18294 tcp_bind_hash_remove(tcp_t *tcp) 18295 { 18296 tcp_t *tcpnext; 18297 kmutex_t *lockp; 18298 tcp_stack_t *tcps = tcp->tcp_tcps; 18299 conn_t *connp = tcp->tcp_connp; 18300 18301 if (tcp->tcp_ptpbhn == NULL) 18302 return; 18303 18304 /* 18305 * Extract the lock pointer in case there are concurrent 18306 * hash_remove's for this instance. 18307 */ 18308 ASSERT(connp->conn_lport != 0); 18309 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH( 18310 connp->conn_lport)].tf_lock; 18311 18312 ASSERT(lockp != NULL); 18313 mutex_enter(lockp); 18314 if (tcp->tcp_ptpbhn) { 18315 tcpnext = tcp->tcp_bind_hash_port; 18316 if (tcpnext != NULL) { 18317 tcp->tcp_bind_hash_port = NULL; 18318 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 18319 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash; 18320 if (tcpnext->tcp_bind_hash != NULL) { 18321 tcpnext->tcp_bind_hash->tcp_ptpbhn = 18322 &(tcpnext->tcp_bind_hash); 18323 tcp->tcp_bind_hash = NULL; 18324 } 18325 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) { 18326 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 18327 tcp->tcp_bind_hash = NULL; 18328 } 18329 *tcp->tcp_ptpbhn = tcpnext; 18330 tcp->tcp_ptpbhn = NULL; 18331 } 18332 mutex_exit(lockp); 18333 } 18334 18335 18336 /* 18337 * Hash list lookup routine for tcp_t structures. 18338 * Returns with a CONN_INC_REF tcp structure. Caller must do a CONN_DEC_REF. 18339 */ 18340 static tcp_t * 18341 tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *tcps) 18342 { 18343 tf_t *tf; 18344 tcp_t *tcp; 18345 18346 tf = &tcps->tcps_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 18347 mutex_enter(&tf->tf_lock); 18348 for (tcp = tf->tf_tcp; tcp != NULL; 18349 tcp = tcp->tcp_acceptor_hash) { 18350 if (tcp->tcp_acceptor_id == id) { 18351 CONN_INC_REF(tcp->tcp_connp); 18352 mutex_exit(&tf->tf_lock); 18353 return (tcp); 18354 } 18355 } 18356 mutex_exit(&tf->tf_lock); 18357 return (NULL); 18358 } 18359 18360 18361 /* 18362 * Hash list insertion routine for tcp_t structures. 18363 */ 18364 void 18365 tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp) 18366 { 18367 tf_t *tf; 18368 tcp_t **tcpp; 18369 tcp_t *tcpnext; 18370 tcp_stack_t *tcps = tcp->tcp_tcps; 18371 18372 tf = &tcps->tcps_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 18373 18374 if (tcp->tcp_ptpahn != NULL) 18375 tcp_acceptor_hash_remove(tcp); 18376 tcpp = &tf->tf_tcp; 18377 mutex_enter(&tf->tf_lock); 18378 tcpnext = tcpp[0]; 18379 if (tcpnext) 18380 tcpnext->tcp_ptpahn = &tcp->tcp_acceptor_hash; 18381 tcp->tcp_acceptor_hash = tcpnext; 18382 tcp->tcp_ptpahn = tcpp; 18383 tcpp[0] = tcp; 18384 tcp->tcp_acceptor_lockp = &tf->tf_lock; /* For tcp_*_hash_remove */ 18385 mutex_exit(&tf->tf_lock); 18386 } 18387 18388 /* 18389 * Hash list removal routine for tcp_t structures. 18390 */ 18391 static void 18392 tcp_acceptor_hash_remove(tcp_t *tcp) 18393 { 18394 tcp_t *tcpnext; 18395 kmutex_t *lockp; 18396 18397 /* 18398 * Extract the lock pointer in case there are concurrent 18399 * hash_remove's for this instance. 18400 */ 18401 lockp = tcp->tcp_acceptor_lockp; 18402 18403 if (tcp->tcp_ptpahn == NULL) 18404 return; 18405 18406 ASSERT(lockp != NULL); 18407 mutex_enter(lockp); 18408 if (tcp->tcp_ptpahn) { 18409 tcpnext = tcp->tcp_acceptor_hash; 18410 if (tcpnext) { 18411 tcpnext->tcp_ptpahn = tcp->tcp_ptpahn; 18412 tcp->tcp_acceptor_hash = NULL; 18413 } 18414 *tcp->tcp_ptpahn = tcpnext; 18415 tcp->tcp_ptpahn = NULL; 18416 } 18417 mutex_exit(lockp); 18418 tcp->tcp_acceptor_lockp = NULL; 18419 } 18420 18421 /* 18422 * Type three generator adapted from the random() function in 4.4 BSD: 18423 */ 18424 18425 /* 18426 * Copyright (c) 1983, 1993 18427 * The Regents of the University of California. All rights reserved. 18428 * 18429 * Redistribution and use in source and binary forms, with or without 18430 * modification, are permitted provided that the following conditions 18431 * are met: 18432 * 1. Redistributions of source code must retain the above copyright 18433 * notice, this list of conditions and the following disclaimer. 18434 * 2. Redistributions in binary form must reproduce the above copyright 18435 * notice, this list of conditions and the following disclaimer in the 18436 * documentation and/or other materials provided with the distribution. 18437 * 3. All advertising materials mentioning features or use of this software 18438 * must display the following acknowledgement: 18439 * This product includes software developed by the University of 18440 * California, Berkeley and its contributors. 18441 * 4. Neither the name of the University nor the names of its contributors 18442 * may be used to endorse or promote products derived from this software 18443 * without specific prior written permission. 18444 * 18445 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18446 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18447 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18448 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 18449 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18450 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 18451 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 18452 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 18453 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 18454 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 18455 * SUCH DAMAGE. 18456 */ 18457 18458 /* Type 3 -- x**31 + x**3 + 1 */ 18459 #define DEG_3 31 18460 #define SEP_3 3 18461 18462 18463 /* Protected by tcp_random_lock */ 18464 static int tcp_randtbl[DEG_3 + 1]; 18465 18466 static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 18467 static int *tcp_random_rptr = &tcp_randtbl[1]; 18468 18469 static int *tcp_random_state = &tcp_randtbl[1]; 18470 static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 18471 18472 kmutex_t tcp_random_lock; 18473 18474 void 18475 tcp_random_init(void) 18476 { 18477 int i; 18478 hrtime_t hrt; 18479 time_t wallclock; 18480 uint64_t result; 18481 18482 /* 18483 * Use high-res timer and current time for seed. Gethrtime() returns 18484 * a longlong, which may contain resolution down to nanoseconds. 18485 * The current time will either be a 32-bit or a 64-bit quantity. 18486 * XOR the two together in a 64-bit result variable. 18487 * Convert the result to a 32-bit value by multiplying the high-order 18488 * 32-bits by the low-order 32-bits. 18489 */ 18490 18491 hrt = gethrtime(); 18492 (void) drv_getparm(TIME, &wallclock); 18493 result = (uint64_t)wallclock ^ (uint64_t)hrt; 18494 mutex_enter(&tcp_random_lock); 18495 tcp_random_state[0] = ((result >> 32) & 0xffffffff) * 18496 (result & 0xffffffff); 18497 18498 for (i = 1; i < DEG_3; i++) 18499 tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 18500 + 12345; 18501 tcp_random_fptr = &tcp_random_state[SEP_3]; 18502 tcp_random_rptr = &tcp_random_state[0]; 18503 mutex_exit(&tcp_random_lock); 18504 for (i = 0; i < 10 * DEG_3; i++) 18505 (void) tcp_random(); 18506 } 18507 18508 /* 18509 * tcp_random: Return a random number in the range [1 - (128K + 1)]. 18510 * This range is selected to be approximately centered on TCP_ISS / 2, 18511 * and easy to compute. We get this value by generating a 32-bit random 18512 * number, selecting out the high-order 17 bits, and then adding one so 18513 * that we never return zero. 18514 */ 18515 int 18516 tcp_random(void) 18517 { 18518 int i; 18519 18520 mutex_enter(&tcp_random_lock); 18521 *tcp_random_fptr += *tcp_random_rptr; 18522 18523 /* 18524 * The high-order bits are more random than the low-order bits, 18525 * so we select out the high-order 17 bits and add one so that 18526 * we never return zero. 18527 */ 18528 i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 18529 if (++tcp_random_fptr >= tcp_random_end_ptr) { 18530 tcp_random_fptr = tcp_random_state; 18531 ++tcp_random_rptr; 18532 } else if (++tcp_random_rptr >= tcp_random_end_ptr) 18533 tcp_random_rptr = tcp_random_state; 18534 18535 mutex_exit(&tcp_random_lock); 18536 return (i); 18537 } 18538 18539 static int 18540 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, 18541 int *t_errorp, int *sys_errorp) 18542 { 18543 int error; 18544 int is_absreq_failure; 18545 t_scalar_t *opt_lenp; 18546 t_scalar_t opt_offset; 18547 int prim_type; 18548 struct T_conn_req *tcreqp; 18549 struct T_conn_res *tcresp; 18550 cred_t *cr; 18551 18552 /* 18553 * All Solaris components should pass a db_credp 18554 * for this TPI message, hence we ASSERT. 18555 * But in case there is some other M_PROTO that looks 18556 * like a TPI message sent by some other kernel 18557 * component, we check and return an error. 18558 */ 18559 cr = msg_getcred(mp, NULL); 18560 ASSERT(cr != NULL); 18561 if (cr == NULL) 18562 return (-1); 18563 18564 prim_type = ((union T_primitives *)mp->b_rptr)->type; 18565 ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || 18566 prim_type == T_CONN_RES); 18567 18568 switch (prim_type) { 18569 case T_CONN_REQ: 18570 tcreqp = (struct T_conn_req *)mp->b_rptr; 18571 opt_offset = tcreqp->OPT_offset; 18572 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; 18573 break; 18574 case O_T_CONN_RES: 18575 case T_CONN_RES: 18576 tcresp = (struct T_conn_res *)mp->b_rptr; 18577 opt_offset = tcresp->OPT_offset; 18578 opt_lenp = (t_scalar_t *)&tcresp->OPT_length; 18579 break; 18580 } 18581 18582 *t_errorp = 0; 18583 *sys_errorp = 0; 18584 *do_disconnectp = 0; 18585 18586 error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp, 18587 opt_offset, cr, &tcp_opt_obj, 18588 NULL, &is_absreq_failure); 18589 18590 switch (error) { 18591 case 0: /* no error */ 18592 ASSERT(is_absreq_failure == 0); 18593 return (0); 18594 case ENOPROTOOPT: 18595 *t_errorp = TBADOPT; 18596 break; 18597 case EACCES: 18598 *t_errorp = TACCES; 18599 break; 18600 default: 18601 *t_errorp = TSYSERR; *sys_errorp = error; 18602 break; 18603 } 18604 if (is_absreq_failure != 0) { 18605 /* 18606 * The connection request should get the local ack 18607 * T_OK_ACK and then a T_DISCON_IND. 18608 */ 18609 *do_disconnectp = 1; 18610 } 18611 return (-1); 18612 } 18613 18614 /* 18615 * Split this function out so that if the secret changes, I'm okay. 18616 * 18617 * Initialize the tcp_iss_cookie and tcp_iss_key. 18618 */ 18619 18620 #define PASSWD_SIZE 16 /* MUST be multiple of 4 */ 18621 18622 static void 18623 tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *tcps) 18624 { 18625 struct { 18626 int32_t current_time; 18627 uint32_t randnum; 18628 uint16_t pad; 18629 uint8_t ether[6]; 18630 uint8_t passwd[PASSWD_SIZE]; 18631 } tcp_iss_cookie; 18632 time_t t; 18633 18634 /* 18635 * Start with the current absolute time. 18636 */ 18637 (void) drv_getparm(TIME, &t); 18638 tcp_iss_cookie.current_time = t; 18639 18640 /* 18641 * XXX - Need a more random number per RFC 1750, not this crap. 18642 * OTOH, if what follows is pretty random, then I'm in better shape. 18643 */ 18644 tcp_iss_cookie.randnum = (uint32_t)(gethrtime() + tcp_random()); 18645 tcp_iss_cookie.pad = 0x365c; /* Picked from HMAC pad values. */ 18646 18647 /* 18648 * The cpu_type_info is pretty non-random. Ugggh. It does serve 18649 * as a good template. 18650 */ 18651 bcopy(&cpu_list->cpu_type_info, &tcp_iss_cookie.passwd, 18652 min(PASSWD_SIZE, sizeof (cpu_list->cpu_type_info))); 18653 18654 /* 18655 * The pass-phrase. Normally this is supplied by user-called NDD. 18656 */ 18657 bcopy(phrase, &tcp_iss_cookie.passwd, min(PASSWD_SIZE, len)); 18658 18659 /* 18660 * See 4010593 if this section becomes a problem again, 18661 * but the local ethernet address is useful here. 18662 */ 18663 (void) localetheraddr(NULL, 18664 (struct ether_addr *)&tcp_iss_cookie.ether); 18665 18666 /* 18667 * Hash 'em all together. The MD5Final is called per-connection. 18668 */ 18669 mutex_enter(&tcps->tcps_iss_key_lock); 18670 MD5Init(&tcps->tcps_iss_key); 18671 MD5Update(&tcps->tcps_iss_key, (uchar_t *)&tcp_iss_cookie, 18672 sizeof (tcp_iss_cookie)); 18673 mutex_exit(&tcps->tcps_iss_key_lock); 18674 } 18675 18676 /* 18677 * Set the RFC 1948 pass phrase 18678 */ 18679 /* ARGSUSED */ 18680 static int 18681 tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 18682 cred_t *cr) 18683 { 18684 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 18685 18686 /* 18687 * Basically, value contains a new pass phrase. Pass it along! 18688 */ 18689 tcp_iss_key_init((uint8_t *)value, strlen(value), tcps); 18690 return (0); 18691 } 18692 18693 /* ARGSUSED */ 18694 static int 18695 tcp_sack_info_constructor(void *buf, void *cdrarg, int kmflags) 18696 { 18697 bzero(buf, sizeof (tcp_sack_info_t)); 18698 return (0); 18699 } 18700 18701 /* 18702 * Called by IP when IP is loaded into the kernel 18703 */ 18704 void 18705 tcp_ddi_g_init(void) 18706 { 18707 tcp_timercache = kmem_cache_create("tcp_timercache", 18708 sizeof (tcp_timer_t) + sizeof (mblk_t), 0, 18709 NULL, NULL, NULL, NULL, NULL, 0); 18710 18711 tcp_sack_info_cache = kmem_cache_create("tcp_sack_info_cache", 18712 sizeof (tcp_sack_info_t), 0, 18713 tcp_sack_info_constructor, NULL, NULL, NULL, NULL, 0); 18714 18715 mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL); 18716 18717 /* Initialize the random number generator */ 18718 tcp_random_init(); 18719 18720 /* A single callback independently of how many netstacks we have */ 18721 ip_squeue_init(tcp_squeue_add); 18722 18723 tcp_g_kstat = tcp_g_kstat_init(&tcp_g_statistics); 18724 18725 tcp_squeue_flag = tcp_squeue_switch(tcp_squeue_wput); 18726 18727 /* 18728 * We want to be informed each time a stack is created or 18729 * destroyed in the kernel, so we can maintain the 18730 * set of tcp_stack_t's. 18731 */ 18732 netstack_register(NS_TCP, tcp_stack_init, NULL, tcp_stack_fini); 18733 } 18734 18735 18736 #define INET_NAME "ip" 18737 18738 /* 18739 * Initialize the TCP stack instance. 18740 */ 18741 static void * 18742 tcp_stack_init(netstackid_t stackid, netstack_t *ns) 18743 { 18744 tcp_stack_t *tcps; 18745 tcpparam_t *pa; 18746 int i; 18747 int error = 0; 18748 major_t major; 18749 18750 tcps = (tcp_stack_t *)kmem_zalloc(sizeof (*tcps), KM_SLEEP); 18751 tcps->tcps_netstack = ns; 18752 18753 /* Initialize locks */ 18754 mutex_init(&tcps->tcps_iss_key_lock, NULL, MUTEX_DEFAULT, NULL); 18755 mutex_init(&tcps->tcps_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL); 18756 18757 tcps->tcps_g_num_epriv_ports = TCP_NUM_EPRIV_PORTS; 18758 tcps->tcps_g_epriv_ports[0] = 2049; 18759 tcps->tcps_g_epriv_ports[1] = 4045; 18760 tcps->tcps_min_anonpriv_port = 512; 18761 18762 tcps->tcps_bind_fanout = kmem_zalloc(sizeof (tf_t) * 18763 TCP_BIND_FANOUT_SIZE, KM_SLEEP); 18764 tcps->tcps_acceptor_fanout = kmem_zalloc(sizeof (tf_t) * 18765 TCP_ACCEPTOR_FANOUT_SIZE, KM_SLEEP); 18766 18767 for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { 18768 mutex_init(&tcps->tcps_bind_fanout[i].tf_lock, NULL, 18769 MUTEX_DEFAULT, NULL); 18770 } 18771 18772 for (i = 0; i < TCP_ACCEPTOR_FANOUT_SIZE; i++) { 18773 mutex_init(&tcps->tcps_acceptor_fanout[i].tf_lock, NULL, 18774 MUTEX_DEFAULT, NULL); 18775 } 18776 18777 /* TCP's IPsec code calls the packet dropper. */ 18778 ip_drop_register(&tcps->tcps_dropper, "TCP IPsec policy enforcement"); 18779 18780 pa = (tcpparam_t *)kmem_alloc(sizeof (lcl_tcp_param_arr), KM_SLEEP); 18781 tcps->tcps_params = pa; 18782 bcopy(lcl_tcp_param_arr, tcps->tcps_params, sizeof (lcl_tcp_param_arr)); 18783 18784 (void) tcp_param_register(&tcps->tcps_g_nd, tcps->tcps_params, 18785 A_CNT(lcl_tcp_param_arr), tcps); 18786 18787 /* 18788 * Note: To really walk the device tree you need the devinfo 18789 * pointer to your device which is only available after probe/attach. 18790 * The following is safe only because it uses ddi_root_node() 18791 */ 18792 tcp_max_optsize = optcom_max_optsize(tcp_opt_obj.odb_opt_des_arr, 18793 tcp_opt_obj.odb_opt_arr_cnt); 18794 18795 /* 18796 * Initialize RFC 1948 secret values. This will probably be reset once 18797 * by the boot scripts. 18798 * 18799 * Use NULL name, as the name is caught by the new lockstats. 18800 * 18801 * Initialize with some random, non-guessable string, like the global 18802 * T_INFO_ACK. 18803 */ 18804 18805 tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack, 18806 sizeof (tcp_g_t_info_ack), tcps); 18807 18808 tcps->tcps_kstat = tcp_kstat2_init(stackid, &tcps->tcps_statistics); 18809 tcps->tcps_mibkp = tcp_kstat_init(stackid, tcps); 18810 18811 major = mod_name_to_major(INET_NAME); 18812 error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident); 18813 ASSERT(error == 0); 18814 tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); 18815 ASSERT(tcps->tcps_ixa_cleanup_mp != NULL); 18816 cv_init(&tcps->tcps_ixa_cleanup_cv, NULL, CV_DEFAULT, NULL); 18817 mutex_init(&tcps->tcps_ixa_cleanup_lock, NULL, MUTEX_DEFAULT, NULL); 18818 18819 mutex_init(&tcps->tcps_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 18820 tcps->tcps_reclaim = B_FALSE; 18821 tcps->tcps_reclaim_tid = 0; 18822 tcps->tcps_reclaim_period = tcps->tcps_rexmit_interval_max * 3; 18823 18824 mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL); 18825 list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t), 18826 offsetof(tcp_listener_t, tl_link)); 18827 18828 return (tcps); 18829 } 18830 18831 /* 18832 * Called when the IP module is about to be unloaded. 18833 */ 18834 void 18835 tcp_ddi_g_destroy(void) 18836 { 18837 tcp_g_kstat_fini(tcp_g_kstat); 18838 tcp_g_kstat = NULL; 18839 bzero(&tcp_g_statistics, sizeof (tcp_g_statistics)); 18840 18841 mutex_destroy(&tcp_random_lock); 18842 18843 kmem_cache_destroy(tcp_timercache); 18844 kmem_cache_destroy(tcp_sack_info_cache); 18845 18846 netstack_unregister(NS_TCP); 18847 } 18848 18849 /* 18850 * Free the TCP stack instance. 18851 */ 18852 static void 18853 tcp_stack_fini(netstackid_t stackid, void *arg) 18854 { 18855 tcp_stack_t *tcps = (tcp_stack_t *)arg; 18856 int i; 18857 18858 freeb(tcps->tcps_ixa_cleanup_mp); 18859 tcps->tcps_ixa_cleanup_mp = NULL; 18860 cv_destroy(&tcps->tcps_ixa_cleanup_cv); 18861 mutex_destroy(&tcps->tcps_ixa_cleanup_lock); 18862 18863 if (tcps->tcps_reclaim_tid != 0) 18864 (void) untimeout(tcps->tcps_reclaim_tid); 18865 mutex_destroy(&tcps->tcps_reclaim_lock); 18866 18867 tcp_listener_conf_cleanup(tcps); 18868 18869 nd_free(&tcps->tcps_g_nd); 18870 kmem_free(tcps->tcps_params, sizeof (lcl_tcp_param_arr)); 18871 tcps->tcps_params = NULL; 18872 kmem_free(tcps->tcps_wroff_xtra_param, sizeof (tcpparam_t)); 18873 tcps->tcps_wroff_xtra_param = NULL; 18874 18875 for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { 18876 ASSERT(tcps->tcps_bind_fanout[i].tf_tcp == NULL); 18877 mutex_destroy(&tcps->tcps_bind_fanout[i].tf_lock); 18878 } 18879 18880 for (i = 0; i < TCP_ACCEPTOR_FANOUT_SIZE; i++) { 18881 ASSERT(tcps->tcps_acceptor_fanout[i].tf_tcp == NULL); 18882 mutex_destroy(&tcps->tcps_acceptor_fanout[i].tf_lock); 18883 } 18884 18885 kmem_free(tcps->tcps_bind_fanout, sizeof (tf_t) * TCP_BIND_FANOUT_SIZE); 18886 tcps->tcps_bind_fanout = NULL; 18887 18888 kmem_free(tcps->tcps_acceptor_fanout, sizeof (tf_t) * 18889 TCP_ACCEPTOR_FANOUT_SIZE); 18890 tcps->tcps_acceptor_fanout = NULL; 18891 18892 mutex_destroy(&tcps->tcps_iss_key_lock); 18893 mutex_destroy(&tcps->tcps_epriv_port_lock); 18894 18895 ip_drop_unregister(&tcps->tcps_dropper); 18896 18897 tcp_kstat2_fini(stackid, tcps->tcps_kstat); 18898 tcps->tcps_kstat = NULL; 18899 bzero(&tcps->tcps_statistics, sizeof (tcps->tcps_statistics)); 18900 18901 tcp_kstat_fini(stackid, tcps->tcps_mibkp); 18902 tcps->tcps_mibkp = NULL; 18903 18904 ldi_ident_release(tcps->tcps_ldi_ident); 18905 kmem_free(tcps, sizeof (*tcps)); 18906 } 18907 18908 /* 18909 * Generate ISS, taking into account NDD changes may happen halfway through. 18910 * (If the iss is not zero, set it.) 18911 */ 18912 18913 static void 18914 tcp_iss_init(tcp_t *tcp) 18915 { 18916 MD5_CTX context; 18917 struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg; 18918 uint32_t answer[4]; 18919 tcp_stack_t *tcps = tcp->tcp_tcps; 18920 conn_t *connp = tcp->tcp_connp; 18921 18922 tcps->tcps_iss_incr_extra += (ISS_INCR >> 1); 18923 tcp->tcp_iss = tcps->tcps_iss_incr_extra; 18924 switch (tcps->tcps_strong_iss) { 18925 case 2: 18926 mutex_enter(&tcps->tcps_iss_key_lock); 18927 context = tcps->tcps_iss_key; 18928 mutex_exit(&tcps->tcps_iss_key_lock); 18929 arg.ports = connp->conn_ports; 18930 arg.src = connp->conn_laddr_v6; 18931 arg.dst = connp->conn_faddr_v6; 18932 MD5Update(&context, (uchar_t *)&arg, sizeof (arg)); 18933 MD5Final((uchar_t *)answer, &context); 18934 tcp->tcp_iss += answer[0] ^ answer[1] ^ answer[2] ^ answer[3]; 18935 /* 18936 * Now that we've hashed into a unique per-connection sequence 18937 * space, add a random increment per strong_iss == 1. So I 18938 * guess we'll have to... 18939 */ 18940 /* FALLTHRU */ 18941 case 1: 18942 tcp->tcp_iss += (gethrtime() >> ISS_NSEC_SHT) + tcp_random(); 18943 break; 18944 default: 18945 tcp->tcp_iss += (uint32_t)gethrestime_sec() * ISS_INCR; 18946 break; 18947 } 18948 tcp->tcp_valid_bits = TCP_ISS_VALID; 18949 tcp->tcp_fss = tcp->tcp_iss - 1; 18950 tcp->tcp_suna = tcp->tcp_iss; 18951 tcp->tcp_snxt = tcp->tcp_iss + 1; 18952 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 18953 tcp->tcp_csuna = tcp->tcp_snxt; 18954 } 18955 18956 /* 18957 * Exported routine for extracting active tcp connection status. 18958 * 18959 * This is used by the Solaris Cluster Networking software to 18960 * gather a list of connections that need to be forwarded to 18961 * specific nodes in the cluster when configuration changes occur. 18962 * 18963 * The callback is invoked for each tcp_t structure from all netstacks, 18964 * if 'stack_id' is less than 0. Otherwise, only for tcp_t structures 18965 * from the netstack with the specified stack_id. Returning 18966 * non-zero from the callback routine terminates the search. 18967 */ 18968 int 18969 cl_tcp_walk_list(netstackid_t stack_id, 18970 int (*cl_callback)(cl_tcp_info_t *, void *), void *arg) 18971 { 18972 netstack_handle_t nh; 18973 netstack_t *ns; 18974 int ret = 0; 18975 18976 if (stack_id >= 0) { 18977 if ((ns = netstack_find_by_stackid(stack_id)) == NULL) 18978 return (EINVAL); 18979 18980 ret = cl_tcp_walk_list_stack(cl_callback, arg, 18981 ns->netstack_tcp); 18982 netstack_rele(ns); 18983 return (ret); 18984 } 18985 18986 netstack_next_init(&nh); 18987 while ((ns = netstack_next(&nh)) != NULL) { 18988 ret = cl_tcp_walk_list_stack(cl_callback, arg, 18989 ns->netstack_tcp); 18990 netstack_rele(ns); 18991 } 18992 netstack_next_fini(&nh); 18993 return (ret); 18994 } 18995 18996 static int 18997 cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg, 18998 tcp_stack_t *tcps) 18999 { 19000 tcp_t *tcp; 19001 cl_tcp_info_t cl_tcpi; 19002 connf_t *connfp; 19003 conn_t *connp; 19004 int i; 19005 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 19006 19007 ASSERT(callback != NULL); 19008 19009 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 19010 connfp = &ipst->ips_ipcl_globalhash_fanout[i]; 19011 connp = NULL; 19012 19013 while ((connp = 19014 ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { 19015 19016 tcp = connp->conn_tcp; 19017 cl_tcpi.cl_tcpi_version = CL_TCPI_V1; 19018 cl_tcpi.cl_tcpi_ipversion = connp->conn_ipversion; 19019 cl_tcpi.cl_tcpi_state = tcp->tcp_state; 19020 cl_tcpi.cl_tcpi_lport = connp->conn_lport; 19021 cl_tcpi.cl_tcpi_fport = connp->conn_fport; 19022 cl_tcpi.cl_tcpi_laddr_v6 = connp->conn_laddr_v6; 19023 cl_tcpi.cl_tcpi_faddr_v6 = connp->conn_faddr_v6; 19024 19025 /* 19026 * If the callback returns non-zero 19027 * we terminate the traversal. 19028 */ 19029 if ((*callback)(&cl_tcpi, arg) != 0) { 19030 CONN_DEC_REF(tcp->tcp_connp); 19031 return (1); 19032 } 19033 } 19034 } 19035 19036 return (0); 19037 } 19038 19039 /* 19040 * Macros used for accessing the different types of sockaddr 19041 * structures inside a tcp_ioc_abort_conn_t. 19042 */ 19043 #define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local) 19044 #define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote) 19045 #define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr) 19046 #define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr) 19047 #define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port) 19048 #define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port) 19049 #define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local) 19050 #define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote) 19051 #define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr) 19052 #define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr) 19053 #define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port) 19054 #define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port) 19055 19056 /* 19057 * Return the correct error code to mimic the behavior 19058 * of a connection reset. 19059 */ 19060 #define TCP_AC_GET_ERRCODE(state, err) { \ 19061 switch ((state)) { \ 19062 case TCPS_SYN_SENT: \ 19063 case TCPS_SYN_RCVD: \ 19064 (err) = ECONNREFUSED; \ 19065 break; \ 19066 case TCPS_ESTABLISHED: \ 19067 case TCPS_FIN_WAIT_1: \ 19068 case TCPS_FIN_WAIT_2: \ 19069 case TCPS_CLOSE_WAIT: \ 19070 (err) = ECONNRESET; \ 19071 break; \ 19072 case TCPS_CLOSING: \ 19073 case TCPS_LAST_ACK: \ 19074 case TCPS_TIME_WAIT: \ 19075 (err) = 0; \ 19076 break; \ 19077 default: \ 19078 (err) = ENXIO; \ 19079 } \ 19080 } 19081 19082 /* 19083 * Check if a tcp structure matches the info in acp. 19084 */ 19085 #define TCP_AC_ADDR_MATCH(acp, connp, tcp) \ 19086 (((acp)->ac_local.ss_family == AF_INET) ? \ 19087 ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \ 19088 TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) && \ 19089 (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \ 19090 TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) && \ 19091 (TCP_AC_V4LPORT((acp)) == 0 || \ 19092 TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) && \ 19093 (TCP_AC_V4RPORT((acp)) == 0 || \ 19094 TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) && \ 19095 (acp)->ac_start <= (tcp)->tcp_state && \ 19096 (acp)->ac_end >= (tcp)->tcp_state) : \ 19097 ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \ 19098 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \ 19099 &(connp)->conn_laddr_v6)) && \ 19100 (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \ 19101 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \ 19102 &(connp)->conn_faddr_v6)) && \ 19103 (TCP_AC_V6LPORT((acp)) == 0 || \ 19104 TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) && \ 19105 (TCP_AC_V6RPORT((acp)) == 0 || \ 19106 TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) && \ 19107 (acp)->ac_start <= (tcp)->tcp_state && \ 19108 (acp)->ac_end >= (tcp)->tcp_state)) 19109 19110 #define TCP_AC_MATCH(acp, connp, tcp) \ 19111 (((acp)->ac_zoneid == ALL_ZONES || \ 19112 (acp)->ac_zoneid == (connp)->conn_zoneid) ? \ 19113 TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0) 19114 19115 /* 19116 * Build a message containing a tcp_ioc_abort_conn_t structure 19117 * which is filled in with information from acp and tp. 19118 */ 19119 static mblk_t * 19120 tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) 19121 { 19122 mblk_t *mp; 19123 tcp_ioc_abort_conn_t *tacp; 19124 19125 mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO); 19126 if (mp == NULL) 19127 return (NULL); 19128 19129 *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN; 19130 tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr + 19131 sizeof (uint32_t)); 19132 19133 tacp->ac_start = acp->ac_start; 19134 tacp->ac_end = acp->ac_end; 19135 tacp->ac_zoneid = acp->ac_zoneid; 19136 19137 if (acp->ac_local.ss_family == AF_INET) { 19138 tacp->ac_local.ss_family = AF_INET; 19139 tacp->ac_remote.ss_family = AF_INET; 19140 TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4; 19141 TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4; 19142 TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport; 19143 TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport; 19144 } else { 19145 tacp->ac_local.ss_family = AF_INET6; 19146 tacp->ac_remote.ss_family = AF_INET6; 19147 TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6; 19148 TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6; 19149 TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport; 19150 TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport; 19151 } 19152 mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp); 19153 return (mp); 19154 } 19155 19156 /* 19157 * Print a tcp_ioc_abort_conn_t structure. 19158 */ 19159 static void 19160 tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) 19161 { 19162 char lbuf[128]; 19163 char rbuf[128]; 19164 sa_family_t af; 19165 in_port_t lport, rport; 19166 ushort_t logflags; 19167 19168 af = acp->ac_local.ss_family; 19169 19170 if (af == AF_INET) { 19171 (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp), 19172 lbuf, 128); 19173 (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp), 19174 rbuf, 128); 19175 lport = ntohs(TCP_AC_V4LPORT(acp)); 19176 rport = ntohs(TCP_AC_V4RPORT(acp)); 19177 } else { 19178 (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp), 19179 lbuf, 128); 19180 (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp), 19181 rbuf, 128); 19182 lport = ntohs(TCP_AC_V6LPORT(acp)); 19183 rport = ntohs(TCP_AC_V6RPORT(acp)); 19184 } 19185 19186 logflags = SL_TRACE | SL_NOTE; 19187 /* 19188 * Don't print this message to the console if the operation was done 19189 * to a non-global zone. 19190 */ 19191 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 19192 logflags |= SL_CONSOLE; 19193 (void) strlog(TCP_MOD_ID, 0, 1, logflags, 19194 "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, " 19195 "start = %d, end = %d\n", lbuf, lport, rbuf, rport, 19196 acp->ac_start, acp->ac_end); 19197 } 19198 19199 /* 19200 * Called using SQ_FILL when a message built using 19201 * tcp_ioctl_abort_build_msg is put into a queue. 19202 * Note that when we get here there is no wildcard in acp any more. 19203 */ 19204 /* ARGSUSED2 */ 19205 static void 19206 tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, 19207 ip_recv_attr_t *dummy) 19208 { 19209 conn_t *connp = (conn_t *)arg; 19210 tcp_t *tcp = connp->conn_tcp; 19211 tcp_ioc_abort_conn_t *acp; 19212 19213 /* 19214 * Don't accept any input on a closed tcp as this TCP logically does 19215 * not exist on the system. Don't proceed further with this TCP. 19216 * For eg. this packet could trigger another close of this tcp 19217 * which would be disastrous for tcp_refcnt. tcp_close_detached / 19218 * tcp_clean_death / tcp_closei_local must be called at most once 19219 * on a TCP. 19220 */ 19221 if (tcp->tcp_state == TCPS_CLOSED || 19222 tcp->tcp_state == TCPS_BOUND) { 19223 freemsg(mp); 19224 return; 19225 } 19226 19227 acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t)); 19228 if (tcp->tcp_state <= acp->ac_end) { 19229 /* 19230 * If we get here, we are already on the correct 19231 * squeue. This ioctl follows the following path 19232 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn 19233 * ->tcp_ioctl_abort->squeue_enter (if on a 19234 * different squeue) 19235 */ 19236 int errcode; 19237 19238 TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode); 19239 (void) tcp_clean_death(tcp, errcode, 26); 19240 } 19241 freemsg(mp); 19242 } 19243 19244 /* 19245 * Abort all matching connections on a hash chain. 19246 */ 19247 static int 19248 tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count, 19249 boolean_t exact, tcp_stack_t *tcps) 19250 { 19251 int nmatch, err = 0; 19252 tcp_t *tcp; 19253 MBLKP mp, last, listhead = NULL; 19254 conn_t *tconnp; 19255 connf_t *connfp; 19256 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 19257 19258 connfp = &ipst->ips_ipcl_conn_fanout[index]; 19259 19260 startover: 19261 nmatch = 0; 19262 19263 mutex_enter(&connfp->connf_lock); 19264 for (tconnp = connfp->connf_head; tconnp != NULL; 19265 tconnp = tconnp->conn_next) { 19266 tcp = tconnp->conn_tcp; 19267 /* 19268 * We are missing a check on sin6_scope_id for linklocals here, 19269 * but current usage is just for aborting based on zoneid 19270 * for shared-IP zones. 19271 */ 19272 if (TCP_AC_MATCH(acp, tconnp, tcp)) { 19273 CONN_INC_REF(tconnp); 19274 mp = tcp_ioctl_abort_build_msg(acp, tcp); 19275 if (mp == NULL) { 19276 err = ENOMEM; 19277 CONN_DEC_REF(tconnp); 19278 break; 19279 } 19280 mp->b_prev = (mblk_t *)tcp; 19281 19282 if (listhead == NULL) { 19283 listhead = mp; 19284 last = mp; 19285 } else { 19286 last->b_next = mp; 19287 last = mp; 19288 } 19289 nmatch++; 19290 if (exact) 19291 break; 19292 } 19293 19294 /* Avoid holding lock for too long. */ 19295 if (nmatch >= 500) 19296 break; 19297 } 19298 mutex_exit(&connfp->connf_lock); 19299 19300 /* Pass mp into the correct tcp */ 19301 while ((mp = listhead) != NULL) { 19302 listhead = listhead->b_next; 19303 tcp = (tcp_t *)mp->b_prev; 19304 mp->b_next = mp->b_prev = NULL; 19305 SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, 19306 tcp_ioctl_abort_handler, tcp->tcp_connp, NULL, 19307 SQ_FILL, SQTAG_TCP_ABORT_BUCKET); 19308 } 19309 19310 *count += nmatch; 19311 if (nmatch >= 500 && err == 0) 19312 goto startover; 19313 return (err); 19314 } 19315 19316 /* 19317 * Abort all connections that matches the attributes specified in acp. 19318 */ 19319 static int 19320 tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps) 19321 { 19322 sa_family_t af; 19323 uint32_t ports; 19324 uint16_t *pports; 19325 int err = 0, count = 0; 19326 boolean_t exact = B_FALSE; /* set when there is no wildcard */ 19327 int index = -1; 19328 ushort_t logflags; 19329 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 19330 19331 af = acp->ac_local.ss_family; 19332 19333 if (af == AF_INET) { 19334 if (TCP_AC_V4REMOTE(acp) != INADDR_ANY && 19335 TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) { 19336 pports = (uint16_t *)&ports; 19337 pports[1] = TCP_AC_V4LPORT(acp); 19338 pports[0] = TCP_AC_V4RPORT(acp); 19339 exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY); 19340 } 19341 } else { 19342 if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) && 19343 TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) { 19344 pports = (uint16_t *)&ports; 19345 pports[1] = TCP_AC_V6LPORT(acp); 19346 pports[0] = TCP_AC_V6RPORT(acp); 19347 exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp)); 19348 } 19349 } 19350 19351 /* 19352 * For cases where remote addr, local port, and remote port are non- 19353 * wildcards, tcp_ioctl_abort_bucket will only be called once. 19354 */ 19355 if (index != -1) { 19356 err = tcp_ioctl_abort_bucket(acp, index, 19357 &count, exact, tcps); 19358 } else { 19359 /* 19360 * loop through all entries for wildcard case 19361 */ 19362 for (index = 0; 19363 index < ipst->ips_ipcl_conn_fanout_size; 19364 index++) { 19365 err = tcp_ioctl_abort_bucket(acp, index, 19366 &count, exact, tcps); 19367 if (err != 0) 19368 break; 19369 } 19370 } 19371 19372 logflags = SL_TRACE | SL_NOTE; 19373 /* 19374 * Don't print this message to the console if the operation was done 19375 * to a non-global zone. 19376 */ 19377 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 19378 logflags |= SL_CONSOLE; 19379 (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: " 19380 "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' ')); 19381 if (err == 0 && count == 0) 19382 err = ENOENT; 19383 return (err); 19384 } 19385 19386 /* 19387 * Process the TCP_IOC_ABORT_CONN ioctl request. 19388 */ 19389 static void 19390 tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) 19391 { 19392 int err; 19393 IOCP iocp; 19394 MBLKP mp1; 19395 sa_family_t laf, raf; 19396 tcp_ioc_abort_conn_t *acp; 19397 zone_t *zptr; 19398 conn_t *connp = Q_TO_CONN(q); 19399 zoneid_t zoneid = connp->conn_zoneid; 19400 tcp_t *tcp = connp->conn_tcp; 19401 tcp_stack_t *tcps = tcp->tcp_tcps; 19402 19403 iocp = (IOCP)mp->b_rptr; 19404 19405 if ((mp1 = mp->b_cont) == NULL || 19406 iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) { 19407 err = EINVAL; 19408 goto out; 19409 } 19410 19411 /* check permissions */ 19412 if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { 19413 err = EPERM; 19414 goto out; 19415 } 19416 19417 if (mp1->b_cont != NULL) { 19418 freemsg(mp1->b_cont); 19419 mp1->b_cont = NULL; 19420 } 19421 19422 acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr; 19423 laf = acp->ac_local.ss_family; 19424 raf = acp->ac_remote.ss_family; 19425 19426 /* check that a zone with the supplied zoneid exists */ 19427 if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) { 19428 zptr = zone_find_by_id(zoneid); 19429 if (zptr != NULL) { 19430 zone_rele(zptr); 19431 } else { 19432 err = EINVAL; 19433 goto out; 19434 } 19435 } 19436 19437 /* 19438 * For exclusive stacks we set the zoneid to zero 19439 * to make TCP operate as if in the global zone. 19440 */ 19441 if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID) 19442 acp->ac_zoneid = GLOBAL_ZONEID; 19443 19444 if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT || 19445 acp->ac_start > acp->ac_end || laf != raf || 19446 (laf != AF_INET && laf != AF_INET6)) { 19447 err = EINVAL; 19448 goto out; 19449 } 19450 19451 tcp_ioctl_abort_dump(acp); 19452 err = tcp_ioctl_abort(acp, tcps); 19453 19454 out: 19455 if (mp1 != NULL) { 19456 freemsg(mp1); 19457 mp->b_cont = NULL; 19458 } 19459 19460 if (err != 0) 19461 miocnak(q, mp, 0, err); 19462 else 19463 miocack(q, mp, 0, 0); 19464 } 19465 19466 /* 19467 * tcp_time_wait_processing() handles processing of incoming packets when 19468 * the tcp is in the TIME_WAIT state. 19469 * A TIME_WAIT tcp that has an associated open TCP stream is never put 19470 * on the time wait list. 19471 */ 19472 void 19473 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, 19474 uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira) 19475 { 19476 int32_t bytes_acked; 19477 int32_t gap; 19478 int32_t rgap; 19479 tcp_opt_t tcpopt; 19480 uint_t flags; 19481 uint32_t new_swnd = 0; 19482 conn_t *nconnp; 19483 conn_t *connp = tcp->tcp_connp; 19484 tcp_stack_t *tcps = tcp->tcp_tcps; 19485 19486 BUMP_LOCAL(tcp->tcp_ibsegs); 19487 DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); 19488 19489 flags = (unsigned int)tcpha->tha_flags & 0xFF; 19490 new_swnd = ntohs(tcpha->tha_win) << 19491 ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); 19492 if (tcp->tcp_snd_ts_ok) { 19493 if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { 19494 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 19495 tcp->tcp_rnxt, TH_ACK); 19496 goto done; 19497 } 19498 } 19499 gap = seg_seq - tcp->tcp_rnxt; 19500 rgap = tcp->tcp_rwnd - (gap + seg_len); 19501 if (gap < 0) { 19502 BUMP_MIB(&tcps->tcps_mib, tcpInDataDupSegs); 19503 UPDATE_MIB(&tcps->tcps_mib, tcpInDataDupBytes, 19504 (seg_len > -gap ? -gap : seg_len)); 19505 seg_len += gap; 19506 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 19507 if (flags & TH_RST) { 19508 goto done; 19509 } 19510 if ((flags & TH_FIN) && seg_len == -1) { 19511 /* 19512 * When TCP receives a duplicate FIN in 19513 * TIME_WAIT state, restart the 2 MSL timer. 19514 * See page 73 in RFC 793. Make sure this TCP 19515 * is already on the TIME_WAIT list. If not, 19516 * just restart the timer. 19517 */ 19518 if (TCP_IS_DETACHED(tcp)) { 19519 if (tcp_time_wait_remove(tcp, NULL) == 19520 B_TRUE) { 19521 tcp_time_wait_append(tcp); 19522 TCP_DBGSTAT(tcps, 19523 tcp_rput_time_wait); 19524 } 19525 } else { 19526 ASSERT(tcp != NULL); 19527 TCP_TIMER_RESTART(tcp, 19528 tcps->tcps_time_wait_interval); 19529 } 19530 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 19531 tcp->tcp_rnxt, TH_ACK); 19532 goto done; 19533 } 19534 flags |= TH_ACK_NEEDED; 19535 seg_len = 0; 19536 goto process_ack; 19537 } 19538 19539 /* Fix seg_seq, and chew the gap off the front. */ 19540 seg_seq = tcp->tcp_rnxt; 19541 } 19542 19543 if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 19544 /* 19545 * Make sure that when we accept the connection, pick 19546 * an ISS greater than (tcp_snxt + ISS_INCR/2) for the 19547 * old connection. 19548 * 19549 * The next ISS generated is equal to tcp_iss_incr_extra 19550 * + ISS_INCR/2 + other components depending on the 19551 * value of tcp_strong_iss. We pre-calculate the new 19552 * ISS here and compare with tcp_snxt to determine if 19553 * we need to make adjustment to tcp_iss_incr_extra. 19554 * 19555 * The above calculation is ugly and is a 19556 * waste of CPU cycles... 19557 */ 19558 uint32_t new_iss = tcps->tcps_iss_incr_extra; 19559 int32_t adj; 19560 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 19561 19562 switch (tcps->tcps_strong_iss) { 19563 case 2: { 19564 /* Add time and MD5 components. */ 19565 uint32_t answer[4]; 19566 struct { 19567 uint32_t ports; 19568 in6_addr_t src; 19569 in6_addr_t dst; 19570 } arg; 19571 MD5_CTX context; 19572 19573 mutex_enter(&tcps->tcps_iss_key_lock); 19574 context = tcps->tcps_iss_key; 19575 mutex_exit(&tcps->tcps_iss_key_lock); 19576 arg.ports = connp->conn_ports; 19577 /* We use MAPPED addresses in tcp_iss_init */ 19578 arg.src = connp->conn_laddr_v6; 19579 arg.dst = connp->conn_faddr_v6; 19580 MD5Update(&context, (uchar_t *)&arg, 19581 sizeof (arg)); 19582 MD5Final((uchar_t *)answer, &context); 19583 answer[0] ^= answer[1] ^ answer[2] ^ answer[3]; 19584 new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0]; 19585 break; 19586 } 19587 case 1: 19588 /* Add time component and min random (i.e. 1). */ 19589 new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1; 19590 break; 19591 default: 19592 /* Add only time component. */ 19593 new_iss += (uint32_t)gethrestime_sec() * ISS_INCR; 19594 break; 19595 } 19596 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 19597 /* 19598 * New ISS not guaranteed to be ISS_INCR/2 19599 * ahead of the current tcp_snxt, so add the 19600 * difference to tcp_iss_incr_extra. 19601 */ 19602 tcps->tcps_iss_incr_extra += adj; 19603 } 19604 /* 19605 * If tcp_clean_death() can not perform the task now, 19606 * drop the SYN packet and let the other side re-xmit. 19607 * Otherwise pass the SYN packet back in, since the 19608 * old tcp state has been cleaned up or freed. 19609 */ 19610 if (tcp_clean_death(tcp, 0, 27) == -1) 19611 goto done; 19612 nconnp = ipcl_classify(mp, ira, ipst); 19613 if (nconnp != NULL) { 19614 TCP_STAT(tcps, tcp_time_wait_syn_success); 19615 /* Drops ref on nconnp */ 19616 tcp_reinput(nconnp, mp, ira, ipst); 19617 return; 19618 } 19619 goto done; 19620 } 19621 19622 /* 19623 * rgap is the amount of stuff received out of window. A negative 19624 * value is the amount out of window. 19625 */ 19626 if (rgap < 0) { 19627 BUMP_MIB(&tcps->tcps_mib, tcpInDataPastWinSegs); 19628 UPDATE_MIB(&tcps->tcps_mib, tcpInDataPastWinBytes, -rgap); 19629 /* Fix seg_len and make sure there is something left. */ 19630 seg_len += rgap; 19631 if (seg_len <= 0) { 19632 if (flags & TH_RST) { 19633 goto done; 19634 } 19635 flags |= TH_ACK_NEEDED; 19636 seg_len = 0; 19637 goto process_ack; 19638 } 19639 } 19640 /* 19641 * Check whether we can update tcp_ts_recent. This test is 19642 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 19643 * Extensions for High Performance: An Update", Internet Draft. 19644 */ 19645 if (tcp->tcp_snd_ts_ok && 19646 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 19647 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 19648 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 19649 tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); 19650 } 19651 19652 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 19653 /* Always ack out of order packets */ 19654 flags |= TH_ACK_NEEDED; 19655 seg_len = 0; 19656 } else if (seg_len > 0) { 19657 BUMP_MIB(&tcps->tcps_mib, tcpInClosed); 19658 BUMP_MIB(&tcps->tcps_mib, tcpInDataInorderSegs); 19659 UPDATE_MIB(&tcps->tcps_mib, tcpInDataInorderBytes, seg_len); 19660 } 19661 if (flags & TH_RST) { 19662 (void) tcp_clean_death(tcp, 0, 28); 19663 goto done; 19664 } 19665 if (flags & TH_SYN) { 19666 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 19667 TH_RST|TH_ACK); 19668 /* 19669 * Do not delete the TCP structure if it is in 19670 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 19671 */ 19672 goto done; 19673 } 19674 process_ack: 19675 if (flags & TH_ACK) { 19676 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 19677 if (bytes_acked <= 0) { 19678 if (bytes_acked == 0 && seg_len == 0 && 19679 new_swnd == tcp->tcp_swnd) 19680 BUMP_MIB(&tcps->tcps_mib, tcpInDupAck); 19681 } else { 19682 /* Acks something not sent */ 19683 flags |= TH_ACK_NEEDED; 19684 } 19685 } 19686 if (flags & TH_ACK_NEEDED) { 19687 /* 19688 * Time to send an ack for some reason. 19689 */ 19690 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 19691 tcp->tcp_rnxt, TH_ACK); 19692 } 19693 done: 19694 freemsg(mp); 19695 } 19696 19697 /* 19698 * TCP Timers Implementation. 19699 */ 19700 timeout_id_t 19701 tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim) 19702 { 19703 mblk_t *mp; 19704 tcp_timer_t *tcpt; 19705 tcp_t *tcp = connp->conn_tcp; 19706 19707 ASSERT(connp->conn_sqp != NULL); 19708 19709 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls); 19710 19711 if (tcp->tcp_timercache == NULL) { 19712 mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC); 19713 } else { 19714 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc); 19715 mp = tcp->tcp_timercache; 19716 tcp->tcp_timercache = mp->b_next; 19717 mp->b_next = NULL; 19718 ASSERT(mp->b_wptr == NULL); 19719 } 19720 19721 CONN_INC_REF(connp); 19722 tcpt = (tcp_timer_t *)mp->b_rptr; 19723 tcpt->connp = connp; 19724 tcpt->tcpt_proc = f; 19725 /* 19726 * TCP timers are normal timeouts. Plus, they do not require more than 19727 * a 10 millisecond resolution. By choosing a coarser resolution and by 19728 * rounding up the expiration to the next resolution boundary, we can 19729 * batch timers in the callout subsystem to make TCP timers more 19730 * efficient. The roundup also protects short timers from expiring too 19731 * early before they have a chance to be cancelled. 19732 */ 19733 tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp, 19734 TICK_TO_NSEC(tim), CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); 19735 19736 return ((timeout_id_t)mp); 19737 } 19738 19739 static void 19740 tcp_timer_callback(void *arg) 19741 { 19742 mblk_t *mp = (mblk_t *)arg; 19743 tcp_timer_t *tcpt; 19744 conn_t *connp; 19745 19746 tcpt = (tcp_timer_t *)mp->b_rptr; 19747 connp = tcpt->connp; 19748 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp, 19749 NULL, SQ_FILL, SQTAG_TCP_TIMER); 19750 } 19751 19752 /* ARGSUSED */ 19753 static void 19754 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 19755 { 19756 tcp_timer_t *tcpt; 19757 conn_t *connp = (conn_t *)arg; 19758 tcp_t *tcp = connp->conn_tcp; 19759 19760 tcpt = (tcp_timer_t *)mp->b_rptr; 19761 ASSERT(connp == tcpt->connp); 19762 ASSERT((squeue_t *)arg2 == connp->conn_sqp); 19763 19764 /* 19765 * If the TCP has reached the closed state, don't proceed any 19766 * further. This TCP logically does not exist on the system. 19767 * tcpt_proc could for example access queues, that have already 19768 * been qprocoff'ed off. 19769 */ 19770 if (tcp->tcp_state != TCPS_CLOSED) { 19771 (*tcpt->tcpt_proc)(connp); 19772 } else { 19773 tcp->tcp_timer_tid = 0; 19774 } 19775 tcp_timer_free(connp->conn_tcp, mp); 19776 } 19777 19778 /* 19779 * There is potential race with untimeout and the handler firing at the same 19780 * time. The mblock may be freed by the handler while we are trying to use 19781 * it. But since both should execute on the same squeue, this race should not 19782 * occur. 19783 */ 19784 clock_t 19785 tcp_timeout_cancel(conn_t *connp, timeout_id_t id) 19786 { 19787 mblk_t *mp = (mblk_t *)id; 19788 tcp_timer_t *tcpt; 19789 clock_t delta; 19790 19791 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs); 19792 19793 if (mp == NULL) 19794 return (-1); 19795 19796 tcpt = (tcp_timer_t *)mp->b_rptr; 19797 ASSERT(tcpt->connp == connp); 19798 19799 delta = untimeout_default(tcpt->tcpt_tid, 0); 19800 19801 if (delta >= 0) { 19802 TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled); 19803 tcp_timer_free(connp->conn_tcp, mp); 19804 CONN_DEC_REF(connp); 19805 } 19806 19807 return (delta); 19808 } 19809 19810 /* 19811 * Allocate space for the timer event. The allocation looks like mblk, but it is 19812 * not a proper mblk. To avoid confusion we set b_wptr to NULL. 19813 * 19814 * Dealing with failures: If we can't allocate from the timer cache we try 19815 * allocating from dblock caches using allocb_tryhard(). In this case b_wptr 19816 * points to b_rptr. 19817 * If we can't allocate anything using allocb_tryhard(), we perform a last 19818 * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and 19819 * save the actual allocation size in b_datap. 19820 */ 19821 mblk_t * 19822 tcp_timermp_alloc(int kmflags) 19823 { 19824 mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache, 19825 kmflags & ~KM_PANIC); 19826 19827 if (mp != NULL) { 19828 mp->b_next = mp->b_prev = NULL; 19829 mp->b_rptr = (uchar_t *)(&mp[1]); 19830 mp->b_wptr = NULL; 19831 mp->b_datap = NULL; 19832 mp->b_queue = NULL; 19833 mp->b_cont = NULL; 19834 } else if (kmflags & KM_PANIC) { 19835 /* 19836 * Failed to allocate memory for the timer. Try allocating from 19837 * dblock caches. 19838 */ 19839 /* ipclassifier calls this from a constructor - hence no tcps */ 19840 TCP_G_STAT(tcp_timermp_allocfail); 19841 mp = allocb_tryhard(sizeof (tcp_timer_t)); 19842 if (mp == NULL) { 19843 size_t size = 0; 19844 /* 19845 * Memory is really low. Try tryhard allocation. 19846 * 19847 * ipclassifier calls this from a constructor - 19848 * hence no tcps 19849 */ 19850 TCP_G_STAT(tcp_timermp_allocdblfail); 19851 mp = kmem_alloc_tryhard(sizeof (mblk_t) + 19852 sizeof (tcp_timer_t), &size, kmflags); 19853 mp->b_rptr = (uchar_t *)(&mp[1]); 19854 mp->b_next = mp->b_prev = NULL; 19855 mp->b_wptr = (uchar_t *)-1; 19856 mp->b_datap = (dblk_t *)size; 19857 mp->b_queue = NULL; 19858 mp->b_cont = NULL; 19859 } 19860 ASSERT(mp->b_wptr != NULL); 19861 } 19862 /* ipclassifier calls this from a constructor - hence no tcps */ 19863 TCP_G_DBGSTAT(tcp_timermp_alloced); 19864 19865 return (mp); 19866 } 19867 19868 /* 19869 * Free per-tcp timer cache. 19870 * It can only contain entries from tcp_timercache. 19871 */ 19872 void 19873 tcp_timermp_free(tcp_t *tcp) 19874 { 19875 mblk_t *mp; 19876 19877 while ((mp = tcp->tcp_timercache) != NULL) { 19878 ASSERT(mp->b_wptr == NULL); 19879 tcp->tcp_timercache = tcp->tcp_timercache->b_next; 19880 kmem_cache_free(tcp_timercache, mp); 19881 } 19882 } 19883 19884 /* 19885 * Free timer event. Put it on the per-tcp timer cache if there is not too many 19886 * events there already (currently at most two events are cached). 19887 * If the event is not allocated from the timer cache, free it right away. 19888 */ 19889 static void 19890 tcp_timer_free(tcp_t *tcp, mblk_t *mp) 19891 { 19892 mblk_t *mp1 = tcp->tcp_timercache; 19893 19894 if (mp->b_wptr != NULL) { 19895 /* 19896 * This allocation is not from a timer cache, free it right 19897 * away. 19898 */ 19899 if (mp->b_wptr != (uchar_t *)-1) 19900 freeb(mp); 19901 else 19902 kmem_free(mp, (size_t)mp->b_datap); 19903 } else if (mp1 == NULL || mp1->b_next == NULL) { 19904 /* Cache this timer block for future allocations */ 19905 mp->b_rptr = (uchar_t *)(&mp[1]); 19906 mp->b_next = mp1; 19907 tcp->tcp_timercache = mp; 19908 } else { 19909 kmem_cache_free(tcp_timercache, mp); 19910 TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed); 19911 } 19912 } 19913 19914 /* 19915 * End of TCP Timers implementation. 19916 */ 19917 19918 /* 19919 * tcp_{set,clr}qfull() functions are used to either set or clear QFULL 19920 * on the specified backing STREAMS q. Note, the caller may make the 19921 * decision to call based on the tcp_t.tcp_flow_stopped value which 19922 * when check outside the q's lock is only an advisory check ... 19923 */ 19924 void 19925 tcp_setqfull(tcp_t *tcp) 19926 { 19927 tcp_stack_t *tcps = tcp->tcp_tcps; 19928 conn_t *connp = tcp->tcp_connp; 19929 19930 if (tcp->tcp_closed) 19931 return; 19932 19933 conn_setqfull(connp, &tcp->tcp_flow_stopped); 19934 if (tcp->tcp_flow_stopped) 19935 TCP_STAT(tcps, tcp_flwctl_on); 19936 } 19937 19938 void 19939 tcp_clrqfull(tcp_t *tcp) 19940 { 19941 conn_t *connp = tcp->tcp_connp; 19942 19943 if (tcp->tcp_closed) 19944 return; 19945 conn_clrqfull(connp, &tcp->tcp_flow_stopped); 19946 } 19947 19948 /* 19949 * kstats related to squeues i.e. not per IP instance 19950 */ 19951 static void * 19952 tcp_g_kstat_init(tcp_g_stat_t *tcp_g_statp) 19953 { 19954 kstat_t *ksp; 19955 19956 tcp_g_stat_t template = { 19957 { "tcp_timermp_alloced", KSTAT_DATA_UINT64 }, 19958 { "tcp_timermp_allocfail", KSTAT_DATA_UINT64 }, 19959 { "tcp_timermp_allocdblfail", KSTAT_DATA_UINT64 }, 19960 { "tcp_freelist_cleanup", KSTAT_DATA_UINT64 }, 19961 }; 19962 19963 ksp = kstat_create(TCP_MOD_NAME, 0, "tcpstat_g", "net", 19964 KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 19965 KSTAT_FLAG_VIRTUAL); 19966 19967 if (ksp == NULL) 19968 return (NULL); 19969 19970 bcopy(&template, tcp_g_statp, sizeof (template)); 19971 ksp->ks_data = (void *)tcp_g_statp; 19972 19973 kstat_install(ksp); 19974 return (ksp); 19975 } 19976 19977 static void 19978 tcp_g_kstat_fini(kstat_t *ksp) 19979 { 19980 if (ksp != NULL) { 19981 kstat_delete(ksp); 19982 } 19983 } 19984 19985 19986 static void * 19987 tcp_kstat2_init(netstackid_t stackid, tcp_stat_t *tcps_statisticsp) 19988 { 19989 kstat_t *ksp; 19990 19991 tcp_stat_t template = { 19992 { "tcp_time_wait", KSTAT_DATA_UINT64 }, 19993 { "tcp_time_wait_syn", KSTAT_DATA_UINT64 }, 19994 { "tcp_time_wait_syn_success", KSTAT_DATA_UINT64 }, 19995 { "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 }, 19996 { "tcp_detach_time_wait", KSTAT_DATA_UINT64 }, 19997 { "tcp_time_wait_reap", KSTAT_DATA_UINT64 }, 19998 { "tcp_clean_death_nondetached", KSTAT_DATA_UINT64 }, 19999 { "tcp_reinit_calls", KSTAT_DATA_UINT64 }, 20000 { "tcp_eager_err1", KSTAT_DATA_UINT64 }, 20001 { "tcp_eager_err2", KSTAT_DATA_UINT64 }, 20002 { "tcp_eager_blowoff_calls", KSTAT_DATA_UINT64 }, 20003 { "tcp_eager_blowoff_q", KSTAT_DATA_UINT64 }, 20004 { "tcp_eager_blowoff_q0", KSTAT_DATA_UINT64 }, 20005 { "tcp_not_hard_bound", KSTAT_DATA_UINT64 }, 20006 { "tcp_no_listener", KSTAT_DATA_UINT64 }, 20007 { "tcp_found_eager", KSTAT_DATA_UINT64 }, 20008 { "tcp_wrong_queue", KSTAT_DATA_UINT64 }, 20009 { "tcp_found_eager_binding1", KSTAT_DATA_UINT64 }, 20010 { "tcp_found_eager_bound1", KSTAT_DATA_UINT64 }, 20011 { "tcp_eager_has_listener1", KSTAT_DATA_UINT64 }, 20012 { "tcp_open_alloc", KSTAT_DATA_UINT64 }, 20013 { "tcp_open_detached_alloc", KSTAT_DATA_UINT64 }, 20014 { "tcp_rput_time_wait", KSTAT_DATA_UINT64 }, 20015 { "tcp_listendrop", KSTAT_DATA_UINT64 }, 20016 { "tcp_listendropq0", KSTAT_DATA_UINT64 }, 20017 { "tcp_wrong_rq", KSTAT_DATA_UINT64 }, 20018 { "tcp_rsrv_calls", KSTAT_DATA_UINT64 }, 20019 { "tcp_eagerfree2", KSTAT_DATA_UINT64 }, 20020 { "tcp_eagerfree3", KSTAT_DATA_UINT64 }, 20021 { "tcp_eagerfree4", KSTAT_DATA_UINT64 }, 20022 { "tcp_eagerfree5", KSTAT_DATA_UINT64 }, 20023 { "tcp_timewait_syn_fail", KSTAT_DATA_UINT64 }, 20024 { "tcp_listen_badflags", KSTAT_DATA_UINT64 }, 20025 { "tcp_timeout_calls", KSTAT_DATA_UINT64 }, 20026 { "tcp_timeout_cached_alloc", KSTAT_DATA_UINT64 }, 20027 { "tcp_timeout_cancel_reqs", KSTAT_DATA_UINT64 }, 20028 { "tcp_timeout_canceled", KSTAT_DATA_UINT64 }, 20029 { "tcp_timermp_freed", KSTAT_DATA_UINT64 }, 20030 { "tcp_push_timer_cnt", KSTAT_DATA_UINT64 }, 20031 { "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 }, 20032 { "tcp_wsrv_called", KSTAT_DATA_UINT64 }, 20033 { "tcp_flwctl_on", KSTAT_DATA_UINT64 }, 20034 { "tcp_timer_fire_early", KSTAT_DATA_UINT64 }, 20035 { "tcp_timer_fire_miss", KSTAT_DATA_UINT64 }, 20036 { "tcp_rput_v6_error", KSTAT_DATA_UINT64 }, 20037 { "tcp_zcopy_on", KSTAT_DATA_UINT64 }, 20038 { "tcp_zcopy_off", KSTAT_DATA_UINT64 }, 20039 { "tcp_zcopy_backoff", KSTAT_DATA_UINT64 }, 20040 { "tcp_fusion_flowctl", KSTAT_DATA_UINT64 }, 20041 { "tcp_fusion_backenabled", KSTAT_DATA_UINT64 }, 20042 { "tcp_fusion_urg", KSTAT_DATA_UINT64 }, 20043 { "tcp_fusion_putnext", KSTAT_DATA_UINT64 }, 20044 { "tcp_fusion_unfusable", KSTAT_DATA_UINT64 }, 20045 { "tcp_fusion_aborted", KSTAT_DATA_UINT64 }, 20046 { "tcp_fusion_unqualified", KSTAT_DATA_UINT64 }, 20047 { "tcp_fusion_rrw_busy", KSTAT_DATA_UINT64 }, 20048 { "tcp_fusion_rrw_msgcnt", KSTAT_DATA_UINT64 }, 20049 { "tcp_fusion_rrw_plugged", KSTAT_DATA_UINT64 }, 20050 { "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64 }, 20051 { "tcp_sock_fallback", KSTAT_DATA_UINT64 }, 20052 { "tcp_lso_enabled", KSTAT_DATA_UINT64 }, 20053 { "tcp_lso_disabled", KSTAT_DATA_UINT64 }, 20054 { "tcp_lso_times", KSTAT_DATA_UINT64 }, 20055 { "tcp_lso_pkt_out", KSTAT_DATA_UINT64 }, 20056 { "tcp_listen_cnt_drop", KSTAT_DATA_UINT64 }, 20057 { "tcp_listen_mem_drop", KSTAT_DATA_UINT64 }, 20058 { "tcp_zwin_ack_syn", KSTAT_DATA_UINT64 }, 20059 { "tcp_rst_unsent", KSTAT_DATA_UINT64 } 20060 }; 20061 20062 ksp = kstat_create_netstack(TCP_MOD_NAME, 0, "tcpstat", "net", 20063 KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 20064 KSTAT_FLAG_VIRTUAL, stackid); 20065 20066 if (ksp == NULL) 20067 return (NULL); 20068 20069 bcopy(&template, tcps_statisticsp, sizeof (template)); 20070 ksp->ks_data = (void *)tcps_statisticsp; 20071 ksp->ks_private = (void *)(uintptr_t)stackid; 20072 20073 kstat_install(ksp); 20074 return (ksp); 20075 } 20076 20077 static void 20078 tcp_kstat2_fini(netstackid_t stackid, kstat_t *ksp) 20079 { 20080 if (ksp != NULL) { 20081 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 20082 kstat_delete_netstack(ksp, stackid); 20083 } 20084 } 20085 20086 /* 20087 * TCP Kstats implementation 20088 */ 20089 static void * 20090 tcp_kstat_init(netstackid_t stackid, tcp_stack_t *tcps) 20091 { 20092 kstat_t *ksp; 20093 20094 tcp_named_kstat_t template = { 20095 { "rtoAlgorithm", KSTAT_DATA_INT32, 0 }, 20096 { "rtoMin", KSTAT_DATA_INT32, 0 }, 20097 { "rtoMax", KSTAT_DATA_INT32, 0 }, 20098 { "maxConn", KSTAT_DATA_INT32, 0 }, 20099 { "activeOpens", KSTAT_DATA_UINT32, 0 }, 20100 { "passiveOpens", KSTAT_DATA_UINT32, 0 }, 20101 { "attemptFails", KSTAT_DATA_UINT32, 0 }, 20102 { "estabResets", KSTAT_DATA_UINT32, 0 }, 20103 { "currEstab", KSTAT_DATA_UINT32, 0 }, 20104 { "inSegs", KSTAT_DATA_UINT64, 0 }, 20105 { "outSegs", KSTAT_DATA_UINT64, 0 }, 20106 { "retransSegs", KSTAT_DATA_UINT32, 0 }, 20107 { "connTableSize", KSTAT_DATA_INT32, 0 }, 20108 { "outRsts", KSTAT_DATA_UINT32, 0 }, 20109 { "outDataSegs", KSTAT_DATA_UINT32, 0 }, 20110 { "outDataBytes", KSTAT_DATA_UINT32, 0 }, 20111 { "retransBytes", KSTAT_DATA_UINT32, 0 }, 20112 { "outAck", KSTAT_DATA_UINT32, 0 }, 20113 { "outAckDelayed", KSTAT_DATA_UINT32, 0 }, 20114 { "outUrg", KSTAT_DATA_UINT32, 0 }, 20115 { "outWinUpdate", KSTAT_DATA_UINT32, 0 }, 20116 { "outWinProbe", KSTAT_DATA_UINT32, 0 }, 20117 { "outControl", KSTAT_DATA_UINT32, 0 }, 20118 { "outFastRetrans", KSTAT_DATA_UINT32, 0 }, 20119 { "inAckSegs", KSTAT_DATA_UINT32, 0 }, 20120 { "inAckBytes", KSTAT_DATA_UINT32, 0 }, 20121 { "inDupAck", KSTAT_DATA_UINT32, 0 }, 20122 { "inAckUnsent", KSTAT_DATA_UINT32, 0 }, 20123 { "inDataInorderSegs", KSTAT_DATA_UINT32, 0 }, 20124 { "inDataInorderBytes", KSTAT_DATA_UINT32, 0 }, 20125 { "inDataUnorderSegs", KSTAT_DATA_UINT32, 0 }, 20126 { "inDataUnorderBytes", KSTAT_DATA_UINT32, 0 }, 20127 { "inDataDupSegs", KSTAT_DATA_UINT32, 0 }, 20128 { "inDataDupBytes", KSTAT_DATA_UINT32, 0 }, 20129 { "inDataPartDupSegs", KSTAT_DATA_UINT32, 0 }, 20130 { "inDataPartDupBytes", KSTAT_DATA_UINT32, 0 }, 20131 { "inDataPastWinSegs", KSTAT_DATA_UINT32, 0 }, 20132 { "inDataPastWinBytes", KSTAT_DATA_UINT32, 0 }, 20133 { "inWinProbe", KSTAT_DATA_UINT32, 0 }, 20134 { "inWinUpdate", KSTAT_DATA_UINT32, 0 }, 20135 { "inClosed", KSTAT_DATA_UINT32, 0 }, 20136 { "rttUpdate", KSTAT_DATA_UINT32, 0 }, 20137 { "rttNoUpdate", KSTAT_DATA_UINT32, 0 }, 20138 { "timRetrans", KSTAT_DATA_UINT32, 0 }, 20139 { "timRetransDrop", KSTAT_DATA_UINT32, 0 }, 20140 { "timKeepalive", KSTAT_DATA_UINT32, 0 }, 20141 { "timKeepaliveProbe", KSTAT_DATA_UINT32, 0 }, 20142 { "timKeepaliveDrop", KSTAT_DATA_UINT32, 0 }, 20143 { "listenDrop", KSTAT_DATA_UINT32, 0 }, 20144 { "listenDropQ0", KSTAT_DATA_UINT32, 0 }, 20145 { "halfOpenDrop", KSTAT_DATA_UINT32, 0 }, 20146 { "outSackRetransSegs", KSTAT_DATA_UINT32, 0 }, 20147 { "connTableSize6", KSTAT_DATA_INT32, 0 } 20148 }; 20149 20150 ksp = kstat_create_netstack(TCP_MOD_NAME, 0, TCP_MOD_NAME, "mib2", 20151 KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0, stackid); 20152 20153 if (ksp == NULL) 20154 return (NULL); 20155 20156 template.rtoAlgorithm.value.ui32 = 4; 20157 template.rtoMin.value.ui32 = tcps->tcps_rexmit_interval_min; 20158 template.rtoMax.value.ui32 = tcps->tcps_rexmit_interval_max; 20159 template.maxConn.value.i32 = -1; 20160 20161 bcopy(&template, ksp->ks_data, sizeof (template)); 20162 ksp->ks_update = tcp_kstat_update; 20163 ksp->ks_private = (void *)(uintptr_t)stackid; 20164 20165 kstat_install(ksp); 20166 return (ksp); 20167 } 20168 20169 static void 20170 tcp_kstat_fini(netstackid_t stackid, kstat_t *ksp) 20171 { 20172 if (ksp != NULL) { 20173 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 20174 kstat_delete_netstack(ksp, stackid); 20175 } 20176 } 20177 20178 static int 20179 tcp_kstat_update(kstat_t *kp, int rw) 20180 { 20181 tcp_named_kstat_t *tcpkp; 20182 tcp_t *tcp; 20183 connf_t *connfp; 20184 conn_t *connp; 20185 int i; 20186 netstackid_t stackid = (netstackid_t)(uintptr_t)kp->ks_private; 20187 netstack_t *ns; 20188 tcp_stack_t *tcps; 20189 ip_stack_t *ipst; 20190 20191 if ((kp == NULL) || (kp->ks_data == NULL)) 20192 return (EIO); 20193 20194 if (rw == KSTAT_WRITE) 20195 return (EACCES); 20196 20197 ns = netstack_find_by_stackid(stackid); 20198 if (ns == NULL) 20199 return (-1); 20200 tcps = ns->netstack_tcp; 20201 if (tcps == NULL) { 20202 netstack_rele(ns); 20203 return (-1); 20204 } 20205 20206 tcpkp = (tcp_named_kstat_t *)kp->ks_data; 20207 20208 tcpkp->currEstab.value.ui32 = 0; 20209 20210 ipst = ns->netstack_ip; 20211 20212 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 20213 connfp = &ipst->ips_ipcl_globalhash_fanout[i]; 20214 connp = NULL; 20215 while ((connp = 20216 ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { 20217 tcp = connp->conn_tcp; 20218 switch (tcp_snmp_state(tcp)) { 20219 case MIB2_TCP_established: 20220 case MIB2_TCP_closeWait: 20221 tcpkp->currEstab.value.ui32++; 20222 break; 20223 } 20224 } 20225 } 20226 20227 tcpkp->activeOpens.value.ui32 = tcps->tcps_mib.tcpActiveOpens; 20228 tcpkp->passiveOpens.value.ui32 = tcps->tcps_mib.tcpPassiveOpens; 20229 tcpkp->attemptFails.value.ui32 = tcps->tcps_mib.tcpAttemptFails; 20230 tcpkp->estabResets.value.ui32 = tcps->tcps_mib.tcpEstabResets; 20231 tcpkp->inSegs.value.ui64 = tcps->tcps_mib.tcpHCInSegs; 20232 tcpkp->outSegs.value.ui64 = tcps->tcps_mib.tcpHCOutSegs; 20233 tcpkp->retransSegs.value.ui32 = tcps->tcps_mib.tcpRetransSegs; 20234 tcpkp->connTableSize.value.i32 = tcps->tcps_mib.tcpConnTableSize; 20235 tcpkp->outRsts.value.ui32 = tcps->tcps_mib.tcpOutRsts; 20236 tcpkp->outDataSegs.value.ui32 = tcps->tcps_mib.tcpOutDataSegs; 20237 tcpkp->outDataBytes.value.ui32 = tcps->tcps_mib.tcpOutDataBytes; 20238 tcpkp->retransBytes.value.ui32 = tcps->tcps_mib.tcpRetransBytes; 20239 tcpkp->outAck.value.ui32 = tcps->tcps_mib.tcpOutAck; 20240 tcpkp->outAckDelayed.value.ui32 = tcps->tcps_mib.tcpOutAckDelayed; 20241 tcpkp->outUrg.value.ui32 = tcps->tcps_mib.tcpOutUrg; 20242 tcpkp->outWinUpdate.value.ui32 = tcps->tcps_mib.tcpOutWinUpdate; 20243 tcpkp->outWinProbe.value.ui32 = tcps->tcps_mib.tcpOutWinProbe; 20244 tcpkp->outControl.value.ui32 = tcps->tcps_mib.tcpOutControl; 20245 tcpkp->outFastRetrans.value.ui32 = tcps->tcps_mib.tcpOutFastRetrans; 20246 tcpkp->inAckSegs.value.ui32 = tcps->tcps_mib.tcpInAckSegs; 20247 tcpkp->inAckBytes.value.ui32 = tcps->tcps_mib.tcpInAckBytes; 20248 tcpkp->inDupAck.value.ui32 = tcps->tcps_mib.tcpInDupAck; 20249 tcpkp->inAckUnsent.value.ui32 = tcps->tcps_mib.tcpInAckUnsent; 20250 tcpkp->inDataInorderSegs.value.ui32 = 20251 tcps->tcps_mib.tcpInDataInorderSegs; 20252 tcpkp->inDataInorderBytes.value.ui32 = 20253 tcps->tcps_mib.tcpInDataInorderBytes; 20254 tcpkp->inDataUnorderSegs.value.ui32 = 20255 tcps->tcps_mib.tcpInDataUnorderSegs; 20256 tcpkp->inDataUnorderBytes.value.ui32 = 20257 tcps->tcps_mib.tcpInDataUnorderBytes; 20258 tcpkp->inDataDupSegs.value.ui32 = tcps->tcps_mib.tcpInDataDupSegs; 20259 tcpkp->inDataDupBytes.value.ui32 = tcps->tcps_mib.tcpInDataDupBytes; 20260 tcpkp->inDataPartDupSegs.value.ui32 = 20261 tcps->tcps_mib.tcpInDataPartDupSegs; 20262 tcpkp->inDataPartDupBytes.value.ui32 = 20263 tcps->tcps_mib.tcpInDataPartDupBytes; 20264 tcpkp->inDataPastWinSegs.value.ui32 = 20265 tcps->tcps_mib.tcpInDataPastWinSegs; 20266 tcpkp->inDataPastWinBytes.value.ui32 = 20267 tcps->tcps_mib.tcpInDataPastWinBytes; 20268 tcpkp->inWinProbe.value.ui32 = tcps->tcps_mib.tcpInWinProbe; 20269 tcpkp->inWinUpdate.value.ui32 = tcps->tcps_mib.tcpInWinUpdate; 20270 tcpkp->inClosed.value.ui32 = tcps->tcps_mib.tcpInClosed; 20271 tcpkp->rttNoUpdate.value.ui32 = tcps->tcps_mib.tcpRttNoUpdate; 20272 tcpkp->rttUpdate.value.ui32 = tcps->tcps_mib.tcpRttUpdate; 20273 tcpkp->timRetrans.value.ui32 = tcps->tcps_mib.tcpTimRetrans; 20274 tcpkp->timRetransDrop.value.ui32 = tcps->tcps_mib.tcpTimRetransDrop; 20275 tcpkp->timKeepalive.value.ui32 = tcps->tcps_mib.tcpTimKeepalive; 20276 tcpkp->timKeepaliveProbe.value.ui32 = 20277 tcps->tcps_mib.tcpTimKeepaliveProbe; 20278 tcpkp->timKeepaliveDrop.value.ui32 = 20279 tcps->tcps_mib.tcpTimKeepaliveDrop; 20280 tcpkp->listenDrop.value.ui32 = tcps->tcps_mib.tcpListenDrop; 20281 tcpkp->listenDropQ0.value.ui32 = tcps->tcps_mib.tcpListenDropQ0; 20282 tcpkp->halfOpenDrop.value.ui32 = tcps->tcps_mib.tcpHalfOpenDrop; 20283 tcpkp->outSackRetransSegs.value.ui32 = 20284 tcps->tcps_mib.tcpOutSackRetransSegs; 20285 tcpkp->connTableSize6.value.i32 = tcps->tcps_mib.tcp6ConnTableSize; 20286 20287 netstack_rele(ns); 20288 return (0); 20289 } 20290 20291 static int 20292 tcp_squeue_switch(int val) 20293 { 20294 int rval = SQ_FILL; 20295 20296 switch (val) { 20297 case 1: 20298 rval = SQ_NODRAIN; 20299 break; 20300 case 2: 20301 rval = SQ_PROCESS; 20302 break; 20303 default: 20304 break; 20305 } 20306 return (rval); 20307 } 20308 20309 /* 20310 * This is called once for each squeue - globally for all stack 20311 * instances. 20312 */ 20313 static void 20314 tcp_squeue_add(squeue_t *sqp) 20315 { 20316 tcp_squeue_priv_t *tcp_time_wait = kmem_zalloc( 20317 sizeof (tcp_squeue_priv_t), KM_SLEEP); 20318 20319 *squeue_getprivate(sqp, SQPRIVATE_TCP) = (intptr_t)tcp_time_wait; 20320 tcp_time_wait->tcp_time_wait_tid = 20321 timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp, 20322 TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION, 20323 CALLOUT_FLAG_ROUNDUP); 20324 if (tcp_free_list_max_cnt == 0) { 20325 int tcp_ncpus = ((boot_max_ncpus == -1) ? 20326 max_ncpus : boot_max_ncpus); 20327 20328 /* 20329 * Limit number of entries to 1% of availble memory / tcp_ncpus 20330 */ 20331 tcp_free_list_max_cnt = (freemem * PAGESIZE) / 20332 (tcp_ncpus * sizeof (tcp_t) * 100); 20333 } 20334 tcp_time_wait->tcp_free_list_cnt = 0; 20335 } 20336 20337 /* 20338 * On a labeled system we have some protocols above TCP, such as RPC, which 20339 * appear to assume that every mblk in a chain has a db_credp. 20340 */ 20341 static void 20342 tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira) 20343 { 20344 ASSERT(is_system_labeled()); 20345 ASSERT(ira->ira_cred != NULL); 20346 20347 while (mp != NULL) { 20348 mblk_setcred(mp, ira->ira_cred, NOPID); 20349 mp = mp->b_cont; 20350 } 20351 } 20352 20353 static int 20354 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, 20355 boolean_t bind_to_req_port_only, cred_t *cr) 20356 { 20357 in_port_t mlp_port; 20358 mlp_type_t addrtype, mlptype; 20359 boolean_t user_specified; 20360 in_port_t allocated_port; 20361 in_port_t requested_port = *requested_port_ptr; 20362 conn_t *connp = tcp->tcp_connp; 20363 zone_t *zone; 20364 tcp_stack_t *tcps = tcp->tcp_tcps; 20365 in6_addr_t v6addr = connp->conn_laddr_v6; 20366 20367 /* 20368 * XXX It's up to the caller to specify bind_to_req_port_only or not. 20369 */ 20370 ASSERT(cr != NULL); 20371 20372 /* 20373 * Get a valid port (within the anonymous range and should not 20374 * be a privileged one) to use if the user has not given a port. 20375 * If multiple threads are here, they may all start with 20376 * with the same initial port. But, it should be fine as long as 20377 * tcp_bindi will ensure that no two threads will be assigned 20378 * the same port. 20379 * 20380 * NOTE: XXX If a privileged process asks for an anonymous port, we 20381 * still check for ports only in the range > tcp_smallest_non_priv_port, 20382 * unless TCP_ANONPRIVBIND option is set. 20383 */ 20384 mlptype = mlptSingle; 20385 mlp_port = requested_port; 20386 if (requested_port == 0) { 20387 requested_port = connp->conn_anon_priv_bind ? 20388 tcp_get_next_priv_port(tcp) : 20389 tcp_update_next_port(tcps->tcps_next_port_to_try, 20390 tcp, B_TRUE); 20391 if (requested_port == 0) { 20392 return (-TNOADDR); 20393 } 20394 user_specified = B_FALSE; 20395 20396 /* 20397 * If the user went through one of the RPC interfaces to create 20398 * this socket and RPC is MLP in this zone, then give him an 20399 * anonymous MLP. 20400 */ 20401 if (connp->conn_anon_mlp && is_system_labeled()) { 20402 zone = crgetzone(cr); 20403 addrtype = tsol_mlp_addr_type( 20404 connp->conn_allzones ? ALL_ZONES : zone->zone_id, 20405 IPV6_VERSION, &v6addr, 20406 tcps->tcps_netstack->netstack_ip); 20407 if (addrtype == mlptSingle) { 20408 return (-TNOADDR); 20409 } 20410 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 20411 PMAPPORT, addrtype); 20412 mlp_port = PMAPPORT; 20413 } 20414 } else { 20415 int i; 20416 boolean_t priv = B_FALSE; 20417 20418 /* 20419 * If the requested_port is in the well-known privileged range, 20420 * verify that the stream was opened by a privileged user. 20421 * Note: No locks are held when inspecting tcp_g_*epriv_ports 20422 * but instead the code relies on: 20423 * - the fact that the address of the array and its size never 20424 * changes 20425 * - the atomic assignment of the elements of the array 20426 */ 20427 if (requested_port < tcps->tcps_smallest_nonpriv_port) { 20428 priv = B_TRUE; 20429 } else { 20430 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 20431 if (requested_port == 20432 tcps->tcps_g_epriv_ports[i]) { 20433 priv = B_TRUE; 20434 break; 20435 } 20436 } 20437 } 20438 if (priv) { 20439 if (secpolicy_net_privaddr(cr, requested_port, 20440 IPPROTO_TCP) != 0) { 20441 if (connp->conn_debug) { 20442 (void) strlog(TCP_MOD_ID, 0, 1, 20443 SL_ERROR|SL_TRACE, 20444 "tcp_bind: no priv for port %d", 20445 requested_port); 20446 } 20447 return (-TACCES); 20448 } 20449 } 20450 user_specified = B_TRUE; 20451 20452 connp = tcp->tcp_connp; 20453 if (is_system_labeled()) { 20454 zone = crgetzone(cr); 20455 addrtype = tsol_mlp_addr_type( 20456 connp->conn_allzones ? ALL_ZONES : zone->zone_id, 20457 IPV6_VERSION, &v6addr, 20458 tcps->tcps_netstack->netstack_ip); 20459 if (addrtype == mlptSingle) { 20460 return (-TNOADDR); 20461 } 20462 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 20463 requested_port, addrtype); 20464 } 20465 } 20466 20467 if (mlptype != mlptSingle) { 20468 if (secpolicy_net_bindmlp(cr) != 0) { 20469 if (connp->conn_debug) { 20470 (void) strlog(TCP_MOD_ID, 0, 1, 20471 SL_ERROR|SL_TRACE, 20472 "tcp_bind: no priv for multilevel port %d", 20473 requested_port); 20474 } 20475 return (-TACCES); 20476 } 20477 20478 /* 20479 * If we're specifically binding a shared IP address and the 20480 * port is MLP on shared addresses, then check to see if this 20481 * zone actually owns the MLP. Reject if not. 20482 */ 20483 if (mlptype == mlptShared && addrtype == mlptShared) { 20484 /* 20485 * No need to handle exclusive-stack zones since 20486 * ALL_ZONES only applies to the shared stack. 20487 */ 20488 zoneid_t mlpzone; 20489 20490 mlpzone = tsol_mlp_findzone(IPPROTO_TCP, 20491 htons(mlp_port)); 20492 if (connp->conn_zoneid != mlpzone) { 20493 if (connp->conn_debug) { 20494 (void) strlog(TCP_MOD_ID, 0, 1, 20495 SL_ERROR|SL_TRACE, 20496 "tcp_bind: attempt to bind port " 20497 "%d on shared addr in zone %d " 20498 "(should be %d)", 20499 mlp_port, connp->conn_zoneid, 20500 mlpzone); 20501 } 20502 return (-TACCES); 20503 } 20504 } 20505 20506 if (!user_specified) { 20507 int err; 20508 err = tsol_mlp_anon(zone, mlptype, connp->conn_proto, 20509 requested_port, B_TRUE); 20510 if (err != 0) { 20511 if (connp->conn_debug) { 20512 (void) strlog(TCP_MOD_ID, 0, 1, 20513 SL_ERROR|SL_TRACE, 20514 "tcp_bind: cannot establish anon " 20515 "MLP for port %d", 20516 requested_port); 20517 } 20518 return (err); 20519 } 20520 connp->conn_anon_port = B_TRUE; 20521 } 20522 connp->conn_mlp_type = mlptype; 20523 } 20524 20525 allocated_port = tcp_bindi(tcp, requested_port, &v6addr, 20526 connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only, 20527 user_specified); 20528 20529 if (allocated_port == 0) { 20530 connp->conn_mlp_type = mlptSingle; 20531 if (connp->conn_anon_port) { 20532 connp->conn_anon_port = B_FALSE; 20533 (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto, 20534 requested_port, B_FALSE); 20535 } 20536 if (bind_to_req_port_only) { 20537 if (connp->conn_debug) { 20538 (void) strlog(TCP_MOD_ID, 0, 1, 20539 SL_ERROR|SL_TRACE, 20540 "tcp_bind: requested addr busy"); 20541 } 20542 return (-TADDRBUSY); 20543 } else { 20544 /* If we are out of ports, fail the bind. */ 20545 if (connp->conn_debug) { 20546 (void) strlog(TCP_MOD_ID, 0, 1, 20547 SL_ERROR|SL_TRACE, 20548 "tcp_bind: out of ports?"); 20549 } 20550 return (-TNOADDR); 20551 } 20552 } 20553 20554 /* Pass the allocated port back */ 20555 *requested_port_ptr = allocated_port; 20556 return (0); 20557 } 20558 20559 /* 20560 * Check the address and check/pick a local port number. 20561 */ 20562 static int 20563 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, 20564 boolean_t bind_to_req_port_only) 20565 { 20566 tcp_t *tcp = connp->conn_tcp; 20567 sin_t *sin; 20568 sin6_t *sin6; 20569 in_port_t requested_port; 20570 ipaddr_t v4addr; 20571 in6_addr_t v6addr; 20572 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 20573 zoneid_t zoneid = IPCL_ZONEID(connp); 20574 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 20575 uint_t scopeid = 0; 20576 int error = 0; 20577 ip_xmit_attr_t *ixa = connp->conn_ixa; 20578 20579 ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX); 20580 20581 if (tcp->tcp_state == TCPS_BOUND) { 20582 return (0); 20583 } else if (tcp->tcp_state > TCPS_BOUND) { 20584 if (connp->conn_debug) { 20585 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 20586 "tcp_bind: bad state, %d", tcp->tcp_state); 20587 } 20588 return (-TOUTSTATE); 20589 } 20590 20591 ASSERT(sa != NULL && len != 0); 20592 20593 if (!OK_32PTR((char *)sa)) { 20594 if (connp->conn_debug) { 20595 (void) strlog(TCP_MOD_ID, 0, 1, 20596 SL_ERROR|SL_TRACE, 20597 "tcp_bind: bad address parameter, " 20598 "address %p, len %d", 20599 (void *)sa, len); 20600 } 20601 return (-TPROTO); 20602 } 20603 20604 error = proto_verify_ip_addr(connp->conn_family, sa, len); 20605 if (error != 0) { 20606 return (error); 20607 } 20608 20609 switch (len) { 20610 case sizeof (sin_t): /* Complete IPv4 address */ 20611 sin = (sin_t *)sa; 20612 requested_port = ntohs(sin->sin_port); 20613 v4addr = sin->sin_addr.s_addr; 20614 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); 20615 if (v4addr != INADDR_ANY) { 20616 laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst, 20617 B_FALSE); 20618 } 20619 break; 20620 20621 case sizeof (sin6_t): /* Complete IPv6 address */ 20622 sin6 = (sin6_t *)sa; 20623 v6addr = sin6->sin6_addr; 20624 requested_port = ntohs(sin6->sin6_port); 20625 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) { 20626 if (connp->conn_ipv6_v6only) 20627 return (EADDRNOTAVAIL); 20628 20629 IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr); 20630 if (v4addr != INADDR_ANY) { 20631 laddr_type = ip_laddr_verify_v4(v4addr, 20632 zoneid, ipst, B_FALSE); 20633 } 20634 } else { 20635 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) { 20636 if (IN6_IS_ADDR_LINKSCOPE(&v6addr)) 20637 scopeid = sin6->sin6_scope_id; 20638 laddr_type = ip_laddr_verify_v6(&v6addr, 20639 zoneid, ipst, B_FALSE, scopeid); 20640 } 20641 } 20642 break; 20643 20644 default: 20645 if (connp->conn_debug) { 20646 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 20647 "tcp_bind: bad address length, %d", len); 20648 } 20649 return (EAFNOSUPPORT); 20650 /* return (-TBADADDR); */ 20651 } 20652 20653 /* Is the local address a valid unicast address? */ 20654 if (laddr_type == IPVL_BAD) 20655 return (EADDRNOTAVAIL); 20656 20657 connp->conn_bound_addr_v6 = v6addr; 20658 if (scopeid != 0) { 20659 ixa->ixa_flags |= IXAF_SCOPEID_SET; 20660 ixa->ixa_scopeid = scopeid; 20661 connp->conn_incoming_ifindex = scopeid; 20662 } else { 20663 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 20664 connp->conn_incoming_ifindex = connp->conn_bound_if; 20665 } 20666 20667 connp->conn_laddr_v6 = v6addr; 20668 connp->conn_saddr_v6 = v6addr; 20669 20670 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only; 20671 20672 error = tcp_bind_select_lport(tcp, &requested_port, 20673 bind_to_req_port_only, cr); 20674 if (error != 0) { 20675 connp->conn_laddr_v6 = ipv6_all_zeros; 20676 connp->conn_saddr_v6 = ipv6_all_zeros; 20677 connp->conn_bound_addr_v6 = ipv6_all_zeros; 20678 } 20679 return (error); 20680 } 20681 20682 /* 20683 * Return unix error is tli error is TSYSERR, otherwise return a negative 20684 * tli error. 20685 */ 20686 int 20687 tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, 20688 boolean_t bind_to_req_port_only) 20689 { 20690 int error; 20691 tcp_t *tcp = connp->conn_tcp; 20692 20693 if (tcp->tcp_state >= TCPS_BOUND) { 20694 if (connp->conn_debug) { 20695 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 20696 "tcp_bind: bad state, %d", tcp->tcp_state); 20697 } 20698 return (-TOUTSTATE); 20699 } 20700 20701 error = tcp_bind_check(connp, sa, len, cr, bind_to_req_port_only); 20702 if (error != 0) 20703 return (error); 20704 20705 ASSERT(tcp->tcp_state == TCPS_BOUND); 20706 tcp->tcp_conn_req_max = 0; 20707 return (0); 20708 } 20709 20710 int 20711 tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, 20712 socklen_t len, cred_t *cr) 20713 { 20714 int error; 20715 conn_t *connp = (conn_t *)proto_handle; 20716 squeue_t *sqp = connp->conn_sqp; 20717 20718 /* All Solaris components should pass a cred for this operation. */ 20719 ASSERT(cr != NULL); 20720 20721 ASSERT(sqp != NULL); 20722 ASSERT(connp->conn_upper_handle != NULL); 20723 20724 error = squeue_synch_enter(sqp, connp, NULL); 20725 if (error != 0) { 20726 /* failed to enter */ 20727 return (ENOSR); 20728 } 20729 20730 /* binding to a NULL address really means unbind */ 20731 if (sa == NULL) { 20732 if (connp->conn_tcp->tcp_state < TCPS_LISTEN) 20733 error = tcp_do_unbind(connp); 20734 else 20735 error = EINVAL; 20736 } else { 20737 error = tcp_do_bind(connp, sa, len, cr, B_TRUE); 20738 } 20739 20740 squeue_synch_exit(sqp, connp); 20741 20742 if (error < 0) { 20743 if (error == -TOUTSTATE) 20744 error = EINVAL; 20745 else 20746 error = proto_tlitosyserr(-error); 20747 } 20748 20749 return (error); 20750 } 20751 20752 /* 20753 * If the return value from this function is positive, it's a UNIX error. 20754 * Otherwise, if it's negative, then the absolute value is a TLI error. 20755 * the TPI routine tcp_tpi_connect() is a wrapper function for this. 20756 */ 20757 int 20758 tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, 20759 cred_t *cr, pid_t pid) 20760 { 20761 tcp_t *tcp = connp->conn_tcp; 20762 sin_t *sin = (sin_t *)sa; 20763 sin6_t *sin6 = (sin6_t *)sa; 20764 ipaddr_t *dstaddrp; 20765 in_port_t dstport; 20766 uint_t srcid; 20767 int error; 20768 uint32_t mss; 20769 mblk_t *syn_mp; 20770 tcp_stack_t *tcps = tcp->tcp_tcps; 20771 int32_t oldstate; 20772 ip_xmit_attr_t *ixa = connp->conn_ixa; 20773 20774 oldstate = tcp->tcp_state; 20775 20776 switch (len) { 20777 default: 20778 /* 20779 * Should never happen 20780 */ 20781 return (EINVAL); 20782 20783 case sizeof (sin_t): 20784 sin = (sin_t *)sa; 20785 if (sin->sin_port == 0) { 20786 return (-TBADADDR); 20787 } 20788 if (connp->conn_ipv6_v6only) { 20789 return (EAFNOSUPPORT); 20790 } 20791 break; 20792 20793 case sizeof (sin6_t): 20794 sin6 = (sin6_t *)sa; 20795 if (sin6->sin6_port == 0) { 20796 return (-TBADADDR); 20797 } 20798 break; 20799 } 20800 /* 20801 * If we're connecting to an IPv4-mapped IPv6 address, we need to 20802 * make sure that the conn_ipversion is IPV4_VERSION. We 20803 * need to this before we call tcp_bindi() so that the port lookup 20804 * code will look for ports in the correct port space (IPv4 and 20805 * IPv6 have separate port spaces). 20806 */ 20807 if (connp->conn_family == AF_INET6 && 20808 connp->conn_ipversion == IPV6_VERSION && 20809 IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 20810 if (connp->conn_ipv6_v6only) 20811 return (EADDRNOTAVAIL); 20812 20813 connp->conn_ipversion = IPV4_VERSION; 20814 } 20815 20816 switch (tcp->tcp_state) { 20817 case TCPS_LISTEN: 20818 /* 20819 * Listening sockets are not allowed to issue connect(). 20820 */ 20821 if (IPCL_IS_NONSTR(connp)) 20822 return (EOPNOTSUPP); 20823 /* FALLTHRU */ 20824 case TCPS_IDLE: 20825 /* 20826 * We support quick connect, refer to comments in 20827 * tcp_connect_*() 20828 */ 20829 /* FALLTHRU */ 20830 case TCPS_BOUND: 20831 break; 20832 default: 20833 return (-TOUTSTATE); 20834 } 20835 20836 /* 20837 * We update our cred/cpid based on the caller of connect 20838 */ 20839 if (connp->conn_cred != cr) { 20840 crhold(cr); 20841 crfree(connp->conn_cred); 20842 connp->conn_cred = cr; 20843 } 20844 connp->conn_cpid = pid; 20845 20846 /* Cache things in the ixa without any refhold */ 20847 ixa->ixa_cred = cr; 20848 ixa->ixa_cpid = pid; 20849 if (is_system_labeled()) { 20850 /* We need to restart with a label based on the cred */ 20851 ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); 20852 } 20853 20854 if (connp->conn_family == AF_INET6) { 20855 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 20856 error = tcp_connect_ipv6(tcp, &sin6->sin6_addr, 20857 sin6->sin6_port, sin6->sin6_flowinfo, 20858 sin6->__sin6_src_id, sin6->sin6_scope_id); 20859 } else { 20860 /* 20861 * Destination adress is mapped IPv6 address. 20862 * Source bound address should be unspecified or 20863 * IPv6 mapped address as well. 20864 */ 20865 if (!IN6_IS_ADDR_UNSPECIFIED( 20866 &connp->conn_bound_addr_v6) && 20867 !IN6_IS_ADDR_V4MAPPED(&connp->conn_bound_addr_v6)) { 20868 return (EADDRNOTAVAIL); 20869 } 20870 dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr)); 20871 dstport = sin6->sin6_port; 20872 srcid = sin6->__sin6_src_id; 20873 error = tcp_connect_ipv4(tcp, dstaddrp, dstport, 20874 srcid); 20875 } 20876 } else { 20877 dstaddrp = &sin->sin_addr.s_addr; 20878 dstport = sin->sin_port; 20879 srcid = 0; 20880 error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid); 20881 } 20882 20883 if (error != 0) 20884 goto connect_failed; 20885 20886 CL_INET_CONNECT(connp, B_TRUE, error); 20887 if (error != 0) 20888 goto connect_failed; 20889 20890 /* connect succeeded */ 20891 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); 20892 tcp->tcp_active_open = 1; 20893 20894 /* 20895 * tcp_set_destination() does not adjust for TCP/IP header length. 20896 */ 20897 mss = tcp->tcp_mss - connp->conn_ht_iphc_len; 20898 20899 /* 20900 * Just make sure our rwnd is at least rcvbuf * MSS large, and round up 20901 * to the nearest MSS. 20902 * 20903 * We do the round up here because we need to get the interface MTU 20904 * first before we can do the round up. 20905 */ 20906 tcp->tcp_rwnd = connp->conn_rcvbuf; 20907 tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 20908 tcps->tcps_recv_hiwat_minmss * mss); 20909 connp->conn_rcvbuf = tcp->tcp_rwnd; 20910 tcp_set_ws_value(tcp); 20911 tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); 20912 if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always) 20913 tcp->tcp_snd_ws_ok = B_TRUE; 20914 20915 /* 20916 * Set tcp_snd_ts_ok to true 20917 * so that tcp_xmit_mp will 20918 * include the timestamp 20919 * option in the SYN segment. 20920 */ 20921 if (tcps->tcps_tstamp_always || 20922 (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) { 20923 tcp->tcp_snd_ts_ok = B_TRUE; 20924 } 20925 20926 /* 20927 * tcp_snd_sack_ok can be set in 20928 * tcp_set_destination() if the sack metric 20929 * is set. So check it here also. 20930 */ 20931 if (tcps->tcps_sack_permitted == 2 || 20932 tcp->tcp_snd_sack_ok) { 20933 if (tcp->tcp_sack_info == NULL) { 20934 tcp->tcp_sack_info = kmem_cache_alloc( 20935 tcp_sack_info_cache, KM_SLEEP); 20936 } 20937 tcp->tcp_snd_sack_ok = B_TRUE; 20938 } 20939 20940 /* 20941 * Should we use ECN? Note that the current 20942 * default value (SunOS 5.9) of tcp_ecn_permitted 20943 * is 1. The reason for doing this is that there 20944 * are equipments out there that will drop ECN 20945 * enabled IP packets. Setting it to 1 avoids 20946 * compatibility problems. 20947 */ 20948 if (tcps->tcps_ecn_permitted == 2) 20949 tcp->tcp_ecn_ok = B_TRUE; 20950 20951 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 20952 syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 20953 tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 20954 if (syn_mp != NULL) { 20955 /* 20956 * We must bump the generation before sending the syn 20957 * to ensure that we use the right generation in case 20958 * this thread issues a "connected" up call. 20959 */ 20960 SOCK_CONNID_BUMP(tcp->tcp_connid); 20961 tcp_send_data(tcp, syn_mp); 20962 } 20963 20964 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 20965 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 20966 return (0); 20967 20968 connect_failed: 20969 connp->conn_faddr_v6 = ipv6_all_zeros; 20970 connp->conn_fport = 0; 20971 tcp->tcp_state = oldstate; 20972 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 20973 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 20974 return (error); 20975 } 20976 20977 int 20978 tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, 20979 socklen_t len, sock_connid_t *id, cred_t *cr) 20980 { 20981 conn_t *connp = (conn_t *)proto_handle; 20982 squeue_t *sqp = connp->conn_sqp; 20983 int error; 20984 20985 ASSERT(connp->conn_upper_handle != NULL); 20986 20987 /* All Solaris components should pass a cred for this operation. */ 20988 ASSERT(cr != NULL); 20989 20990 error = proto_verify_ip_addr(connp->conn_family, sa, len); 20991 if (error != 0) { 20992 return (error); 20993 } 20994 20995 error = squeue_synch_enter(sqp, connp, NULL); 20996 if (error != 0) { 20997 /* failed to enter */ 20998 return (ENOSR); 20999 } 21000 21001 /* 21002 * TCP supports quick connect, so no need to do an implicit bind 21003 */ 21004 error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid); 21005 if (error == 0) { 21006 *id = connp->conn_tcp->tcp_connid; 21007 } else if (error < 0) { 21008 if (error == -TOUTSTATE) { 21009 switch (connp->conn_tcp->tcp_state) { 21010 case TCPS_SYN_SENT: 21011 error = EALREADY; 21012 break; 21013 case TCPS_ESTABLISHED: 21014 error = EISCONN; 21015 break; 21016 case TCPS_LISTEN: 21017 error = EOPNOTSUPP; 21018 break; 21019 default: 21020 error = EINVAL; 21021 break; 21022 } 21023 } else { 21024 error = proto_tlitosyserr(-error); 21025 } 21026 } 21027 21028 if (connp->conn_tcp->tcp_loopback) { 21029 struct sock_proto_props sopp; 21030 21031 sopp.sopp_flags = SOCKOPT_LOOPBACK; 21032 sopp.sopp_loopback = B_TRUE; 21033 21034 (*connp->conn_upcalls->su_set_proto_props)( 21035 connp->conn_upper_handle, &sopp); 21036 } 21037 done: 21038 squeue_synch_exit(sqp, connp); 21039 21040 return ((error == 0) ? EINPROGRESS : error); 21041 } 21042 21043 /* ARGSUSED */ 21044 sock_lower_handle_t 21045 tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, 21046 uint_t *smodep, int *errorp, int flags, cred_t *credp) 21047 { 21048 conn_t *connp; 21049 boolean_t isv6 = family == AF_INET6; 21050 if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) || 21051 (proto != 0 && proto != IPPROTO_TCP)) { 21052 *errorp = EPROTONOSUPPORT; 21053 return (NULL); 21054 } 21055 21056 connp = tcp_create_common(credp, isv6, B_TRUE, errorp); 21057 if (connp == NULL) { 21058 return (NULL); 21059 } 21060 21061 /* 21062 * Put the ref for TCP. Ref for IP was already put 21063 * by ipcl_conn_create. Also Make the conn_t globally 21064 * visible to walkers 21065 */ 21066 mutex_enter(&connp->conn_lock); 21067 CONN_INC_REF_LOCKED(connp); 21068 ASSERT(connp->conn_ref == 2); 21069 connp->conn_state_flags &= ~CONN_INCIPIENT; 21070 21071 connp->conn_flags |= IPCL_NONSTR; 21072 mutex_exit(&connp->conn_lock); 21073 21074 ASSERT(errorp != NULL); 21075 *errorp = 0; 21076 *sock_downcalls = &sock_tcp_downcalls; 21077 *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP | 21078 SM_SENDFILESUPP; 21079 21080 return ((sock_lower_handle_t)connp); 21081 } 21082 21083 /* ARGSUSED */ 21084 void 21085 tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, 21086 sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) 21087 { 21088 conn_t *connp = (conn_t *)proto_handle; 21089 struct sock_proto_props sopp; 21090 21091 ASSERT(connp->conn_upper_handle == NULL); 21092 21093 /* All Solaris components should pass a cred for this operation. */ 21094 ASSERT(cr != NULL); 21095 21096 sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | 21097 SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER | 21098 SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ; 21099 21100 sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; 21101 sopp.sopp_rxlowat = SOCKET_RECVLOWATER; 21102 sopp.sopp_maxpsz = INFPSZ; 21103 sopp.sopp_maxblk = INFPSZ; 21104 sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL; 21105 sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3; 21106 sopp.sopp_maxaddrlen = sizeof (sin6_t); 21107 sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 : 21108 tcp_rinfo.mi_minpsz; 21109 21110 connp->conn_upcalls = sock_upcalls; 21111 connp->conn_upper_handle = sock_handle; 21112 21113 ASSERT(connp->conn_rcvbuf != 0 && 21114 connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd); 21115 (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); 21116 } 21117 21118 /* ARGSUSED */ 21119 int 21120 tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) 21121 { 21122 conn_t *connp = (conn_t *)proto_handle; 21123 21124 ASSERT(connp->conn_upper_handle != NULL); 21125 21126 /* All Solaris components should pass a cred for this operation. */ 21127 ASSERT(cr != NULL); 21128 21129 tcp_close_common(connp, flags); 21130 21131 ip_free_helper_stream(connp); 21132 21133 /* 21134 * Drop IP's reference on the conn. This is the last reference 21135 * on the connp if the state was less than established. If the 21136 * connection has gone into timewait state, then we will have 21137 * one ref for the TCP and one more ref (total of two) for the 21138 * classifier connected hash list (a timewait connections stays 21139 * in connected hash till closed). 21140 * 21141 * We can't assert the references because there might be other 21142 * transient reference places because of some walkers or queued 21143 * packets in squeue for the timewait state. 21144 */ 21145 CONN_DEC_REF(connp); 21146 return (0); 21147 } 21148 21149 /* ARGSUSED */ 21150 int 21151 tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, 21152 cred_t *cr) 21153 { 21154 tcp_t *tcp; 21155 uint32_t msize; 21156 conn_t *connp = (conn_t *)proto_handle; 21157 int32_t tcpstate; 21158 21159 /* All Solaris components should pass a cred for this operation. */ 21160 ASSERT(cr != NULL); 21161 21162 ASSERT(connp->conn_ref >= 2); 21163 ASSERT(connp->conn_upper_handle != NULL); 21164 21165 if (msg->msg_controllen != 0) { 21166 freemsg(mp); 21167 return (EOPNOTSUPP); 21168 } 21169 21170 switch (DB_TYPE(mp)) { 21171 case M_DATA: 21172 tcp = connp->conn_tcp; 21173 ASSERT(tcp != NULL); 21174 21175 tcpstate = tcp->tcp_state; 21176 if (tcpstate < TCPS_ESTABLISHED) { 21177 freemsg(mp); 21178 /* 21179 * We return ENOTCONN if the endpoint is trying to 21180 * connect or has never been connected, and EPIPE if it 21181 * has been disconnected. The connection id helps us 21182 * distinguish between the last two cases. 21183 */ 21184 return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN : 21185 ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN)); 21186 } else if (tcpstate > TCPS_CLOSE_WAIT) { 21187 freemsg(mp); 21188 return (EPIPE); 21189 } 21190 21191 msize = msgdsize(mp); 21192 21193 mutex_enter(&tcp->tcp_non_sq_lock); 21194 tcp->tcp_squeue_bytes += msize; 21195 /* 21196 * Squeue Flow Control 21197 */ 21198 if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { 21199 tcp_setqfull(tcp); 21200 } 21201 mutex_exit(&tcp->tcp_non_sq_lock); 21202 21203 /* 21204 * The application may pass in an address in the msghdr, but 21205 * we ignore the address on connection-oriented sockets. 21206 * Just like BSD this code does not generate an error for 21207 * TCP (a CONNREQUIRED socket) when sending to an address 21208 * passed in with sendto/sendmsg. Instead the data is 21209 * delivered on the connection as if no address had been 21210 * supplied. 21211 */ 21212 CONN_INC_REF(connp); 21213 21214 if (msg->msg_flags & MSG_OOB) { 21215 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent, 21216 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 21217 } else { 21218 SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, 21219 connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); 21220 } 21221 21222 return (0); 21223 21224 default: 21225 ASSERT(0); 21226 } 21227 21228 freemsg(mp); 21229 return (0); 21230 } 21231 21232 /* ARGSUSED2 */ 21233 void 21234 tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 21235 { 21236 int len; 21237 uint32_t msize; 21238 conn_t *connp = (conn_t *)arg; 21239 tcp_t *tcp = connp->conn_tcp; 21240 21241 msize = msgdsize(mp); 21242 21243 len = msize - 1; 21244 if (len < 0) { 21245 freemsg(mp); 21246 return; 21247 } 21248 21249 /* 21250 * Try to force urgent data out on the wire. Even if we have unsent 21251 * data this will at least send the urgent flag. 21252 * XXX does not handle more flag correctly. 21253 */ 21254 len += tcp->tcp_unsent; 21255 len += tcp->tcp_snxt; 21256 tcp->tcp_urg = len; 21257 tcp->tcp_valid_bits |= TCP_URG_VALID; 21258 21259 /* Bypass tcp protocol for fused tcp loopback */ 21260 if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 21261 return; 21262 21263 /* Strip off the T_EXDATA_REQ if the data is from TPI */ 21264 if (DB_TYPE(mp) != M_DATA) { 21265 mblk_t *mp1 = mp; 21266 ASSERT(!IPCL_IS_NONSTR(connp)); 21267 mp = mp->b_cont; 21268 freeb(mp1); 21269 } 21270 tcp_wput_data(tcp, mp, B_TRUE); 21271 } 21272 21273 /* ARGSUSED3 */ 21274 int 21275 tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, 21276 socklen_t *addrlenp, cred_t *cr) 21277 { 21278 conn_t *connp = (conn_t *)proto_handle; 21279 tcp_t *tcp = connp->conn_tcp; 21280 21281 ASSERT(connp->conn_upper_handle != NULL); 21282 /* All Solaris components should pass a cred for this operation. */ 21283 ASSERT(cr != NULL); 21284 21285 ASSERT(tcp != NULL); 21286 if (tcp->tcp_state < TCPS_SYN_RCVD) 21287 return (ENOTCONN); 21288 21289 return (conn_getpeername(connp, addr, addrlenp)); 21290 } 21291 21292 /* ARGSUSED3 */ 21293 int 21294 tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, 21295 socklen_t *addrlenp, cred_t *cr) 21296 { 21297 conn_t *connp = (conn_t *)proto_handle; 21298 21299 /* All Solaris components should pass a cred for this operation. */ 21300 ASSERT(cr != NULL); 21301 21302 ASSERT(connp->conn_upper_handle != NULL); 21303 return (conn_getsockname(connp, addr, addrlenp)); 21304 } 21305 21306 /* 21307 * tcp_fallback 21308 * 21309 * A direct socket is falling back to using STREAMS. The queue 21310 * that is being passed down was created using tcp_open() with 21311 * the SO_FALLBACK flag set. As a result, the queue is not 21312 * associated with a conn, and the q_ptrs instead contain the 21313 * dev and minor area that should be used. 21314 * 21315 * The 'issocket' flag indicates whether the FireEngine 21316 * optimizations should be used. The common case would be that 21317 * optimizations are enabled, and they might be subsequently 21318 * disabled using the _SIOCSOCKFALLBACK ioctl. 21319 */ 21320 21321 /* 21322 * An active connection is falling back to TPI. Gather all the information 21323 * required by the STREAM head and TPI sonode and send it up. 21324 */ 21325 void 21326 tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, 21327 boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb) 21328 { 21329 conn_t *connp = tcp->tcp_connp; 21330 struct stroptions *stropt; 21331 struct T_capability_ack tca; 21332 struct sockaddr_in6 laddr, faddr; 21333 socklen_t laddrlen, faddrlen; 21334 short opts; 21335 int error; 21336 mblk_t *mp; 21337 21338 connp->conn_dev = (dev_t)RD(q)->q_ptr; 21339 connp->conn_minor_arena = WR(q)->q_ptr; 21340 21341 RD(q)->q_ptr = WR(q)->q_ptr = connp; 21342 21343 connp->conn_rq = RD(q); 21344 connp->conn_wq = WR(q); 21345 21346 WR(q)->q_qinfo = &tcp_sock_winit; 21347 21348 if (!issocket) 21349 tcp_use_pure_tpi(tcp); 21350 21351 /* 21352 * free the helper stream 21353 */ 21354 ip_free_helper_stream(connp); 21355 21356 /* 21357 * Notify the STREAM head about options 21358 */ 21359 DB_TYPE(stropt_mp) = M_SETOPTS; 21360 stropt = (struct stroptions *)stropt_mp->b_rptr; 21361 stropt_mp->b_wptr += sizeof (struct stroptions); 21362 stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; 21363 21364 stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : 21365 tcp->tcp_tcps->tcps_wroff_xtra); 21366 if (tcp->tcp_snd_sack_ok) 21367 stropt->so_wroff += TCPOPT_MAX_SACK_LEN; 21368 stropt->so_hiwat = connp->conn_rcvbuf; 21369 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 21370 21371 putnext(RD(q), stropt_mp); 21372 21373 /* 21374 * Collect the information needed to sync with the sonode 21375 */ 21376 tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); 21377 21378 laddrlen = faddrlen = sizeof (sin6_t); 21379 (void) tcp_getsockname((sock_lower_handle_t)connp, 21380 (struct sockaddr *)&laddr, &laddrlen, CRED()); 21381 error = tcp_getpeername((sock_lower_handle_t)connp, 21382 (struct sockaddr *)&faddr, &faddrlen, CRED()); 21383 if (error != 0) 21384 faddrlen = 0; 21385 21386 opts = 0; 21387 if (connp->conn_oobinline) 21388 opts |= SO_OOBINLINE; 21389 if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) 21390 opts |= SO_DONTROUTE; 21391 21392 /* 21393 * Notify the socket that the protocol is now quiescent, 21394 * and it's therefore safe move data from the socket 21395 * to the stream head. 21396 */ 21397 (*quiesced_cb)(connp->conn_upper_handle, q, &tca, 21398 (struct sockaddr *)&laddr, laddrlen, 21399 (struct sockaddr *)&faddr, faddrlen, opts); 21400 21401 while ((mp = tcp->tcp_rcv_list) != NULL) { 21402 tcp->tcp_rcv_list = mp->b_next; 21403 mp->b_next = NULL; 21404 /* We never do fallback for kernel RPC */ 21405 putnext(q, mp); 21406 } 21407 tcp->tcp_rcv_last_head = NULL; 21408 tcp->tcp_rcv_last_tail = NULL; 21409 tcp->tcp_rcv_cnt = 0; 21410 } 21411 21412 /* 21413 * An eager is falling back to TPI. All we have to do is send 21414 * up a T_CONN_IND. 21415 */ 21416 void 21417 tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs) 21418 { 21419 tcp_t *listener = eager->tcp_listener; 21420 mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind; 21421 21422 ASSERT(listener != NULL); 21423 ASSERT(mp != NULL); 21424 21425 eager->tcp_conn.tcp_eager_conn_ind = NULL; 21426 21427 /* 21428 * TLI/XTI applications will get confused by 21429 * sending eager as an option since it violates 21430 * the option semantics. So remove the eager as 21431 * option since TLI/XTI app doesn't need it anyway. 21432 */ 21433 if (!direct_sockfs) { 21434 struct T_conn_ind *conn_ind; 21435 21436 conn_ind = (struct T_conn_ind *)mp->b_rptr; 21437 conn_ind->OPT_length = 0; 21438 conn_ind->OPT_offset = 0; 21439 } 21440 21441 /* 21442 * Sockfs guarantees that the listener will not be closed 21443 * during fallback. So we can safely use the listener's queue. 21444 */ 21445 putnext(listener->tcp_connp->conn_rq, mp); 21446 } 21447 21448 int 21449 tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, 21450 boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) 21451 { 21452 tcp_t *tcp; 21453 conn_t *connp = (conn_t *)proto_handle; 21454 int error; 21455 mblk_t *stropt_mp; 21456 mblk_t *ordrel_mp; 21457 21458 tcp = connp->conn_tcp; 21459 21460 stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG, 21461 NULL); 21462 21463 /* Pre-allocate the T_ordrel_ind mblk. */ 21464 ASSERT(tcp->tcp_ordrel_mp == NULL); 21465 ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, 21466 STR_NOSIG, NULL); 21467 ordrel_mp->b_datap->db_type = M_PROTO; 21468 ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; 21469 ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); 21470 21471 /* 21472 * Enter the squeue so that no new packets can come in 21473 */ 21474 error = squeue_synch_enter(connp->conn_sqp, connp, NULL); 21475 if (error != 0) { 21476 /* failed to enter, free all the pre-allocated messages. */ 21477 freeb(stropt_mp); 21478 freeb(ordrel_mp); 21479 /* 21480 * We cannot process the eager, so at least send out a 21481 * RST so the peer can reconnect. 21482 */ 21483 if (tcp->tcp_listener != NULL) { 21484 (void) tcp_eager_blowoff(tcp->tcp_listener, 21485 tcp->tcp_conn_req_seqnum); 21486 } 21487 return (ENOMEM); 21488 } 21489 21490 /* 21491 * Both endpoints must be of the same type (either STREAMS or 21492 * non-STREAMS) for fusion to be enabled. So if we are fused, 21493 * we have to unfuse. 21494 */ 21495 if (tcp->tcp_fused) 21496 tcp_unfuse(tcp); 21497 21498 /* 21499 * No longer a direct socket 21500 */ 21501 connp->conn_flags &= ~IPCL_NONSTR; 21502 tcp->tcp_ordrel_mp = ordrel_mp; 21503 21504 if (tcp->tcp_listener != NULL) { 21505 /* The eager will deal with opts when accept() is called */ 21506 freeb(stropt_mp); 21507 tcp_fallback_eager(tcp, direct_sockfs); 21508 } else { 21509 tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs, 21510 quiesced_cb); 21511 } 21512 21513 /* 21514 * There should be atleast two ref's (IP + TCP) 21515 */ 21516 ASSERT(connp->conn_ref >= 2); 21517 squeue_synch_exit(connp->conn_sqp, connp); 21518 21519 return (0); 21520 } 21521 21522 /* ARGSUSED */ 21523 static void 21524 tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) 21525 { 21526 conn_t *connp = (conn_t *)arg; 21527 tcp_t *tcp = connp->conn_tcp; 21528 21529 freemsg(mp); 21530 21531 if (tcp->tcp_fused) 21532 tcp_unfuse(tcp); 21533 21534 if (tcp_xmit_end(tcp) != 0) { 21535 /* 21536 * We were crossing FINs and got a reset from 21537 * the other side. Just ignore it. 21538 */ 21539 if (connp->conn_debug) { 21540 (void) strlog(TCP_MOD_ID, 0, 1, 21541 SL_ERROR|SL_TRACE, 21542 "tcp_shutdown_output() out of state %s", 21543 tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); 21544 } 21545 } 21546 } 21547 21548 /* ARGSUSED */ 21549 int 21550 tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) 21551 { 21552 conn_t *connp = (conn_t *)proto_handle; 21553 tcp_t *tcp = connp->conn_tcp; 21554 21555 ASSERT(connp->conn_upper_handle != NULL); 21556 21557 /* All Solaris components should pass a cred for this operation. */ 21558 ASSERT(cr != NULL); 21559 21560 /* 21561 * X/Open requires that we check the connected state. 21562 */ 21563 if (tcp->tcp_state < TCPS_SYN_SENT) 21564 return (ENOTCONN); 21565 21566 /* shutdown the send side */ 21567 if (how != SHUT_RD) { 21568 mblk_t *bp; 21569 21570 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 21571 CONN_INC_REF(connp); 21572 SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, 21573 connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); 21574 21575 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 21576 SOCK_OPCTL_SHUT_SEND, 0); 21577 } 21578 21579 /* shutdown the recv side */ 21580 if (how != SHUT_WR) 21581 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 21582 SOCK_OPCTL_SHUT_RECV, 0); 21583 21584 return (0); 21585 } 21586 21587 /* 21588 * SOP_LISTEN() calls into tcp_listen(). 21589 */ 21590 /* ARGSUSED */ 21591 int 21592 tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) 21593 { 21594 conn_t *connp = (conn_t *)proto_handle; 21595 int error; 21596 squeue_t *sqp = connp->conn_sqp; 21597 21598 ASSERT(connp->conn_upper_handle != NULL); 21599 21600 /* All Solaris components should pass a cred for this operation. */ 21601 ASSERT(cr != NULL); 21602 21603 error = squeue_synch_enter(sqp, connp, NULL); 21604 if (error != 0) { 21605 /* failed to enter */ 21606 return (ENOBUFS); 21607 } 21608 21609 error = tcp_do_listen(connp, NULL, 0, backlog, cr, FALSE); 21610 if (error == 0) { 21611 (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, 21612 SOCK_OPCTL_ENAB_ACCEPT, (uintptr_t)backlog); 21613 } else if (error < 0) { 21614 if (error == -TOUTSTATE) 21615 error = EINVAL; 21616 else 21617 error = proto_tlitosyserr(-error); 21618 } 21619 squeue_synch_exit(sqp, connp); 21620 return (error); 21621 } 21622 21623 static int 21624 tcp_do_listen(conn_t *connp, struct sockaddr *sa, socklen_t len, 21625 int backlog, cred_t *cr, boolean_t bind_to_req_port_only) 21626 { 21627 tcp_t *tcp = connp->conn_tcp; 21628 int error = 0; 21629 tcp_stack_t *tcps = tcp->tcp_tcps; 21630 21631 /* All Solaris components should pass a cred for this operation. */ 21632 ASSERT(cr != NULL); 21633 21634 if (tcp->tcp_state >= TCPS_BOUND) { 21635 if ((tcp->tcp_state == TCPS_BOUND || 21636 tcp->tcp_state == TCPS_LISTEN) && backlog > 0) { 21637 /* 21638 * Handle listen() increasing backlog. 21639 * This is more "liberal" then what the TPI spec 21640 * requires but is needed to avoid a t_unbind 21641 * when handling listen() since the port number 21642 * might be "stolen" between the unbind and bind. 21643 */ 21644 goto do_listen; 21645 } 21646 if (connp->conn_debug) { 21647 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 21648 "tcp_listen: bad state, %d", tcp->tcp_state); 21649 } 21650 return (-TOUTSTATE); 21651 } else { 21652 if (sa == NULL) { 21653 sin6_t addr; 21654 sin_t *sin; 21655 sin6_t *sin6; 21656 21657 ASSERT(IPCL_IS_NONSTR(connp)); 21658 /* Do an implicit bind: Request for a generic port. */ 21659 if (connp->conn_family == AF_INET) { 21660 len = sizeof (sin_t); 21661 sin = (sin_t *)&addr; 21662 *sin = sin_null; 21663 sin->sin_family = AF_INET; 21664 } else { 21665 ASSERT(connp->conn_family == AF_INET6); 21666 len = sizeof (sin6_t); 21667 sin6 = (sin6_t *)&addr; 21668 *sin6 = sin6_null; 21669 sin6->sin6_family = AF_INET6; 21670 } 21671 sa = (struct sockaddr *)&addr; 21672 } 21673 21674 error = tcp_bind_check(connp, sa, len, cr, 21675 bind_to_req_port_only); 21676 if (error) 21677 return (error); 21678 /* Fall through and do the fanout insertion */ 21679 } 21680 21681 do_listen: 21682 ASSERT(tcp->tcp_state == TCPS_BOUND || tcp->tcp_state == TCPS_LISTEN); 21683 tcp->tcp_conn_req_max = backlog; 21684 if (tcp->tcp_conn_req_max) { 21685 if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min) 21686 tcp->tcp_conn_req_max = tcps->tcps_conn_req_min; 21687 if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q) 21688 tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q; 21689 /* 21690 * If this is a listener, do not reset the eager list 21691 * and other stuffs. Note that we don't check if the 21692 * existing eager list meets the new tcp_conn_req_max 21693 * requirement. 21694 */ 21695 if (tcp->tcp_state != TCPS_LISTEN) { 21696 tcp->tcp_state = TCPS_LISTEN; 21697 /* Initialize the chain. Don't need the eager_lock */ 21698 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 21699 tcp->tcp_eager_next_drop_q0 = tcp; 21700 tcp->tcp_eager_prev_drop_q0 = tcp; 21701 tcp->tcp_second_ctimer_threshold = 21702 tcps->tcps_ip_abort_linterval; 21703 } 21704 } 21705 21706 /* 21707 * We need to make sure that the conn_recv is set to a non-null 21708 * value before we insert the conn into the classifier table. 21709 * This is to avoid a race with an incoming packet which does an 21710 * ipcl_classify(). 21711 * We initially set it to tcp_input_listener_unbound to try to 21712 * pick a good squeue for the listener when the first SYN arrives. 21713 * tcp_input_listener_unbound sets it to tcp_input_listener on that 21714 * first SYN. 21715 */ 21716 connp->conn_recv = tcp_input_listener_unbound; 21717 21718 /* Insert the listener in the classifier table */ 21719 error = ip_laddr_fanout_insert(connp); 21720 if (error != 0) { 21721 /* Undo the bind - release the port number */ 21722 tcp->tcp_state = TCPS_IDLE; 21723 connp->conn_bound_addr_v6 = ipv6_all_zeros; 21724 21725 connp->conn_laddr_v6 = ipv6_all_zeros; 21726 connp->conn_saddr_v6 = ipv6_all_zeros; 21727 connp->conn_ports = 0; 21728 21729 if (connp->conn_anon_port) { 21730 zone_t *zone; 21731 21732 zone = crgetzone(cr); 21733 connp->conn_anon_port = B_FALSE; 21734 (void) tsol_mlp_anon(zone, connp->conn_mlp_type, 21735 connp->conn_proto, connp->conn_lport, B_FALSE); 21736 } 21737 connp->conn_mlp_type = mlptSingle; 21738 21739 tcp_bind_hash_remove(tcp); 21740 return (error); 21741 } else { 21742 /* 21743 * If there is a connection limit, allocate and initialize 21744 * the counter struct. Note that since listen can be called 21745 * multiple times, the struct may have been allready allocated. 21746 */ 21747 if (!list_is_empty(&tcps->tcps_listener_conf) && 21748 tcp->tcp_listen_cnt == NULL) { 21749 tcp_listen_cnt_t *tlc; 21750 uint32_t ratio; 21751 21752 ratio = tcp_find_listener_conf(tcps, 21753 ntohs(connp->conn_lport)); 21754 if (ratio != 0) { 21755 uint32_t mem_ratio, tot_buf; 21756 21757 tlc = kmem_alloc(sizeof (tcp_listen_cnt_t), 21758 KM_SLEEP); 21759 /* 21760 * Calculate the connection limit based on 21761 * the configured ratio and maxusers. Maxusers 21762 * are calculated based on memory size, 21763 * ~ 1 user per MB. Note that the conn_rcvbuf 21764 * and conn_sndbuf may change after a 21765 * connection is accepted. So what we have 21766 * is only an approximation. 21767 */ 21768 if ((tot_buf = connp->conn_rcvbuf + 21769 connp->conn_sndbuf) < MB) { 21770 mem_ratio = MB / tot_buf; 21771 tlc->tlc_max = maxusers / ratio * 21772 mem_ratio; 21773 } else { 21774 mem_ratio = tot_buf / MB; 21775 tlc->tlc_max = maxusers / ratio / 21776 mem_ratio; 21777 } 21778 /* At least we should allow two connections! */ 21779 if (tlc->tlc_max <= tcp_min_conn_listener) 21780 tlc->tlc_max = tcp_min_conn_listener; 21781 tlc->tlc_cnt = 1; 21782 tlc->tlc_drop = 0; 21783 tcp->tcp_listen_cnt = tlc; 21784 } 21785 } 21786 } 21787 return (error); 21788 } 21789 21790 void 21791 tcp_clr_flowctrl(sock_lower_handle_t proto_handle) 21792 { 21793 conn_t *connp = (conn_t *)proto_handle; 21794 tcp_t *tcp = connp->conn_tcp; 21795 mblk_t *mp; 21796 int error; 21797 21798 ASSERT(connp->conn_upper_handle != NULL); 21799 21800 /* 21801 * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl() 21802 * is currently running. 21803 */ 21804 mutex_enter(&tcp->tcp_rsrv_mp_lock); 21805 if ((mp = tcp->tcp_rsrv_mp) == NULL) { 21806 mutex_exit(&tcp->tcp_rsrv_mp_lock); 21807 return; 21808 } 21809 tcp->tcp_rsrv_mp = NULL; 21810 mutex_exit(&tcp->tcp_rsrv_mp_lock); 21811 21812 error = squeue_synch_enter(connp->conn_sqp, connp, mp); 21813 ASSERT(error == 0); 21814 21815 mutex_enter(&tcp->tcp_rsrv_mp_lock); 21816 tcp->tcp_rsrv_mp = mp; 21817 mutex_exit(&tcp->tcp_rsrv_mp_lock); 21818 21819 if (tcp->tcp_fused) { 21820 tcp_fuse_backenable(tcp); 21821 } else { 21822 tcp->tcp_rwnd = connp->conn_rcvbuf; 21823 /* 21824 * Send back a window update immediately if TCP is above 21825 * ESTABLISHED state and the increase of the rcv window 21826 * that the other side knows is at least 1 MSS after flow 21827 * control is lifted. 21828 */ 21829 if (tcp->tcp_state >= TCPS_ESTABLISHED && 21830 tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { 21831 tcp_xmit_ctl(NULL, tcp, 21832 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 21833 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 21834 } 21835 } 21836 21837 squeue_synch_exit(connp->conn_sqp, connp); 21838 } 21839 21840 /* ARGSUSED */ 21841 int 21842 tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, 21843 int mode, int32_t *rvalp, cred_t *cr) 21844 { 21845 conn_t *connp = (conn_t *)proto_handle; 21846 int error; 21847 21848 ASSERT(connp->conn_upper_handle != NULL); 21849 21850 /* All Solaris components should pass a cred for this operation. */ 21851 ASSERT(cr != NULL); 21852 21853 /* 21854 * If we don't have a helper stream then create one. 21855 * ip_create_helper_stream takes care of locking the conn_t, 21856 * so this check for NULL is just a performance optimization. 21857 */ 21858 if (connp->conn_helper_info == NULL) { 21859 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; 21860 21861 /* 21862 * Create a helper stream for non-STREAMS socket. 21863 */ 21864 error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); 21865 if (error != 0) { 21866 ip0dbg(("tcp_ioctl: create of IP helper stream " 21867 "failed %d\n", error)); 21868 return (error); 21869 } 21870 } 21871 21872 switch (cmd) { 21873 case ND_SET: 21874 case ND_GET: 21875 case _SIOCSOCKFALLBACK: 21876 case TCP_IOC_ABORT_CONN: 21877 case TI_GETPEERNAME: 21878 case TI_GETMYNAME: 21879 ip1dbg(("tcp_ioctl: cmd 0x%x on non sreams socket", 21880 cmd)); 21881 error = EINVAL; 21882 break; 21883 default: 21884 /* 21885 * Pass on to IP using helper stream 21886 */ 21887 error = ldi_ioctl(connp->conn_helper_info->iphs_handle, 21888 cmd, arg, mode, cr, rvalp); 21889 break; 21890 } 21891 return (error); 21892 } 21893 21894 sock_downcalls_t sock_tcp_downcalls = { 21895 tcp_activate, 21896 tcp_accept, 21897 tcp_bind, 21898 tcp_listen, 21899 tcp_connect, 21900 tcp_getpeername, 21901 tcp_getsockname, 21902 tcp_getsockopt, 21903 tcp_setsockopt, 21904 tcp_sendmsg, 21905 NULL, 21906 NULL, 21907 NULL, 21908 tcp_shutdown, 21909 tcp_clr_flowctrl, 21910 tcp_ioctl, 21911 tcp_close, 21912 }; 21913 21914 /* 21915 * Timeout function to reset the TCP stack variable tcps_reclaim to false. 21916 */ 21917 static void 21918 tcp_reclaim_timer(void *arg) 21919 { 21920 tcp_stack_t *tcps = (tcp_stack_t *)arg; 21921 21922 mutex_enter(&tcps->tcps_reclaim_lock); 21923 tcps->tcps_reclaim = B_FALSE; 21924 tcps->tcps_reclaim_tid = 0; 21925 mutex_exit(&tcps->tcps_reclaim_lock); 21926 /* Only need to print this once. */ 21927 if (tcps->tcps_netstack->netstack_stackid == GLOBAL_ZONEID) 21928 cmn_err(CE_WARN, "TCP defensive mode off\n"); 21929 } 21930 21931 /* 21932 * Kmem reclaim call back function. When the system is under memory 21933 * pressure, we set the TCP stack variable tcps_reclaim to true. This 21934 * variable is reset to false after tcps_reclaim_period msecs. During this 21935 * period, TCP will be more aggressive in aborting connections not making 21936 * progress, meaning retransmitting for some time (tcp_early_abort seconds). 21937 * TCP will also not accept new connection request for those listeners whose 21938 * q or q0 is not empty. 21939 */ 21940 /* ARGSUSED */ 21941 void 21942 tcp_conn_reclaim(void *arg) 21943 { 21944 netstack_handle_t nh; 21945 netstack_t *ns; 21946 tcp_stack_t *tcps; 21947 boolean_t new = B_FALSE; 21948 21949 netstack_next_init(&nh); 21950 while ((ns = netstack_next(&nh)) != NULL) { 21951 tcps = ns->netstack_tcp; 21952 mutex_enter(&tcps->tcps_reclaim_lock); 21953 if (!tcps->tcps_reclaim) { 21954 tcps->tcps_reclaim = B_TRUE; 21955 tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer, 21956 tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period)); 21957 new = B_TRUE; 21958 } 21959 mutex_exit(&tcps->tcps_reclaim_lock); 21960 netstack_rele(ns); 21961 } 21962 netstack_next_fini(&nh); 21963 if (new) 21964 cmn_err(CE_WARN, "Memory pressure: TCP defensive mode on\n"); 21965 } 21966 21967 /* 21968 * Given a tcp_stack_t and a port (in host byte order), find a listener 21969 * configuration for that port and return the ratio. 21970 */ 21971 static uint32_t 21972 tcp_find_listener_conf(tcp_stack_t *tcps, in_port_t port) 21973 { 21974 tcp_listener_t *tl; 21975 uint32_t ratio = 0; 21976 21977 mutex_enter(&tcps->tcps_listener_conf_lock); 21978 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; 21979 tl = list_next(&tcps->tcps_listener_conf, tl)) { 21980 if (tl->tl_port == port) { 21981 ratio = tl->tl_ratio; 21982 break; 21983 } 21984 } 21985 mutex_exit(&tcps->tcps_listener_conf_lock); 21986 return (ratio); 21987 } 21988 21989 /* 21990 * Ndd param helper routine to return the current list of listener limit 21991 * configuration. 21992 */ 21993 /* ARGSUSED */ 21994 static int 21995 tcp_listener_conf_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 21996 { 21997 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 21998 tcp_listener_t *tl; 21999 22000 mutex_enter(&tcps->tcps_listener_conf_lock); 22001 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; 22002 tl = list_next(&tcps->tcps_listener_conf, tl)) { 22003 (void) mi_mpprintf(mp, "%d:%d ", tl->tl_port, tl->tl_ratio); 22004 } 22005 mutex_exit(&tcps->tcps_listener_conf_lock); 22006 return (0); 22007 } 22008 22009 /* 22010 * Ndd param helper routine to add a new listener limit configuration. 22011 */ 22012 /* ARGSUSED */ 22013 static int 22014 tcp_listener_conf_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 22015 cred_t *cr) 22016 { 22017 tcp_listener_t *new_tl; 22018 tcp_listener_t *tl; 22019 long lport; 22020 long ratio; 22021 char *colon; 22022 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 22023 22024 if (ddi_strtol(value, &colon, 10, &lport) != 0 || lport <= 0 || 22025 lport > USHRT_MAX || *colon != ':') { 22026 return (EINVAL); 22027 } 22028 if (ddi_strtol(colon + 1, NULL, 10, &ratio) != 0 || ratio <= 0) 22029 return (EINVAL); 22030 22031 mutex_enter(&tcps->tcps_listener_conf_lock); 22032 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; 22033 tl = list_next(&tcps->tcps_listener_conf, tl)) { 22034 /* There is an existing entry, so update its ratio value. */ 22035 if (tl->tl_port == lport) { 22036 tl->tl_ratio = ratio; 22037 mutex_exit(&tcps->tcps_listener_conf_lock); 22038 return (0); 22039 } 22040 } 22041 22042 if ((new_tl = kmem_alloc(sizeof (tcp_listener_t), KM_NOSLEEP)) == 22043 NULL) { 22044 mutex_exit(&tcps->tcps_listener_conf_lock); 22045 return (ENOMEM); 22046 } 22047 22048 new_tl->tl_port = lport; 22049 new_tl->tl_ratio = ratio; 22050 list_insert_tail(&tcps->tcps_listener_conf, new_tl); 22051 mutex_exit(&tcps->tcps_listener_conf_lock); 22052 return (0); 22053 } 22054 22055 /* 22056 * Ndd param helper routine to remove a listener limit configuration. 22057 */ 22058 /* ARGSUSED */ 22059 static int 22060 tcp_listener_conf_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 22061 cred_t *cr) 22062 { 22063 tcp_listener_t *tl; 22064 long lport; 22065 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 22066 22067 if (ddi_strtol(value, NULL, 10, &lport) != 0 || lport <= 0 || 22068 lport > USHRT_MAX) { 22069 return (EINVAL); 22070 } 22071 mutex_enter(&tcps->tcps_listener_conf_lock); 22072 for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; 22073 tl = list_next(&tcps->tcps_listener_conf, tl)) { 22074 if (tl->tl_port == lport) { 22075 list_remove(&tcps->tcps_listener_conf, tl); 22076 mutex_exit(&tcps->tcps_listener_conf_lock); 22077 kmem_free(tl, sizeof (tcp_listener_t)); 22078 return (0); 22079 } 22080 } 22081 mutex_exit(&tcps->tcps_listener_conf_lock); 22082 return (ESRCH); 22083 } 22084 22085 /* 22086 * To remove all listener limit configuration in a tcp_stack_t. 22087 */ 22088 static void 22089 tcp_listener_conf_cleanup(tcp_stack_t *tcps) 22090 { 22091 tcp_listener_t *tl; 22092 22093 mutex_enter(&tcps->tcps_listener_conf_lock); 22094 while ((tl = list_head(&tcps->tcps_listener_conf)) != NULL) { 22095 list_remove(&tcps->tcps_listener_conf, tl); 22096 kmem_free(tl, sizeof (tcp_listener_t)); 22097 } 22098 mutex_destroy(&tcps->tcps_listener_conf_lock); 22099 list_destroy(&tcps->tcps_listener_conf); 22100 } 22101