1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 const char tcp_version[] = "%Z%%M% %I% %E% SMI"; 30 31 32 #include <sys/types.h> 33 #include <sys/stream.h> 34 #include <sys/strsun.h> 35 #include <sys/strsubr.h> 36 #include <sys/stropts.h> 37 #include <sys/strlog.h> 38 #include <sys/strsun.h> 39 #define _SUN_TPI_VERSION 2 40 #include <sys/tihdr.h> 41 #include <sys/timod.h> 42 #include <sys/ddi.h> 43 #include <sys/sunddi.h> 44 #include <sys/suntpi.h> 45 #include <sys/xti_inet.h> 46 #include <sys/cmn_err.h> 47 #include <sys/debug.h> 48 #include <sys/sdt.h> 49 #include <sys/vtrace.h> 50 #include <sys/kmem.h> 51 #include <sys/ethernet.h> 52 #include <sys/cpuvar.h> 53 #include <sys/dlpi.h> 54 #include <sys/multidata.h> 55 #include <sys/multidata_impl.h> 56 #include <sys/pattr.h> 57 #include <sys/policy.h> 58 #include <sys/priv.h> 59 #include <sys/zone.h> 60 #include <sys/sunldi.h> 61 62 #include <sys/errno.h> 63 #include <sys/signal.h> 64 #include <sys/socket.h> 65 #include <sys/sockio.h> 66 #include <sys/isa_defs.h> 67 #include <sys/md5.h> 68 #include <sys/random.h> 69 #include <sys/sodirect.h> 70 #include <sys/uio.h> 71 #include <netinet/in.h> 72 #include <netinet/tcp.h> 73 #include <netinet/ip6.h> 74 #include <netinet/icmp6.h> 75 #include <net/if.h> 76 #include <net/route.h> 77 #include <inet/ipsec_impl.h> 78 79 #include <inet/common.h> 80 #include <inet/ip.h> 81 #include <inet/ip_impl.h> 82 #include <inet/ip6.h> 83 #include <inet/ip_ndp.h> 84 #include <inet/mi.h> 85 #include <inet/mib2.h> 86 #include <inet/nd.h> 87 #include <inet/optcom.h> 88 #include <inet/snmpcom.h> 89 #include <inet/kstatcom.h> 90 #include <inet/tcp.h> 91 #include <inet/tcp_impl.h> 92 #include <net/pfkeyv2.h> 93 #include <inet/ipsec_info.h> 94 #include <inet/ipdrop.h> 95 #include <inet/tcp_trace.h> 96 97 #include <inet/ipclassifier.h> 98 #include <inet/ip_ire.h> 99 #include <inet/ip_ftable.h> 100 #include <inet/ip_if.h> 101 #include <inet/ipp_common.h> 102 #include <inet/ip_netinfo.h> 103 #include <sys/squeue.h> 104 #include <inet/kssl/ksslapi.h> 105 #include <sys/tsol/label.h> 106 #include <sys/tsol/tnet.h> 107 #include <rpc/pmap_prot.h> 108 109 /* 110 * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) 111 * 112 * (Read the detailed design doc in PSARC case directory) 113 * 114 * The entire tcp state is contained in tcp_t and conn_t structure 115 * which are allocated in tandem using ipcl_conn_create() and passing 116 * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect 117 * the references on the tcp_t. The tcp_t structure is never compressed 118 * and packets always land on the correct TCP perimeter from the time 119 * eager is created till the time tcp_t dies (as such the old mentat 120 * TCP global queue is not used for detached state and no IPSEC checking 121 * is required). The global queue is still allocated to send out resets 122 * for connection which have no listeners and IP directly calls 123 * tcp_xmit_listeners_reset() which does any policy check. 124 * 125 * Protection and Synchronisation mechanism: 126 * 127 * The tcp data structure does not use any kind of lock for protecting 128 * its state but instead uses 'squeues' for mutual exclusion from various 129 * read and write side threads. To access a tcp member, the thread should 130 * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or 131 * squeue_fill). Since the squeues allow a direct function call, caller 132 * can pass any tcp function having prototype of edesc_t as argument 133 * (different from traditional STREAMs model where packets come in only 134 * designated entry points). The list of functions that can be directly 135 * called via squeue are listed before the usual function prototype. 136 * 137 * Referencing: 138 * 139 * TCP is MT-Hot and we use a reference based scheme to make sure that the 140 * tcp structure doesn't disappear when its needed. When the application 141 * creates an outgoing connection or accepts an incoming connection, we 142 * start out with 2 references on 'conn_ref'. One for TCP and one for IP. 143 * The IP reference is just a symbolic reference since ip_tcpclose() 144 * looks at tcp structure after tcp_close_output() returns which could 145 * have dropped the last TCP reference. So as long as the connection is 146 * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the 147 * conn_t. The classifier puts its own reference when the connection is 148 * inserted in listen or connected hash. Anytime a thread needs to enter 149 * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr 150 * on write side or by doing a classify on read side and then puts a 151 * reference on the conn before doing squeue_enter/tryenter/fill. For 152 * read side, the classifier itself puts the reference under fanout lock 153 * to make sure that tcp can't disappear before it gets processed. The 154 * squeue will drop this reference automatically so the called function 155 * doesn't have to do a DEC_REF. 156 * 157 * Opening a new connection: 158 * 159 * The outgoing connection open is pretty simple. tcp_open() does the 160 * work in creating the conn/tcp structure and initializing it. The 161 * squeue assignment is done based on the CPU the application 162 * is running on. So for outbound connections, processing is always done 163 * on application CPU which might be different from the incoming CPU 164 * being interrupted by the NIC. An optimal way would be to figure out 165 * the NIC <-> CPU binding at listen time, and assign the outgoing 166 * connection to the squeue attached to the CPU that will be interrupted 167 * for incoming packets (we know the NIC based on the bind IP address). 168 * This might seem like a problem if more data is going out but the 169 * fact is that in most cases the transmit is ACK driven transmit where 170 * the outgoing data normally sits on TCP's xmit queue waiting to be 171 * transmitted. 172 * 173 * Accepting a connection: 174 * 175 * This is a more interesting case because of various races involved in 176 * establishing a eager in its own perimeter. Read the meta comment on 177 * top of tcp_conn_request(). But briefly, the squeue is picked by 178 * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU. 179 * 180 * Closing a connection: 181 * 182 * The close is fairly straight forward. tcp_close() calls tcp_close_output() 183 * via squeue to do the close and mark the tcp as detached if the connection 184 * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its 185 * reference but tcp_close() drop IP's reference always. So if tcp was 186 * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP 187 * and 1 because it is in classifier's connected hash. This is the condition 188 * we use to determine that its OK to clean up the tcp outside of squeue 189 * when time wait expires (check the ref under fanout and conn_lock and 190 * if it is 2, remove it from fanout hash and kill it). 191 * 192 * Although close just drops the necessary references and marks the 193 * tcp_detached state, tcp_close needs to know the tcp_detached has been 194 * set (under squeue) before letting the STREAM go away (because a 195 * inbound packet might attempt to go up the STREAM while the close 196 * has happened and tcp_detached is not set). So a special lock and 197 * flag is used along with a condition variable (tcp_closelock, tcp_closed, 198 * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked 199 * tcp_detached. 200 * 201 * Special provisions and fast paths: 202 * 203 * We make special provision for (AF_INET, SOCK_STREAM) sockets which 204 * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP 205 * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles 206 * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY 207 * check to send packets directly to tcp_rput_data via squeue. Everyone 208 * else comes through tcp_input() on the read side. 209 * 210 * We also make special provisions for sockfs by marking tcp_issocket 211 * whenever we have only sockfs on top of TCP. This allows us to skip 212 * putting the tcp in acceptor hash since a sockfs listener can never 213 * become acceptor and also avoid allocating a tcp_t for acceptor STREAM 214 * since eager has already been allocated and the accept now happens 215 * on acceptor STREAM. There is a big blob of comment on top of 216 * tcp_conn_request explaining the new accept. When socket is POP'd, 217 * sockfs sends us an ioctl to mark the fact and we go back to old 218 * behaviour. Once tcp_issocket is unset, its never set for the 219 * life of that connection. 220 * 221 * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT) 222 * two consoldiation private KAPIs are used to enqueue M_DATA mblk_t's 223 * directly to the socket (sodirect) and start an asynchronous copyout 224 * to a user-land receive-side buffer (uioa) when a blocking socket read 225 * (e.g. read, recv, ...) is pending. 226 * 227 * This is accomplished when tcp_issocket is set and tcp_sodirect is not 228 * NULL so points to an sodirect_t and if marked enabled then we enqueue 229 * all mblk_t's directly to the socket. 230 * 231 * Further, if the sodirect_t sod_uioa and if marked enabled (due to a 232 * blocking socket read, e.g. user-land read, recv, ...) then an asynchronous 233 * copyout will be started directly to the user-land uio buffer. Also, as we 234 * have a pending read, TCP's push logic can take into account the number of 235 * bytes to be received and only awake the blocked read()er when the uioa_t 236 * byte count has been satisfied. 237 * 238 * IPsec notes : 239 * 240 * Since a packet is always executed on the correct TCP perimeter 241 * all IPsec processing is defered to IP including checking new 242 * connections and setting IPSEC policies for new connection. The 243 * only exception is tcp_xmit_listeners_reset() which is called 244 * directly from IP and needs to policy check to see if TH_RST 245 * can be sent out. 246 * 247 * PFHooks notes : 248 * 249 * For mdt case, one meta buffer contains multiple packets. Mblks for every 250 * packet are assembled and passed to the hooks. When packets are blocked, 251 * or boundary of any packet is changed, the mdt processing is stopped, and 252 * packets of the meta buffer are send to the IP path one by one. 253 */ 254 255 /* 256 * Values for squeue switch: 257 * 1: squeue_enter_nodrain 258 * 2: squeue_enter 259 * 3: squeue_fill 260 */ 261 int tcp_squeue_close = 2; /* Setable in /etc/system */ 262 int tcp_squeue_wput = 2; 263 264 squeue_func_t tcp_squeue_close_proc; 265 squeue_func_t tcp_squeue_wput_proc; 266 267 /* 268 * Macros for sodirect: 269 * 270 * SOD_PTR_ENTER(tcp, sodp) - for the tcp_t pointer "tcp" set the 271 * sodirect_t pointer "sodp" to the socket/tcp shared sodirect_t 272 * if it exists and is enabled, else to NULL. Note, in the current 273 * sodirect implementation the sod_lock must not be held across any 274 * STREAMS call (e.g. putnext) else a "recursive mutex_enter" PANIC 275 * will result as sod_lock is the streamhead stdata.sd_lock. 276 * 277 * SOD_NOT_ENABLED(tcp) - return true if not a sodirect tcp_t or the 278 * sodirect_t isn't enabled, usefull for ASSERT()ing that a recieve 279 * side tcp code path dealing with a tcp_rcv_list or putnext() isn't 280 * being used when sodirect code paths should be. 281 */ 282 283 #define SOD_PTR_ENTER(tcp, sodp) \ 284 (sodp) = (tcp)->tcp_sodirect; \ 285 \ 286 if ((sodp) != NULL) { \ 287 mutex_enter((sodp)->sod_lock); \ 288 if (!((sodp)->sod_state & SOD_ENABLED)) { \ 289 mutex_exit((sodp)->sod_lock); \ 290 (sodp) = NULL; \ 291 } \ 292 } 293 294 #define SOD_NOT_ENABLED(tcp) \ 295 ((tcp)->tcp_sodirect == NULL || \ 296 !((tcp)->tcp_sodirect->sod_state & SOD_ENABLED)) 297 298 /* 299 * This controls how tiny a write must be before we try to copy it 300 * into the the mblk on the tail of the transmit queue. Not much 301 * speedup is observed for values larger than sixteen. Zero will 302 * disable the optimisation. 303 */ 304 int tcp_tx_pull_len = 16; 305 306 /* 307 * TCP Statistics. 308 * 309 * How TCP statistics work. 310 * 311 * There are two types of statistics invoked by two macros. 312 * 313 * TCP_STAT(name) does non-atomic increment of a named stat counter. It is 314 * supposed to be used in non MT-hot paths of the code. 315 * 316 * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is 317 * supposed to be used for DEBUG purposes and may be used on a hot path. 318 * 319 * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat 320 * (use "kstat tcp" to get them). 321 * 322 * There is also additional debugging facility that marks tcp_clean_death() 323 * instances and saves them in tcp_t structure. It is triggered by 324 * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for 325 * tcp_clean_death() calls that counts the number of times each tag was hit. It 326 * is triggered by TCP_CLD_COUNTERS define. 327 * 328 * How to add new counters. 329 * 330 * 1) Add a field in the tcp_stat structure describing your counter. 331 * 2) Add a line in the template in tcp_kstat2_init() with the name 332 * of the counter. 333 * 334 * IMPORTANT!! - make sure that both are in sync !! 335 * 3) Use either TCP_STAT or TCP_DBGSTAT with the name. 336 * 337 * Please avoid using private counters which are not kstat-exported. 338 * 339 * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances 340 * in tcp_t structure. 341 * 342 * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags. 343 */ 344 345 #ifndef TCP_DEBUG_COUNTER 346 #ifdef DEBUG 347 #define TCP_DEBUG_COUNTER 1 348 #else 349 #define TCP_DEBUG_COUNTER 0 350 #endif 351 #endif 352 353 #define TCP_CLD_COUNTERS 0 354 355 #define TCP_TAG_CLEAN_DEATH 1 356 #define TCP_MAX_CLEAN_DEATH_TAG 32 357 358 #ifdef lint 359 static int _lint_dummy_; 360 #endif 361 362 #if TCP_CLD_COUNTERS 363 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; 364 #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++ 365 #elif defined(lint) 366 #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0); 367 #else 368 #define TCP_CLD_STAT(x) 369 #endif 370 371 #if TCP_DEBUG_COUNTER 372 #define TCP_DBGSTAT(tcps, x) \ 373 atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1) 374 #define TCP_G_DBGSTAT(x) \ 375 atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1) 376 #elif defined(lint) 377 #define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0); 378 #define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); 379 #else 380 #define TCP_DBGSTAT(tcps, x) 381 #define TCP_G_DBGSTAT(x) 382 #endif 383 384 #define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++) 385 386 tcp_g_stat_t tcp_g_statistics; 387 kstat_t *tcp_g_kstat; 388 389 /* 390 * Call either ip_output or ip_output_v6. This replaces putnext() calls on the 391 * tcp write side. 392 */ 393 #define CALL_IP_WPUT(connp, q, mp) { \ 394 tcp_stack_t *tcps; \ 395 \ 396 tcps = connp->conn_netstack->netstack_tcp; \ 397 ASSERT(((q)->q_flag & QREADR) == 0); \ 398 TCP_DBGSTAT(tcps, tcp_ip_output); \ 399 connp->conn_send(connp, (mp), (q), IP_WPUT); \ 400 } 401 402 /* Macros for timestamp comparisons */ 403 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 404 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 405 406 /* 407 * Parameters for TCP Initial Send Sequence number (ISS) generation. When 408 * tcp_strong_iss is set to 1, which is the default, the ISS is calculated 409 * by adding three components: a time component which grows by 1 every 4096 410 * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); 411 * a per-connection component which grows by 125000 for every new connection; 412 * and an "extra" component that grows by a random amount centered 413 * approximately on 64000. This causes the the ISS generator to cycle every 414 * 4.89 hours if no TCP connections are made, and faster if connections are 415 * made. 416 * 417 * When tcp_strong_iss is set to 0, ISS is calculated by adding two 418 * components: a time component which grows by 250000 every second; and 419 * a per-connection component which grows by 125000 for every new connections. 420 * 421 * A third method, when tcp_strong_iss is set to 2, for generating ISS is 422 * prescribed by Steve Bellovin. This involves adding time, the 125000 per 423 * connection, and a one-way hash (MD5) of the connection ID <sport, dport, 424 * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered 425 * password. 426 */ 427 #define ISS_INCR 250000 428 #define ISS_NSEC_SHT 12 429 430 static sin_t sin_null; /* Zero address for quick clears */ 431 static sin6_t sin6_null; /* Zero address for quick clears */ 432 433 /* 434 * This implementation follows the 4.3BSD interpretation of the urgent 435 * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause 436 * incompatible changes in protocols like telnet and rlogin. 437 */ 438 #define TCP_OLD_URP_INTERPRETATION 1 439 440 #define TCP_IS_DETACHED_NONEAGER(tcp) \ 441 (TCP_IS_DETACHED(tcp) && \ 442 (!(tcp)->tcp_hard_binding)) 443 444 /* 445 * TCP reassembly macros. We hide starting and ending sequence numbers in 446 * b_next and b_prev of messages on the reassembly queue. The messages are 447 * chained using b_cont. These macros are used in tcp_reass() so we don't 448 * have to see the ugly casts and assignments. 449 */ 450 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) 451 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ 452 (mblk_t *)(uintptr_t)(u)) 453 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) 454 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ 455 (mblk_t *)(uintptr_t)(u)) 456 457 /* 458 * Implementation of TCP Timers. 459 * ============================= 460 * 461 * INTERFACE: 462 * 463 * There are two basic functions dealing with tcp timers: 464 * 465 * timeout_id_t tcp_timeout(connp, func, time) 466 * clock_t tcp_timeout_cancel(connp, timeout_id) 467 * TCP_TIMER_RESTART(tcp, intvl) 468 * 469 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' 470 * after 'time' ticks passed. The function called by timeout() must adhere to 471 * the same restrictions as a driver soft interrupt handler - it must not sleep 472 * or call other functions that might sleep. The value returned is the opaque 473 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to 474 * cancel the request. The call to tcp_timeout() may fail in which case it 475 * returns zero. This is different from the timeout(9F) function which never 476 * fails. 477 * 478 * The call-back function 'func' always receives 'connp' as its single 479 * argument. It is always executed in the squeue corresponding to the tcp 480 * structure. The tcp structure is guaranteed to be present at the time the 481 * call-back is called. 482 * 483 * NOTE: The call-back function 'func' is never called if tcp is in 484 * the TCPS_CLOSED state. 485 * 486 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() 487 * request. locks acquired by the call-back routine should not be held across 488 * the call to tcp_timeout_cancel() or a deadlock may result. 489 * 490 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. 491 * Otherwise, it returns an integer value greater than or equal to 0. In 492 * particular, if the call-back function is already placed on the squeue, it can 493 * not be canceled. 494 * 495 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called 496 * within squeue context corresponding to the tcp instance. Since the 497 * call-back is also called via the same squeue, there are no race 498 * conditions described in untimeout(9F) manual page since all calls are 499 * strictly serialized. 500 * 501 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout 502 * stored in tcp_timer_tid and starts a new one using 503 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back 504 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid 505 * field. 506 * 507 * NOTE: since the timeout cancellation is not guaranteed, the cancelled 508 * call-back may still be called, so it is possible tcp_timer() will be 509 * called several times. This should not be a problem since tcp_timer() 510 * should always check the tcp instance state. 511 * 512 * 513 * IMPLEMENTATION: 514 * 515 * TCP timers are implemented using three-stage process. The call to 516 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function 517 * when the timer expires. The tcp_timer_callback() arranges the call of the 518 * tcp_timer_handler() function via squeue corresponding to the tcp 519 * instance. The tcp_timer_handler() calls actual requested timeout call-back 520 * and passes tcp instance as an argument to it. Information is passed between 521 * stages using the tcp_timer_t structure which contains the connp pointer, the 522 * tcp call-back to call and the timeout id returned by the timeout(9F). 523 * 524 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - 525 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo 526 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() 527 * returns the pointer to this mblk. 528 * 529 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It 530 * looks like a normal mblk without actual dblk attached to it. 531 * 532 * To optimize performance each tcp instance holds a small cache of timer 533 * mblocks. In the current implementation it caches up to two timer mblocks per 534 * tcp instance. The cache is preserved over tcp frees and is only freed when 535 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp 536 * timer processing happens on a corresponding squeue, the cache manipulation 537 * does not require any locks. Experiments show that majority of timer mblocks 538 * allocations are satisfied from the tcp cache and do not involve kmem calls. 539 * 540 * The tcp_timeout() places a refhold on the connp instance which guarantees 541 * that it will be present at the time the call-back function fires. The 542 * tcp_timer_handler() drops the reference after calling the call-back, so the 543 * call-back function does not need to manipulate the references explicitly. 544 */ 545 546 typedef struct tcp_timer_s { 547 conn_t *connp; 548 void (*tcpt_proc)(void *); 549 timeout_id_t tcpt_tid; 550 } tcp_timer_t; 551 552 static kmem_cache_t *tcp_timercache; 553 kmem_cache_t *tcp_sack_info_cache; 554 kmem_cache_t *tcp_iphc_cache; 555 556 /* 557 * For scalability, we must not run a timer for every TCP connection 558 * in TIME_WAIT state. To see why, consider (for time wait interval of 559 * 4 minutes): 560 * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's 561 * 562 * This list is ordered by time, so you need only delete from the head 563 * until you get to entries which aren't old enough to delete yet. 564 * The list consists of only the detached TIME_WAIT connections. 565 * 566 * Note that the timer (tcp_time_wait_expire) is started when the tcp_t 567 * becomes detached TIME_WAIT (either by changing the state and already 568 * being detached or the other way around). This means that the TIME_WAIT 569 * state can be extended (up to doubled) if the connection doesn't become 570 * detached for a long time. 571 * 572 * The list manipulations (including tcp_time_wait_next/prev) 573 * are protected by the tcp_time_wait_lock. The content of the 574 * detached TIME_WAIT connections is protected by the normal perimeters. 575 * 576 * This list is per squeue and squeues are shared across the tcp_stack_t's. 577 * Things on tcp_time_wait_head remain associated with the tcp_stack_t 578 * and conn_netstack. 579 * The tcp_t's that are added to tcp_free_list are disassociated and 580 * have NULL tcp_tcps and conn_netstack pointers. 581 */ 582 typedef struct tcp_squeue_priv_s { 583 kmutex_t tcp_time_wait_lock; 584 timeout_id_t tcp_time_wait_tid; 585 tcp_t *tcp_time_wait_head; 586 tcp_t *tcp_time_wait_tail; 587 tcp_t *tcp_free_list; 588 uint_t tcp_free_list_cnt; 589 } tcp_squeue_priv_t; 590 591 /* 592 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. 593 * Running it every 5 seconds seems to give the best results. 594 */ 595 #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000) 596 597 /* 598 * To prevent memory hog, limit the number of entries in tcp_free_list 599 * to 1% of available memory / number of cpus 600 */ 601 uint_t tcp_free_list_max_cnt = 0; 602 603 #define TCP_XMIT_LOWATER 4096 604 #define TCP_XMIT_HIWATER 49152 605 #define TCP_RECV_LOWATER 2048 606 #define TCP_RECV_HIWATER 49152 607 608 /* 609 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days 610 */ 611 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) 612 613 #define TIDUSZ 4096 /* transport interface data unit size */ 614 615 /* 616 * Bind hash list size and has function. It has to be a power of 2 for 617 * hashing. 618 */ 619 #define TCP_BIND_FANOUT_SIZE 512 620 #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1)) 621 /* 622 * Size of listen and acceptor hash list. It has to be a power of 2 for 623 * hashing. 624 */ 625 #define TCP_FANOUT_SIZE 256 626 627 #ifdef _ILP32 628 #define TCP_ACCEPTOR_HASH(accid) \ 629 (((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1)) 630 #else 631 #define TCP_ACCEPTOR_HASH(accid) \ 632 ((uint_t)(accid) & (TCP_FANOUT_SIZE - 1)) 633 #endif /* _ILP32 */ 634 635 #define IP_ADDR_CACHE_SIZE 2048 636 #define IP_ADDR_CACHE_HASH(faddr) \ 637 (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1)) 638 639 /* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */ 640 #define TCP_HSP_HASH_SIZE 256 641 642 #define TCP_HSP_HASH(addr) \ 643 (((addr>>24) ^ (addr >>16) ^ \ 644 (addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE) 645 646 /* 647 * TCP options struct returned from tcp_parse_options. 648 */ 649 typedef struct tcp_opt_s { 650 uint32_t tcp_opt_mss; 651 uint32_t tcp_opt_wscale; 652 uint32_t tcp_opt_ts_val; 653 uint32_t tcp_opt_ts_ecr; 654 tcp_t *tcp; 655 } tcp_opt_t; 656 657 /* 658 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 659 */ 660 661 #ifdef _BIG_ENDIAN 662 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 663 (TCPOPT_TSTAMP << 8) | 10) 664 #else 665 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 666 (TCPOPT_NOP << 8) | TCPOPT_NOP) 667 #endif 668 669 /* 670 * Flags returned from tcp_parse_options. 671 */ 672 #define TCP_OPT_MSS_PRESENT 1 673 #define TCP_OPT_WSCALE_PRESENT 2 674 #define TCP_OPT_TSTAMP_PRESENT 4 675 #define TCP_OPT_SACK_OK_PRESENT 8 676 #define TCP_OPT_SACK_PRESENT 16 677 678 /* TCP option length */ 679 #define TCPOPT_NOP_LEN 1 680 #define TCPOPT_MAXSEG_LEN 4 681 #define TCPOPT_WS_LEN 3 682 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 683 #define TCPOPT_TSTAMP_LEN 10 684 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 685 #define TCPOPT_SACK_OK_LEN 2 686 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 687 #define TCPOPT_REAL_SACK_LEN 4 688 #define TCPOPT_MAX_SACK_LEN 36 689 #define TCPOPT_HEADER_LEN 2 690 691 /* TCP cwnd burst factor. */ 692 #define TCP_CWND_INFINITE 65535 693 #define TCP_CWND_SS 3 694 #define TCP_CWND_NORMAL 5 695 696 /* Maximum TCP initial cwin (start/restart). */ 697 #define TCP_MAX_INIT_CWND 8 698 699 /* 700 * Initialize cwnd according to RFC 3390. def_max_init_cwnd is 701 * either tcp_slow_start_initial or tcp_slow_start_after idle 702 * depending on the caller. If the upper layer has not used the 703 * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd 704 * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd. 705 * If the upper layer has changed set the tcp_init_cwnd, just use 706 * it to calculate the tcp_cwnd. 707 */ 708 #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \ 709 { \ 710 if ((tcp)->tcp_init_cwnd == 0) { \ 711 (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \ 712 MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \ 713 } else { \ 714 (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \ 715 } \ 716 tcp->tcp_cwnd_cnt = 0; \ 717 } 718 719 /* TCP Timer control structure */ 720 typedef struct tcpt_s { 721 pfv_t tcpt_pfv; /* The routine we are to call */ 722 tcp_t *tcpt_tcp; /* The parameter we are to pass in */ 723 } tcpt_t; 724 725 /* Host Specific Parameter structure */ 726 typedef struct tcp_hsp { 727 struct tcp_hsp *tcp_hsp_next; 728 in6_addr_t tcp_hsp_addr_v6; 729 in6_addr_t tcp_hsp_subnet_v6; 730 uint_t tcp_hsp_vers; /* IPV4_VERSION | IPV6_VERSION */ 731 int32_t tcp_hsp_sendspace; 732 int32_t tcp_hsp_recvspace; 733 int32_t tcp_hsp_tstamp; 734 } tcp_hsp_t; 735 #define tcp_hsp_addr V4_PART_OF_V6(tcp_hsp_addr_v6) 736 #define tcp_hsp_subnet V4_PART_OF_V6(tcp_hsp_subnet_v6) 737 738 /* 739 * Functions called directly via squeue having a prototype of edesc_t. 740 */ 741 void tcp_conn_request(void *arg, mblk_t *mp, void *arg2); 742 static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2); 743 void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2); 744 static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2); 745 static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2); 746 void tcp_input(void *arg, mblk_t *mp, void *arg2); 747 void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); 748 static void tcp_close_output(void *arg, mblk_t *mp, void *arg2); 749 void tcp_output(void *arg, mblk_t *mp, void *arg2); 750 static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2); 751 static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2); 752 static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2); 753 754 755 /* Prototype for TCP functions */ 756 static void tcp_random_init(void); 757 int tcp_random(void); 758 static void tcp_accept(tcp_t *tcp, mblk_t *mp); 759 static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, 760 tcp_t *eager); 761 static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp); 762 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 763 int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, 764 boolean_t user_specified); 765 static void tcp_closei_local(tcp_t *tcp); 766 static void tcp_close_detached(tcp_t *tcp); 767 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, 768 mblk_t *idmp, mblk_t **defermp); 769 static void tcp_connect(tcp_t *tcp, mblk_t *mp); 770 static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, 771 in_port_t dstport, uint_t srcid); 772 static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, 773 in_port_t dstport, uint32_t flowinfo, uint_t srcid, 774 uint32_t scope_id); 775 static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); 776 static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp); 777 static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); 778 static char *tcp_display(tcp_t *tcp, char *, char); 779 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); 780 static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only); 781 static void tcp_eager_unlink(tcp_t *tcp); 782 static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr, 783 int unixerr); 784 static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 785 int tlierr, int unixerr); 786 static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, 787 cred_t *cr); 788 static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, 789 char *value, caddr_t cp, cred_t *cr); 790 static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, 791 char *value, caddr_t cp, cred_t *cr); 792 static int tcp_tpistate(tcp_t *tcp); 793 static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp, 794 int caller_holds_lock); 795 static void tcp_bind_hash_remove(tcp_t *tcp); 796 static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *); 797 void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp); 798 static void tcp_acceptor_hash_remove(tcp_t *tcp); 799 static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); 800 static void tcp_info_req(tcp_t *tcp, mblk_t *mp); 801 static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); 802 static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp); 803 void tcp_g_q_setup(tcp_stack_t *); 804 void tcp_g_q_create(tcp_stack_t *); 805 void tcp_g_q_destroy(tcp_stack_t *); 806 static int tcp_header_init_ipv4(tcp_t *tcp); 807 static int tcp_header_init_ipv6(tcp_t *tcp); 808 int tcp_init(tcp_t *tcp, queue_t *q); 809 static int tcp_init_values(tcp_t *tcp); 810 static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic); 811 static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, 812 t_scalar_t addr_length); 813 static void tcp_ip_ire_mark_advice(tcp_t *tcp); 814 static void tcp_ip_notify(tcp_t *tcp); 815 static mblk_t *tcp_ire_mp(mblk_t *mp); 816 static void tcp_iss_init(tcp_t *tcp); 817 static void tcp_keepalive_killer(void *arg); 818 static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt); 819 static void tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss); 820 static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, 821 int *do_disconnectp, int *t_errorp, int *sys_errorp); 822 static boolean_t tcp_allow_connopt_set(int level, int name); 823 int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); 824 int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr); 825 int tcp_opt_set(queue_t *q, uint_t optset_context, int level, 826 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, 827 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, 828 mblk_t *mblk); 829 static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha); 830 static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, 831 uchar_t *ptr, uint_t len); 832 static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); 833 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, 834 tcp_stack_t *); 835 static int tcp_param_set(queue_t *q, mblk_t *mp, char *value, 836 caddr_t cp, cred_t *cr); 837 static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, 838 caddr_t cp, cred_t *cr); 839 static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *); 840 static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, 841 caddr_t cp, cred_t *cr); 842 static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt); 843 static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start); 844 static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp); 845 static void tcp_reinit(tcp_t *tcp); 846 static void tcp_reinit_values(tcp_t *tcp); 847 static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, 848 tcp_t *thisstream, cred_t *cr); 849 850 static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp); 851 static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); 852 static boolean_t tcp_send_rst_chk(tcp_stack_t *); 853 static void tcp_ss_rexmit(tcp_t *tcp); 854 static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp); 855 static void tcp_process_options(tcp_t *, tcph_t *); 856 static void tcp_rput_common(tcp_t *tcp, mblk_t *mp); 857 static void tcp_rsrv(queue_t *q); 858 static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd); 859 static int tcp_snmp_state(tcp_t *tcp); 860 static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, 861 cred_t *cr); 862 static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 863 cred_t *cr); 864 static int tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 865 cred_t *cr); 866 static int tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 867 cred_t *cr); 868 static int tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 869 cred_t *cr); 870 static int tcp_host_param_set(queue_t *q, mblk_t *mp, char *value, 871 caddr_t cp, cred_t *cr); 872 static int tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value, 873 caddr_t cp, cred_t *cr); 874 static int tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, 875 cred_t *cr); 876 static void tcp_timer(void *arg); 877 static void tcp_timer_callback(void *); 878 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp, 879 boolean_t random); 880 static in_port_t tcp_get_next_priv_port(const tcp_t *); 881 static void tcp_wput_sock(queue_t *q, mblk_t *mp); 882 void tcp_wput_accept(queue_t *q, mblk_t *mp); 883 static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); 884 static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); 885 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); 886 static int tcp_send(queue_t *q, tcp_t *tcp, const int mss, 887 const int tcp_hdr_len, const int tcp_tcp_hdr_len, 888 const int num_sack_blk, int *usable, uint_t *snxt, 889 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 890 const int mdt_thres); 891 static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, 892 const int tcp_hdr_len, const int tcp_tcp_hdr_len, 893 const int num_sack_blk, int *usable, uint_t *snxt, 894 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 895 const int mdt_thres); 896 static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, 897 int num_sack_blk); 898 static void tcp_wsrv(queue_t *q); 899 static int tcp_xmit_end(tcp_t *tcp); 900 static void tcp_ack_timer(void *arg); 901 static mblk_t *tcp_ack_mp(tcp_t *tcp); 902 static void tcp_xmit_early_reset(char *str, mblk_t *mp, 903 uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len, 904 zoneid_t zoneid, tcp_stack_t *, conn_t *connp); 905 static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, 906 uint32_t ack, int ctl); 907 static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr, tcp_stack_t *); 908 static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr, tcp_stack_t *); 909 static int setmaxps(queue_t *q, int maxpsz); 910 static void tcp_set_rto(tcp_t *, time_t); 911 static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *, 912 boolean_t, boolean_t); 913 static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, 914 boolean_t ipsec_mctl); 915 static mblk_t *tcp_setsockopt_mp(int level, int cmd, 916 char *opt, int optlen); 917 static int tcp_build_hdrs(queue_t *, tcp_t *); 918 static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 919 uint32_t seg_seq, uint32_t seg_ack, int seg_len, 920 tcph_t *tcph); 921 boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp); 922 boolean_t tcp_reserved_port_add(int, in_port_t *, in_port_t *); 923 boolean_t tcp_reserved_port_del(in_port_t, in_port_t); 924 boolean_t tcp_reserved_port_check(in_port_t, tcp_stack_t *); 925 static tcp_t *tcp_alloc_temp_tcp(in_port_t, tcp_stack_t *); 926 static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *); 927 static mblk_t *tcp_mdt_info_mp(mblk_t *); 928 static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t); 929 static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *, 930 const boolean_t, const uint32_t, const uint32_t, 931 const uint32_t, const uint32_t, tcp_stack_t *); 932 static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *, 933 const uint_t, const uint_t, boolean_t *); 934 static mblk_t *tcp_lso_info_mp(mblk_t *); 935 static void tcp_lso_update(tcp_t *, ill_lso_capab_t *); 936 static void tcp_send_data(tcp_t *, queue_t *, mblk_t *); 937 extern mblk_t *tcp_timermp_alloc(int); 938 extern void tcp_timermp_free(tcp_t *); 939 static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); 940 static void tcp_stop_lingering(tcp_t *tcp); 941 static void tcp_close_linger_timeout(void *arg); 942 static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns); 943 static void tcp_stack_shutdown(netstackid_t stackid, void *arg); 944 static void tcp_stack_fini(netstackid_t stackid, void *arg); 945 static void *tcp_g_kstat_init(tcp_g_stat_t *); 946 static void tcp_g_kstat_fini(kstat_t *); 947 static void *tcp_kstat_init(netstackid_t, tcp_stack_t *); 948 static void tcp_kstat_fini(netstackid_t, kstat_t *); 949 static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *); 950 static void tcp_kstat2_fini(netstackid_t, kstat_t *); 951 static int tcp_kstat_update(kstat_t *kp, int rw); 952 void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp); 953 static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 954 tcph_t *tcph, uint_t ipvers, mblk_t *idmp); 955 static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, 956 tcph_t *tcph, mblk_t *idmp); 957 static squeue_func_t tcp_squeue_switch(int); 958 959 static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t); 960 static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *); 961 static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *); 962 static int tcp_close(queue_t *, int); 963 static int tcpclose_accept(queue_t *); 964 965 static void tcp_squeue_add(squeue_t *); 966 static boolean_t tcp_zcopy_check(tcp_t *); 967 static void tcp_zcopy_notify(tcp_t *); 968 static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *); 969 static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int); 970 static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t); 971 972 extern void tcp_kssl_input(tcp_t *, mblk_t *); 973 974 void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2); 975 void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2); 976 977 /* 978 * Routines related to the TCP_IOC_ABORT_CONN ioctl command. 979 * 980 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting 981 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure 982 * (defined in tcp.h) needs to be filled in and passed into the kernel 983 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t 984 * structure contains the four-tuple of a TCP connection and a range of TCP 985 * states (specified by ac_start and ac_end). The use of wildcard addresses 986 * and ports is allowed. Connections with a matching four tuple and a state 987 * within the specified range will be aborted. The valid states for the 988 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, 989 * inclusive. 990 * 991 * An application which has its connection aborted by this ioctl will receive 992 * an error that is dependent on the connection state at the time of the abort. 993 * If the connection state is < TCPS_TIME_WAIT, an application should behave as 994 * though a RST packet has been received. If the connection state is equal to 995 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel 996 * and all resources associated with the connection will be freed. 997 */ 998 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); 999 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); 1000 static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *); 1001 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); 1002 static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); 1003 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, 1004 boolean_t, tcp_stack_t *); 1005 1006 static struct module_info tcp_rinfo = { 1007 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER 1008 }; 1009 1010 static struct module_info tcp_winfo = { 1011 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16 1012 }; 1013 1014 /* 1015 * Entry points for TCP as a device. The normal case which supports 1016 * the TCP functionality. 1017 * We have separate open functions for the /dev/tcp and /dev/tcp6 devices. 1018 */ 1019 struct qinit tcp_rinitv4 = { 1020 NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, NULL, &tcp_rinfo 1021 }; 1022 1023 struct qinit tcp_rinitv6 = { 1024 NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_close, NULL, &tcp_rinfo 1025 }; 1026 1027 struct qinit tcp_winit = { 1028 (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 1029 }; 1030 1031 /* Initial entry point for TCP in socket mode. */ 1032 struct qinit tcp_sock_winit = { 1033 (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 1034 }; 1035 1036 /* 1037 * Entry points for TCP as a acceptor STREAM opened by sockfs when doing 1038 * an accept. Avoid allocating data structures since eager has already 1039 * been created. 1040 */ 1041 struct qinit tcp_acceptor_rinit = { 1042 NULL, (pfi_t)tcp_rsrv, NULL, tcpclose_accept, NULL, &tcp_winfo 1043 }; 1044 1045 struct qinit tcp_acceptor_winit = { 1046 (pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo 1047 }; 1048 1049 /* 1050 * Entry points for TCP loopback (read side only) 1051 * The open routine is only used for reopens, thus no need to 1052 * have a separate one for tcp_openv6. 1053 */ 1054 struct qinit tcp_loopback_rinit = { 1055 (pfi_t)0, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, (pfi_t)0, 1056 &tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD 1057 }; 1058 1059 /* For AF_INET aka /dev/tcp */ 1060 struct streamtab tcpinfov4 = { 1061 &tcp_rinitv4, &tcp_winit 1062 }; 1063 1064 /* For AF_INET6 aka /dev/tcp6 */ 1065 struct streamtab tcpinfov6 = { 1066 &tcp_rinitv6, &tcp_winit 1067 }; 1068 1069 /* 1070 * Have to ensure that tcp_g_q_close is not done by an 1071 * interrupt thread. 1072 */ 1073 static taskq_t *tcp_taskq; 1074 1075 /* 1076 * TCP has a private interface for other kernel modules to reserve a 1077 * port range for them to use. Once reserved, TCP will not use any ports 1078 * in the range. This interface relies on the TCP_EXCLBIND feature. If 1079 * the semantics of TCP_EXCLBIND is changed, implementation of this interface 1080 * has to be verified. 1081 * 1082 * There can be TCP_RESERVED_PORTS_ARRAY_MAX_SIZE port ranges. Each port 1083 * range can cover at most TCP_RESERVED_PORTS_RANGE_MAX ports. A port 1084 * range is [port a, port b] inclusive. And each port range is between 1085 * TCP_LOWESET_RESERVED_PORT and TCP_LARGEST_RESERVED_PORT inclusive. 1086 * 1087 * Note that the default anonymous port range starts from 32768. There is 1088 * no port "collision" between that and the reserved port range. If there 1089 * is port collision (because the default smallest anonymous port is lowered 1090 * or some apps specifically bind to ports in the reserved port range), the 1091 * system may not be able to reserve a port range even there are enough 1092 * unbound ports as a reserved port range contains consecutive ports . 1093 */ 1094 #define TCP_RESERVED_PORTS_ARRAY_MAX_SIZE 5 1095 #define TCP_RESERVED_PORTS_RANGE_MAX 1000 1096 #define TCP_SMALLEST_RESERVED_PORT 10240 1097 #define TCP_LARGEST_RESERVED_PORT 20480 1098 1099 /* Structure to represent those reserved port ranges. */ 1100 typedef struct tcp_rport_s { 1101 in_port_t lo_port; 1102 in_port_t hi_port; 1103 tcp_t **temp_tcp_array; 1104 } tcp_rport_t; 1105 1106 /* Setable only in /etc/system. Move to ndd? */ 1107 boolean_t tcp_icmp_source_quench = B_FALSE; 1108 1109 /* 1110 * Following assumes TPI alignment requirements stay along 32 bit 1111 * boundaries 1112 */ 1113 #define ROUNDUP32(x) \ 1114 (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1)) 1115 1116 /* Template for response to info request. */ 1117 static struct T_info_ack tcp_g_t_info_ack = { 1118 T_INFO_ACK, /* PRIM_type */ 1119 0, /* TSDU_size */ 1120 T_INFINITE, /* ETSDU_size */ 1121 T_INVALID, /* CDATA_size */ 1122 T_INVALID, /* DDATA_size */ 1123 sizeof (sin_t), /* ADDR_size */ 1124 0, /* OPT_size - not initialized here */ 1125 TIDUSZ, /* TIDU_size */ 1126 T_COTS_ORD, /* SERV_type */ 1127 TCPS_IDLE, /* CURRENT_state */ 1128 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 1129 }; 1130 1131 static struct T_info_ack tcp_g_t_info_ack_v6 = { 1132 T_INFO_ACK, /* PRIM_type */ 1133 0, /* TSDU_size */ 1134 T_INFINITE, /* ETSDU_size */ 1135 T_INVALID, /* CDATA_size */ 1136 T_INVALID, /* DDATA_size */ 1137 sizeof (sin6_t), /* ADDR_size */ 1138 0, /* OPT_size - not initialized here */ 1139 TIDUSZ, /* TIDU_size */ 1140 T_COTS_ORD, /* SERV_type */ 1141 TCPS_IDLE, /* CURRENT_state */ 1142 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 1143 }; 1144 1145 #define MS 1L 1146 #define SECONDS (1000 * MS) 1147 #define MINUTES (60 * SECONDS) 1148 #define HOURS (60 * MINUTES) 1149 #define DAYS (24 * HOURS) 1150 1151 #define PARAM_MAX (~(uint32_t)0) 1152 1153 /* Max size IP datagram is 64k - 1 */ 1154 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t))) 1155 #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t))) 1156 /* Max of the above */ 1157 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 1158 1159 /* Largest TCP port number */ 1160 #define TCP_MAX_PORT (64 * 1024 - 1) 1161 1162 /* 1163 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link 1164 * layer header. It has to be a multiple of 4. 1165 */ 1166 static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; 1167 #define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val 1168 1169 /* 1170 * All of these are alterable, within the min/max values given, at run time. 1171 * Note that the default value of "tcp_time_wait_interval" is four minutes, 1172 * per the TCP spec. 1173 */ 1174 /* BEGIN CSTYLED */ 1175 static tcpparam_t lcl_tcp_param_arr[] = { 1176 /*min max value name */ 1177 { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"}, 1178 { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" }, 1179 { 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" }, 1180 { 1, 1024, 1, "tcp_conn_req_min" }, 1181 { 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" }, 1182 { 128, (1<<30), 1024*1024, "tcp_cwnd_max" }, 1183 { 0, 10, 0, "tcp_debug" }, 1184 { 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"}, 1185 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"}, 1186 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"}, 1187 { 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"}, 1188 { 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"}, 1189 { 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"}, 1190 { 1, 255, 64, "tcp_ipv4_ttl"}, 1191 { 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"}, 1192 { 0, 100, 10, "tcp_maxpsz_multiplier" }, 1193 { 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"}, 1194 { 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"}, 1195 { 1, TCP_MSS_MAX, 108, "tcp_mss_min"}, 1196 { 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"}, 1197 { 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"}, 1198 { 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"}, 1199 { 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"}, 1200 { 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" }, 1201 { 0, 16, 0, "tcp_snd_lowat_fraction" }, 1202 { 0, 128000, 0, "tcp_sth_rcv_hiwat" }, 1203 { 0, 128000, 0, "tcp_sth_rcv_lowat" }, 1204 { 1, 10000, 3, "tcp_dupack_fast_retransmit" }, 1205 { 0, 1, 0, "tcp_ignore_path_mtu" }, 1206 { 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"}, 1207 { 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"}, 1208 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"}, 1209 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"}, 1210 { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"}, 1211 { 1, 65536, 4, "tcp_recv_hiwat_minmss"}, 1212 { 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"}, 1213 { 0, TCP_MSS_MAX, 64, "tcp_co_min"}, 1214 { 8192, (1<<30), 1024*1024, "tcp_max_buf"}, 1215 /* 1216 * Question: What default value should I set for tcp_strong_iss? 1217 */ 1218 { 0, 2, 1, "tcp_strong_iss"}, 1219 { 0, 65536, 20, "tcp_rtt_updates"}, 1220 { 0, 1, 1, "tcp_wscale_always"}, 1221 { 0, 1, 0, "tcp_tstamp_always"}, 1222 { 0, 1, 1, "tcp_tstamp_if_wscale"}, 1223 { 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"}, 1224 { 0, 16, 2, "tcp_deferred_acks_max"}, 1225 { 1, 16384, 4, "tcp_slow_start_after_idle"}, 1226 { 1, 4, 4, "tcp_slow_start_initial"}, 1227 { 10*MS, 50*MS, 20*MS, "tcp_co_timer_interval"}, 1228 { 0, 2, 2, "tcp_sack_permitted"}, 1229 { 0, 1, 0, "tcp_trace"}, 1230 { 0, 1, 1, "tcp_compression_enabled"}, 1231 { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"}, 1232 { 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"}, 1233 { 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"}, 1234 { 0, 1, 0, "tcp_rev_src_routes"}, 1235 { 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"}, 1236 { 100*MS, 60*SECONDS, 1*SECONDS, "tcp_ndd_get_info_interval"}, 1237 { 0, 16, 8, "tcp_local_dacks_max"}, 1238 { 0, 2, 1, "tcp_ecn_permitted"}, 1239 { 0, 1, 1, "tcp_rst_sent_rate_enabled"}, 1240 { 0, PARAM_MAX, 40, "tcp_rst_sent_rate"}, 1241 { 0, 100*MS, 50*MS, "tcp_push_timer_interval"}, 1242 { 0, 1, 0, "tcp_use_smss_as_mss_opt"}, 1243 { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"}, 1244 }; 1245 /* END CSTYLED */ 1246 1247 /* 1248 * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of 1249 * each header fragment in the header buffer. Each parameter value has 1250 * to be a multiple of 4 (32-bit aligned). 1251 */ 1252 static tcpparam_t lcl_tcp_mdt_head_param = 1253 { 32, 256, 32, "tcp_mdt_hdr_head_min" }; 1254 static tcpparam_t lcl_tcp_mdt_tail_param = 1255 { 0, 256, 32, "tcp_mdt_hdr_tail_min" }; 1256 #define tcps_mdt_hdr_head_min tcps_mdt_head_param->tcp_param_val 1257 #define tcps_mdt_hdr_tail_min tcps_mdt_tail_param->tcp_param_val 1258 1259 /* 1260 * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out 1261 * the maximum number of payload buffers associated per Multidata. 1262 */ 1263 static tcpparam_t lcl_tcp_mdt_max_pbufs_param = 1264 { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" }; 1265 #define tcps_mdt_max_pbufs tcps_mdt_max_pbufs_param->tcp_param_val 1266 1267 /* Round up the value to the nearest mss. */ 1268 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 1269 1270 /* 1271 * Set ECN capable transport (ECT) code point in IP header. 1272 * 1273 * Note that there are 2 ECT code points '01' and '10', which are called 1274 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 1275 * point ECT(0) for TCP as described in RFC 2481. 1276 */ 1277 #define SET_ECT(tcp, iph) \ 1278 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1279 /* We need to clear the code point first. */ \ 1280 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ 1281 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ 1282 } else { \ 1283 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \ 1284 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ 1285 } 1286 1287 /* 1288 * The format argument to pass to tcp_display(). 1289 * DISP_PORT_ONLY means that the returned string has only port info. 1290 * DISP_ADDR_AND_PORT means that the returned string also contains the 1291 * remote and local IP address. 1292 */ 1293 #define DISP_PORT_ONLY 1 1294 #define DISP_ADDR_AND_PORT 2 1295 1296 #define NDD_TOO_QUICK_MSG \ 1297 "ndd get info rate too high for non-privileged users, try again " \ 1298 "later.\n" 1299 #define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n" 1300 1301 #define IS_VMLOANED_MBLK(mp) \ 1302 (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) 1303 1304 1305 /* Enable or disable b_cont M_MULTIDATA chaining for MDT. */ 1306 boolean_t tcp_mdt_chain = B_TRUE; 1307 1308 /* 1309 * MDT threshold in the form of effective send MSS multiplier; we take 1310 * the MDT path if the amount of unsent data exceeds the threshold value 1311 * (default threshold is 1*SMSS). 1312 */ 1313 uint_t tcp_mdt_smss_threshold = 1; 1314 1315 uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ 1316 1317 /* 1318 * Forces all connections to obey the value of the tcps_maxpsz_multiplier 1319 * tunable settable via NDD. Otherwise, the per-connection behavior is 1320 * determined dynamically during tcp_adapt_ire(), which is the default. 1321 */ 1322 boolean_t tcp_static_maxpsz = B_FALSE; 1323 1324 /* Setable in /etc/system */ 1325 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ 1326 uint32_t tcp_random_anon_port = 1; 1327 1328 /* 1329 * To reach to an eager in Q0 which can be dropped due to an incoming 1330 * new SYN request when Q0 is full, a new doubly linked list is 1331 * introduced. This list allows to select an eager from Q0 in O(1) time. 1332 * This is needed to avoid spending too much time walking through the 1333 * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of 1334 * this new list has to be a member of Q0. 1335 * This list is headed by listener's tcp_t. When the list is empty, 1336 * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0, 1337 * of listener's tcp_t point to listener's tcp_t itself. 1338 * 1339 * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager 1340 * in the list. MAKE_UNDROPPABLE() takes the eager out of the list. 1341 * These macros do not affect the eager's membership to Q0. 1342 */ 1343 1344 1345 #define MAKE_DROPPABLE(listener, eager) \ 1346 if ((eager)->tcp_eager_next_drop_q0 == NULL) { \ 1347 (listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\ 1348 = (eager); \ 1349 (eager)->tcp_eager_prev_drop_q0 = (listener); \ 1350 (eager)->tcp_eager_next_drop_q0 = \ 1351 (listener)->tcp_eager_next_drop_q0; \ 1352 (listener)->tcp_eager_next_drop_q0 = (eager); \ 1353 } 1354 1355 #define MAKE_UNDROPPABLE(eager) \ 1356 if ((eager)->tcp_eager_next_drop_q0 != NULL) { \ 1357 (eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \ 1358 = (eager)->tcp_eager_prev_drop_q0; \ 1359 (eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \ 1360 = (eager)->tcp_eager_next_drop_q0; \ 1361 (eager)->tcp_eager_prev_drop_q0 = NULL; \ 1362 (eager)->tcp_eager_next_drop_q0 = NULL; \ 1363 } 1364 1365 /* 1366 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more 1367 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent 1368 * data, TCP will not respond with an ACK. RFC 793 requires that 1369 * TCP responds with an ACK for such a bogus ACK. By not following 1370 * the RFC, we prevent TCP from getting into an ACK storm if somehow 1371 * an attacker successfully spoofs an acceptable segment to our 1372 * peer; or when our peer is "confused." 1373 */ 1374 uint32_t tcp_drop_ack_unsent_cnt = 10; 1375 1376 /* 1377 * Hook functions to enable cluster networking 1378 * On non-clustered systems these vectors must always be NULL. 1379 */ 1380 1381 void (*cl_inet_listen)(uint8_t protocol, sa_family_t addr_family, 1382 uint8_t *laddrp, in_port_t lport) = NULL; 1383 void (*cl_inet_unlisten)(uint8_t protocol, sa_family_t addr_family, 1384 uint8_t *laddrp, in_port_t lport) = NULL; 1385 void (*cl_inet_connect)(uint8_t protocol, sa_family_t addr_family, 1386 uint8_t *laddrp, in_port_t lport, 1387 uint8_t *faddrp, in_port_t fport) = NULL; 1388 void (*cl_inet_disconnect)(uint8_t protocol, sa_family_t addr_family, 1389 uint8_t *laddrp, in_port_t lport, 1390 uint8_t *faddrp, in_port_t fport) = NULL; 1391 1392 /* 1393 * The following are defined in ip.c 1394 */ 1395 extern int (*cl_inet_isclusterwide)(uint8_t protocol, sa_family_t addr_family, 1396 uint8_t *laddrp); 1397 extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, 1398 uint8_t *laddrp, uint8_t *faddrp); 1399 1400 #define CL_INET_CONNECT(tcp) { \ 1401 if (cl_inet_connect != NULL) { \ 1402 /* \ 1403 * Running in cluster mode - register active connection \ 1404 * information \ 1405 */ \ 1406 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1407 if ((tcp)->tcp_ipha->ipha_src != 0) { \ 1408 (*cl_inet_connect)(IPPROTO_TCP, AF_INET,\ 1409 (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\ 1410 (in_port_t)(tcp)->tcp_lport, \ 1411 (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ 1412 (in_port_t)(tcp)->tcp_fport); \ 1413 } \ 1414 } else { \ 1415 if (!IN6_IS_ADDR_UNSPECIFIED( \ 1416 &(tcp)->tcp_ip6h->ip6_src)) {\ 1417 (*cl_inet_connect)(IPPROTO_TCP, AF_INET6,\ 1418 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\ 1419 (in_port_t)(tcp)->tcp_lport, \ 1420 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ 1421 (in_port_t)(tcp)->tcp_fport); \ 1422 } \ 1423 } \ 1424 } \ 1425 } 1426 1427 #define CL_INET_DISCONNECT(tcp) { \ 1428 if (cl_inet_disconnect != NULL) { \ 1429 /* \ 1430 * Running in cluster mode - deregister active \ 1431 * connection information \ 1432 */ \ 1433 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1434 if ((tcp)->tcp_ip_src != 0) { \ 1435 (*cl_inet_disconnect)(IPPROTO_TCP, \ 1436 AF_INET, \ 1437 (uint8_t *)(&((tcp)->tcp_ip_src)),\ 1438 (in_port_t)(tcp)->tcp_lport, \ 1439 (uint8_t *) \ 1440 (&((tcp)->tcp_ipha->ipha_dst)),\ 1441 (in_port_t)(tcp)->tcp_fport); \ 1442 } \ 1443 } else { \ 1444 if (!IN6_IS_ADDR_UNSPECIFIED( \ 1445 &(tcp)->tcp_ip_src_v6)) { \ 1446 (*cl_inet_disconnect)(IPPROTO_TCP, AF_INET6,\ 1447 (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\ 1448 (in_port_t)(tcp)->tcp_lport, \ 1449 (uint8_t *) \ 1450 (&((tcp)->tcp_ip6h->ip6_dst)),\ 1451 (in_port_t)(tcp)->tcp_fport); \ 1452 } \ 1453 } \ 1454 } \ 1455 } 1456 1457 /* 1458 * Cluster networking hook for traversing current connection list. 1459 * This routine is used to extract the current list of live connections 1460 * which must continue to to be dispatched to this node. 1461 */ 1462 int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg); 1463 1464 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), 1465 void *arg, tcp_stack_t *tcps); 1466 1467 /* 1468 * Figure out the value of window scale opton. Note that the rwnd is 1469 * ASSUMED to be rounded up to the nearest MSS before the calculation. 1470 * We cannot find the scale value and then do a round up of tcp_rwnd 1471 * because the scale value may not be correct after that. 1472 * 1473 * Set the compiler flag to make this function inline. 1474 */ 1475 static void 1476 tcp_set_ws_value(tcp_t *tcp) 1477 { 1478 int i; 1479 uint32_t rwnd = tcp->tcp_rwnd; 1480 1481 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; 1482 i++, rwnd >>= 1) 1483 ; 1484 tcp->tcp_rcv_ws = i; 1485 } 1486 1487 /* 1488 * Remove a connection from the list of detached TIME_WAIT connections. 1489 * It returns B_FALSE if it can't remove the connection from the list 1490 * as the connection has already been removed from the list due to an 1491 * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. 1492 */ 1493 static boolean_t 1494 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) 1495 { 1496 boolean_t locked = B_FALSE; 1497 1498 if (tcp_time_wait == NULL) { 1499 tcp_time_wait = *((tcp_squeue_priv_t **) 1500 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); 1501 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1502 locked = B_TRUE; 1503 } else { 1504 ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); 1505 } 1506 1507 if (tcp->tcp_time_wait_expire == 0) { 1508 ASSERT(tcp->tcp_time_wait_next == NULL); 1509 ASSERT(tcp->tcp_time_wait_prev == NULL); 1510 if (locked) 1511 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1512 return (B_FALSE); 1513 } 1514 ASSERT(TCP_IS_DETACHED(tcp)); 1515 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1516 1517 if (tcp == tcp_time_wait->tcp_time_wait_head) { 1518 ASSERT(tcp->tcp_time_wait_prev == NULL); 1519 tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; 1520 if (tcp_time_wait->tcp_time_wait_head != NULL) { 1521 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = 1522 NULL; 1523 } else { 1524 tcp_time_wait->tcp_time_wait_tail = NULL; 1525 } 1526 } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { 1527 ASSERT(tcp != tcp_time_wait->tcp_time_wait_head); 1528 ASSERT(tcp->tcp_time_wait_next == NULL); 1529 tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; 1530 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1531 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; 1532 } else { 1533 ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 1534 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 1535 tcp->tcp_time_wait_prev->tcp_time_wait_next = 1536 tcp->tcp_time_wait_next; 1537 tcp->tcp_time_wait_next->tcp_time_wait_prev = 1538 tcp->tcp_time_wait_prev; 1539 } 1540 tcp->tcp_time_wait_next = NULL; 1541 tcp->tcp_time_wait_prev = NULL; 1542 tcp->tcp_time_wait_expire = 0; 1543 1544 if (locked) 1545 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1546 return (B_TRUE); 1547 } 1548 1549 /* 1550 * Add a connection to the list of detached TIME_WAIT connections 1551 * and set its time to expire. 1552 */ 1553 static void 1554 tcp_time_wait_append(tcp_t *tcp) 1555 { 1556 tcp_stack_t *tcps = tcp->tcp_tcps; 1557 tcp_squeue_priv_t *tcp_time_wait = 1558 *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp, 1559 SQPRIVATE_TCP)); 1560 1561 tcp_timers_stop(tcp); 1562 1563 /* Freed above */ 1564 ASSERT(tcp->tcp_timer_tid == 0); 1565 ASSERT(tcp->tcp_ack_tid == 0); 1566 1567 /* must have happened at the time of detaching the tcp */ 1568 ASSERT(tcp->tcp_ptpahn == NULL); 1569 ASSERT(tcp->tcp_flow_stopped == 0); 1570 ASSERT(tcp->tcp_time_wait_next == NULL); 1571 ASSERT(tcp->tcp_time_wait_prev == NULL); 1572 ASSERT(tcp->tcp_time_wait_expire == NULL); 1573 ASSERT(tcp->tcp_listener == NULL); 1574 1575 tcp->tcp_time_wait_expire = ddi_get_lbolt(); 1576 /* 1577 * The value computed below in tcp->tcp_time_wait_expire may 1578 * appear negative or wrap around. That is ok since our 1579 * interest is only in the difference between the current lbolt 1580 * value and tcp->tcp_time_wait_expire. But the value should not 1581 * be zero, since it means the tcp is not in the TIME_WAIT list. 1582 * The corresponding comparison in tcp_time_wait_collector() uses 1583 * modular arithmetic. 1584 */ 1585 tcp->tcp_time_wait_expire += 1586 drv_usectohz(tcps->tcps_time_wait_interval * 1000); 1587 if (tcp->tcp_time_wait_expire == 0) 1588 tcp->tcp_time_wait_expire = 1; 1589 1590 ASSERT(TCP_IS_DETACHED(tcp)); 1591 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1592 ASSERT(tcp->tcp_time_wait_next == NULL); 1593 ASSERT(tcp->tcp_time_wait_prev == NULL); 1594 TCP_DBGSTAT(tcps, tcp_time_wait); 1595 1596 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1597 if (tcp_time_wait->tcp_time_wait_head == NULL) { 1598 ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); 1599 tcp_time_wait->tcp_time_wait_head = tcp; 1600 } else { 1601 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1602 ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == 1603 TCPS_TIME_WAIT); 1604 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; 1605 tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; 1606 } 1607 tcp_time_wait->tcp_time_wait_tail = tcp; 1608 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1609 } 1610 1611 /* ARGSUSED */ 1612 void 1613 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) 1614 { 1615 conn_t *connp = (conn_t *)arg; 1616 tcp_t *tcp = connp->conn_tcp; 1617 tcp_stack_t *tcps = tcp->tcp_tcps; 1618 1619 ASSERT(tcp != NULL); 1620 if (tcp->tcp_state == TCPS_CLOSED) { 1621 return; 1622 } 1623 1624 ASSERT((tcp->tcp_family == AF_INET && 1625 tcp->tcp_ipversion == IPV4_VERSION) || 1626 (tcp->tcp_family == AF_INET6 && 1627 (tcp->tcp_ipversion == IPV4_VERSION || 1628 tcp->tcp_ipversion == IPV6_VERSION))); 1629 ASSERT(!tcp->tcp_listener); 1630 1631 TCP_STAT(tcps, tcp_time_wait_reap); 1632 ASSERT(TCP_IS_DETACHED(tcp)); 1633 1634 /* 1635 * Because they have no upstream client to rebind or tcp_close() 1636 * them later, we axe the connection here and now. 1637 */ 1638 tcp_close_detached(tcp); 1639 } 1640 1641 /* 1642 * Remove cached/latched IPsec references. 1643 */ 1644 void 1645 tcp_ipsec_cleanup(tcp_t *tcp) 1646 { 1647 conn_t *connp = tcp->tcp_connp; 1648 1649 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1650 1651 if (connp->conn_latch != NULL) { 1652 IPLATCH_REFRELE(connp->conn_latch, 1653 connp->conn_netstack); 1654 connp->conn_latch = NULL; 1655 } 1656 if (connp->conn_policy != NULL) { 1657 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); 1658 connp->conn_policy = NULL; 1659 } 1660 } 1661 1662 /* 1663 * Cleaup before placing on free list. 1664 * Disassociate from the netstack/tcp_stack_t since the freelist 1665 * is per squeue and not per netstack. 1666 */ 1667 void 1668 tcp_cleanup(tcp_t *tcp) 1669 { 1670 mblk_t *mp; 1671 char *tcp_iphc; 1672 int tcp_iphc_len; 1673 int tcp_hdr_grown; 1674 tcp_sack_info_t *tcp_sack_info; 1675 conn_t *connp = tcp->tcp_connp; 1676 tcp_stack_t *tcps = tcp->tcp_tcps; 1677 netstack_t *ns = tcps->tcps_netstack; 1678 1679 tcp_bind_hash_remove(tcp); 1680 1681 /* Cleanup that which needs the netstack first */ 1682 tcp_ipsec_cleanup(tcp); 1683 1684 tcp_free(tcp); 1685 1686 /* Release any SSL context */ 1687 if (tcp->tcp_kssl_ent != NULL) { 1688 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); 1689 tcp->tcp_kssl_ent = NULL; 1690 } 1691 1692 if (tcp->tcp_kssl_ctx != NULL) { 1693 kssl_release_ctx(tcp->tcp_kssl_ctx); 1694 tcp->tcp_kssl_ctx = NULL; 1695 } 1696 tcp->tcp_kssl_pending = B_FALSE; 1697 1698 conn_delete_ire(connp, NULL); 1699 1700 /* 1701 * Since we will bzero the entire structure, we need to 1702 * remove it and reinsert it in global hash list. We 1703 * know the walkers can't get to this conn because we 1704 * had set CONDEMNED flag earlier and checked reference 1705 * under conn_lock so walker won't pick it and when we 1706 * go the ipcl_globalhash_remove() below, no walker 1707 * can get to it. 1708 */ 1709 ipcl_globalhash_remove(connp); 1710 1711 /* 1712 * Now it is safe to decrement the reference counts. 1713 * This might be the last reference on the netstack and TCPS 1714 * in which case it will cause the tcp_g_q_close and 1715 * the freeing of the IP Instance. 1716 */ 1717 connp->conn_netstack = NULL; 1718 netstack_rele(ns); 1719 ASSERT(tcps != NULL); 1720 tcp->tcp_tcps = NULL; 1721 TCPS_REFRELE(tcps); 1722 1723 /* Save some state */ 1724 mp = tcp->tcp_timercache; 1725 1726 tcp_sack_info = tcp->tcp_sack_info; 1727 tcp_iphc = tcp->tcp_iphc; 1728 tcp_iphc_len = tcp->tcp_iphc_len; 1729 tcp_hdr_grown = tcp->tcp_hdr_grown; 1730 1731 if (connp->conn_cred != NULL) { 1732 crfree(connp->conn_cred); 1733 connp->conn_cred = NULL; 1734 } 1735 if (connp->conn_peercred != NULL) { 1736 crfree(connp->conn_peercred); 1737 connp->conn_peercred = NULL; 1738 } 1739 ipcl_conn_cleanup(connp); 1740 connp->conn_flags = IPCL_TCPCONN; 1741 bzero(tcp, sizeof (tcp_t)); 1742 1743 /* restore the state */ 1744 tcp->tcp_timercache = mp; 1745 1746 tcp->tcp_sack_info = tcp_sack_info; 1747 tcp->tcp_iphc = tcp_iphc; 1748 tcp->tcp_iphc_len = tcp_iphc_len; 1749 tcp->tcp_hdr_grown = tcp_hdr_grown; 1750 1751 tcp->tcp_connp = connp; 1752 1753 ASSERT(connp->conn_tcp == tcp); 1754 ASSERT(connp->conn_flags & IPCL_TCPCONN); 1755 connp->conn_state_flags = CONN_INCIPIENT; 1756 ASSERT(connp->conn_ulp == IPPROTO_TCP); 1757 ASSERT(connp->conn_ref == 1); 1758 } 1759 1760 /* 1761 * Blows away all tcps whose TIME_WAIT has expired. List traversal 1762 * is done forwards from the head. 1763 * This walks all stack instances since 1764 * tcp_time_wait remains global across all stacks. 1765 */ 1766 /* ARGSUSED */ 1767 void 1768 tcp_time_wait_collector(void *arg) 1769 { 1770 tcp_t *tcp; 1771 clock_t now; 1772 mblk_t *mp; 1773 conn_t *connp; 1774 kmutex_t *lock; 1775 boolean_t removed; 1776 1777 squeue_t *sqp = (squeue_t *)arg; 1778 tcp_squeue_priv_t *tcp_time_wait = 1779 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 1780 1781 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1782 tcp_time_wait->tcp_time_wait_tid = 0; 1783 1784 if (tcp_time_wait->tcp_free_list != NULL && 1785 tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { 1786 TCP_G_STAT(tcp_freelist_cleanup); 1787 while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { 1788 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 1789 tcp->tcp_time_wait_next = NULL; 1790 tcp_time_wait->tcp_free_list_cnt--; 1791 ASSERT(tcp->tcp_tcps == NULL); 1792 CONN_DEC_REF(tcp->tcp_connp); 1793 } 1794 ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); 1795 } 1796 1797 /* 1798 * In order to reap time waits reliably, we should use a 1799 * source of time that is not adjustable by the user -- hence 1800 * the call to ddi_get_lbolt(). 1801 */ 1802 now = ddi_get_lbolt(); 1803 while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { 1804 /* 1805 * Compare times using modular arithmetic, since 1806 * lbolt can wrapover. 1807 */ 1808 if ((now - tcp->tcp_time_wait_expire) < 0) { 1809 break; 1810 } 1811 1812 removed = tcp_time_wait_remove(tcp, tcp_time_wait); 1813 ASSERT(removed); 1814 1815 connp = tcp->tcp_connp; 1816 ASSERT(connp->conn_fanout != NULL); 1817 lock = &connp->conn_fanout->connf_lock; 1818 /* 1819 * This is essentially a TW reclaim fast path optimization for 1820 * performance where the timewait collector checks under the 1821 * fanout lock (so that no one else can get access to the 1822 * conn_t) that the refcnt is 2 i.e. one for TCP and one for 1823 * the classifier hash list. If ref count is indeed 2, we can 1824 * just remove the conn under the fanout lock and avoid 1825 * cleaning up the conn under the squeue, provided that 1826 * clustering callbacks are not enabled. If clustering is 1827 * enabled, we need to make the clustering callback before 1828 * setting the CONDEMNED flag and after dropping all locks and 1829 * so we forego this optimization and fall back to the slow 1830 * path. Also please see the comments in tcp_closei_local 1831 * regarding the refcnt logic. 1832 * 1833 * Since we are holding the tcp_time_wait_lock, its better 1834 * not to block on the fanout_lock because other connections 1835 * can't add themselves to time_wait list. So we do a 1836 * tryenter instead of mutex_enter. 1837 */ 1838 if (mutex_tryenter(lock)) { 1839 mutex_enter(&connp->conn_lock); 1840 if ((connp->conn_ref == 2) && 1841 (cl_inet_disconnect == NULL)) { 1842 ipcl_hash_remove_locked(connp, 1843 connp->conn_fanout); 1844 /* 1845 * Set the CONDEMNED flag now itself so that 1846 * the refcnt cannot increase due to any 1847 * walker. But we have still not cleaned up 1848 * conn_ire_cache. This is still ok since 1849 * we are going to clean it up in tcp_cleanup 1850 * immediately and any interface unplumb 1851 * thread will wait till the ire is blown away 1852 */ 1853 connp->conn_state_flags |= CONN_CONDEMNED; 1854 mutex_exit(lock); 1855 mutex_exit(&connp->conn_lock); 1856 if (tcp_time_wait->tcp_free_list_cnt < 1857 tcp_free_list_max_cnt) { 1858 /* Add to head of tcp_free_list */ 1859 mutex_exit( 1860 &tcp_time_wait->tcp_time_wait_lock); 1861 tcp_cleanup(tcp); 1862 ASSERT(connp->conn_latch == NULL); 1863 ASSERT(connp->conn_policy == NULL); 1864 ASSERT(tcp->tcp_tcps == NULL); 1865 ASSERT(connp->conn_netstack == NULL); 1866 1867 mutex_enter( 1868 &tcp_time_wait->tcp_time_wait_lock); 1869 tcp->tcp_time_wait_next = 1870 tcp_time_wait->tcp_free_list; 1871 tcp_time_wait->tcp_free_list = tcp; 1872 tcp_time_wait->tcp_free_list_cnt++; 1873 continue; 1874 } else { 1875 /* Do not add to tcp_free_list */ 1876 mutex_exit( 1877 &tcp_time_wait->tcp_time_wait_lock); 1878 tcp_bind_hash_remove(tcp); 1879 conn_delete_ire(tcp->tcp_connp, NULL); 1880 tcp_ipsec_cleanup(tcp); 1881 CONN_DEC_REF(tcp->tcp_connp); 1882 } 1883 } else { 1884 CONN_INC_REF_LOCKED(connp); 1885 mutex_exit(lock); 1886 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1887 mutex_exit(&connp->conn_lock); 1888 /* 1889 * We can reuse the closemp here since conn has 1890 * detached (otherwise we wouldn't even be in 1891 * time_wait list). tcp_closemp_used can safely 1892 * be changed without taking a lock as no other 1893 * thread can concurrently access it at this 1894 * point in the connection lifecycle. 1895 */ 1896 1897 if (tcp->tcp_closemp.b_prev == NULL) 1898 tcp->tcp_closemp_used = B_TRUE; 1899 else 1900 cmn_err(CE_PANIC, 1901 "tcp_timewait_collector: " 1902 "concurrent use of tcp_closemp: " 1903 "connp %p tcp %p\n", (void *)connp, 1904 (void *)tcp); 1905 1906 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1907 mp = &tcp->tcp_closemp; 1908 squeue_fill(connp->conn_sqp, mp, 1909 tcp_timewait_output, connp, 1910 SQTAG_TCP_TIMEWAIT); 1911 } 1912 } else { 1913 mutex_enter(&connp->conn_lock); 1914 CONN_INC_REF_LOCKED(connp); 1915 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1916 mutex_exit(&connp->conn_lock); 1917 /* 1918 * We can reuse the closemp here since conn has 1919 * detached (otherwise we wouldn't even be in 1920 * time_wait list). tcp_closemp_used can safely 1921 * be changed without taking a lock as no other 1922 * thread can concurrently access it at this 1923 * point in the connection lifecycle. 1924 */ 1925 1926 if (tcp->tcp_closemp.b_prev == NULL) 1927 tcp->tcp_closemp_used = B_TRUE; 1928 else 1929 cmn_err(CE_PANIC, "tcp_timewait_collector: " 1930 "concurrent use of tcp_closemp: " 1931 "connp %p tcp %p\n", (void *)connp, 1932 (void *)tcp); 1933 1934 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 1935 mp = &tcp->tcp_closemp; 1936 squeue_fill(connp->conn_sqp, mp, 1937 tcp_timewait_output, connp, 0); 1938 } 1939 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1940 } 1941 1942 if (tcp_time_wait->tcp_free_list != NULL) 1943 tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; 1944 1945 tcp_time_wait->tcp_time_wait_tid = 1946 timeout(tcp_time_wait_collector, sqp, TCP_TIME_WAIT_DELAY); 1947 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1948 } 1949 /* 1950 * Reply to a clients T_CONN_RES TPI message. This function 1951 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES 1952 * on the acceptor STREAM and processed in tcp_wput_accept(). 1953 * Read the block comment on top of tcp_conn_request(). 1954 */ 1955 static void 1956 tcp_accept(tcp_t *listener, mblk_t *mp) 1957 { 1958 tcp_t *acceptor; 1959 tcp_t *eager; 1960 tcp_t *tcp; 1961 struct T_conn_res *tcr; 1962 t_uscalar_t acceptor_id; 1963 t_scalar_t seqnum; 1964 mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */ 1965 mblk_t *ok_mp; 1966 mblk_t *mp1; 1967 tcp_stack_t *tcps = listener->tcp_tcps; 1968 1969 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 1970 tcp_err_ack(listener, mp, TPROTO, 0); 1971 return; 1972 } 1973 tcr = (struct T_conn_res *)mp->b_rptr; 1974 1975 /* 1976 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the 1977 * read side queue of the streams device underneath us i.e. the 1978 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we 1979 * look it up in the queue_hash. Under LP64 it sends down the 1980 * minor_t of the accepting endpoint. 1981 * 1982 * Once the acceptor/eager are modified (in tcp_accept_swap) the 1983 * fanout hash lock is held. 1984 * This prevents any thread from entering the acceptor queue from 1985 * below (since it has not been hard bound yet i.e. any inbound 1986 * packets will arrive on the listener or default tcp queue and 1987 * go through tcp_lookup). 1988 * The CONN_INC_REF will prevent the acceptor from closing. 1989 * 1990 * XXX It is still possible for a tli application to send down data 1991 * on the accepting stream while another thread calls t_accept. 1992 * This should not be a problem for well-behaved applications since 1993 * the T_OK_ACK is sent after the queue swapping is completed. 1994 * 1995 * If the accepting fd is the same as the listening fd, avoid 1996 * queue hash lookup since that will return an eager listener in a 1997 * already established state. 1998 */ 1999 acceptor_id = tcr->ACCEPTOR_id; 2000 mutex_enter(&listener->tcp_eager_lock); 2001 if (listener->tcp_acceptor_id == acceptor_id) { 2002 eager = listener->tcp_eager_next_q; 2003 /* only count how many T_CONN_INDs so don't count q0 */ 2004 if ((listener->tcp_conn_req_cnt_q != 1) || 2005 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { 2006 mutex_exit(&listener->tcp_eager_lock); 2007 tcp_err_ack(listener, mp, TBADF, 0); 2008 return; 2009 } 2010 if (listener->tcp_conn_req_cnt_q0 != 0) { 2011 /* Throw away all the eagers on q0. */ 2012 tcp_eager_cleanup(listener, 1); 2013 } 2014 if (listener->tcp_syn_defense) { 2015 listener->tcp_syn_defense = B_FALSE; 2016 if (listener->tcp_ip_addr_cache != NULL) { 2017 kmem_free(listener->tcp_ip_addr_cache, 2018 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 2019 listener->tcp_ip_addr_cache = NULL; 2020 } 2021 } 2022 /* 2023 * Transfer tcp_conn_req_max to the eager so that when 2024 * a disconnect occurs we can revert the endpoint to the 2025 * listen state. 2026 */ 2027 eager->tcp_conn_req_max = listener->tcp_conn_req_max; 2028 ASSERT(listener->tcp_conn_req_cnt_q0 == 0); 2029 /* 2030 * Get a reference on the acceptor just like the 2031 * tcp_acceptor_hash_lookup below. 2032 */ 2033 acceptor = listener; 2034 CONN_INC_REF(acceptor->tcp_connp); 2035 } else { 2036 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); 2037 if (acceptor == NULL) { 2038 if (listener->tcp_debug) { 2039 (void) strlog(TCP_MOD_ID, 0, 1, 2040 SL_ERROR|SL_TRACE, 2041 "tcp_accept: did not find acceptor 0x%x\n", 2042 acceptor_id); 2043 } 2044 mutex_exit(&listener->tcp_eager_lock); 2045 tcp_err_ack(listener, mp, TPROVMISMATCH, 0); 2046 return; 2047 } 2048 /* 2049 * Verify acceptor state. The acceptable states for an acceptor 2050 * include TCPS_IDLE and TCPS_BOUND. 2051 */ 2052 switch (acceptor->tcp_state) { 2053 case TCPS_IDLE: 2054 /* FALLTHRU */ 2055 case TCPS_BOUND: 2056 break; 2057 default: 2058 CONN_DEC_REF(acceptor->tcp_connp); 2059 mutex_exit(&listener->tcp_eager_lock); 2060 tcp_err_ack(listener, mp, TOUTSTATE, 0); 2061 return; 2062 } 2063 } 2064 2065 /* The listener must be in TCPS_LISTEN */ 2066 if (listener->tcp_state != TCPS_LISTEN) { 2067 CONN_DEC_REF(acceptor->tcp_connp); 2068 mutex_exit(&listener->tcp_eager_lock); 2069 tcp_err_ack(listener, mp, TOUTSTATE, 0); 2070 return; 2071 } 2072 2073 /* 2074 * Rendezvous with an eager connection request packet hanging off 2075 * 'tcp' that has the 'seqnum' tag. We tagged the detached open 2076 * tcp structure when the connection packet arrived in 2077 * tcp_conn_request(). 2078 */ 2079 seqnum = tcr->SEQ_number; 2080 eager = listener; 2081 do { 2082 eager = eager->tcp_eager_next_q; 2083 if (eager == NULL) { 2084 CONN_DEC_REF(acceptor->tcp_connp); 2085 mutex_exit(&listener->tcp_eager_lock); 2086 tcp_err_ack(listener, mp, TBADSEQ, 0); 2087 return; 2088 } 2089 } while (eager->tcp_conn_req_seqnum != seqnum); 2090 mutex_exit(&listener->tcp_eager_lock); 2091 2092 /* 2093 * At this point, both acceptor and listener have 2 ref 2094 * that they begin with. Acceptor has one additional ref 2095 * we placed in lookup while listener has 3 additional 2096 * ref for being behind the squeue (tcp_accept() is 2097 * done on listener's squeue); being in classifier hash; 2098 * and eager's ref on listener. 2099 */ 2100 ASSERT(listener->tcp_connp->conn_ref >= 5); 2101 ASSERT(acceptor->tcp_connp->conn_ref >= 3); 2102 2103 /* 2104 * The eager at this point is set in its own squeue and 2105 * could easily have been killed (tcp_accept_finish will 2106 * deal with that) because of a TH_RST so we can only 2107 * ASSERT for a single ref. 2108 */ 2109 ASSERT(eager->tcp_connp->conn_ref >= 1); 2110 2111 /* Pre allocate the stroptions mblk also */ 2112 opt_mp = allocb(sizeof (struct stroptions), BPRI_HI); 2113 if (opt_mp == NULL) { 2114 CONN_DEC_REF(acceptor->tcp_connp); 2115 CONN_DEC_REF(eager->tcp_connp); 2116 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 2117 return; 2118 } 2119 DB_TYPE(opt_mp) = M_SETOPTS; 2120 opt_mp->b_wptr += sizeof (struct stroptions); 2121 2122 /* 2123 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO 2124 * from listener to acceptor. The message is chained on opt_mp 2125 * which will be sent onto eager's squeue. 2126 */ 2127 if (listener->tcp_bound_if != 0) { 2128 /* allocate optmgmt req */ 2129 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6, 2130 IPV6_BOUND_IF, (char *)&listener->tcp_bound_if, 2131 sizeof (int)); 2132 if (mp1 != NULL) 2133 linkb(opt_mp, mp1); 2134 } 2135 if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { 2136 uint_t on = 1; 2137 2138 /* allocate optmgmt req */ 2139 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6, 2140 IPV6_RECVPKTINFO, (char *)&on, sizeof (on)); 2141 if (mp1 != NULL) 2142 linkb(opt_mp, mp1); 2143 } 2144 2145 /* Re-use mp1 to hold a copy of mp, in case reallocb fails */ 2146 if ((mp1 = copymsg(mp)) == NULL) { 2147 CONN_DEC_REF(acceptor->tcp_connp); 2148 CONN_DEC_REF(eager->tcp_connp); 2149 freemsg(opt_mp); 2150 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 2151 return; 2152 } 2153 2154 tcr = (struct T_conn_res *)mp1->b_rptr; 2155 2156 /* 2157 * This is an expanded version of mi_tpi_ok_ack_alloc() 2158 * which allocates a larger mblk and appends the new 2159 * local address to the ok_ack. The address is copied by 2160 * soaccept() for getsockname(). 2161 */ 2162 { 2163 int extra; 2164 2165 extra = (eager->tcp_family == AF_INET) ? 2166 sizeof (sin_t) : sizeof (sin6_t); 2167 2168 /* 2169 * Try to re-use mp, if possible. Otherwise, allocate 2170 * an mblk and return it as ok_mp. In any case, mp 2171 * is no longer usable upon return. 2172 */ 2173 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { 2174 CONN_DEC_REF(acceptor->tcp_connp); 2175 CONN_DEC_REF(eager->tcp_connp); 2176 freemsg(opt_mp); 2177 /* Original mp has been freed by now, so use mp1 */ 2178 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); 2179 return; 2180 } 2181 2182 mp = NULL; /* We should never use mp after this point */ 2183 2184 switch (extra) { 2185 case sizeof (sin_t): { 2186 sin_t *sin = (sin_t *)ok_mp->b_wptr; 2187 2188 ok_mp->b_wptr += extra; 2189 sin->sin_family = AF_INET; 2190 sin->sin_port = eager->tcp_lport; 2191 sin->sin_addr.s_addr = 2192 eager->tcp_ipha->ipha_src; 2193 break; 2194 } 2195 case sizeof (sin6_t): { 2196 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; 2197 2198 ok_mp->b_wptr += extra; 2199 sin6->sin6_family = AF_INET6; 2200 sin6->sin6_port = eager->tcp_lport; 2201 if (eager->tcp_ipversion == IPV4_VERSION) { 2202 sin6->sin6_flowinfo = 0; 2203 IN6_IPADDR_TO_V4MAPPED( 2204 eager->tcp_ipha->ipha_src, 2205 &sin6->sin6_addr); 2206 } else { 2207 ASSERT(eager->tcp_ip6h != NULL); 2208 sin6->sin6_flowinfo = 2209 eager->tcp_ip6h->ip6_vcf & 2210 ~IPV6_VERS_AND_FLOW_MASK; 2211 sin6->sin6_addr = 2212 eager->tcp_ip6h->ip6_src; 2213 } 2214 sin6->sin6_scope_id = 0; 2215 sin6->__sin6_src_id = 0; 2216 break; 2217 } 2218 default: 2219 break; 2220 } 2221 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); 2222 } 2223 2224 /* 2225 * If there are no options we know that the T_CONN_RES will 2226 * succeed. However, we can't send the T_OK_ACK upstream until 2227 * the tcp_accept_swap is done since it would be dangerous to 2228 * let the application start using the new fd prior to the swap. 2229 */ 2230 tcp_accept_swap(listener, acceptor, eager); 2231 2232 /* 2233 * tcp_accept_swap unlinks eager from listener but does not drop 2234 * the eager's reference on the listener. 2235 */ 2236 ASSERT(eager->tcp_listener == NULL); 2237 ASSERT(listener->tcp_connp->conn_ref >= 5); 2238 2239 /* 2240 * The eager is now associated with its own queue. Insert in 2241 * the hash so that the connection can be reused for a future 2242 * T_CONN_RES. 2243 */ 2244 tcp_acceptor_hash_insert(acceptor_id, eager); 2245 2246 /* 2247 * We now do the processing of options with T_CONN_RES. 2248 * We delay till now since we wanted to have queue to pass to 2249 * option processing routines that points back to the right 2250 * instance structure which does not happen until after 2251 * tcp_accept_swap(). 2252 * 2253 * Note: 2254 * The sanity of the logic here assumes that whatever options 2255 * are appropriate to inherit from listner=>eager are done 2256 * before this point, and whatever were to be overridden (or not) 2257 * in transfer logic from eager=>acceptor in tcp_accept_swap(). 2258 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it 2259 * before its ACCEPTOR_id comes down in T_CONN_RES ] 2260 * This may not be true at this point in time but can be fixed 2261 * independently. This option processing code starts with 2262 * the instantiated acceptor instance and the final queue at 2263 * this point. 2264 */ 2265 2266 if (tcr->OPT_length != 0) { 2267 /* Options to process */ 2268 int t_error = 0; 2269 int sys_error = 0; 2270 int do_disconnect = 0; 2271 2272 if (tcp_conprim_opt_process(eager, mp1, 2273 &do_disconnect, &t_error, &sys_error) < 0) { 2274 eager->tcp_accept_error = 1; 2275 if (do_disconnect) { 2276 /* 2277 * An option failed which does not allow 2278 * connection to be accepted. 2279 * 2280 * We allow T_CONN_RES to succeed and 2281 * put a T_DISCON_IND on the eager queue. 2282 */ 2283 ASSERT(t_error == 0 && sys_error == 0); 2284 eager->tcp_send_discon_ind = 1; 2285 } else { 2286 ASSERT(t_error != 0); 2287 freemsg(ok_mp); 2288 /* 2289 * Original mp was either freed or set 2290 * to ok_mp above, so use mp1 instead. 2291 */ 2292 tcp_err_ack(listener, mp1, t_error, sys_error); 2293 goto finish; 2294 } 2295 } 2296 /* 2297 * Most likely success in setting options (except if 2298 * eager->tcp_send_discon_ind set). 2299 * mp1 option buffer represented by OPT_length/offset 2300 * potentially modified and contains results of setting 2301 * options at this point 2302 */ 2303 } 2304 2305 /* We no longer need mp1, since all options processing has passed */ 2306 freemsg(mp1); 2307 2308 putnext(listener->tcp_rq, ok_mp); 2309 2310 mutex_enter(&listener->tcp_eager_lock); 2311 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 2312 tcp_t *tail; 2313 mblk_t *conn_ind; 2314 2315 /* 2316 * This path should not be executed if listener and 2317 * acceptor streams are the same. 2318 */ 2319 ASSERT(listener != acceptor); 2320 2321 tcp = listener->tcp_eager_prev_q0; 2322 /* 2323 * listener->tcp_eager_prev_q0 points to the TAIL of the 2324 * deferred T_conn_ind queue. We need to get to the head of 2325 * the queue in order to send up T_conn_ind the same order as 2326 * how the 3WHS is completed. 2327 */ 2328 while (tcp != listener) { 2329 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) 2330 break; 2331 else 2332 tcp = tcp->tcp_eager_prev_q0; 2333 } 2334 ASSERT(tcp != listener); 2335 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; 2336 ASSERT(conn_ind != NULL); 2337 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 2338 2339 /* Move from q0 to q */ 2340 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 2341 listener->tcp_conn_req_cnt_q0--; 2342 listener->tcp_conn_req_cnt_q++; 2343 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 2344 tcp->tcp_eager_prev_q0; 2345 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 2346 tcp->tcp_eager_next_q0; 2347 tcp->tcp_eager_prev_q0 = NULL; 2348 tcp->tcp_eager_next_q0 = NULL; 2349 tcp->tcp_conn_def_q0 = B_FALSE; 2350 2351 /* Make sure the tcp isn't in the list of droppables */ 2352 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && 2353 tcp->tcp_eager_prev_drop_q0 == NULL); 2354 2355 /* 2356 * Insert at end of the queue because sockfs sends 2357 * down T_CONN_RES in chronological order. Leaving 2358 * the older conn indications at front of the queue 2359 * helps reducing search time. 2360 */ 2361 tail = listener->tcp_eager_last_q; 2362 if (tail != NULL) 2363 tail->tcp_eager_next_q = tcp; 2364 else 2365 listener->tcp_eager_next_q = tcp; 2366 listener->tcp_eager_last_q = tcp; 2367 tcp->tcp_eager_next_q = NULL; 2368 mutex_exit(&listener->tcp_eager_lock); 2369 putnext(tcp->tcp_rq, conn_ind); 2370 } else { 2371 mutex_exit(&listener->tcp_eager_lock); 2372 } 2373 2374 /* 2375 * Done with the acceptor - free it 2376 * 2377 * Note: from this point on, no access to listener should be made 2378 * as listener can be equal to acceptor. 2379 */ 2380 finish: 2381 ASSERT(acceptor->tcp_detached); 2382 ASSERT(tcps->tcps_g_q != NULL); 2383 acceptor->tcp_rq = tcps->tcps_g_q; 2384 acceptor->tcp_wq = WR(tcps->tcps_g_q); 2385 (void) tcp_clean_death(acceptor, 0, 2); 2386 CONN_DEC_REF(acceptor->tcp_connp); 2387 2388 /* 2389 * In case we already received a FIN we have to make tcp_rput send 2390 * the ordrel_ind. This will also send up a window update if the window 2391 * has opened up. 2392 * 2393 * In the normal case of a successful connection acceptance 2394 * we give the O_T_BIND_REQ to the read side put procedure as an 2395 * indication that this was just accepted. This tells tcp_rput to 2396 * pass up any data queued in tcp_rcv_list. 2397 * 2398 * In the fringe case where options sent with T_CONN_RES failed and 2399 * we required, we would be indicating a T_DISCON_IND to blow 2400 * away this connection. 2401 */ 2402 2403 /* 2404 * XXX: we currently have a problem if XTI application closes the 2405 * acceptor stream in between. This problem exists in on10-gate also 2406 * and is well know but nothing can be done short of major rewrite 2407 * to fix it. Now it is possible to take care of it by assigning TLI/XTI 2408 * eager same squeue as listener (we can distinguish non socket 2409 * listeners at the time of handling a SYN in tcp_conn_request) 2410 * and do most of the work that tcp_accept_finish does here itself 2411 * and then get behind the acceptor squeue to access the acceptor 2412 * queue. 2413 */ 2414 /* 2415 * We already have a ref on tcp so no need to do one before squeue_fill 2416 */ 2417 squeue_fill(eager->tcp_connp->conn_sqp, opt_mp, 2418 tcp_accept_finish, eager->tcp_connp, SQTAG_TCP_ACCEPT_FINISH); 2419 } 2420 2421 /* 2422 * Swap information between the eager and acceptor for a TLI/XTI client. 2423 * The sockfs accept is done on the acceptor stream and control goes 2424 * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not 2425 * called. In either case, both the eager and listener are in their own 2426 * perimeter (squeue) and the code has to deal with potential race. 2427 * 2428 * See the block comment on top of tcp_accept() and tcp_wput_accept(). 2429 */ 2430 static void 2431 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) 2432 { 2433 conn_t *econnp, *aconnp; 2434 2435 ASSERT(eager->tcp_rq == listener->tcp_rq); 2436 ASSERT(eager->tcp_detached && !acceptor->tcp_detached); 2437 ASSERT(!eager->tcp_hard_bound); 2438 ASSERT(!TCP_IS_SOCKET(acceptor)); 2439 ASSERT(!TCP_IS_SOCKET(eager)); 2440 ASSERT(!TCP_IS_SOCKET(listener)); 2441 2442 acceptor->tcp_detached = B_TRUE; 2443 /* 2444 * To permit stream re-use by TLI/XTI, the eager needs a copy of 2445 * the acceptor id. 2446 */ 2447 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; 2448 2449 /* remove eager from listen list... */ 2450 mutex_enter(&listener->tcp_eager_lock); 2451 tcp_eager_unlink(eager); 2452 ASSERT(eager->tcp_eager_next_q == NULL && 2453 eager->tcp_eager_last_q == NULL); 2454 ASSERT(eager->tcp_eager_next_q0 == NULL && 2455 eager->tcp_eager_prev_q0 == NULL); 2456 mutex_exit(&listener->tcp_eager_lock); 2457 eager->tcp_rq = acceptor->tcp_rq; 2458 eager->tcp_wq = acceptor->tcp_wq; 2459 2460 econnp = eager->tcp_connp; 2461 aconnp = acceptor->tcp_connp; 2462 2463 eager->tcp_rq->q_ptr = econnp; 2464 eager->tcp_wq->q_ptr = econnp; 2465 2466 /* 2467 * In the TLI/XTI loopback case, we are inside the listener's squeue, 2468 * which might be a different squeue from our peer TCP instance. 2469 * For TCP Fusion, the peer expects that whenever tcp_detached is 2470 * clear, our TCP queues point to the acceptor's queues. Thus, use 2471 * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq 2472 * above reach global visibility prior to the clearing of tcp_detached. 2473 */ 2474 membar_producer(); 2475 eager->tcp_detached = B_FALSE; 2476 2477 ASSERT(eager->tcp_ack_tid == 0); 2478 2479 econnp->conn_dev = aconnp->conn_dev; 2480 econnp->conn_minor_arena = aconnp->conn_minor_arena; 2481 ASSERT(econnp->conn_minor_arena != NULL); 2482 if (eager->tcp_cred != NULL) 2483 crfree(eager->tcp_cred); 2484 eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred; 2485 ASSERT(econnp->conn_netstack == aconnp->conn_netstack); 2486 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); 2487 2488 aconnp->conn_cred = NULL; 2489 2490 econnp->conn_zoneid = aconnp->conn_zoneid; 2491 econnp->conn_allzones = aconnp->conn_allzones; 2492 2493 econnp->conn_mac_exempt = aconnp->conn_mac_exempt; 2494 aconnp->conn_mac_exempt = B_FALSE; 2495 2496 ASSERT(aconnp->conn_peercred == NULL); 2497 2498 /* Do the IPC initialization */ 2499 CONN_INC_REF(econnp); 2500 2501 econnp->conn_multicast_loop = aconnp->conn_multicast_loop; 2502 econnp->conn_af_isv6 = aconnp->conn_af_isv6; 2503 econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6; 2504 2505 /* Done with old IPC. Drop its ref on its connp */ 2506 CONN_DEC_REF(aconnp); 2507 } 2508 2509 2510 /* 2511 * Adapt to the information, such as rtt and rtt_sd, provided from the 2512 * ire cached in conn_cache_ire. If no ire cached, do a ire lookup. 2513 * 2514 * Checks for multicast and broadcast destination address. 2515 * Returns zero on failure; non-zero if ok. 2516 * 2517 * Note that the MSS calculation here is based on the info given in 2518 * the IRE. We do not do any calculation based on TCP options. They 2519 * will be handled in tcp_rput_other() and tcp_rput_data() when TCP 2520 * knows which options to use. 2521 * 2522 * Note on how TCP gets its parameters for a connection. 2523 * 2524 * When a tcp_t structure is allocated, it gets all the default parameters. 2525 * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd, 2526 * spipe, rpipe, ... from the route metrics. Route metric overrides the 2527 * default. But if there is an associated tcp_host_param, it will override 2528 * the metrics. 2529 * 2530 * An incoming SYN with a multicast or broadcast destination address, is dropped 2531 * in 1 of 2 places. 2532 * 2533 * 1. If the packet was received over the wire it is dropped in 2534 * ip_rput_process_broadcast() 2535 * 2536 * 2. If the packet was received through internal IP loopback, i.e. the packet 2537 * was generated and received on the same machine, it is dropped in 2538 * ip_wput_local() 2539 * 2540 * An incoming SYN with a multicast or broadcast source address is always 2541 * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to 2542 * reject an attempt to connect to a broadcast or multicast (destination) 2543 * address. 2544 */ 2545 static int 2546 tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) 2547 { 2548 tcp_hsp_t *hsp; 2549 ire_t *ire; 2550 ire_t *sire = NULL; 2551 iulp_t *ire_uinfo = NULL; 2552 uint32_t mss_max; 2553 uint32_t mss; 2554 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 2555 conn_t *connp = tcp->tcp_connp; 2556 boolean_t ire_cacheable = B_FALSE; 2557 zoneid_t zoneid = connp->conn_zoneid; 2558 int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 2559 MATCH_IRE_SECATTR; 2560 ts_label_t *tsl = crgetlabel(CONN_CRED(connp)); 2561 ill_t *ill = NULL; 2562 boolean_t incoming = (ire_mp == NULL); 2563 tcp_stack_t *tcps = tcp->tcp_tcps; 2564 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 2565 2566 ASSERT(connp->conn_ire_cache == NULL); 2567 2568 if (tcp->tcp_ipversion == IPV4_VERSION) { 2569 2570 if (CLASSD(tcp->tcp_connp->conn_rem)) { 2571 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 2572 return (0); 2573 } 2574 /* 2575 * If IP_NEXTHOP is set, then look for an IRE_CACHE 2576 * for the destination with the nexthop as gateway. 2577 * ire_ctable_lookup() is used because this particular 2578 * ire, if it exists, will be marked private. 2579 * If that is not available, use the interface ire 2580 * for the nexthop. 2581 * 2582 * TSol: tcp_update_label will detect label mismatches based 2583 * only on the destination's label, but that would not 2584 * detect label mismatches based on the security attributes 2585 * of routes or next hop gateway. Hence we need to pass the 2586 * label to ire_ftable_lookup below in order to locate the 2587 * right prefix (and/or) ire cache. Similarly we also need 2588 * pass the label to the ire_cache_lookup below to locate 2589 * the right ire that also matches on the label. 2590 */ 2591 if (tcp->tcp_connp->conn_nexthop_set) { 2592 ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem, 2593 tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid, 2594 tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, 2595 ipst); 2596 if (ire == NULL) { 2597 ire = ire_ftable_lookup( 2598 tcp->tcp_connp->conn_nexthop_v4, 2599 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0, 2600 tsl, match_flags, ipst); 2601 if (ire == NULL) 2602 return (0); 2603 } else { 2604 ire_uinfo = &ire->ire_uinfo; 2605 } 2606 } else { 2607 ire = ire_cache_lookup(tcp->tcp_connp->conn_rem, 2608 zoneid, tsl, ipst); 2609 if (ire != NULL) { 2610 ire_cacheable = B_TRUE; 2611 ire_uinfo = (ire_mp != NULL) ? 2612 &((ire_t *)ire_mp->b_rptr)->ire_uinfo: 2613 &ire->ire_uinfo; 2614 2615 } else { 2616 if (ire_mp == NULL) { 2617 ire = ire_ftable_lookup( 2618 tcp->tcp_connp->conn_rem, 2619 0, 0, 0, NULL, &sire, zoneid, 0, 2620 tsl, (MATCH_IRE_RECURSIVE | 2621 MATCH_IRE_DEFAULT), ipst); 2622 if (ire == NULL) 2623 return (0); 2624 ire_uinfo = (sire != NULL) ? 2625 &sire->ire_uinfo : 2626 &ire->ire_uinfo; 2627 } else { 2628 ire = (ire_t *)ire_mp->b_rptr; 2629 ire_uinfo = 2630 &((ire_t *) 2631 ire_mp->b_rptr)->ire_uinfo; 2632 } 2633 } 2634 } 2635 ASSERT(ire != NULL); 2636 2637 if ((ire->ire_src_addr == INADDR_ANY) || 2638 (ire->ire_type & IRE_BROADCAST)) { 2639 /* 2640 * ire->ire_mp is non null when ire_mp passed in is used 2641 * ire->ire_mp is set in ip_bind_insert_ire[_v6](). 2642 */ 2643 if (ire->ire_mp == NULL) 2644 ire_refrele(ire); 2645 if (sire != NULL) 2646 ire_refrele(sire); 2647 return (0); 2648 } 2649 2650 if (tcp->tcp_ipha->ipha_src == INADDR_ANY) { 2651 ipaddr_t src_addr; 2652 2653 /* 2654 * ip_bind_connected() has stored the correct source 2655 * address in conn_src. 2656 */ 2657 src_addr = tcp->tcp_connp->conn_src; 2658 tcp->tcp_ipha->ipha_src = src_addr; 2659 /* 2660 * Copy of the src addr. in tcp_t is needed 2661 * for the lookup funcs. 2662 */ 2663 IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6); 2664 } 2665 /* 2666 * Set the fragment bit so that IP will tell us if the MTU 2667 * should change. IP tells us the latest setting of 2668 * ip_path_mtu_discovery through ire_frag_flag. 2669 */ 2670 if (ipst->ips_ip_path_mtu_discovery) { 2671 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 2672 htons(IPH_DF); 2673 } 2674 /* 2675 * If ire_uinfo is NULL, this is the IRE_INTERFACE case 2676 * for IP_NEXTHOP. No cache ire has been found for the 2677 * destination and we are working with the nexthop's 2678 * interface ire. Since we need to forward all packets 2679 * to the nexthop first, we "blindly" set tcp_localnet 2680 * to false, eventhough the destination may also be 2681 * onlink. 2682 */ 2683 if (ire_uinfo == NULL) 2684 tcp->tcp_localnet = 0; 2685 else 2686 tcp->tcp_localnet = (ire->ire_gateway_addr == 0); 2687 } else { 2688 /* 2689 * For incoming connection ire_mp = NULL 2690 * For outgoing connection ire_mp != NULL 2691 * Technically we should check conn_incoming_ill 2692 * when ire_mp is NULL and conn_outgoing_ill when 2693 * ire_mp is non-NULL. But this is performance 2694 * critical path and for IPV*_BOUND_IF, outgoing 2695 * and incoming ill are always set to the same value. 2696 */ 2697 ill_t *dst_ill = NULL; 2698 ipif_t *dst_ipif = NULL; 2699 2700 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 2701 2702 if (connp->conn_outgoing_ill != NULL) { 2703 /* Outgoing or incoming path */ 2704 int err; 2705 2706 dst_ill = conn_get_held_ill(connp, 2707 &connp->conn_outgoing_ill, &err); 2708 if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) { 2709 ip1dbg(("tcp_adapt_ire: ill_lookup failed\n")); 2710 return (0); 2711 } 2712 match_flags |= MATCH_IRE_ILL; 2713 dst_ipif = dst_ill->ill_ipif; 2714 } 2715 ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6, 2716 0, 0, dst_ipif, zoneid, tsl, match_flags, ipst); 2717 2718 if (ire != NULL) { 2719 ire_cacheable = B_TRUE; 2720 ire_uinfo = (ire_mp != NULL) ? 2721 &((ire_t *)ire_mp->b_rptr)->ire_uinfo: 2722 &ire->ire_uinfo; 2723 } else { 2724 if (ire_mp == NULL) { 2725 ire = ire_ftable_lookup_v6( 2726 &tcp->tcp_connp->conn_remv6, 2727 0, 0, 0, dst_ipif, &sire, zoneid, 2728 0, tsl, match_flags, ipst); 2729 if (ire == NULL) { 2730 if (dst_ill != NULL) 2731 ill_refrele(dst_ill); 2732 return (0); 2733 } 2734 ire_uinfo = (sire != NULL) ? &sire->ire_uinfo : 2735 &ire->ire_uinfo; 2736 } else { 2737 ire = (ire_t *)ire_mp->b_rptr; 2738 ire_uinfo = 2739 &((ire_t *)ire_mp->b_rptr)->ire_uinfo; 2740 } 2741 } 2742 if (dst_ill != NULL) 2743 ill_refrele(dst_ill); 2744 2745 ASSERT(ire != NULL); 2746 ASSERT(ire_uinfo != NULL); 2747 2748 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) || 2749 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 2750 /* 2751 * ire->ire_mp is non null when ire_mp passed in is used 2752 * ire->ire_mp is set in ip_bind_insert_ire[_v6](). 2753 */ 2754 if (ire->ire_mp == NULL) 2755 ire_refrele(ire); 2756 if (sire != NULL) 2757 ire_refrele(sire); 2758 return (0); 2759 } 2760 2761 if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { 2762 in6_addr_t src_addr; 2763 2764 /* 2765 * ip_bind_connected_v6() has stored the correct source 2766 * address per IPv6 addr. selection policy in 2767 * conn_src_v6. 2768 */ 2769 src_addr = tcp->tcp_connp->conn_srcv6; 2770 2771 tcp->tcp_ip6h->ip6_src = src_addr; 2772 /* 2773 * Copy of the src addr. in tcp_t is needed 2774 * for the lookup funcs. 2775 */ 2776 tcp->tcp_ip_src_v6 = src_addr; 2777 ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src, 2778 &connp->conn_srcv6)); 2779 } 2780 tcp->tcp_localnet = 2781 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6); 2782 } 2783 2784 /* 2785 * This allows applications to fail quickly when connections are made 2786 * to dead hosts. Hosts can be labeled dead by adding a reject route 2787 * with both the RTF_REJECT and RTF_PRIVATE flags set. 2788 */ 2789 if ((ire->ire_flags & RTF_REJECT) && 2790 (ire->ire_flags & RTF_PRIVATE)) 2791 goto error; 2792 2793 /* 2794 * Make use of the cached rtt and rtt_sd values to calculate the 2795 * initial RTO. Note that they are already initialized in 2796 * tcp_init_values(). 2797 * If ire_uinfo is NULL, i.e., we do not have a cache ire for 2798 * IP_NEXTHOP, but instead are using the interface ire for the 2799 * nexthop, then we do not use the ire_uinfo from that ire to 2800 * do any initializations. 2801 */ 2802 if (ire_uinfo != NULL) { 2803 if (ire_uinfo->iulp_rtt != 0) { 2804 clock_t rto; 2805 2806 tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt; 2807 tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd; 2808 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 2809 tcps->tcps_rexmit_interval_extra + 2810 (tcp->tcp_rtt_sa >> 5); 2811 2812 if (rto > tcps->tcps_rexmit_interval_max) { 2813 tcp->tcp_rto = tcps->tcps_rexmit_interval_max; 2814 } else if (rto < tcps->tcps_rexmit_interval_min) { 2815 tcp->tcp_rto = tcps->tcps_rexmit_interval_min; 2816 } else { 2817 tcp->tcp_rto = rto; 2818 } 2819 } 2820 if (ire_uinfo->iulp_ssthresh != 0) 2821 tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh; 2822 else 2823 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 2824 if (ire_uinfo->iulp_spipe > 0) { 2825 tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe, 2826 tcps->tcps_max_buf); 2827 if (tcps->tcps_snd_lowat_fraction != 0) 2828 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / 2829 tcps->tcps_snd_lowat_fraction; 2830 (void) tcp_maxpsz_set(tcp, B_TRUE); 2831 } 2832 /* 2833 * Note that up till now, acceptor always inherits receive 2834 * window from the listener. But if there is a metrics 2835 * associated with a host, we should use that instead of 2836 * inheriting it from listener. Thus we need to pass this 2837 * info back to the caller. 2838 */ 2839 if (ire_uinfo->iulp_rpipe > 0) { 2840 tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe, 2841 tcps->tcps_max_buf); 2842 } 2843 2844 if (ire_uinfo->iulp_rtomax > 0) { 2845 tcp->tcp_second_timer_threshold = 2846 ire_uinfo->iulp_rtomax; 2847 } 2848 2849 /* 2850 * Use the metric option settings, iulp_tstamp_ok and 2851 * iulp_wscale_ok, only for active open. What this means 2852 * is that if the other side uses timestamp or window 2853 * scale option, TCP will also use those options. That 2854 * is for passive open. If the application sets a 2855 * large window, window scale is enabled regardless of 2856 * the value in iulp_wscale_ok. This is the behavior 2857 * since 2.6. So we keep it. 2858 * The only case left in passive open processing is the 2859 * check for SACK. 2860 * For ECN, it should probably be like SACK. But the 2861 * current value is binary, so we treat it like the other 2862 * cases. The metric only controls active open.For passive 2863 * open, the ndd param, tcp_ecn_permitted, controls the 2864 * behavior. 2865 */ 2866 if (!tcp_detached) { 2867 /* 2868 * The if check means that the following can only 2869 * be turned on by the metrics only IRE, but not off. 2870 */ 2871 if (ire_uinfo->iulp_tstamp_ok) 2872 tcp->tcp_snd_ts_ok = B_TRUE; 2873 if (ire_uinfo->iulp_wscale_ok) 2874 tcp->tcp_snd_ws_ok = B_TRUE; 2875 if (ire_uinfo->iulp_sack == 2) 2876 tcp->tcp_snd_sack_ok = B_TRUE; 2877 if (ire_uinfo->iulp_ecn_ok) 2878 tcp->tcp_ecn_ok = B_TRUE; 2879 } else { 2880 /* 2881 * Passive open. 2882 * 2883 * As above, the if check means that SACK can only be 2884 * turned on by the metric only IRE. 2885 */ 2886 if (ire_uinfo->iulp_sack > 0) { 2887 tcp->tcp_snd_sack_ok = B_TRUE; 2888 } 2889 } 2890 } 2891 2892 2893 /* 2894 * XXX: Note that currently, ire_max_frag can be as small as 68 2895 * because of PMTUd. So tcp_mss may go to negative if combined 2896 * length of all those options exceeds 28 bytes. But because 2897 * of the tcp_mss_min check below, we may not have a problem if 2898 * tcp_mss_min is of a reasonable value. The default is 1 so 2899 * the negative problem still exists. And the check defeats PMTUd. 2900 * In fact, if PMTUd finds that the MSS should be smaller than 2901 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min 2902 * value. 2903 * 2904 * We do not deal with that now. All those problems related to 2905 * PMTUd will be fixed later. 2906 */ 2907 ASSERT(ire->ire_max_frag != 0); 2908 mss = tcp->tcp_if_mtu = ire->ire_max_frag; 2909 if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) { 2910 if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) { 2911 mss = MIN(mss, IPV6_MIN_MTU); 2912 } 2913 } 2914 2915 /* Sanity check for MSS value. */ 2916 if (tcp->tcp_ipversion == IPV4_VERSION) 2917 mss_max = tcps->tcps_mss_max_ipv4; 2918 else 2919 mss_max = tcps->tcps_mss_max_ipv6; 2920 2921 if (tcp->tcp_ipversion == IPV6_VERSION && 2922 (ire->ire_frag_flag & IPH_FRAG_HDR)) { 2923 /* 2924 * After receiving an ICMPv6 "packet too big" message with a 2925 * MTU < 1280, and for multirouted IPv6 packets, the IP layer 2926 * will insert a 8-byte fragment header in every packet; we 2927 * reduce the MSS by that amount here. 2928 */ 2929 mss -= sizeof (ip6_frag_t); 2930 } 2931 2932 if (tcp->tcp_ipsec_overhead == 0) 2933 tcp->tcp_ipsec_overhead = conn_ipsec_length(connp); 2934 2935 mss -= tcp->tcp_ipsec_overhead; 2936 2937 if (mss < tcps->tcps_mss_min) 2938 mss = tcps->tcps_mss_min; 2939 if (mss > mss_max) 2940 mss = mss_max; 2941 2942 /* Note that this is the maximum MSS, excluding all options. */ 2943 tcp->tcp_mss = mss; 2944 2945 /* 2946 * Initialize the ISS here now that we have the full connection ID. 2947 * The RFC 1948 method of initial sequence number generation requires 2948 * knowledge of the full connection ID before setting the ISS. 2949 */ 2950 2951 tcp_iss_init(tcp); 2952 2953 if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL)) 2954 tcp->tcp_loopback = B_TRUE; 2955 2956 if (tcp->tcp_ipversion == IPV4_VERSION) { 2957 hsp = tcp_hsp_lookup(tcp->tcp_remote, tcps); 2958 } else { 2959 hsp = tcp_hsp_lookup_ipv6(&tcp->tcp_remote_v6, tcps); 2960 } 2961 2962 if (hsp != NULL) { 2963 /* Only modify if we're going to make them bigger */ 2964 if (hsp->tcp_hsp_sendspace > tcp->tcp_xmit_hiwater) { 2965 tcp->tcp_xmit_hiwater = hsp->tcp_hsp_sendspace; 2966 if (tcps->tcps_snd_lowat_fraction != 0) 2967 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / 2968 tcps->tcps_snd_lowat_fraction; 2969 } 2970 2971 if (hsp->tcp_hsp_recvspace > tcp->tcp_rwnd) { 2972 tcp->tcp_rwnd = hsp->tcp_hsp_recvspace; 2973 } 2974 2975 /* Copy timestamp flag only for active open */ 2976 if (!tcp_detached) 2977 tcp->tcp_snd_ts_ok = hsp->tcp_hsp_tstamp; 2978 } 2979 2980 if (sire != NULL) 2981 IRE_REFRELE(sire); 2982 2983 /* 2984 * If we got an IRE_CACHE and an ILL, go through their properties; 2985 * otherwise, this is deferred until later when we have an IRE_CACHE. 2986 */ 2987 if (tcp->tcp_loopback || 2988 (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) { 2989 /* 2990 * For incoming, see if this tcp may be MDT-capable. For 2991 * outgoing, this process has been taken care of through 2992 * tcp_rput_other. 2993 */ 2994 tcp_ire_ill_check(tcp, ire, ill, incoming); 2995 tcp->tcp_ire_ill_check_done = B_TRUE; 2996 } 2997 2998 mutex_enter(&connp->conn_lock); 2999 /* 3000 * Make sure that conn is not marked incipient 3001 * for incoming connections. A blind 3002 * removal of incipient flag is cheaper than 3003 * check and removal. 3004 */ 3005 connp->conn_state_flags &= ~CONN_INCIPIENT; 3006 3007 /* 3008 * Must not cache forwarding table routes 3009 * or recache an IRE after the conn_t has 3010 * had conn_ire_cache cleared and is flagged 3011 * unusable, (see the CONN_CACHE_IRE() macro). 3012 */ 3013 if (ire_cacheable && CONN_CACHE_IRE(connp)) { 3014 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 3015 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 3016 connp->conn_ire_cache = ire; 3017 IRE_UNTRACE_REF(ire); 3018 rw_exit(&ire->ire_bucket->irb_lock); 3019 mutex_exit(&connp->conn_lock); 3020 return (1); 3021 } 3022 rw_exit(&ire->ire_bucket->irb_lock); 3023 } 3024 mutex_exit(&connp->conn_lock); 3025 3026 if (ire->ire_mp == NULL) 3027 ire_refrele(ire); 3028 return (1); 3029 3030 error: 3031 if (ire->ire_mp == NULL) 3032 ire_refrele(ire); 3033 if (sire != NULL) 3034 ire_refrele(sire); 3035 return (0); 3036 } 3037 3038 /* 3039 * tcp_bind is called (holding the writer lock) by tcp_wput_proto to process a 3040 * O_T_BIND_REQ/T_BIND_REQ message. 3041 */ 3042 static void 3043 tcp_bind(tcp_t *tcp, mblk_t *mp) 3044 { 3045 sin_t *sin; 3046 sin6_t *sin6; 3047 mblk_t *mp1; 3048 in_port_t requested_port; 3049 in_port_t allocated_port; 3050 struct T_bind_req *tbr; 3051 boolean_t bind_to_req_port_only; 3052 boolean_t backlog_update = B_FALSE; 3053 boolean_t user_specified; 3054 in6_addr_t v6addr; 3055 ipaddr_t v4addr; 3056 uint_t origipversion; 3057 int err; 3058 queue_t *q = tcp->tcp_wq; 3059 conn_t *connp = tcp->tcp_connp; 3060 mlp_type_t addrtype, mlptype; 3061 zone_t *zone; 3062 cred_t *cr; 3063 in_port_t mlp_port; 3064 tcp_stack_t *tcps = tcp->tcp_tcps; 3065 3066 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 3067 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 3068 if (tcp->tcp_debug) { 3069 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 3070 "tcp_bind: bad req, len %u", 3071 (uint_t)(mp->b_wptr - mp->b_rptr)); 3072 } 3073 tcp_err_ack(tcp, mp, TPROTO, 0); 3074 return; 3075 } 3076 /* Make sure the largest address fits */ 3077 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1); 3078 if (mp1 == NULL) { 3079 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 3080 return; 3081 } 3082 mp = mp1; 3083 tbr = (struct T_bind_req *)mp->b_rptr; 3084 if (tcp->tcp_state >= TCPS_BOUND) { 3085 if ((tcp->tcp_state == TCPS_BOUND || 3086 tcp->tcp_state == TCPS_LISTEN) && 3087 tcp->tcp_conn_req_max != tbr->CONIND_number && 3088 tbr->CONIND_number > 0) { 3089 /* 3090 * Handle listen() increasing CONIND_number. 3091 * This is more "liberal" then what the TPI spec 3092 * requires but is needed to avoid a t_unbind 3093 * when handling listen() since the port number 3094 * might be "stolen" between the unbind and bind. 3095 */ 3096 backlog_update = B_TRUE; 3097 goto do_bind; 3098 } 3099 if (tcp->tcp_debug) { 3100 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 3101 "tcp_bind: bad state, %d", tcp->tcp_state); 3102 } 3103 tcp_err_ack(tcp, mp, TOUTSTATE, 0); 3104 return; 3105 } 3106 origipversion = tcp->tcp_ipversion; 3107 3108 switch (tbr->ADDR_length) { 3109 case 0: /* request for a generic port */ 3110 tbr->ADDR_offset = sizeof (struct T_bind_req); 3111 if (tcp->tcp_family == AF_INET) { 3112 tbr->ADDR_length = sizeof (sin_t); 3113 sin = (sin_t *)&tbr[1]; 3114 *sin = sin_null; 3115 sin->sin_family = AF_INET; 3116 mp->b_wptr = (uchar_t *)&sin[1]; 3117 tcp->tcp_ipversion = IPV4_VERSION; 3118 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &v6addr); 3119 } else { 3120 ASSERT(tcp->tcp_family == AF_INET6); 3121 tbr->ADDR_length = sizeof (sin6_t); 3122 sin6 = (sin6_t *)&tbr[1]; 3123 *sin6 = sin6_null; 3124 sin6->sin6_family = AF_INET6; 3125 mp->b_wptr = (uchar_t *)&sin6[1]; 3126 tcp->tcp_ipversion = IPV6_VERSION; 3127 V6_SET_ZERO(v6addr); 3128 } 3129 requested_port = 0; 3130 break; 3131 3132 case sizeof (sin_t): /* Complete IPv4 address */ 3133 sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset, 3134 sizeof (sin_t)); 3135 if (sin == NULL || !OK_32PTR((char *)sin)) { 3136 if (tcp->tcp_debug) { 3137 (void) strlog(TCP_MOD_ID, 0, 1, 3138 SL_ERROR|SL_TRACE, 3139 "tcp_bind: bad address parameter, " 3140 "offset %d, len %d", 3141 tbr->ADDR_offset, tbr->ADDR_length); 3142 } 3143 tcp_err_ack(tcp, mp, TPROTO, 0); 3144 return; 3145 } 3146 /* 3147 * With sockets sockfs will accept bogus sin_family in 3148 * bind() and replace it with the family used in the socket 3149 * call. 3150 */ 3151 if (sin->sin_family != AF_INET || 3152 tcp->tcp_family != AF_INET) { 3153 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 3154 return; 3155 } 3156 requested_port = ntohs(sin->sin_port); 3157 tcp->tcp_ipversion = IPV4_VERSION; 3158 v4addr = sin->sin_addr.s_addr; 3159 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); 3160 break; 3161 3162 case sizeof (sin6_t): /* Complete IPv6 address */ 3163 sin6 = (sin6_t *)mi_offset_param(mp, 3164 tbr->ADDR_offset, sizeof (sin6_t)); 3165 if (sin6 == NULL || !OK_32PTR((char *)sin6)) { 3166 if (tcp->tcp_debug) { 3167 (void) strlog(TCP_MOD_ID, 0, 1, 3168 SL_ERROR|SL_TRACE, 3169 "tcp_bind: bad IPv6 address parameter, " 3170 "offset %d, len %d", tbr->ADDR_offset, 3171 tbr->ADDR_length); 3172 } 3173 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 3174 return; 3175 } 3176 if (sin6->sin6_family != AF_INET6 || 3177 tcp->tcp_family != AF_INET6) { 3178 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 3179 return; 3180 } 3181 requested_port = ntohs(sin6->sin6_port); 3182 tcp->tcp_ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 3183 IPV4_VERSION : IPV6_VERSION; 3184 v6addr = sin6->sin6_addr; 3185 break; 3186 3187 default: 3188 if (tcp->tcp_debug) { 3189 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 3190 "tcp_bind: bad address length, %d", 3191 tbr->ADDR_length); 3192 } 3193 tcp_err_ack(tcp, mp, TBADADDR, 0); 3194 return; 3195 } 3196 tcp->tcp_bound_source_v6 = v6addr; 3197 3198 /* Check for change in ipversion */ 3199 if (origipversion != tcp->tcp_ipversion) { 3200 ASSERT(tcp->tcp_family == AF_INET6); 3201 err = tcp->tcp_ipversion == IPV6_VERSION ? 3202 tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp); 3203 if (err) { 3204 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 3205 return; 3206 } 3207 } 3208 3209 /* 3210 * Initialize family specific fields. Copy of the src addr. 3211 * in tcp_t is needed for the lookup funcs. 3212 */ 3213 if (tcp->tcp_ipversion == IPV6_VERSION) { 3214 tcp->tcp_ip6h->ip6_src = v6addr; 3215 } else { 3216 IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src); 3217 } 3218 tcp->tcp_ip_src_v6 = v6addr; 3219 3220 /* 3221 * For O_T_BIND_REQ: 3222 * Verify that the target port/addr is available, or choose 3223 * another. 3224 * For T_BIND_REQ: 3225 * Verify that the target port/addr is available or fail. 3226 * In both cases when it succeeds the tcp is inserted in the 3227 * bind hash table. This ensures that the operation is atomic 3228 * under the lock on the hash bucket. 3229 */ 3230 bind_to_req_port_only = requested_port != 0 && 3231 tbr->PRIM_type != O_T_BIND_REQ; 3232 /* 3233 * Get a valid port (within the anonymous range and should not 3234 * be a privileged one) to use if the user has not given a port. 3235 * If multiple threads are here, they may all start with 3236 * with the same initial port. But, it should be fine as long as 3237 * tcp_bindi will ensure that no two threads will be assigned 3238 * the same port. 3239 * 3240 * NOTE: XXX If a privileged process asks for an anonymous port, we 3241 * still check for ports only in the range > tcp_smallest_non_priv_port, 3242 * unless TCP_ANONPRIVBIND option is set. 3243 */ 3244 mlptype = mlptSingle; 3245 mlp_port = requested_port; 3246 if (requested_port == 0) { 3247 requested_port = tcp->tcp_anon_priv_bind ? 3248 tcp_get_next_priv_port(tcp) : 3249 tcp_update_next_port(tcps->tcps_next_port_to_try, 3250 tcp, B_TRUE); 3251 if (requested_port == 0) { 3252 tcp_err_ack(tcp, mp, TNOADDR, 0); 3253 return; 3254 } 3255 user_specified = B_FALSE; 3256 3257 /* 3258 * If the user went through one of the RPC interfaces to create 3259 * this socket and RPC is MLP in this zone, then give him an 3260 * anonymous MLP. 3261 */ 3262 cr = DB_CREDDEF(mp, tcp->tcp_cred); 3263 if (connp->conn_anon_mlp && is_system_labeled()) { 3264 zone = crgetzone(cr); 3265 addrtype = tsol_mlp_addr_type(zone->zone_id, 3266 IPV6_VERSION, &v6addr, 3267 tcps->tcps_netstack->netstack_ip); 3268 if (addrtype == mlptSingle) { 3269 tcp_err_ack(tcp, mp, TNOADDR, 0); 3270 return; 3271 } 3272 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 3273 PMAPPORT, addrtype); 3274 mlp_port = PMAPPORT; 3275 } 3276 } else { 3277 int i; 3278 boolean_t priv = B_FALSE; 3279 3280 /* 3281 * If the requested_port is in the well-known privileged range, 3282 * verify that the stream was opened by a privileged user. 3283 * Note: No locks are held when inspecting tcp_g_*epriv_ports 3284 * but instead the code relies on: 3285 * - the fact that the address of the array and its size never 3286 * changes 3287 * - the atomic assignment of the elements of the array 3288 */ 3289 cr = DB_CREDDEF(mp, tcp->tcp_cred); 3290 if (requested_port < tcps->tcps_smallest_nonpriv_port) { 3291 priv = B_TRUE; 3292 } else { 3293 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 3294 if (requested_port == 3295 tcps->tcps_g_epriv_ports[i]) { 3296 priv = B_TRUE; 3297 break; 3298 } 3299 } 3300 } 3301 if (priv) { 3302 if (secpolicy_net_privaddr(cr, requested_port, 3303 IPPROTO_TCP) != 0) { 3304 if (tcp->tcp_debug) { 3305 (void) strlog(TCP_MOD_ID, 0, 1, 3306 SL_ERROR|SL_TRACE, 3307 "tcp_bind: no priv for port %d", 3308 requested_port); 3309 } 3310 tcp_err_ack(tcp, mp, TACCES, 0); 3311 return; 3312 } 3313 } 3314 user_specified = B_TRUE; 3315 3316 if (is_system_labeled()) { 3317 zone = crgetzone(cr); 3318 addrtype = tsol_mlp_addr_type(zone->zone_id, 3319 IPV6_VERSION, &v6addr, 3320 tcps->tcps_netstack->netstack_ip); 3321 if (addrtype == mlptSingle) { 3322 tcp_err_ack(tcp, mp, TNOADDR, 0); 3323 return; 3324 } 3325 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 3326 requested_port, addrtype); 3327 } 3328 } 3329 3330 if (mlptype != mlptSingle) { 3331 if (secpolicy_net_bindmlp(cr) != 0) { 3332 if (tcp->tcp_debug) { 3333 (void) strlog(TCP_MOD_ID, 0, 1, 3334 SL_ERROR|SL_TRACE, 3335 "tcp_bind: no priv for multilevel port %d", 3336 requested_port); 3337 } 3338 tcp_err_ack(tcp, mp, TACCES, 0); 3339 return; 3340 } 3341 3342 /* 3343 * If we're specifically binding a shared IP address and the 3344 * port is MLP on shared addresses, then check to see if this 3345 * zone actually owns the MLP. Reject if not. 3346 */ 3347 if (mlptype == mlptShared && addrtype == mlptShared) { 3348 /* 3349 * No need to handle exclusive-stack zones since 3350 * ALL_ZONES only applies to the shared stack. 3351 */ 3352 zoneid_t mlpzone; 3353 3354 mlpzone = tsol_mlp_findzone(IPPROTO_TCP, 3355 htons(mlp_port)); 3356 if (connp->conn_zoneid != mlpzone) { 3357 if (tcp->tcp_debug) { 3358 (void) strlog(TCP_MOD_ID, 0, 1, 3359 SL_ERROR|SL_TRACE, 3360 "tcp_bind: attempt to bind port " 3361 "%d on shared addr in zone %d " 3362 "(should be %d)", 3363 mlp_port, connp->conn_zoneid, 3364 mlpzone); 3365 } 3366 tcp_err_ack(tcp, mp, TACCES, 0); 3367 return; 3368 } 3369 } 3370 3371 if (!user_specified) { 3372 err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp, 3373 requested_port, B_TRUE); 3374 if (err != 0) { 3375 if (tcp->tcp_debug) { 3376 (void) strlog(TCP_MOD_ID, 0, 1, 3377 SL_ERROR|SL_TRACE, 3378 "tcp_bind: cannot establish anon " 3379 "MLP for port %d", 3380 requested_port); 3381 } 3382 tcp_err_ack(tcp, mp, TSYSERR, err); 3383 return; 3384 } 3385 connp->conn_anon_port = B_TRUE; 3386 } 3387 connp->conn_mlp_type = mlptype; 3388 } 3389 3390 allocated_port = tcp_bindi(tcp, requested_port, &v6addr, 3391 tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified); 3392 3393 if (allocated_port == 0) { 3394 connp->conn_mlp_type = mlptSingle; 3395 if (connp->conn_anon_port) { 3396 connp->conn_anon_port = B_FALSE; 3397 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, 3398 requested_port, B_FALSE); 3399 } 3400 if (bind_to_req_port_only) { 3401 if (tcp->tcp_debug) { 3402 (void) strlog(TCP_MOD_ID, 0, 1, 3403 SL_ERROR|SL_TRACE, 3404 "tcp_bind: requested addr busy"); 3405 } 3406 tcp_err_ack(tcp, mp, TADDRBUSY, 0); 3407 } else { 3408 /* If we are out of ports, fail the bind. */ 3409 if (tcp->tcp_debug) { 3410 (void) strlog(TCP_MOD_ID, 0, 1, 3411 SL_ERROR|SL_TRACE, 3412 "tcp_bind: out of ports?"); 3413 } 3414 tcp_err_ack(tcp, mp, TNOADDR, 0); 3415 } 3416 return; 3417 } 3418 ASSERT(tcp->tcp_state == TCPS_BOUND); 3419 do_bind: 3420 if (!backlog_update) { 3421 if (tcp->tcp_family == AF_INET) 3422 sin->sin_port = htons(allocated_port); 3423 else 3424 sin6->sin6_port = htons(allocated_port); 3425 } 3426 if (tcp->tcp_family == AF_INET) { 3427 if (tbr->CONIND_number != 0) { 3428 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, 3429 sizeof (sin_t)); 3430 } else { 3431 /* Just verify the local IP address */ 3432 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, IP_ADDR_LEN); 3433 } 3434 } else { 3435 if (tbr->CONIND_number != 0) { 3436 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, 3437 sizeof (sin6_t)); 3438 } else { 3439 /* Just verify the local IP address */ 3440 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, 3441 IPV6_ADDR_LEN); 3442 } 3443 } 3444 if (mp1 == NULL) { 3445 if (connp->conn_anon_port) { 3446 connp->conn_anon_port = B_FALSE; 3447 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, 3448 requested_port, B_FALSE); 3449 } 3450 connp->conn_mlp_type = mlptSingle; 3451 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 3452 return; 3453 } 3454 3455 tbr->PRIM_type = T_BIND_ACK; 3456 mp->b_datap->db_type = M_PCPROTO; 3457 3458 /* Chain in the reply mp for tcp_rput() */ 3459 mp1->b_cont = mp; 3460 mp = mp1; 3461 3462 tcp->tcp_conn_req_max = tbr->CONIND_number; 3463 if (tcp->tcp_conn_req_max) { 3464 if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min) 3465 tcp->tcp_conn_req_max = tcps->tcps_conn_req_min; 3466 if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q) 3467 tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q; 3468 /* 3469 * If this is a listener, do not reset the eager list 3470 * and other stuffs. Note that we don't check if the 3471 * existing eager list meets the new tcp_conn_req_max 3472 * requirement. 3473 */ 3474 if (tcp->tcp_state != TCPS_LISTEN) { 3475 tcp->tcp_state = TCPS_LISTEN; 3476 /* Initialize the chain. Don't need the eager_lock */ 3477 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 3478 tcp->tcp_eager_next_drop_q0 = tcp; 3479 tcp->tcp_eager_prev_drop_q0 = tcp; 3480 tcp->tcp_second_ctimer_threshold = 3481 tcps->tcps_ip_abort_linterval; 3482 } 3483 } 3484 3485 /* 3486 * We can call ip_bind directly which returns a T_BIND_ACK mp. The 3487 * processing continues in tcp_rput_other(). 3488 * 3489 * We need to make sure that the conn_recv is set to a non-null 3490 * value before we insert the conn into the classifier table. 3491 * This is to avoid a race with an incoming packet which does an 3492 * ipcl_classify(). 3493 */ 3494 connp->conn_recv = tcp_conn_request; 3495 if (tcp->tcp_family == AF_INET6) { 3496 ASSERT(tcp->tcp_connp->conn_af_isv6); 3497 mp = ip_bind_v6(q, mp, tcp->tcp_connp, &tcp->tcp_sticky_ipp); 3498 } else { 3499 ASSERT(!tcp->tcp_connp->conn_af_isv6); 3500 mp = ip_bind_v4(q, mp, tcp->tcp_connp); 3501 } 3502 /* 3503 * If the bind cannot complete immediately 3504 * IP will arrange to call tcp_rput_other 3505 * when the bind completes. 3506 */ 3507 if (mp != NULL) { 3508 tcp_rput_other(tcp, mp); 3509 } else { 3510 /* 3511 * Bind will be resumed later. Need to ensure 3512 * that conn doesn't disappear when that happens. 3513 * This will be decremented in ip_resume_tcp_bind(). 3514 */ 3515 CONN_INC_REF(tcp->tcp_connp); 3516 } 3517 } 3518 3519 3520 /* 3521 * If the "bind_to_req_port_only" parameter is set, if the requested port 3522 * number is available, return it, If not return 0 3523 * 3524 * If "bind_to_req_port_only" parameter is not set and 3525 * If the requested port number is available, return it. If not, return 3526 * the first anonymous port we happen across. If no anonymous ports are 3527 * available, return 0. addr is the requested local address, if any. 3528 * 3529 * In either case, when succeeding update the tcp_t to record the port number 3530 * and insert it in the bind hash table. 3531 * 3532 * Note that TCP over IPv4 and IPv6 sockets can use the same port number 3533 * without setting SO_REUSEADDR. This is needed so that they 3534 * can be viewed as two independent transport protocols. 3535 */ 3536 static in_port_t 3537 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 3538 int reuseaddr, boolean_t quick_connect, 3539 boolean_t bind_to_req_port_only, boolean_t user_specified) 3540 { 3541 /* number of times we have run around the loop */ 3542 int count = 0; 3543 /* maximum number of times to run around the loop */ 3544 int loopmax; 3545 conn_t *connp = tcp->tcp_connp; 3546 zoneid_t zoneid = connp->conn_zoneid; 3547 tcp_stack_t *tcps = tcp->tcp_tcps; 3548 3549 /* 3550 * Lookup for free addresses is done in a loop and "loopmax" 3551 * influences how long we spin in the loop 3552 */ 3553 if (bind_to_req_port_only) { 3554 /* 3555 * If the requested port is busy, don't bother to look 3556 * for a new one. Setting loop maximum count to 1 has 3557 * that effect. 3558 */ 3559 loopmax = 1; 3560 } else { 3561 /* 3562 * If the requested port is busy, look for a free one 3563 * in the anonymous port range. 3564 * Set loopmax appropriately so that one does not look 3565 * forever in the case all of the anonymous ports are in use. 3566 */ 3567 if (tcp->tcp_anon_priv_bind) { 3568 /* 3569 * loopmax = 3570 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 3571 */ 3572 loopmax = IPPORT_RESERVED - 3573 tcps->tcps_min_anonpriv_port; 3574 } else { 3575 loopmax = (tcps->tcps_largest_anon_port - 3576 tcps->tcps_smallest_anon_port + 1); 3577 } 3578 } 3579 do { 3580 uint16_t lport; 3581 tf_t *tbf; 3582 tcp_t *ltcp; 3583 conn_t *lconnp; 3584 3585 lport = htons(port); 3586 3587 /* 3588 * Ensure that the tcp_t is not currently in the bind hash. 3589 * Hold the lock on the hash bucket to ensure that 3590 * the duplicate check plus the insertion is an atomic 3591 * operation. 3592 * 3593 * This function does an inline lookup on the bind hash list 3594 * Make sure that we access only members of tcp_t 3595 * and that we don't look at tcp_tcp, since we are not 3596 * doing a CONN_INC_REF. 3597 */ 3598 tcp_bind_hash_remove(tcp); 3599 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; 3600 mutex_enter(&tbf->tf_lock); 3601 for (ltcp = tbf->tf_tcp; ltcp != NULL; 3602 ltcp = ltcp->tcp_bind_hash) { 3603 boolean_t not_socket; 3604 boolean_t exclbind; 3605 3606 if (lport != ltcp->tcp_lport) 3607 continue; 3608 3609 lconnp = ltcp->tcp_connp; 3610 3611 /* 3612 * On a labeled system, we must treat bindings to ports 3613 * on shared IP addresses by sockets with MAC exemption 3614 * privilege as being in all zones, as there's 3615 * otherwise no way to identify the right receiver. 3616 */ 3617 if (!(IPCL_ZONE_MATCH(ltcp->tcp_connp, zoneid) || 3618 IPCL_ZONE_MATCH(connp, 3619 ltcp->tcp_connp->conn_zoneid)) && 3620 !lconnp->conn_mac_exempt && 3621 !connp->conn_mac_exempt) 3622 continue; 3623 3624 /* 3625 * If TCP_EXCLBIND is set for either the bound or 3626 * binding endpoint, the semantics of bind 3627 * is changed according to the following. 3628 * 3629 * spec = specified address (v4 or v6) 3630 * unspec = unspecified address (v4 or v6) 3631 * A = specified addresses are different for endpoints 3632 * 3633 * bound bind to allowed 3634 * ------------------------------------- 3635 * unspec unspec no 3636 * unspec spec no 3637 * spec unspec no 3638 * spec spec yes if A 3639 * 3640 * For labeled systems, SO_MAC_EXEMPT behaves the same 3641 * as TCP_EXCLBIND, except that zoneid is ignored. 3642 * 3643 * Note: 3644 * 3645 * 1. Because of TLI semantics, an endpoint can go 3646 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or 3647 * TCPS_BOUND, depending on whether it is originally 3648 * a listener or not. That is why we need to check 3649 * for states greater than or equal to TCPS_BOUND 3650 * here. 3651 * 3652 * 2. Ideally, we should only check for state equals 3653 * to TCPS_LISTEN. And the following check should be 3654 * added. 3655 * 3656 * if (ltcp->tcp_state == TCPS_LISTEN || 3657 * !reuseaddr || !ltcp->tcp_reuseaddr) { 3658 * ... 3659 * } 3660 * 3661 * The semantics will be changed to this. If the 3662 * endpoint on the list is in state not equal to 3663 * TCPS_LISTEN and both endpoints have SO_REUSEADDR 3664 * set, let the bind succeed. 3665 * 3666 * Because of (1), we cannot do that for TLI 3667 * endpoints. But we can do that for socket endpoints. 3668 * If in future, we can change this going back 3669 * semantics, we can use the above check for TLI also. 3670 */ 3671 not_socket = !(TCP_IS_SOCKET(ltcp) && 3672 TCP_IS_SOCKET(tcp)); 3673 exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind; 3674 3675 if (lconnp->conn_mac_exempt || connp->conn_mac_exempt || 3676 (exclbind && (not_socket || 3677 ltcp->tcp_state <= TCPS_ESTABLISHED))) { 3678 if (V6_OR_V4_INADDR_ANY( 3679 ltcp->tcp_bound_source_v6) || 3680 V6_OR_V4_INADDR_ANY(*laddr) || 3681 IN6_ARE_ADDR_EQUAL(laddr, 3682 <cp->tcp_bound_source_v6)) { 3683 break; 3684 } 3685 continue; 3686 } 3687 3688 /* 3689 * Check ipversion to allow IPv4 and IPv6 sockets to 3690 * have disjoint port number spaces, if *_EXCLBIND 3691 * is not set and only if the application binds to a 3692 * specific port. We use the same autoassigned port 3693 * number space for IPv4 and IPv6 sockets. 3694 */ 3695 if (tcp->tcp_ipversion != ltcp->tcp_ipversion && 3696 bind_to_req_port_only) 3697 continue; 3698 3699 /* 3700 * Ideally, we should make sure that the source 3701 * address, remote address, and remote port in the 3702 * four tuple for this tcp-connection is unique. 3703 * However, trying to find out the local source 3704 * address would require too much code duplication 3705 * with IP, since IP needs needs to have that code 3706 * to support userland TCP implementations. 3707 */ 3708 if (quick_connect && 3709 (ltcp->tcp_state > TCPS_LISTEN) && 3710 ((tcp->tcp_fport != ltcp->tcp_fport) || 3711 !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, 3712 <cp->tcp_remote_v6))) 3713 continue; 3714 3715 if (!reuseaddr) { 3716 /* 3717 * No socket option SO_REUSEADDR. 3718 * If existing port is bound to 3719 * a non-wildcard IP address 3720 * and the requesting stream is 3721 * bound to a distinct 3722 * different IP addresses 3723 * (non-wildcard, also), keep 3724 * going. 3725 */ 3726 if (!V6_OR_V4_INADDR_ANY(*laddr) && 3727 !V6_OR_V4_INADDR_ANY( 3728 ltcp->tcp_bound_source_v6) && 3729 !IN6_ARE_ADDR_EQUAL(laddr, 3730 <cp->tcp_bound_source_v6)) 3731 continue; 3732 if (ltcp->tcp_state >= TCPS_BOUND) { 3733 /* 3734 * This port is being used and 3735 * its state is >= TCPS_BOUND, 3736 * so we can't bind to it. 3737 */ 3738 break; 3739 } 3740 } else { 3741 /* 3742 * socket option SO_REUSEADDR is set on the 3743 * binding tcp_t. 3744 * 3745 * If two streams are bound to 3746 * same IP address or both addr 3747 * and bound source are wildcards 3748 * (INADDR_ANY), we want to stop 3749 * searching. 3750 * We have found a match of IP source 3751 * address and source port, which is 3752 * refused regardless of the 3753 * SO_REUSEADDR setting, so we break. 3754 */ 3755 if (IN6_ARE_ADDR_EQUAL(laddr, 3756 <cp->tcp_bound_source_v6) && 3757 (ltcp->tcp_state == TCPS_LISTEN || 3758 ltcp->tcp_state == TCPS_BOUND)) 3759 break; 3760 } 3761 } 3762 if (ltcp != NULL) { 3763 /* The port number is busy */ 3764 mutex_exit(&tbf->tf_lock); 3765 } else { 3766 /* 3767 * This port is ours. Insert in fanout and mark as 3768 * bound to prevent others from getting the port 3769 * number. 3770 */ 3771 tcp->tcp_state = TCPS_BOUND; 3772 tcp->tcp_lport = htons(port); 3773 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 3774 3775 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( 3776 tcp->tcp_lport)] == tbf); 3777 tcp_bind_hash_insert(tbf, tcp, 1); 3778 3779 mutex_exit(&tbf->tf_lock); 3780 3781 /* 3782 * We don't want tcp_next_port_to_try to "inherit" 3783 * a port number supplied by the user in a bind. 3784 */ 3785 if (user_specified) 3786 return (port); 3787 3788 /* 3789 * This is the only place where tcp_next_port_to_try 3790 * is updated. After the update, it may or may not 3791 * be in the valid range. 3792 */ 3793 if (!tcp->tcp_anon_priv_bind) 3794 tcps->tcps_next_port_to_try = port + 1; 3795 return (port); 3796 } 3797 3798 if (tcp->tcp_anon_priv_bind) { 3799 port = tcp_get_next_priv_port(tcp); 3800 } else { 3801 if (count == 0 && user_specified) { 3802 /* 3803 * We may have to return an anonymous port. So 3804 * get one to start with. 3805 */ 3806 port = 3807 tcp_update_next_port( 3808 tcps->tcps_next_port_to_try, 3809 tcp, B_TRUE); 3810 user_specified = B_FALSE; 3811 } else { 3812 port = tcp_update_next_port(port + 1, tcp, 3813 B_FALSE); 3814 } 3815 } 3816 if (port == 0) 3817 break; 3818 3819 /* 3820 * Don't let this loop run forever in the case where 3821 * all of the anonymous ports are in use. 3822 */ 3823 } while (++count < loopmax); 3824 return (0); 3825 } 3826 3827 /* 3828 * tcp_clean_death / tcp_close_detached must not be called more than once 3829 * on a tcp. Thus every function that potentially calls tcp_clean_death 3830 * must check for the tcp state before calling tcp_clean_death. 3831 * Eg. tcp_input, tcp_rput_data, tcp_eager_kill, tcp_clean_death_wrapper, 3832 * tcp_timer_handler, all check for the tcp state. 3833 */ 3834 /* ARGSUSED */ 3835 void 3836 tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2) 3837 { 3838 tcp_t *tcp = ((conn_t *)arg)->conn_tcp; 3839 3840 freemsg(mp); 3841 if (tcp->tcp_state > TCPS_BOUND) 3842 (void) tcp_clean_death(((conn_t *)arg)->conn_tcp, 3843 ETIMEDOUT, 5); 3844 } 3845 3846 /* 3847 * We are dying for some reason. Try to do it gracefully. (May be called 3848 * as writer.) 3849 * 3850 * Return -1 if the structure was not cleaned up (if the cleanup had to be 3851 * done by a service procedure). 3852 * TBD - Should the return value distinguish between the tcp_t being 3853 * freed and it being reinitialized? 3854 */ 3855 static int 3856 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) 3857 { 3858 mblk_t *mp; 3859 queue_t *q; 3860 tcp_stack_t *tcps = tcp->tcp_tcps; 3861 sodirect_t *sodp; 3862 3863 TCP_CLD_STAT(tag); 3864 3865 #if TCP_TAG_CLEAN_DEATH 3866 tcp->tcp_cleandeathtag = tag; 3867 #endif 3868 3869 if (tcp->tcp_fused) 3870 tcp_unfuse(tcp); 3871 3872 if (tcp->tcp_linger_tid != 0 && 3873 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 3874 tcp_stop_lingering(tcp); 3875 } 3876 3877 ASSERT(tcp != NULL); 3878 ASSERT((tcp->tcp_family == AF_INET && 3879 tcp->tcp_ipversion == IPV4_VERSION) || 3880 (tcp->tcp_family == AF_INET6 && 3881 (tcp->tcp_ipversion == IPV4_VERSION || 3882 tcp->tcp_ipversion == IPV6_VERSION))); 3883 3884 if (TCP_IS_DETACHED(tcp)) { 3885 if (tcp->tcp_hard_binding) { 3886 /* 3887 * Its an eager that we are dealing with. We close the 3888 * eager but in case a conn_ind has already gone to the 3889 * listener, let tcp_accept_finish() send a discon_ind 3890 * to the listener and drop the last reference. If the 3891 * listener doesn't even know about the eager i.e. the 3892 * conn_ind hasn't gone up, blow away the eager and drop 3893 * the last reference as well. If the conn_ind has gone 3894 * up, state should be BOUND. tcp_accept_finish 3895 * will figure out that the connection has received a 3896 * RST and will send a DISCON_IND to the application. 3897 */ 3898 tcp_closei_local(tcp); 3899 if (!tcp->tcp_tconnind_started) { 3900 CONN_DEC_REF(tcp->tcp_connp); 3901 } else { 3902 tcp->tcp_state = TCPS_BOUND; 3903 } 3904 } else { 3905 tcp_close_detached(tcp); 3906 } 3907 return (0); 3908 } 3909 3910 TCP_STAT(tcps, tcp_clean_death_nondetached); 3911 3912 /* 3913 * If T_ORDREL_IND has not been sent yet (done when service routine 3914 * is run) postpone cleaning up the endpoint until service routine 3915 * has sent up the T_ORDREL_IND. Avoid clearing out an existing 3916 * client_errno since tcp_close uses the client_errno field. 3917 */ 3918 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 3919 if (err != 0) 3920 tcp->tcp_client_errno = err; 3921 3922 tcp->tcp_deferred_clean_death = B_TRUE; 3923 return (-1); 3924 } 3925 3926 /* If sodirect, not anymore */ 3927 SOD_PTR_ENTER(tcp, sodp); 3928 if (sodp != NULL) { 3929 tcp->tcp_sodirect = NULL; 3930 mutex_exit(sodp->sod_lock); 3931 } 3932 3933 q = tcp->tcp_rq; 3934 3935 /* Trash all inbound data */ 3936 flushq(q, FLUSHALL); 3937 3938 /* 3939 * If we are at least part way open and there is error 3940 * (err==0 implies no error) 3941 * notify our client by a T_DISCON_IND. 3942 */ 3943 if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) { 3944 if (tcp->tcp_state >= TCPS_ESTABLISHED && 3945 !TCP_IS_SOCKET(tcp)) { 3946 /* 3947 * Send M_FLUSH according to TPI. Because sockets will 3948 * (and must) ignore FLUSHR we do that only for TPI 3949 * endpoints and sockets in STREAMS mode. 3950 */ 3951 (void) putnextctl1(q, M_FLUSH, FLUSHR); 3952 } 3953 if (tcp->tcp_debug) { 3954 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 3955 "tcp_clean_death: discon err %d", err); 3956 } 3957 mp = mi_tpi_discon_ind(NULL, err, 0); 3958 if (mp != NULL) { 3959 putnext(q, mp); 3960 } else { 3961 if (tcp->tcp_debug) { 3962 (void) strlog(TCP_MOD_ID, 0, 1, 3963 SL_ERROR|SL_TRACE, 3964 "tcp_clean_death, sending M_ERROR"); 3965 } 3966 (void) putnextctl1(q, M_ERROR, EPROTO); 3967 } 3968 if (tcp->tcp_state <= TCPS_SYN_RCVD) { 3969 /* SYN_SENT or SYN_RCVD */ 3970 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); 3971 } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) { 3972 /* ESTABLISHED or CLOSE_WAIT */ 3973 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); 3974 } 3975 } 3976 3977 tcp_reinit(tcp); 3978 return (-1); 3979 } 3980 3981 /* 3982 * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout 3983 * to expire, stop the wait and finish the close. 3984 */ 3985 static void 3986 tcp_stop_lingering(tcp_t *tcp) 3987 { 3988 clock_t delta = 0; 3989 tcp_stack_t *tcps = tcp->tcp_tcps; 3990 3991 tcp->tcp_linger_tid = 0; 3992 if (tcp->tcp_state > TCPS_LISTEN) { 3993 tcp_acceptor_hash_remove(tcp); 3994 mutex_enter(&tcp->tcp_non_sq_lock); 3995 if (tcp->tcp_flow_stopped) { 3996 tcp_clrqfull(tcp); 3997 } 3998 mutex_exit(&tcp->tcp_non_sq_lock); 3999 4000 if (tcp->tcp_timer_tid != 0) { 4001 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 4002 tcp->tcp_timer_tid = 0; 4003 } 4004 /* 4005 * Need to cancel those timers which will not be used when 4006 * TCP is detached. This has to be done before the tcp_wq 4007 * is set to the global queue. 4008 */ 4009 tcp_timers_stop(tcp); 4010 4011 4012 tcp->tcp_detached = B_TRUE; 4013 ASSERT(tcps->tcps_g_q != NULL); 4014 tcp->tcp_rq = tcps->tcps_g_q; 4015 tcp->tcp_wq = WR(tcps->tcps_g_q); 4016 4017 if (tcp->tcp_state == TCPS_TIME_WAIT) { 4018 tcp_time_wait_append(tcp); 4019 TCP_DBGSTAT(tcps, tcp_detach_time_wait); 4020 goto finish; 4021 } 4022 4023 /* 4024 * If delta is zero the timer event wasn't executed and was 4025 * successfully canceled. In this case we need to restart it 4026 * with the minimal delta possible. 4027 */ 4028 if (delta >= 0) { 4029 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 4030 delta ? delta : 1); 4031 } 4032 } else { 4033 tcp_closei_local(tcp); 4034 CONN_DEC_REF(tcp->tcp_connp); 4035 } 4036 finish: 4037 /* Signal closing thread that it can complete close */ 4038 mutex_enter(&tcp->tcp_closelock); 4039 tcp->tcp_detached = B_TRUE; 4040 ASSERT(tcps->tcps_g_q != NULL); 4041 tcp->tcp_rq = tcps->tcps_g_q; 4042 tcp->tcp_wq = WR(tcps->tcps_g_q); 4043 tcp->tcp_closed = 1; 4044 cv_signal(&tcp->tcp_closecv); 4045 mutex_exit(&tcp->tcp_closelock); 4046 } 4047 4048 /* 4049 * Handle lingering timeouts. This function is called when the SO_LINGER timeout 4050 * expires. 4051 */ 4052 static void 4053 tcp_close_linger_timeout(void *arg) 4054 { 4055 conn_t *connp = (conn_t *)arg; 4056 tcp_t *tcp = connp->conn_tcp; 4057 4058 tcp->tcp_client_errno = ETIMEDOUT; 4059 tcp_stop_lingering(tcp); 4060 } 4061 4062 static int 4063 tcp_close(queue_t *q, int flags) 4064 { 4065 conn_t *connp = Q_TO_CONN(q); 4066 tcp_t *tcp = connp->conn_tcp; 4067 mblk_t *mp = &tcp->tcp_closemp; 4068 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 4069 mblk_t *bp; 4070 4071 ASSERT(WR(q)->q_next == NULL); 4072 ASSERT(connp->conn_ref >= 2); 4073 4074 /* 4075 * We are being closed as /dev/tcp or /dev/tcp6. 4076 * 4077 * Mark the conn as closing. ill_pending_mp_add will not 4078 * add any mp to the pending mp list, after this conn has 4079 * started closing. Same for sq_pending_mp_add 4080 */ 4081 mutex_enter(&connp->conn_lock); 4082 connp->conn_state_flags |= CONN_CLOSING; 4083 if (connp->conn_oper_pending_ill != NULL) 4084 conn_ioctl_cleanup_reqd = B_TRUE; 4085 CONN_INC_REF_LOCKED(connp); 4086 mutex_exit(&connp->conn_lock); 4087 tcp->tcp_closeflags = (uint8_t)flags; 4088 ASSERT(connp->conn_ref >= 3); 4089 4090 /* 4091 * tcp_closemp_used is used below without any protection of a lock 4092 * as we don't expect any one else to use it concurrently at this 4093 * point otherwise it would be a major defect. 4094 */ 4095 4096 if (mp->b_prev == NULL) 4097 tcp->tcp_closemp_used = B_TRUE; 4098 else 4099 cmn_err(CE_PANIC, "tcp_close: concurrent use of tcp_closemp: " 4100 "connp %p tcp %p\n", (void *)connp, (void *)tcp); 4101 4102 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); 4103 4104 (*tcp_squeue_close_proc)(connp->conn_sqp, mp, 4105 tcp_close_output, connp, SQTAG_IP_TCP_CLOSE); 4106 4107 mutex_enter(&tcp->tcp_closelock); 4108 while (!tcp->tcp_closed) { 4109 if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) { 4110 /* 4111 * The cv_wait_sig() was interrupted. We now do the 4112 * following: 4113 * 4114 * 1) If the endpoint was lingering, we allow this 4115 * to be interrupted by cancelling the linger timeout 4116 * and closing normally. 4117 * 4118 * 2) Revert to calling cv_wait() 4119 * 4120 * We revert to using cv_wait() to avoid an 4121 * infinite loop which can occur if the calling 4122 * thread is higher priority than the squeue worker 4123 * thread and is bound to the same cpu. 4124 */ 4125 if (tcp->tcp_linger && tcp->tcp_lingertime > 0) { 4126 mutex_exit(&tcp->tcp_closelock); 4127 /* Entering squeue, bump ref count. */ 4128 CONN_INC_REF(connp); 4129 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); 4130 squeue_enter(connp->conn_sqp, bp, 4131 tcp_linger_interrupted, connp, 4132 SQTAG_IP_TCP_CLOSE); 4133 mutex_enter(&tcp->tcp_closelock); 4134 } 4135 break; 4136 } 4137 } 4138 while (!tcp->tcp_closed) 4139 cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock); 4140 mutex_exit(&tcp->tcp_closelock); 4141 4142 /* 4143 * In the case of listener streams that have eagers in the q or q0 4144 * we wait for the eagers to drop their reference to us. tcp_rq and 4145 * tcp_wq of the eagers point to our queues. By waiting for the 4146 * refcnt to drop to 1, we are sure that the eagers have cleaned 4147 * up their queue pointers and also dropped their references to us. 4148 */ 4149 if (tcp->tcp_wait_for_eagers) { 4150 mutex_enter(&connp->conn_lock); 4151 while (connp->conn_ref != 1) { 4152 cv_wait(&connp->conn_cv, &connp->conn_lock); 4153 } 4154 mutex_exit(&connp->conn_lock); 4155 } 4156 /* 4157 * ioctl cleanup. The mp is queued in the 4158 * ill_pending_mp or in the sq_pending_mp. 4159 */ 4160 if (conn_ioctl_cleanup_reqd) 4161 conn_ioctl_cleanup(connp); 4162 4163 qprocsoff(q); 4164 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 4165 4166 tcp->tcp_cpid = -1; 4167 4168 /* 4169 * Drop IP's reference on the conn. This is the last reference 4170 * on the connp if the state was less than established. If the 4171 * connection has gone into timewait state, then we will have 4172 * one ref for the TCP and one more ref (total of two) for the 4173 * classifier connected hash list (a timewait connections stays 4174 * in connected hash till closed). 4175 * 4176 * We can't assert the references because there might be other 4177 * transient reference places because of some walkers or queued 4178 * packets in squeue for the timewait state. 4179 */ 4180 CONN_DEC_REF(connp); 4181 q->q_ptr = WR(q)->q_ptr = NULL; 4182 return (0); 4183 } 4184 4185 static int 4186 tcpclose_accept(queue_t *q) 4187 { 4188 vmem_t *minor_arena; 4189 dev_t conn_dev; 4190 4191 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); 4192 4193 /* 4194 * We had opened an acceptor STREAM for sockfs which is 4195 * now being closed due to some error. 4196 */ 4197 qprocsoff(q); 4198 4199 minor_arena = (vmem_t *)WR(q)->q_ptr; 4200 conn_dev = (dev_t)RD(q)->q_ptr; 4201 ASSERT(minor_arena != NULL); 4202 ASSERT(conn_dev != 0); 4203 inet_minor_free(minor_arena, conn_dev); 4204 q->q_ptr = WR(q)->q_ptr = NULL; 4205 return (0); 4206 } 4207 4208 /* 4209 * Called by tcp_close() routine via squeue when lingering is 4210 * interrupted by a signal. 4211 */ 4212 4213 /* ARGSUSED */ 4214 static void 4215 tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2) 4216 { 4217 conn_t *connp = (conn_t *)arg; 4218 tcp_t *tcp = connp->conn_tcp; 4219 4220 freeb(mp); 4221 if (tcp->tcp_linger_tid != 0 && 4222 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 4223 tcp_stop_lingering(tcp); 4224 tcp->tcp_client_errno = EINTR; 4225 } 4226 } 4227 4228 /* 4229 * Called by streams close routine via squeues when our client blows off her 4230 * descriptor, we take this to mean: "close the stream state NOW, close the tcp 4231 * connection politely" When SO_LINGER is set (with a non-zero linger time and 4232 * it is not a nonblocking socket) then this routine sleeps until the FIN is 4233 * acked. 4234 * 4235 * NOTE: tcp_close potentially returns error when lingering. 4236 * However, the stream head currently does not pass these errors 4237 * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK 4238 * errors to the application (from tsleep()) and not errors 4239 * like ECONNRESET caused by receiving a reset packet. 4240 */ 4241 4242 /* ARGSUSED */ 4243 static void 4244 tcp_close_output(void *arg, mblk_t *mp, void *arg2) 4245 { 4246 char *msg; 4247 conn_t *connp = (conn_t *)arg; 4248 tcp_t *tcp = connp->conn_tcp; 4249 clock_t delta = 0; 4250 tcp_stack_t *tcps = tcp->tcp_tcps; 4251 4252 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 4253 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 4254 4255 /* Cancel any pending timeout */ 4256 if (tcp->tcp_ordrelid != 0) { 4257 if (tcp->tcp_timeout) { 4258 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ordrelid); 4259 } 4260 tcp->tcp_ordrelid = 0; 4261 tcp->tcp_timeout = B_FALSE; 4262 } 4263 4264 mutex_enter(&tcp->tcp_eager_lock); 4265 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 4266 /* Cleanup for listener */ 4267 tcp_eager_cleanup(tcp, 0); 4268 tcp->tcp_wait_for_eagers = 1; 4269 } 4270 mutex_exit(&tcp->tcp_eager_lock); 4271 4272 connp->conn_mdt_ok = B_FALSE; 4273 tcp->tcp_mdt = B_FALSE; 4274 4275 connp->conn_lso_ok = B_FALSE; 4276 tcp->tcp_lso = B_FALSE; 4277 4278 msg = NULL; 4279 switch (tcp->tcp_state) { 4280 case TCPS_CLOSED: 4281 case TCPS_IDLE: 4282 case TCPS_BOUND: 4283 case TCPS_LISTEN: 4284 break; 4285 case TCPS_SYN_SENT: 4286 msg = "tcp_close, during connect"; 4287 break; 4288 case TCPS_SYN_RCVD: 4289 /* 4290 * Close during the connect 3-way handshake 4291 * but here there may or may not be pending data 4292 * already on queue. Process almost same as in 4293 * the ESTABLISHED state. 4294 */ 4295 /* FALLTHRU */ 4296 default: 4297 if (tcp->tcp_sodirect != NULL) { 4298 /* Ok, no more sodirect */ 4299 tcp->tcp_sodirect = NULL; 4300 } 4301 4302 if (tcp->tcp_fused) 4303 tcp_unfuse(tcp); 4304 4305 /* 4306 * If SO_LINGER has set a zero linger time, abort the 4307 * connection with a reset. 4308 */ 4309 if (tcp->tcp_linger && tcp->tcp_lingertime == 0) { 4310 msg = "tcp_close, zero lingertime"; 4311 break; 4312 } 4313 4314 ASSERT(tcp->tcp_hard_bound || tcp->tcp_hard_binding); 4315 /* 4316 * Abort connection if there is unread data queued. 4317 */ 4318 if (tcp->tcp_rcv_list || tcp->tcp_reass_head) { 4319 msg = "tcp_close, unread data"; 4320 break; 4321 } 4322 /* 4323 * tcp_hard_bound is now cleared thus all packets go through 4324 * tcp_lookup. This fact is used by tcp_detach below. 4325 * 4326 * We have done a qwait() above which could have possibly 4327 * drained more messages in turn causing transition to a 4328 * different state. Check whether we have to do the rest 4329 * of the processing or not. 4330 */ 4331 if (tcp->tcp_state <= TCPS_LISTEN) 4332 break; 4333 4334 /* 4335 * Transmit the FIN before detaching the tcp_t. 4336 * After tcp_detach returns this queue/perimeter 4337 * no longer owns the tcp_t thus others can modify it. 4338 */ 4339 (void) tcp_xmit_end(tcp); 4340 4341 /* 4342 * If lingering on close then wait until the fin is acked, 4343 * the SO_LINGER time passes, or a reset is sent/received. 4344 */ 4345 if (tcp->tcp_linger && tcp->tcp_lingertime > 0 && 4346 !(tcp->tcp_fin_acked) && 4347 tcp->tcp_state >= TCPS_ESTABLISHED) { 4348 if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) { 4349 tcp->tcp_client_errno = EWOULDBLOCK; 4350 } else if (tcp->tcp_client_errno == 0) { 4351 4352 ASSERT(tcp->tcp_linger_tid == 0); 4353 4354 tcp->tcp_linger_tid = TCP_TIMER(tcp, 4355 tcp_close_linger_timeout, 4356 tcp->tcp_lingertime * hz); 4357 4358 /* tcp_close_linger_timeout will finish close */ 4359 if (tcp->tcp_linger_tid == 0) 4360 tcp->tcp_client_errno = ENOSR; 4361 else 4362 return; 4363 } 4364 4365 /* 4366 * Check if we need to detach or just close 4367 * the instance. 4368 */ 4369 if (tcp->tcp_state <= TCPS_LISTEN) 4370 break; 4371 } 4372 4373 /* 4374 * Make sure that no other thread will access the tcp_rq of 4375 * this instance (through lookups etc.) as tcp_rq will go 4376 * away shortly. 4377 */ 4378 tcp_acceptor_hash_remove(tcp); 4379 4380 mutex_enter(&tcp->tcp_non_sq_lock); 4381 if (tcp->tcp_flow_stopped) { 4382 tcp_clrqfull(tcp); 4383 } 4384 mutex_exit(&tcp->tcp_non_sq_lock); 4385 4386 if (tcp->tcp_timer_tid != 0) { 4387 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 4388 tcp->tcp_timer_tid = 0; 4389 } 4390 /* 4391 * Need to cancel those timers which will not be used when 4392 * TCP is detached. This has to be done before the tcp_wq 4393 * is set to the global queue. 4394 */ 4395 tcp_timers_stop(tcp); 4396 4397 tcp->tcp_detached = B_TRUE; 4398 if (tcp->tcp_state == TCPS_TIME_WAIT) { 4399 tcp_time_wait_append(tcp); 4400 TCP_DBGSTAT(tcps, tcp_detach_time_wait); 4401 ASSERT(connp->conn_ref >= 3); 4402 goto finish; 4403 } 4404 4405 /* 4406 * If delta is zero the timer event wasn't executed and was 4407 * successfully canceled. In this case we need to restart it 4408 * with the minimal delta possible. 4409 */ 4410 if (delta >= 0) 4411 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 4412 delta ? delta : 1); 4413 4414 ASSERT(connp->conn_ref >= 3); 4415 goto finish; 4416 } 4417 4418 /* Detach did not complete. Still need to remove q from stream. */ 4419 if (msg) { 4420 if (tcp->tcp_state == TCPS_ESTABLISHED || 4421 tcp->tcp_state == TCPS_CLOSE_WAIT) 4422 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); 4423 if (tcp->tcp_state == TCPS_SYN_SENT || 4424 tcp->tcp_state == TCPS_SYN_RCVD) 4425 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); 4426 tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST); 4427 } 4428 4429 tcp_closei_local(tcp); 4430 CONN_DEC_REF(connp); 4431 ASSERT(connp->conn_ref >= 2); 4432 4433 finish: 4434 /* 4435 * Although packets are always processed on the correct 4436 * tcp's perimeter and access is serialized via squeue's, 4437 * IP still needs a queue when sending packets in time_wait 4438 * state so use WR(tcps_g_q) till ip_output() can be 4439 * changed to deal with just connp. For read side, we 4440 * could have set tcp_rq to NULL but there are some cases 4441 * in tcp_rput_data() from early days of this code which 4442 * do a putnext without checking if tcp is closed. Those 4443 * need to be identified before both tcp_rq and tcp_wq 4444 * can be set to NULL and tcps_g_q can disappear forever. 4445 */ 4446 mutex_enter(&tcp->tcp_closelock); 4447 /* 4448 * Don't change the queues in the case of a listener that has 4449 * eagers in its q or q0. It could surprise the eagers. 4450 * Instead wait for the eagers outside the squeue. 4451 */ 4452 if (!tcp->tcp_wait_for_eagers) { 4453 tcp->tcp_detached = B_TRUE; 4454 /* 4455 * When default queue is closing we set tcps_g_q to NULL 4456 * after the close is done. 4457 */ 4458 ASSERT(tcps->tcps_g_q != NULL); 4459 tcp->tcp_rq = tcps->tcps_g_q; 4460 tcp->tcp_wq = WR(tcps->tcps_g_q); 4461 } 4462 4463 /* Signal tcp_close() to finish closing. */ 4464 tcp->tcp_closed = 1; 4465 cv_signal(&tcp->tcp_closecv); 4466 mutex_exit(&tcp->tcp_closelock); 4467 } 4468 4469 4470 /* 4471 * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp. 4472 * Some stream heads get upset if they see these later on as anything but NULL. 4473 */ 4474 static void 4475 tcp_close_mpp(mblk_t **mpp) 4476 { 4477 mblk_t *mp; 4478 4479 if ((mp = *mpp) != NULL) { 4480 do { 4481 mp->b_next = NULL; 4482 mp->b_prev = NULL; 4483 } while ((mp = mp->b_cont) != NULL); 4484 4485 mp = *mpp; 4486 *mpp = NULL; 4487 freemsg(mp); 4488 } 4489 } 4490 4491 /* Do detached close. */ 4492 static void 4493 tcp_close_detached(tcp_t *tcp) 4494 { 4495 if (tcp->tcp_fused) 4496 tcp_unfuse(tcp); 4497 4498 /* 4499 * Clustering code serializes TCP disconnect callbacks and 4500 * cluster tcp list walks by blocking a TCP disconnect callback 4501 * if a cluster tcp list walk is in progress. This ensures 4502 * accurate accounting of TCPs in the cluster code even though 4503 * the TCP list walk itself is not atomic. 4504 */ 4505 tcp_closei_local(tcp); 4506 CONN_DEC_REF(tcp->tcp_connp); 4507 } 4508 4509 /* 4510 * Stop all TCP timers, and free the timer mblks if requested. 4511 */ 4512 void 4513 tcp_timers_stop(tcp_t *tcp) 4514 { 4515 if (tcp->tcp_timer_tid != 0) { 4516 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 4517 tcp->tcp_timer_tid = 0; 4518 } 4519 if (tcp->tcp_ka_tid != 0) { 4520 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); 4521 tcp->tcp_ka_tid = 0; 4522 } 4523 if (tcp->tcp_ack_tid != 0) { 4524 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 4525 tcp->tcp_ack_tid = 0; 4526 } 4527 if (tcp->tcp_push_tid != 0) { 4528 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 4529 tcp->tcp_push_tid = 0; 4530 } 4531 } 4532 4533 /* 4534 * The tcp_t is going away. Remove it from all lists and set it 4535 * to TCPS_CLOSED. The freeing up of memory is deferred until 4536 * tcp_inactive. This is needed since a thread in tcp_rput might have 4537 * done a CONN_INC_REF on this structure before it was removed from the 4538 * hashes. 4539 */ 4540 static void 4541 tcp_closei_local(tcp_t *tcp) 4542 { 4543 ire_t *ire; 4544 conn_t *connp = tcp->tcp_connp; 4545 tcp_stack_t *tcps = tcp->tcp_tcps; 4546 4547 if (!TCP_IS_SOCKET(tcp)) 4548 tcp_acceptor_hash_remove(tcp); 4549 4550 UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs); 4551 tcp->tcp_ibsegs = 0; 4552 UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs); 4553 tcp->tcp_obsegs = 0; 4554 4555 /* 4556 * If we are an eager connection hanging off a listener that 4557 * hasn't formally accepted the connection yet, get off his 4558 * list and blow off any data that we have accumulated. 4559 */ 4560 if (tcp->tcp_listener != NULL) { 4561 tcp_t *listener = tcp->tcp_listener; 4562 mutex_enter(&listener->tcp_eager_lock); 4563 /* 4564 * tcp_tconnind_started == B_TRUE means that the 4565 * conn_ind has already gone to listener. At 4566 * this point, eager will be closed but we 4567 * leave it in listeners eager list so that 4568 * if listener decides to close without doing 4569 * accept, we can clean this up. In tcp_wput_accept 4570 * we take care of the case of accept on closed 4571 * eager. 4572 */ 4573 if (!tcp->tcp_tconnind_started) { 4574 tcp_eager_unlink(tcp); 4575 mutex_exit(&listener->tcp_eager_lock); 4576 /* 4577 * We don't want to have any pointers to the 4578 * listener queue, after we have released our 4579 * reference on the listener 4580 */ 4581 ASSERT(tcps->tcps_g_q != NULL); 4582 tcp->tcp_rq = tcps->tcps_g_q; 4583 tcp->tcp_wq = WR(tcps->tcps_g_q); 4584 CONN_DEC_REF(listener->tcp_connp); 4585 } else { 4586 mutex_exit(&listener->tcp_eager_lock); 4587 } 4588 } 4589 4590 /* Stop all the timers */ 4591 tcp_timers_stop(tcp); 4592 4593 if (tcp->tcp_state == TCPS_LISTEN) { 4594 if (tcp->tcp_ip_addr_cache) { 4595 kmem_free((void *)tcp->tcp_ip_addr_cache, 4596 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 4597 tcp->tcp_ip_addr_cache = NULL; 4598 } 4599 } 4600 mutex_enter(&tcp->tcp_non_sq_lock); 4601 if (tcp->tcp_flow_stopped) 4602 tcp_clrqfull(tcp); 4603 mutex_exit(&tcp->tcp_non_sq_lock); 4604 4605 tcp_bind_hash_remove(tcp); 4606 /* 4607 * If the tcp_time_wait_collector (which runs outside the squeue) 4608 * is trying to remove this tcp from the time wait list, we will 4609 * block in tcp_time_wait_remove while trying to acquire the 4610 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also 4611 * requires the ipcl_hash_remove to be ordered after the 4612 * tcp_time_wait_remove for the refcnt checks to work correctly. 4613 */ 4614 if (tcp->tcp_state == TCPS_TIME_WAIT) 4615 (void) tcp_time_wait_remove(tcp, NULL); 4616 CL_INET_DISCONNECT(tcp); 4617 ipcl_hash_remove(connp); 4618 4619 /* 4620 * Delete the cached ire in conn_ire_cache and also mark 4621 * the conn as CONDEMNED 4622 */ 4623 mutex_enter(&connp->conn_lock); 4624 connp->conn_state_flags |= CONN_CONDEMNED; 4625 ire = connp->conn_ire_cache; 4626 connp->conn_ire_cache = NULL; 4627 mutex_exit(&connp->conn_lock); 4628 if (ire != NULL) 4629 IRE_REFRELE_NOTR(ire); 4630 4631 /* Need to cleanup any pending ioctls */ 4632 ASSERT(tcp->tcp_time_wait_next == NULL); 4633 ASSERT(tcp->tcp_time_wait_prev == NULL); 4634 ASSERT(tcp->tcp_time_wait_expire == 0); 4635 tcp->tcp_state = TCPS_CLOSED; 4636 4637 /* Release any SSL context */ 4638 if (tcp->tcp_kssl_ent != NULL) { 4639 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); 4640 tcp->tcp_kssl_ent = NULL; 4641 } 4642 if (tcp->tcp_kssl_ctx != NULL) { 4643 kssl_release_ctx(tcp->tcp_kssl_ctx); 4644 tcp->tcp_kssl_ctx = NULL; 4645 } 4646 tcp->tcp_kssl_pending = B_FALSE; 4647 4648 tcp_ipsec_cleanup(tcp); 4649 } 4650 4651 /* 4652 * tcp is dying (called from ipcl_conn_destroy and error cases). 4653 * Free the tcp_t in either case. 4654 */ 4655 void 4656 tcp_free(tcp_t *tcp) 4657 { 4658 mblk_t *mp; 4659 ip6_pkt_t *ipp; 4660 4661 ASSERT(tcp != NULL); 4662 ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL); 4663 4664 tcp->tcp_rq = NULL; 4665 tcp->tcp_wq = NULL; 4666 4667 tcp_close_mpp(&tcp->tcp_xmit_head); 4668 tcp_close_mpp(&tcp->tcp_reass_head); 4669 if (tcp->tcp_rcv_list != NULL) { 4670 /* Free b_next chain */ 4671 tcp_close_mpp(&tcp->tcp_rcv_list); 4672 } 4673 if ((mp = tcp->tcp_urp_mp) != NULL) { 4674 freemsg(mp); 4675 } 4676 if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 4677 freemsg(mp); 4678 } 4679 4680 if (tcp->tcp_fused_sigurg_mp != NULL) { 4681 freeb(tcp->tcp_fused_sigurg_mp); 4682 tcp->tcp_fused_sigurg_mp = NULL; 4683 } 4684 4685 if (tcp->tcp_sack_info != NULL) { 4686 if (tcp->tcp_notsack_list != NULL) { 4687 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 4688 } 4689 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 4690 } 4691 4692 if (tcp->tcp_hopopts != NULL) { 4693 mi_free(tcp->tcp_hopopts); 4694 tcp->tcp_hopopts = NULL; 4695 tcp->tcp_hopoptslen = 0; 4696 } 4697 ASSERT(tcp->tcp_hopoptslen == 0); 4698 if (tcp->tcp_dstopts != NULL) { 4699 mi_free(tcp->tcp_dstopts); 4700 tcp->tcp_dstopts = NULL; 4701 tcp->tcp_dstoptslen = 0; 4702 } 4703 ASSERT(tcp->tcp_dstoptslen == 0); 4704 if (tcp->tcp_rtdstopts != NULL) { 4705 mi_free(tcp->tcp_rtdstopts); 4706 tcp->tcp_rtdstopts = NULL; 4707 tcp->tcp_rtdstoptslen = 0; 4708 } 4709 ASSERT(tcp->tcp_rtdstoptslen == 0); 4710 if (tcp->tcp_rthdr != NULL) { 4711 mi_free(tcp->tcp_rthdr); 4712 tcp->tcp_rthdr = NULL; 4713 tcp->tcp_rthdrlen = 0; 4714 } 4715 ASSERT(tcp->tcp_rthdrlen == 0); 4716 4717 ipp = &tcp->tcp_sticky_ipp; 4718 if (ipp->ipp_fields & (IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | 4719 IPPF_RTHDR)) 4720 ip6_pkt_free(ipp); 4721 4722 /* 4723 * Free memory associated with the tcp/ip header template. 4724 */ 4725 4726 if (tcp->tcp_iphc != NULL) 4727 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 4728 4729 /* 4730 * Following is really a blowing away a union. 4731 * It happens to have exactly two members of identical size 4732 * the following code is enough. 4733 */ 4734 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 4735 4736 if (tcp->tcp_tracebuf != NULL) { 4737 kmem_free(tcp->tcp_tracebuf, sizeof (tcptrch_t)); 4738 tcp->tcp_tracebuf = NULL; 4739 } 4740 } 4741 4742 4743 /* 4744 * Put a connection confirmation message upstream built from the 4745 * address information within 'iph' and 'tcph'. Report our success or failure. 4746 */ 4747 static boolean_t 4748 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, 4749 mblk_t **defermp) 4750 { 4751 sin_t sin; 4752 sin6_t sin6; 4753 mblk_t *mp; 4754 char *optp = NULL; 4755 int optlen = 0; 4756 cred_t *cr; 4757 4758 if (defermp != NULL) 4759 *defermp = NULL; 4760 4761 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { 4762 /* 4763 * Return in T_CONN_CON results of option negotiation through 4764 * the T_CONN_REQ. Note: If there is an real end-to-end option 4765 * negotiation, then what is received from remote end needs 4766 * to be taken into account but there is no such thing (yet?) 4767 * in our TCP/IP. 4768 * Note: We do not use mi_offset_param() here as 4769 * tcp_opts_conn_req contents do not directly come from 4770 * an application and are either generated in kernel or 4771 * from user input that was already verified. 4772 */ 4773 mp = tcp->tcp_conn.tcp_opts_conn_req; 4774 optp = (char *)(mp->b_rptr + 4775 ((struct T_conn_req *)mp->b_rptr)->OPT_offset); 4776 optlen = (int) 4777 ((struct T_conn_req *)mp->b_rptr)->OPT_length; 4778 } 4779 4780 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { 4781 ipha_t *ipha = (ipha_t *)iphdr; 4782 4783 /* packet is IPv4 */ 4784 if (tcp->tcp_family == AF_INET) { 4785 sin = sin_null; 4786 sin.sin_addr.s_addr = ipha->ipha_src; 4787 sin.sin_port = *(uint16_t *)tcph->th_lport; 4788 sin.sin_family = AF_INET; 4789 mp = mi_tpi_conn_con(NULL, (char *)&sin, 4790 (int)sizeof (sin_t), optp, optlen); 4791 } else { 4792 sin6 = sin6_null; 4793 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr); 4794 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4795 sin6.sin6_family = AF_INET6; 4796 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 4797 (int)sizeof (sin6_t), optp, optlen); 4798 4799 } 4800 } else { 4801 ip6_t *ip6h = (ip6_t *)iphdr; 4802 4803 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); 4804 ASSERT(tcp->tcp_family == AF_INET6); 4805 sin6 = sin6_null; 4806 sin6.sin6_addr = ip6h->ip6_src; 4807 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4808 sin6.sin6_family = AF_INET6; 4809 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 4810 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 4811 (int)sizeof (sin6_t), optp, optlen); 4812 } 4813 4814 if (!mp) 4815 return (B_FALSE); 4816 4817 if ((cr = DB_CRED(idmp)) != NULL) { 4818 mblk_setcred(mp, cr); 4819 DB_CPID(mp) = DB_CPID(idmp); 4820 } 4821 4822 if (defermp == NULL) 4823 putnext(tcp->tcp_rq, mp); 4824 else 4825 *defermp = mp; 4826 4827 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 4828 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 4829 return (B_TRUE); 4830 } 4831 4832 /* 4833 * Defense for the SYN attack - 4834 * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest 4835 * one from the list of droppable eagers. This list is a subset of q0. 4836 * see comments before the definition of MAKE_DROPPABLE(). 4837 * 2. Don't drop a SYN request before its first timeout. This gives every 4838 * request at least til the first timeout to complete its 3-way handshake. 4839 * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many 4840 * requests currently on the queue that has timed out. This will be used 4841 * as an indicator of whether an attack is under way, so that appropriate 4842 * actions can be taken. (It's incremented in tcp_timer() and decremented 4843 * either when eager goes into ESTABLISHED, or gets freed up.) 4844 * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on 4845 * # of timeout drops back to <= q0len/32 => SYN alert off 4846 */ 4847 static boolean_t 4848 tcp_drop_q0(tcp_t *tcp) 4849 { 4850 tcp_t *eager; 4851 mblk_t *mp; 4852 tcp_stack_t *tcps = tcp->tcp_tcps; 4853 4854 ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock)); 4855 ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 4856 4857 /* Pick oldest eager from the list of droppable eagers */ 4858 eager = tcp->tcp_eager_prev_drop_q0; 4859 4860 /* If list is empty. return B_FALSE */ 4861 if (eager == tcp) { 4862 return (B_FALSE); 4863 } 4864 4865 /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */ 4866 if ((mp = allocb(0, BPRI_HI)) == NULL) 4867 return (B_FALSE); 4868 4869 /* 4870 * Take this eager out from the list of droppable eagers since we are 4871 * going to drop it. 4872 */ 4873 MAKE_UNDROPPABLE(eager); 4874 4875 if (tcp->tcp_debug) { 4876 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 4877 "tcp_drop_q0: listen half-open queue (max=%d) overflow" 4878 " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0, 4879 tcp->tcp_conn_req_cnt_q0, 4880 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 4881 } 4882 4883 BUMP_MIB(&tcps->tcps_mib, tcpHalfOpenDrop); 4884 4885 /* Put a reference on the conn as we are enqueueing it in the sqeue */ 4886 CONN_INC_REF(eager->tcp_connp); 4887 4888 /* Mark the IRE created for this SYN request temporary */ 4889 tcp_ip_ire_mark_advice(eager); 4890 squeue_fill(eager->tcp_connp->conn_sqp, mp, 4891 tcp_clean_death_wrapper, eager->tcp_connp, SQTAG_TCP_DROP_Q0); 4892 4893 return (B_TRUE); 4894 } 4895 4896 int 4897 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 4898 tcph_t *tcph, uint_t ipvers, mblk_t *idmp) 4899 { 4900 tcp_t *ltcp = lconnp->conn_tcp; 4901 tcp_t *tcp = connp->conn_tcp; 4902 mblk_t *tpi_mp; 4903 ipha_t *ipha; 4904 ip6_t *ip6h; 4905 sin6_t sin6; 4906 in6_addr_t v6dst; 4907 int err; 4908 int ifindex = 0; 4909 cred_t *cr; 4910 tcp_stack_t *tcps = tcp->tcp_tcps; 4911 4912 if (ipvers == IPV4_VERSION) { 4913 ipha = (ipha_t *)mp->b_rptr; 4914 4915 connp->conn_send = ip_output; 4916 connp->conn_recv = tcp_input; 4917 4918 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6); 4919 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6); 4920 4921 sin6 = sin6_null; 4922 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr); 4923 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 4924 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4925 sin6.sin6_family = AF_INET6; 4926 sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst, 4927 lconnp->conn_zoneid, tcps->tcps_netstack); 4928 if (tcp->tcp_recvdstaddr) { 4929 sin6_t sin6d; 4930 4931 sin6d = sin6_null; 4932 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, 4933 &sin6d.sin6_addr); 4934 sin6d.sin6_port = *(uint16_t *)tcph->th_fport; 4935 sin6d.sin6_family = AF_INET; 4936 tpi_mp = mi_tpi_extconn_ind(NULL, 4937 (char *)&sin6d, sizeof (sin6_t), 4938 (char *)&tcp, 4939 (t_scalar_t)sizeof (intptr_t), 4940 (char *)&sin6d, sizeof (sin6_t), 4941 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4942 } else { 4943 tpi_mp = mi_tpi_conn_ind(NULL, 4944 (char *)&sin6, sizeof (sin6_t), 4945 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4946 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4947 } 4948 } else { 4949 ip6h = (ip6_t *)mp->b_rptr; 4950 4951 connp->conn_send = ip_output_v6; 4952 connp->conn_recv = tcp_input; 4953 4954 connp->conn_srcv6 = ip6h->ip6_dst; 4955 connp->conn_remv6 = ip6h->ip6_src; 4956 4957 /* db_cksumstuff is set at ip_fanout_tcp_v6 */ 4958 ifindex = (int)DB_CKSUMSTUFF(mp); 4959 DB_CKSUMSTUFF(mp) = 0; 4960 4961 sin6 = sin6_null; 4962 sin6.sin6_addr = ip6h->ip6_src; 4963 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4964 sin6.sin6_family = AF_INET6; 4965 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 4966 sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 4967 lconnp->conn_zoneid, tcps->tcps_netstack); 4968 4969 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { 4970 /* Pass up the scope_id of remote addr */ 4971 sin6.sin6_scope_id = ifindex; 4972 } else { 4973 sin6.sin6_scope_id = 0; 4974 } 4975 if (tcp->tcp_recvdstaddr) { 4976 sin6_t sin6d; 4977 4978 sin6d = sin6_null; 4979 sin6.sin6_addr = ip6h->ip6_dst; 4980 sin6d.sin6_port = *(uint16_t *)tcph->th_fport; 4981 sin6d.sin6_family = AF_INET; 4982 tpi_mp = mi_tpi_extconn_ind(NULL, 4983 (char *)&sin6d, sizeof (sin6_t), 4984 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4985 (char *)&sin6d, sizeof (sin6_t), 4986 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4987 } else { 4988 tpi_mp = mi_tpi_conn_ind(NULL, 4989 (char *)&sin6, sizeof (sin6_t), 4990 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4991 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4992 } 4993 } 4994 4995 if (tpi_mp == NULL) 4996 return (ENOMEM); 4997 4998 connp->conn_fport = *(uint16_t *)tcph->th_lport; 4999 connp->conn_lport = *(uint16_t *)tcph->th_fport; 5000 connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER); 5001 connp->conn_fully_bound = B_FALSE; 5002 5003 if (tcps->tcps_trace) 5004 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP); 5005 5006 /* Inherit information from the "parent" */ 5007 tcp->tcp_ipversion = ltcp->tcp_ipversion; 5008 tcp->tcp_family = ltcp->tcp_family; 5009 tcp->tcp_wq = ltcp->tcp_wq; 5010 tcp->tcp_rq = ltcp->tcp_rq; 5011 tcp->tcp_mss = tcps->tcps_mss_def_ipv6; 5012 tcp->tcp_detached = B_TRUE; 5013 if ((err = tcp_init_values(tcp)) != 0) { 5014 freemsg(tpi_mp); 5015 return (err); 5016 } 5017 5018 if (ipvers == IPV4_VERSION) { 5019 if ((err = tcp_header_init_ipv4(tcp)) != 0) { 5020 freemsg(tpi_mp); 5021 return (err); 5022 } 5023 ASSERT(tcp->tcp_ipha != NULL); 5024 } else { 5025 /* ifindex must be already set */ 5026 ASSERT(ifindex != 0); 5027 5028 if (ltcp->tcp_bound_if != 0) { 5029 /* 5030 * Set newtcp's bound_if equal to 5031 * listener's value. If ifindex is 5032 * not the same as ltcp->tcp_bound_if, 5033 * it must be a packet for the ipmp group 5034 * of interfaces 5035 */ 5036 tcp->tcp_bound_if = ltcp->tcp_bound_if; 5037 } else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { 5038 tcp->tcp_bound_if = ifindex; 5039 } 5040 5041 tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary; 5042 tcp->tcp_recvifindex = 0; 5043 tcp->tcp_recvhops = 0xffffffffU; 5044 ASSERT(tcp->tcp_ip6h != NULL); 5045 } 5046 5047 tcp->tcp_lport = ltcp->tcp_lport; 5048 5049 if (ltcp->tcp_ipversion == tcp->tcp_ipversion) { 5050 if (tcp->tcp_iphc_len != ltcp->tcp_iphc_len) { 5051 /* 5052 * Listener had options of some sort; eager inherits. 5053 * Free up the eager template and allocate one 5054 * of the right size. 5055 */ 5056 if (tcp->tcp_hdr_grown) { 5057 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 5058 } else { 5059 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 5060 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 5061 } 5062 tcp->tcp_iphc = kmem_zalloc(ltcp->tcp_iphc_len, 5063 KM_NOSLEEP); 5064 if (tcp->tcp_iphc == NULL) { 5065 tcp->tcp_iphc_len = 0; 5066 freemsg(tpi_mp); 5067 return (ENOMEM); 5068 } 5069 tcp->tcp_iphc_len = ltcp->tcp_iphc_len; 5070 tcp->tcp_hdr_grown = B_TRUE; 5071 } 5072 tcp->tcp_hdr_len = ltcp->tcp_hdr_len; 5073 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len; 5074 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; 5075 tcp->tcp_ip6_hops = ltcp->tcp_ip6_hops; 5076 tcp->tcp_ip6_vcf = ltcp->tcp_ip6_vcf; 5077 5078 /* 5079 * Copy the IP+TCP header template from listener to eager 5080 */ 5081 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len); 5082 if (tcp->tcp_ipversion == IPV6_VERSION) { 5083 if (((ip6i_t *)(tcp->tcp_iphc))->ip6i_nxt == 5084 IPPROTO_RAW) { 5085 tcp->tcp_ip6h = 5086 (ip6_t *)(tcp->tcp_iphc + 5087 sizeof (ip6i_t)); 5088 } else { 5089 tcp->tcp_ip6h = 5090 (ip6_t *)(tcp->tcp_iphc); 5091 } 5092 tcp->tcp_ipha = NULL; 5093 } else { 5094 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; 5095 tcp->tcp_ip6h = NULL; 5096 } 5097 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + 5098 tcp->tcp_ip_hdr_len); 5099 } else { 5100 /* 5101 * only valid case when ipversion of listener and 5102 * eager differ is when listener is IPv6 and 5103 * eager is IPv4. 5104 * Eager header template has been initialized to the 5105 * maximum v4 header sizes, which includes space for 5106 * TCP and IP options. 5107 */ 5108 ASSERT((ltcp->tcp_ipversion == IPV6_VERSION) && 5109 (tcp->tcp_ipversion == IPV4_VERSION)); 5110 ASSERT(tcp->tcp_iphc_len >= 5111 TCP_MAX_COMBINED_HEADER_LENGTH); 5112 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; 5113 /* copy IP header fields individually */ 5114 tcp->tcp_ipha->ipha_ttl = 5115 ltcp->tcp_ip6h->ip6_hops; 5116 bcopy(ltcp->tcp_tcph->th_lport, 5117 tcp->tcp_tcph->th_lport, sizeof (ushort_t)); 5118 } 5119 5120 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t)); 5121 bcopy(tcp->tcp_tcph->th_fport, &tcp->tcp_fport, 5122 sizeof (in_port_t)); 5123 5124 if (ltcp->tcp_lport == 0) { 5125 tcp->tcp_lport = *(in_port_t *)tcph->th_fport; 5126 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, 5127 sizeof (in_port_t)); 5128 } 5129 5130 if (tcp->tcp_ipversion == IPV4_VERSION) { 5131 ASSERT(ipha != NULL); 5132 tcp->tcp_ipha->ipha_dst = ipha->ipha_src; 5133 tcp->tcp_ipha->ipha_src = ipha->ipha_dst; 5134 5135 /* Source routing option copyover (reverse it) */ 5136 if (tcps->tcps_rev_src_routes) 5137 tcp_opt_reverse(tcp, ipha); 5138 } else { 5139 ASSERT(ip6h != NULL); 5140 tcp->tcp_ip6h->ip6_dst = ip6h->ip6_src; 5141 tcp->tcp_ip6h->ip6_src = ip6h->ip6_dst; 5142 } 5143 5144 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 5145 ASSERT(!tcp->tcp_tconnind_started); 5146 /* 5147 * If the SYN contains a credential, it's a loopback packet; attach 5148 * the credential to the TPI message. 5149 */ 5150 if ((cr = DB_CRED(idmp)) != NULL) { 5151 mblk_setcred(tpi_mp, cr); 5152 DB_CPID(tpi_mp) = DB_CPID(idmp); 5153 } 5154 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp; 5155 5156 /* Inherit the listener's SSL protection state */ 5157 5158 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) { 5159 kssl_hold_ent(tcp->tcp_kssl_ent); 5160 tcp->tcp_kssl_pending = B_TRUE; 5161 } 5162 5163 return (0); 5164 } 5165 5166 5167 int 5168 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, 5169 tcph_t *tcph, mblk_t *idmp) 5170 { 5171 tcp_t *ltcp = lconnp->conn_tcp; 5172 tcp_t *tcp = connp->conn_tcp; 5173 sin_t sin; 5174 mblk_t *tpi_mp = NULL; 5175 int err; 5176 cred_t *cr; 5177 tcp_stack_t *tcps = tcp->tcp_tcps; 5178 5179 sin = sin_null; 5180 sin.sin_addr.s_addr = ipha->ipha_src; 5181 sin.sin_port = *(uint16_t *)tcph->th_lport; 5182 sin.sin_family = AF_INET; 5183 if (ltcp->tcp_recvdstaddr) { 5184 sin_t sind; 5185 5186 sind = sin_null; 5187 sind.sin_addr.s_addr = ipha->ipha_dst; 5188 sind.sin_port = *(uint16_t *)tcph->th_fport; 5189 sind.sin_family = AF_INET; 5190 tpi_mp = mi_tpi_extconn_ind(NULL, 5191 (char *)&sind, sizeof (sin_t), (char *)&tcp, 5192 (t_scalar_t)sizeof (intptr_t), (char *)&sind, 5193 sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum); 5194 } else { 5195 tpi_mp = mi_tpi_conn_ind(NULL, 5196 (char *)&sin, sizeof (sin_t), 5197 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 5198 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 5199 } 5200 5201 if (tpi_mp == NULL) { 5202 return (ENOMEM); 5203 } 5204 5205 connp->conn_flags |= (IPCL_TCP4|IPCL_EAGER); 5206 connp->conn_send = ip_output; 5207 connp->conn_recv = tcp_input; 5208 connp->conn_fully_bound = B_FALSE; 5209 5210 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6); 5211 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6); 5212 connp->conn_fport = *(uint16_t *)tcph->th_lport; 5213 connp->conn_lport = *(uint16_t *)tcph->th_fport; 5214 5215 if (tcps->tcps_trace) { 5216 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP); 5217 } 5218 5219 /* Inherit information from the "parent" */ 5220 tcp->tcp_ipversion = ltcp->tcp_ipversion; 5221 tcp->tcp_family = ltcp->tcp_family; 5222 tcp->tcp_wq = ltcp->tcp_wq; 5223 tcp->tcp_rq = ltcp->tcp_rq; 5224 tcp->tcp_mss = tcps->tcps_mss_def_ipv4; 5225 tcp->tcp_detached = B_TRUE; 5226 if ((err = tcp_init_values(tcp)) != 0) { 5227 freemsg(tpi_mp); 5228 return (err); 5229 } 5230 5231 /* 5232 * Let's make sure that eager tcp template has enough space to 5233 * copy IPv4 listener's tcp template. Since the conn_t structure is 5234 * preserved and tcp_iphc_len is also preserved, an eager conn_t may 5235 * have a tcp_template of total len TCP_MAX_COMBINED_HEADER_LENGTH or 5236 * more (in case of re-allocation of conn_t with tcp-IPv6 template with 5237 * extension headers or with ip6i_t struct). Note that bcopy() below 5238 * copies listener tcp's hdr_len which cannot be greater than TCP_MAX_ 5239 * COMBINED_HEADER_LENGTH as this listener must be a IPv4 listener. 5240 */ 5241 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 5242 ASSERT(ltcp->tcp_hdr_len <= TCP_MAX_COMBINED_HEADER_LENGTH); 5243 5244 tcp->tcp_hdr_len = ltcp->tcp_hdr_len; 5245 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len; 5246 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; 5247 tcp->tcp_ttl = ltcp->tcp_ttl; 5248 tcp->tcp_tos = ltcp->tcp_tos; 5249 5250 /* Copy the IP+TCP header template from listener to eager */ 5251 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len); 5252 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; 5253 tcp->tcp_ip6h = NULL; 5254 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + 5255 tcp->tcp_ip_hdr_len); 5256 5257 /* Initialize the IP addresses and Ports */ 5258 tcp->tcp_ipha->ipha_dst = ipha->ipha_src; 5259 tcp->tcp_ipha->ipha_src = ipha->ipha_dst; 5260 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t)); 5261 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t)); 5262 5263 /* Source routing option copyover (reverse it) */ 5264 if (tcps->tcps_rev_src_routes) 5265 tcp_opt_reverse(tcp, ipha); 5266 5267 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 5268 ASSERT(!tcp->tcp_tconnind_started); 5269 5270 /* 5271 * If the SYN contains a credential, it's a loopback packet; attach 5272 * the credential to the TPI message. 5273 */ 5274 if ((cr = DB_CRED(idmp)) != NULL) { 5275 mblk_setcred(tpi_mp, cr); 5276 DB_CPID(tpi_mp) = DB_CPID(idmp); 5277 } 5278 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp; 5279 5280 /* Inherit the listener's SSL protection state */ 5281 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) { 5282 kssl_hold_ent(tcp->tcp_kssl_ent); 5283 tcp->tcp_kssl_pending = B_TRUE; 5284 } 5285 5286 return (0); 5287 } 5288 5289 /* 5290 * sets up conn for ipsec. 5291 * if the first mblk is M_CTL it is consumed and mpp is updated. 5292 * in case of error mpp is freed. 5293 */ 5294 conn_t * 5295 tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp) 5296 { 5297 conn_t *connp = tcp->tcp_connp; 5298 conn_t *econnp; 5299 squeue_t *new_sqp; 5300 mblk_t *first_mp = *mpp; 5301 mblk_t *mp = *mpp; 5302 boolean_t mctl_present = B_FALSE; 5303 uint_t ipvers; 5304 5305 econnp = tcp_get_conn(sqp, tcp->tcp_tcps); 5306 if (econnp == NULL) { 5307 freemsg(first_mp); 5308 return (NULL); 5309 } 5310 if (DB_TYPE(mp) == M_CTL) { 5311 if (mp->b_cont == NULL || 5312 mp->b_cont->b_datap->db_type != M_DATA) { 5313 freemsg(first_mp); 5314 return (NULL); 5315 } 5316 mp = mp->b_cont; 5317 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) == 0) { 5318 freemsg(first_mp); 5319 return (NULL); 5320 } 5321 5322 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 5323 first_mp->b_datap->db_struioflag &= ~STRUIO_POLICY; 5324 mctl_present = B_TRUE; 5325 } else { 5326 ASSERT(mp->b_datap->db_struioflag & STRUIO_POLICY); 5327 mp->b_datap->db_struioflag &= ~STRUIO_POLICY; 5328 } 5329 5330 new_sqp = (squeue_t *)DB_CKSUMSTART(mp); 5331 DB_CKSUMSTART(mp) = 0; 5332 5333 ASSERT(OK_32PTR(mp->b_rptr)); 5334 ipvers = IPH_HDR_VERSION(mp->b_rptr); 5335 if (ipvers == IPV4_VERSION) { 5336 uint16_t *up; 5337 uint32_t ports; 5338 ipha_t *ipha; 5339 5340 ipha = (ipha_t *)mp->b_rptr; 5341 up = (uint16_t *)((uchar_t *)ipha + 5342 IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET); 5343 ports = *(uint32_t *)up; 5344 IPCL_TCP_EAGER_INIT(econnp, IPPROTO_TCP, 5345 ipha->ipha_dst, ipha->ipha_src, ports); 5346 } else { 5347 uint16_t *up; 5348 uint32_t ports; 5349 uint16_t ip_hdr_len; 5350 uint8_t *nexthdrp; 5351 ip6_t *ip6h; 5352 tcph_t *tcph; 5353 5354 ip6h = (ip6_t *)mp->b_rptr; 5355 if (ip6h->ip6_nxt == IPPROTO_TCP) { 5356 ip_hdr_len = IPV6_HDR_LEN; 5357 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_len, 5358 &nexthdrp) || *nexthdrp != IPPROTO_TCP) { 5359 CONN_DEC_REF(econnp); 5360 freemsg(first_mp); 5361 return (NULL); 5362 } 5363 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5364 up = (uint16_t *)tcph->th_lport; 5365 ports = *(uint32_t *)up; 5366 IPCL_TCP_EAGER_INIT_V6(econnp, IPPROTO_TCP, 5367 ip6h->ip6_dst, ip6h->ip6_src, ports); 5368 } 5369 5370 /* 5371 * The caller already ensured that there is a sqp present. 5372 */ 5373 econnp->conn_sqp = new_sqp; 5374 5375 if (connp->conn_policy != NULL) { 5376 ipsec_in_t *ii; 5377 ii = (ipsec_in_t *)(first_mp->b_rptr); 5378 ASSERT(ii->ipsec_in_policy == NULL); 5379 IPPH_REFHOLD(connp->conn_policy); 5380 ii->ipsec_in_policy = connp->conn_policy; 5381 5382 first_mp->b_datap->db_type = IPSEC_POLICY_SET; 5383 if (!ip_bind_ipsec_policy_set(econnp, first_mp)) { 5384 CONN_DEC_REF(econnp); 5385 freemsg(first_mp); 5386 return (NULL); 5387 } 5388 } 5389 5390 if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) { 5391 CONN_DEC_REF(econnp); 5392 freemsg(first_mp); 5393 return (NULL); 5394 } 5395 5396 /* 5397 * If we know we have some policy, pass the "IPSEC" 5398 * options size TCP uses this adjust the MSS. 5399 */ 5400 econnp->conn_tcp->tcp_ipsec_overhead = conn_ipsec_length(econnp); 5401 if (mctl_present) { 5402 freeb(first_mp); 5403 *mpp = mp; 5404 } 5405 5406 return (econnp); 5407 } 5408 5409 /* 5410 * tcp_get_conn/tcp_free_conn 5411 * 5412 * tcp_get_conn is used to get a clean tcp connection structure. 5413 * It tries to reuse the connections put on the freelist by the 5414 * time_wait_collector failing which it goes to kmem_cache. This 5415 * way has two benefits compared to just allocating from and 5416 * freeing to kmem_cache. 5417 * 1) The time_wait_collector can free (which includes the cleanup) 5418 * outside the squeue. So when the interrupt comes, we have a clean 5419 * connection sitting in the freelist. Obviously, this buys us 5420 * performance. 5421 * 5422 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_conn_request 5423 * has multiple disadvantages - tying up the squeue during alloc, and the 5424 * fact that IPSec policy initialization has to happen here which 5425 * requires us sending a M_CTL and checking for it i.e. real ugliness. 5426 * But allocating the conn/tcp in IP land is also not the best since 5427 * we can't check the 'q' and 'q0' which are protected by squeue and 5428 * blindly allocate memory which might have to be freed here if we are 5429 * not allowed to accept the connection. By using the freelist and 5430 * putting the conn/tcp back in freelist, we don't pay a penalty for 5431 * allocating memory without checking 'q/q0' and freeing it if we can't 5432 * accept the connection. 5433 * 5434 * Care should be taken to put the conn back in the same squeue's freelist 5435 * from which it was allocated. Best results are obtained if conn is 5436 * allocated from listener's squeue and freed to the same. Time wait 5437 * collector will free up the freelist is the connection ends up sitting 5438 * there for too long. 5439 */ 5440 void * 5441 tcp_get_conn(void *arg, tcp_stack_t *tcps) 5442 { 5443 tcp_t *tcp = NULL; 5444 conn_t *connp = NULL; 5445 squeue_t *sqp = (squeue_t *)arg; 5446 tcp_squeue_priv_t *tcp_time_wait; 5447 netstack_t *ns; 5448 5449 tcp_time_wait = 5450 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 5451 5452 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 5453 tcp = tcp_time_wait->tcp_free_list; 5454 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0)); 5455 if (tcp != NULL) { 5456 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 5457 tcp_time_wait->tcp_free_list_cnt--; 5458 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 5459 tcp->tcp_time_wait_next = NULL; 5460 connp = tcp->tcp_connp; 5461 connp->conn_flags |= IPCL_REUSED; 5462 5463 ASSERT(tcp->tcp_tcps == NULL); 5464 ASSERT(connp->conn_netstack == NULL); 5465 ns = tcps->tcps_netstack; 5466 netstack_hold(ns); 5467 connp->conn_netstack = ns; 5468 tcp->tcp_tcps = tcps; 5469 TCPS_REFHOLD(tcps); 5470 ipcl_globalhash_insert(connp); 5471 return ((void *)connp); 5472 } 5473 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 5474 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP, 5475 tcps->tcps_netstack)) == NULL) 5476 return (NULL); 5477 tcp = connp->conn_tcp; 5478 tcp->tcp_tcps = tcps; 5479 TCPS_REFHOLD(tcps); 5480 return ((void *)connp); 5481 } 5482 5483 /* 5484 * Update the cached label for the given tcp_t. This should be called once per 5485 * connection, and before any packets are sent or tcp_process_options is 5486 * invoked. Returns B_FALSE if the correct label could not be constructed. 5487 */ 5488 static boolean_t 5489 tcp_update_label(tcp_t *tcp, const cred_t *cr) 5490 { 5491 conn_t *connp = tcp->tcp_connp; 5492 5493 if (tcp->tcp_ipversion == IPV4_VERSION) { 5494 uchar_t optbuf[IP_MAX_OPT_LENGTH]; 5495 int added; 5496 5497 if (tsol_compute_label(cr, tcp->tcp_remote, optbuf, 5498 connp->conn_mac_exempt, 5499 tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0) 5500 return (B_FALSE); 5501 5502 added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len); 5503 if (added == -1) 5504 return (B_FALSE); 5505 tcp->tcp_hdr_len += added; 5506 tcp->tcp_tcph = (tcph_t *)((uchar_t *)tcp->tcp_tcph + added); 5507 tcp->tcp_ip_hdr_len += added; 5508 if ((tcp->tcp_label_len = optbuf[IPOPT_OLEN]) != 0) { 5509 tcp->tcp_label_len = (tcp->tcp_label_len + 3) & ~3; 5510 added = tsol_prepend_option(optbuf, tcp->tcp_ipha, 5511 tcp->tcp_hdr_len); 5512 if (added == -1) 5513 return (B_FALSE); 5514 tcp->tcp_hdr_len += added; 5515 tcp->tcp_tcph = (tcph_t *) 5516 ((uchar_t *)tcp->tcp_tcph + added); 5517 tcp->tcp_ip_hdr_len += added; 5518 } 5519 } else { 5520 uchar_t optbuf[TSOL_MAX_IPV6_OPTION]; 5521 5522 if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf, 5523 connp->conn_mac_exempt, 5524 tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0) 5525 return (B_FALSE); 5526 if (tsol_update_sticky(&tcp->tcp_sticky_ipp, 5527 &tcp->tcp_label_len, optbuf) != 0) 5528 return (B_FALSE); 5529 if (tcp_build_hdrs(tcp->tcp_rq, tcp) != 0) 5530 return (B_FALSE); 5531 } 5532 5533 connp->conn_ulp_labeled = 1; 5534 5535 return (B_TRUE); 5536 } 5537 5538 /* BEGIN CSTYLED */ 5539 /* 5540 * 5541 * The sockfs ACCEPT path: 5542 * ======================= 5543 * 5544 * The eager is now established in its own perimeter as soon as SYN is 5545 * received in tcp_conn_request(). When sockfs receives conn_ind, it 5546 * completes the accept processing on the acceptor STREAM. The sending 5547 * of conn_ind part is common for both sockfs listener and a TLI/XTI 5548 * listener but a TLI/XTI listener completes the accept processing 5549 * on the listener perimeter. 5550 * 5551 * Common control flow for 3 way handshake: 5552 * ---------------------------------------- 5553 * 5554 * incoming SYN (listener perimeter) -> tcp_rput_data() 5555 * -> tcp_conn_request() 5556 * 5557 * incoming SYN-ACK-ACK (eager perim) -> tcp_rput_data() 5558 * send T_CONN_IND (listener perim) -> tcp_send_conn_ind() 5559 * 5560 * Sockfs ACCEPT Path: 5561 * ------------------- 5562 * 5563 * open acceptor stream (tcp_open allocates tcp_wput_accept() 5564 * as STREAM entry point) 5565 * 5566 * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept() 5567 * 5568 * tcp_wput_accept() extracts the eager and makes the q->q_ptr <-> eager 5569 * association (we are not behind eager's squeue but sockfs is protecting us 5570 * and no one knows about this stream yet. The STREAMS entry point q->q_info 5571 * is changed to point at tcp_wput(). 5572 * 5573 * tcp_wput_accept() sends any deferred eagers via tcp_send_pending() to 5574 * listener (done on listener's perimeter). 5575 * 5576 * tcp_wput_accept() calls tcp_accept_finish() on eagers perimeter to finish 5577 * accept. 5578 * 5579 * TLI/XTI client ACCEPT path: 5580 * --------------------------- 5581 * 5582 * soaccept() sends T_CONN_RES on the listener STREAM. 5583 * 5584 * tcp_accept() -> tcp_accept_swap() complete the processing and send 5585 * the bind_mp to eager perimeter to finish accept (tcp_rput_other()). 5586 * 5587 * Locks: 5588 * ====== 5589 * 5590 * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and 5591 * and listeners->tcp_eager_next_q. 5592 * 5593 * Referencing: 5594 * ============ 5595 * 5596 * 1) We start out in tcp_conn_request by eager placing a ref on 5597 * listener and listener adding eager to listeners->tcp_eager_next_q0. 5598 * 5599 * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before 5600 * doing so we place a ref on the eager. This ref is finally dropped at the 5601 * end of tcp_accept_finish() while unwinding from the squeue, i.e. the 5602 * reference is dropped by the squeue framework. 5603 * 5604 * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish 5605 * 5606 * The reference must be released by the same entity that added the reference 5607 * In the above scheme, the eager is the entity that adds and releases the 5608 * references. Note that tcp_accept_finish executes in the squeue of the eager 5609 * (albeit after it is attached to the acceptor stream). Though 1. executes 5610 * in the listener's squeue, the eager is nascent at this point and the 5611 * reference can be considered to have been added on behalf of the eager. 5612 * 5613 * Eager getting a Reset or listener closing: 5614 * ========================================== 5615 * 5616 * Once the listener and eager are linked, the listener never does the unlink. 5617 * If the listener needs to close, tcp_eager_cleanup() is called which queues 5618 * a message on all eager perimeter. The eager then does the unlink, clears 5619 * any pointers to the listener's queue and drops the reference to the 5620 * listener. The listener waits in tcp_close outside the squeue until its 5621 * refcount has dropped to 1. This ensures that the listener has waited for 5622 * all eagers to clear their association with the listener. 5623 * 5624 * Similarly, if eager decides to go away, it can unlink itself and close. 5625 * When the T_CONN_RES comes down, we check if eager has closed. Note that 5626 * the reference to eager is still valid because of the extra ref we put 5627 * in tcp_send_conn_ind. 5628 * 5629 * Listener can always locate the eager under the protection 5630 * of the listener->tcp_eager_lock, and then do a refhold 5631 * on the eager during the accept processing. 5632 * 5633 * The acceptor stream accesses the eager in the accept processing 5634 * based on the ref placed on eager before sending T_conn_ind. 5635 * The only entity that can negate this refhold is a listener close 5636 * which is mutually exclusive with an active acceptor stream. 5637 * 5638 * Eager's reference on the listener 5639 * =================================== 5640 * 5641 * If the accept happens (even on a closed eager) the eager drops its 5642 * reference on the listener at the start of tcp_accept_finish. If the 5643 * eager is killed due to an incoming RST before the T_conn_ind is sent up, 5644 * the reference is dropped in tcp_closei_local. If the listener closes, 5645 * the reference is dropped in tcp_eager_kill. In all cases the reference 5646 * is dropped while executing in the eager's context (squeue). 5647 */ 5648 /* END CSTYLED */ 5649 5650 /* Process the SYN packet, mp, directed at the listener 'tcp' */ 5651 5652 /* 5653 * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN. 5654 * tcp_rput_data will not see any SYN packets. 5655 */ 5656 /* ARGSUSED */ 5657 void 5658 tcp_conn_request(void *arg, mblk_t *mp, void *arg2) 5659 { 5660 tcph_t *tcph; 5661 uint32_t seg_seq; 5662 tcp_t *eager; 5663 uint_t ipvers; 5664 ipha_t *ipha; 5665 ip6_t *ip6h; 5666 int err; 5667 conn_t *econnp = NULL; 5668 squeue_t *new_sqp; 5669 mblk_t *mp1; 5670 uint_t ip_hdr_len; 5671 conn_t *connp = (conn_t *)arg; 5672 tcp_t *tcp = connp->conn_tcp; 5673 cred_t *credp; 5674 tcp_stack_t *tcps = tcp->tcp_tcps; 5675 ip_stack_t *ipst; 5676 5677 if (tcp->tcp_state != TCPS_LISTEN) 5678 goto error2; 5679 5680 ASSERT((tcp->tcp_connp->conn_flags & IPCL_BOUND) != 0); 5681 5682 mutex_enter(&tcp->tcp_eager_lock); 5683 if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { 5684 mutex_exit(&tcp->tcp_eager_lock); 5685 TCP_STAT(tcps, tcp_listendrop); 5686 BUMP_MIB(&tcps->tcps_mib, tcpListenDrop); 5687 if (tcp->tcp_debug) { 5688 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 5689 "tcp_conn_request: listen backlog (max=%d) " 5690 "overflow (%d pending) on %s", 5691 tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, 5692 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 5693 } 5694 goto error2; 5695 } 5696 5697 if (tcp->tcp_conn_req_cnt_q0 >= 5698 tcp->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) { 5699 /* 5700 * Q0 is full. Drop a pending half-open req from the queue 5701 * to make room for the new SYN req. Also mark the time we 5702 * drop a SYN. 5703 * 5704 * A more aggressive defense against SYN attack will 5705 * be to set the "tcp_syn_defense" flag now. 5706 */ 5707 TCP_STAT(tcps, tcp_listendropq0); 5708 tcp->tcp_last_rcv_lbolt = lbolt64; 5709 if (!tcp_drop_q0(tcp)) { 5710 mutex_exit(&tcp->tcp_eager_lock); 5711 BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0); 5712 if (tcp->tcp_debug) { 5713 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 5714 "tcp_conn_request: listen half-open queue " 5715 "(max=%d) full (%d pending) on %s", 5716 tcps->tcps_conn_req_max_q0, 5717 tcp->tcp_conn_req_cnt_q0, 5718 tcp_display(tcp, NULL, 5719 DISP_PORT_ONLY)); 5720 } 5721 goto error2; 5722 } 5723 } 5724 mutex_exit(&tcp->tcp_eager_lock); 5725 5726 /* 5727 * IP adds STRUIO_EAGER and ensures that the received packet is 5728 * M_DATA even if conn_ipv6_recvpktinfo is enabled or for ip6 5729 * link local address. If IPSec is enabled, db_struioflag has 5730 * STRUIO_POLICY set (mutually exclusive from STRUIO_EAGER); 5731 * otherwise an error case if neither of them is set. 5732 */ 5733 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 5734 new_sqp = (squeue_t *)DB_CKSUMSTART(mp); 5735 DB_CKSUMSTART(mp) = 0; 5736 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 5737 econnp = (conn_t *)tcp_get_conn(arg2, tcps); 5738 if (econnp == NULL) 5739 goto error2; 5740 ASSERT(econnp->conn_netstack == connp->conn_netstack); 5741 econnp->conn_sqp = new_sqp; 5742 } else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) { 5743 /* 5744 * mp is updated in tcp_get_ipsec_conn(). 5745 */ 5746 econnp = tcp_get_ipsec_conn(tcp, arg2, &mp); 5747 if (econnp == NULL) { 5748 /* 5749 * mp freed by tcp_get_ipsec_conn. 5750 */ 5751 return; 5752 } 5753 ASSERT(econnp->conn_netstack == connp->conn_netstack); 5754 } else { 5755 goto error2; 5756 } 5757 5758 ASSERT(DB_TYPE(mp) == M_DATA); 5759 5760 ipvers = IPH_HDR_VERSION(mp->b_rptr); 5761 ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION); 5762 ASSERT(OK_32PTR(mp->b_rptr)); 5763 if (ipvers == IPV4_VERSION) { 5764 ipha = (ipha_t *)mp->b_rptr; 5765 ip_hdr_len = IPH_HDR_LENGTH(ipha); 5766 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5767 } else { 5768 ip6h = (ip6_t *)mp->b_rptr; 5769 ip_hdr_len = ip_hdr_length_v6(mp, ip6h); 5770 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5771 } 5772 5773 if (tcp->tcp_family == AF_INET) { 5774 ASSERT(ipvers == IPV4_VERSION); 5775 err = tcp_conn_create_v4(connp, econnp, ipha, tcph, mp); 5776 } else { 5777 err = tcp_conn_create_v6(connp, econnp, mp, tcph, ipvers, mp); 5778 } 5779 5780 if (err) 5781 goto error3; 5782 5783 eager = econnp->conn_tcp; 5784 5785 /* Inherit various TCP parameters from the listener */ 5786 eager->tcp_naglim = tcp->tcp_naglim; 5787 eager->tcp_first_timer_threshold = 5788 tcp->tcp_first_timer_threshold; 5789 eager->tcp_second_timer_threshold = 5790 tcp->tcp_second_timer_threshold; 5791 5792 eager->tcp_first_ctimer_threshold = 5793 tcp->tcp_first_ctimer_threshold; 5794 eager->tcp_second_ctimer_threshold = 5795 tcp->tcp_second_ctimer_threshold; 5796 5797 /* 5798 * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics. 5799 * If it does not, the eager's receive window will be set to the 5800 * listener's receive window later in this function. 5801 */ 5802 eager->tcp_rwnd = 0; 5803 5804 /* 5805 * Inherit listener's tcp_init_cwnd. Need to do this before 5806 * calling tcp_process_options() where tcp_mss_set() is called 5807 * to set the initial cwnd. 5808 */ 5809 eager->tcp_init_cwnd = tcp->tcp_init_cwnd; 5810 5811 /* 5812 * Zones: tcp_adapt_ire() and tcp_send_data() both need the 5813 * zone id before the accept is completed in tcp_wput_accept(). 5814 */ 5815 econnp->conn_zoneid = connp->conn_zoneid; 5816 econnp->conn_allzones = connp->conn_allzones; 5817 5818 /* Copy nexthop information from listener to eager */ 5819 if (connp->conn_nexthop_set) { 5820 econnp->conn_nexthop_set = connp->conn_nexthop_set; 5821 econnp->conn_nexthop_v4 = connp->conn_nexthop_v4; 5822 } 5823 5824 /* 5825 * TSOL: tsol_input_proc() needs the eager's cred before the 5826 * eager is accepted 5827 */ 5828 econnp->conn_cred = eager->tcp_cred = credp = connp->conn_cred; 5829 crhold(credp); 5830 5831 /* 5832 * If the caller has the process-wide flag set, then default to MAC 5833 * exempt mode. This allows read-down to unlabeled hosts. 5834 */ 5835 if (getpflags(NET_MAC_AWARE, credp) != 0) 5836 econnp->conn_mac_exempt = B_TRUE; 5837 5838 if (is_system_labeled()) { 5839 cred_t *cr; 5840 5841 if (connp->conn_mlp_type != mlptSingle) { 5842 cr = econnp->conn_peercred = DB_CRED(mp); 5843 if (cr != NULL) 5844 crhold(cr); 5845 else 5846 cr = econnp->conn_cred; 5847 DTRACE_PROBE2(mlp_syn_accept, conn_t *, 5848 econnp, cred_t *, cr) 5849 } else { 5850 cr = econnp->conn_cred; 5851 DTRACE_PROBE2(syn_accept, conn_t *, 5852 econnp, cred_t *, cr) 5853 } 5854 5855 if (!tcp_update_label(eager, cr)) { 5856 DTRACE_PROBE3( 5857 tx__ip__log__error__connrequest__tcp, 5858 char *, "eager connp(1) label on SYN mp(2) failed", 5859 conn_t *, econnp, mblk_t *, mp); 5860 goto error3; 5861 } 5862 } 5863 5864 eager->tcp_hard_binding = B_TRUE; 5865 5866 tcp_bind_hash_insert(&tcps->tcps_bind_fanout[ 5867 TCP_BIND_HASH(eager->tcp_lport)], eager, 0); 5868 5869 CL_INET_CONNECT(eager); 5870 5871 /* 5872 * No need to check for multicast destination since ip will only pass 5873 * up multicasts to those that have expressed interest 5874 * TODO: what about rejecting broadcasts? 5875 * Also check that source is not a multicast or broadcast address. 5876 */ 5877 eager->tcp_state = TCPS_SYN_RCVD; 5878 5879 5880 /* 5881 * There should be no ire in the mp as we are being called after 5882 * receiving the SYN. 5883 */ 5884 ASSERT(tcp_ire_mp(mp) == NULL); 5885 5886 /* 5887 * Adapt our mss, ttl, ... according to information provided in IRE. 5888 */ 5889 5890 if (tcp_adapt_ire(eager, NULL) == 0) { 5891 /* Undo the bind_hash_insert */ 5892 tcp_bind_hash_remove(eager); 5893 goto error3; 5894 } 5895 5896 /* Process all TCP options. */ 5897 tcp_process_options(eager, tcph); 5898 5899 /* Is the other end ECN capable? */ 5900 if (tcps->tcps_ecn_permitted >= 1 && 5901 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 5902 eager->tcp_ecn_ok = B_TRUE; 5903 } 5904 5905 /* 5906 * listener->tcp_rq->q_hiwat should be the default window size or a 5907 * window size changed via SO_RCVBUF option. First round up the 5908 * eager's tcp_rwnd to the nearest MSS. Then find out the window 5909 * scale option value if needed. Call tcp_rwnd_set() to finish the 5910 * setting. 5911 * 5912 * Note if there is a rpipe metric associated with the remote host, 5913 * we should not inherit receive window size from listener. 5914 */ 5915 eager->tcp_rwnd = MSS_ROUNDUP( 5916 (eager->tcp_rwnd == 0 ? tcp->tcp_rq->q_hiwat : 5917 eager->tcp_rwnd), eager->tcp_mss); 5918 if (eager->tcp_snd_ws_ok) 5919 tcp_set_ws_value(eager); 5920 /* 5921 * Note that this is the only place tcp_rwnd_set() is called for 5922 * accepting a connection. We need to call it here instead of 5923 * after the 3-way handshake because we need to tell the other 5924 * side our rwnd in the SYN-ACK segment. 5925 */ 5926 (void) tcp_rwnd_set(eager, eager->tcp_rwnd); 5927 5928 /* 5929 * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ 5930 * via soaccept()->soinheritoptions() which essentially applies 5931 * all the listener options to the new STREAM. The options that we 5932 * need to take care of are: 5933 * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST, 5934 * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER, 5935 * SO_SNDBUF, SO_RCVBUF. 5936 * 5937 * SO_RCVBUF: tcp_rwnd_set() above takes care of it. 5938 * SO_SNDBUF: Set the tcp_xmit_hiwater for the eager. When 5939 * tcp_maxpsz_set() gets called later from 5940 * tcp_accept_finish(), the option takes effect. 5941 * 5942 */ 5943 /* Set the TCP options */ 5944 eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater; 5945 eager->tcp_dgram_errind = tcp->tcp_dgram_errind; 5946 eager->tcp_oobinline = tcp->tcp_oobinline; 5947 eager->tcp_reuseaddr = tcp->tcp_reuseaddr; 5948 eager->tcp_broadcast = tcp->tcp_broadcast; 5949 eager->tcp_useloopback = tcp->tcp_useloopback; 5950 eager->tcp_dontroute = tcp->tcp_dontroute; 5951 eager->tcp_linger = tcp->tcp_linger; 5952 eager->tcp_lingertime = tcp->tcp_lingertime; 5953 if (tcp->tcp_ka_enabled) 5954 eager->tcp_ka_enabled = 1; 5955 5956 /* Set the IP options */ 5957 econnp->conn_broadcast = connp->conn_broadcast; 5958 econnp->conn_loopback = connp->conn_loopback; 5959 econnp->conn_dontroute = connp->conn_dontroute; 5960 econnp->conn_reuseaddr = connp->conn_reuseaddr; 5961 5962 /* Put a ref on the listener for the eager. */ 5963 CONN_INC_REF(connp); 5964 mutex_enter(&tcp->tcp_eager_lock); 5965 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 5966 eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 5967 tcp->tcp_eager_next_q0 = eager; 5968 eager->tcp_eager_prev_q0 = tcp; 5969 5970 /* Set tcp_listener before adding it to tcp_conn_fanout */ 5971 eager->tcp_listener = tcp; 5972 eager->tcp_saved_listener = tcp; 5973 5974 /* 5975 * Tag this detached tcp vector for later retrieval 5976 * by our listener client in tcp_accept(). 5977 */ 5978 eager->tcp_conn_req_seqnum = tcp->tcp_conn_req_seqnum; 5979 tcp->tcp_conn_req_cnt_q0++; 5980 if (++tcp->tcp_conn_req_seqnum == -1) { 5981 /* 5982 * -1 is "special" and defined in TPI as something 5983 * that should never be used in T_CONN_IND 5984 */ 5985 ++tcp->tcp_conn_req_seqnum; 5986 } 5987 mutex_exit(&tcp->tcp_eager_lock); 5988 5989 if (tcp->tcp_syn_defense) { 5990 /* Don't drop the SYN that comes from a good IP source */ 5991 ipaddr_t *addr_cache = (ipaddr_t *)(tcp->tcp_ip_addr_cache); 5992 if (addr_cache != NULL && eager->tcp_remote == 5993 addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) { 5994 eager->tcp_dontdrop = B_TRUE; 5995 } 5996 } 5997 5998 /* 5999 * We need to insert the eager in its own perimeter but as soon 6000 * as we do that, we expose the eager to the classifier and 6001 * should not touch any field outside the eager's perimeter. 6002 * So do all the work necessary before inserting the eager 6003 * in its own perimeter. Be optimistic that ipcl_conn_insert() 6004 * will succeed but undo everything if it fails. 6005 */ 6006 seg_seq = ABE32_TO_U32(tcph->th_seq); 6007 eager->tcp_irs = seg_seq; 6008 eager->tcp_rack = seg_seq; 6009 eager->tcp_rnxt = seg_seq + 1; 6010 U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack); 6011 BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens); 6012 eager->tcp_state = TCPS_SYN_RCVD; 6013 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, 6014 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE); 6015 if (mp1 == NULL) { 6016 /* 6017 * Increment the ref count as we are going to 6018 * enqueueing an mp in squeue 6019 */ 6020 CONN_INC_REF(econnp); 6021 goto error; 6022 } 6023 DB_CPID(mp1) = tcp->tcp_cpid; 6024 eager->tcp_cpid = tcp->tcp_cpid; 6025 eager->tcp_open_time = lbolt64; 6026 6027 /* 6028 * We need to start the rto timer. In normal case, we start 6029 * the timer after sending the packet on the wire (or at 6030 * least believing that packet was sent by waiting for 6031 * CALL_IP_WPUT() to return). Since this is the first packet 6032 * being sent on the wire for the eager, our initial tcp_rto 6033 * is at least tcp_rexmit_interval_min which is a fairly 6034 * large value to allow the algorithm to adjust slowly to large 6035 * fluctuations of RTT during first few transmissions. 6036 * 6037 * Starting the timer first and then sending the packet in this 6038 * case shouldn't make much difference since tcp_rexmit_interval_min 6039 * is of the order of several 100ms and starting the timer 6040 * first and then sending the packet will result in difference 6041 * of few micro seconds. 6042 * 6043 * Without this optimization, we are forced to hold the fanout 6044 * lock across the ipcl_bind_insert() and sending the packet 6045 * so that we don't race against an incoming packet (maybe RST) 6046 * for this eager. 6047 * 6048 * It is necessary to acquire an extra reference on the eager 6049 * at this point and hold it until after tcp_send_data() to 6050 * ensure against an eager close race. 6051 */ 6052 6053 CONN_INC_REF(eager->tcp_connp); 6054 6055 TCP_RECORD_TRACE(eager, mp1, TCP_TRACE_SEND_PKT); 6056 TCP_TIMER_RESTART(eager, eager->tcp_rto); 6057 6058 6059 /* 6060 * Insert the eager in its own perimeter now. We are ready to deal 6061 * with any packets on eager. 6062 */ 6063 if (eager->tcp_ipversion == IPV4_VERSION) { 6064 if (ipcl_conn_insert(econnp, IPPROTO_TCP, 0, 0, 0) != 0) { 6065 goto error; 6066 } 6067 } else { 6068 if (ipcl_conn_insert_v6(econnp, IPPROTO_TCP, 0, 0, 0, 0) != 0) { 6069 goto error; 6070 } 6071 } 6072 6073 /* mark conn as fully-bound */ 6074 econnp->conn_fully_bound = B_TRUE; 6075 6076 /* Send the SYN-ACK */ 6077 tcp_send_data(eager, eager->tcp_wq, mp1); 6078 CONN_DEC_REF(eager->tcp_connp); 6079 freemsg(mp); 6080 6081 return; 6082 error: 6083 freemsg(mp1); 6084 eager->tcp_closemp_used = B_TRUE; 6085 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 6086 squeue_fill(econnp->conn_sqp, &eager->tcp_closemp, tcp_eager_kill, 6087 econnp, SQTAG_TCP_CONN_REQ_2); 6088 6089 /* 6090 * If a connection already exists, send the mp to that connections so 6091 * that it can be appropriately dealt with. 6092 */ 6093 ipst = tcps->tcps_netstack->netstack_ip; 6094 6095 if ((econnp = ipcl_classify(mp, connp->conn_zoneid, ipst)) != NULL) { 6096 if (!IPCL_IS_CONNECTED(econnp)) { 6097 /* 6098 * Something bad happened. ipcl_conn_insert() 6099 * failed because a connection already existed 6100 * in connected hash but we can't find it 6101 * anymore (someone blew it away). Just 6102 * free this message and hopefully remote 6103 * will retransmit at which time the SYN can be 6104 * treated as a new connection or dealth with 6105 * a TH_RST if a connection already exists. 6106 */ 6107 CONN_DEC_REF(econnp); 6108 freemsg(mp); 6109 } else { 6110 squeue_fill(econnp->conn_sqp, mp, tcp_input, 6111 econnp, SQTAG_TCP_CONN_REQ_1); 6112 } 6113 } else { 6114 /* Nobody wants this packet */ 6115 freemsg(mp); 6116 } 6117 return; 6118 error3: 6119 CONN_DEC_REF(econnp); 6120 error2: 6121 freemsg(mp); 6122 } 6123 6124 /* 6125 * In an ideal case of vertical partition in NUMA architecture, its 6126 * beneficial to have the listener and all the incoming connections 6127 * tied to the same squeue. The other constraint is that incoming 6128 * connections should be tied to the squeue attached to interrupted 6129 * CPU for obvious locality reason so this leaves the listener to 6130 * be tied to the same squeue. Our only problem is that when listener 6131 * is binding, the CPU that will get interrupted by the NIC whose 6132 * IP address the listener is binding to is not even known. So 6133 * the code below allows us to change that binding at the time the 6134 * CPU is interrupted by virtue of incoming connection's squeue. 6135 * 6136 * This is usefull only in case of a listener bound to a specific IP 6137 * address. For other kind of listeners, they get bound the 6138 * very first time and there is no attempt to rebind them. 6139 */ 6140 void 6141 tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) 6142 { 6143 conn_t *connp = (conn_t *)arg; 6144 squeue_t *sqp = (squeue_t *)arg2; 6145 squeue_t *new_sqp; 6146 uint32_t conn_flags; 6147 6148 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 6149 new_sqp = (squeue_t *)DB_CKSUMSTART(mp); 6150 } else { 6151 goto done; 6152 } 6153 6154 if (connp->conn_fanout == NULL) 6155 goto done; 6156 6157 if (!(connp->conn_flags & IPCL_FULLY_BOUND)) { 6158 mutex_enter(&connp->conn_fanout->connf_lock); 6159 mutex_enter(&connp->conn_lock); 6160 /* 6161 * No one from read or write side can access us now 6162 * except for already queued packets on this squeue. 6163 * But since we haven't changed the squeue yet, they 6164 * can't execute. If they are processed after we have 6165 * changed the squeue, they are sent back to the 6166 * correct squeue down below. 6167 * But a listner close can race with processing of 6168 * incoming SYN. If incoming SYN processing changes 6169 * the squeue then the listener close which is waiting 6170 * to enter the squeue would operate on the wrong 6171 * squeue. Hence we don't change the squeue here unless 6172 * the refcount is exactly the minimum refcount. The 6173 * minimum refcount of 4 is counted as - 1 each for 6174 * TCP and IP, 1 for being in the classifier hash, and 6175 * 1 for the mblk being processed. 6176 */ 6177 6178 if (connp->conn_ref != 4 || 6179 connp->conn_tcp->tcp_state != TCPS_LISTEN) { 6180 mutex_exit(&connp->conn_lock); 6181 mutex_exit(&connp->conn_fanout->connf_lock); 6182 goto done; 6183 } 6184 if (connp->conn_sqp != new_sqp) { 6185 while (connp->conn_sqp != new_sqp) 6186 (void) casptr(&connp->conn_sqp, sqp, new_sqp); 6187 } 6188 6189 do { 6190 conn_flags = connp->conn_flags; 6191 conn_flags |= IPCL_FULLY_BOUND; 6192 (void) cas32(&connp->conn_flags, connp->conn_flags, 6193 conn_flags); 6194 } while (!(connp->conn_flags & IPCL_FULLY_BOUND)); 6195 6196 mutex_exit(&connp->conn_fanout->connf_lock); 6197 mutex_exit(&connp->conn_lock); 6198 } 6199 6200 done: 6201 if (connp->conn_sqp != sqp) { 6202 CONN_INC_REF(connp); 6203 squeue_fill(connp->conn_sqp, mp, 6204 connp->conn_recv, connp, SQTAG_TCP_CONN_REQ_UNBOUND); 6205 } else { 6206 tcp_conn_request(connp, mp, sqp); 6207 } 6208 } 6209 6210 /* 6211 * Successful connect request processing begins when our client passes 6212 * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes 6213 * our T_OK_ACK reply message upstream. The control flow looks like this: 6214 * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_connect() -> IP 6215 * upstream <- tcp_rput() <- IP 6216 * After various error checks are completed, tcp_connect() lays 6217 * the target address and port into the composite header template, 6218 * preallocates the T_OK_ACK reply message, construct a full 12 byte bind 6219 * request followed by an IRE request, and passes the three mblk message 6220 * down to IP looking like this: 6221 * O_T_BIND_REQ for IP --> IRE req --> T_OK_ACK for our client 6222 * Processing continues in tcp_rput() when we receive the following message: 6223 * T_BIND_ACK from IP --> IRE ack --> T_OK_ACK for our client 6224 * After consuming the first two mblks, tcp_rput() calls tcp_timer(), 6225 * to fire off the connection request, and then passes the T_OK_ACK mblk 6226 * upstream that we filled in below. There are, of course, numerous 6227 * error conditions along the way which truncate the processing described 6228 * above. 6229 */ 6230 static void 6231 tcp_connect(tcp_t *tcp, mblk_t *mp) 6232 { 6233 sin_t *sin; 6234 sin6_t *sin6; 6235 queue_t *q = tcp->tcp_wq; 6236 struct T_conn_req *tcr; 6237 ipaddr_t *dstaddrp; 6238 in_port_t dstport; 6239 uint_t srcid; 6240 6241 tcr = (struct T_conn_req *)mp->b_rptr; 6242 6243 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 6244 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 6245 tcp_err_ack(tcp, mp, TPROTO, 0); 6246 return; 6247 } 6248 6249 /* 6250 * Determine packet type based on type of address passed in 6251 * the request should contain an IPv4 or IPv6 address. 6252 * Make sure that address family matches the type of 6253 * family of the the address passed down 6254 */ 6255 switch (tcr->DEST_length) { 6256 default: 6257 tcp_err_ack(tcp, mp, TBADADDR, 0); 6258 return; 6259 6260 case (sizeof (sin_t) - sizeof (sin->sin_zero)): { 6261 /* 6262 * XXX: The check for valid DEST_length was not there 6263 * in earlier releases and some buggy 6264 * TLI apps (e.g Sybase) got away with not feeding 6265 * in sin_zero part of address. 6266 * We allow that bug to keep those buggy apps humming. 6267 * Test suites require the check on DEST_length. 6268 * We construct a new mblk with valid DEST_length 6269 * free the original so the rest of the code does 6270 * not have to keep track of this special shorter 6271 * length address case. 6272 */ 6273 mblk_t *nmp; 6274 struct T_conn_req *ntcr; 6275 sin_t *nsin; 6276 6277 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + 6278 tcr->OPT_length, BPRI_HI); 6279 if (nmp == NULL) { 6280 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 6281 return; 6282 } 6283 ntcr = (struct T_conn_req *)nmp->b_rptr; 6284 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ 6285 ntcr->PRIM_type = T_CONN_REQ; 6286 ntcr->DEST_length = sizeof (sin_t); 6287 ntcr->DEST_offset = sizeof (struct T_conn_req); 6288 6289 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); 6290 *nsin = sin_null; 6291 /* Get pointer to shorter address to copy from original mp */ 6292 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 6293 tcr->DEST_length); /* extract DEST_length worth of sin_t */ 6294 if (sin == NULL || !OK_32PTR((char *)sin)) { 6295 freemsg(nmp); 6296 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 6297 return; 6298 } 6299 nsin->sin_family = sin->sin_family; 6300 nsin->sin_port = sin->sin_port; 6301 nsin->sin_addr = sin->sin_addr; 6302 /* Note:nsin->sin_zero zero-fill with sin_null assign above */ 6303 nmp->b_wptr = (uchar_t *)&nsin[1]; 6304 if (tcr->OPT_length != 0) { 6305 ntcr->OPT_length = tcr->OPT_length; 6306 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; 6307 bcopy((uchar_t *)tcr + tcr->OPT_offset, 6308 (uchar_t *)ntcr + ntcr->OPT_offset, 6309 tcr->OPT_length); 6310 nmp->b_wptr += tcr->OPT_length; 6311 } 6312 freemsg(mp); /* original mp freed */ 6313 mp = nmp; /* re-initialize original variables */ 6314 tcr = ntcr; 6315 } 6316 /* FALLTHRU */ 6317 6318 case sizeof (sin_t): 6319 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 6320 sizeof (sin_t)); 6321 if (sin == NULL || !OK_32PTR((char *)sin)) { 6322 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 6323 return; 6324 } 6325 if (tcp->tcp_family != AF_INET || 6326 sin->sin_family != AF_INET) { 6327 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 6328 return; 6329 } 6330 if (sin->sin_port == 0) { 6331 tcp_err_ack(tcp, mp, TBADADDR, 0); 6332 return; 6333 } 6334 if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) { 6335 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 6336 return; 6337 } 6338 6339 break; 6340 6341 case sizeof (sin6_t): 6342 sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset, 6343 sizeof (sin6_t)); 6344 if (sin6 == NULL || !OK_32PTR((char *)sin6)) { 6345 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 6346 return; 6347 } 6348 if (tcp->tcp_family != AF_INET6 || 6349 sin6->sin6_family != AF_INET6) { 6350 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 6351 return; 6352 } 6353 if (sin6->sin6_port == 0) { 6354 tcp_err_ack(tcp, mp, TBADADDR, 0); 6355 return; 6356 } 6357 break; 6358 } 6359 /* 6360 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we 6361 * should key on their sequence number and cut them loose. 6362 */ 6363 6364 /* 6365 * If options passed in, feed it for verification and handling 6366 */ 6367 if (tcr->OPT_length != 0) { 6368 mblk_t *ok_mp; 6369 mblk_t *discon_mp; 6370 mblk_t *conn_opts_mp; 6371 int t_error, sys_error, do_disconnect; 6372 6373 conn_opts_mp = NULL; 6374 6375 if (tcp_conprim_opt_process(tcp, mp, 6376 &do_disconnect, &t_error, &sys_error) < 0) { 6377 if (do_disconnect) { 6378 ASSERT(t_error == 0 && sys_error == 0); 6379 discon_mp = mi_tpi_discon_ind(NULL, 6380 ECONNREFUSED, 0); 6381 if (!discon_mp) { 6382 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 6383 TSYSERR, ENOMEM); 6384 return; 6385 } 6386 ok_mp = mi_tpi_ok_ack_alloc(mp); 6387 if (!ok_mp) { 6388 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6389 TSYSERR, ENOMEM); 6390 return; 6391 } 6392 qreply(q, ok_mp); 6393 qreply(q, discon_mp); /* no flush! */ 6394 } else { 6395 ASSERT(t_error != 0); 6396 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, 6397 sys_error); 6398 } 6399 return; 6400 } 6401 /* 6402 * Success in setting options, the mp option buffer represented 6403 * by OPT_length/offset has been potentially modified and 6404 * contains results of option processing. We copy it in 6405 * another mp to save it for potentially influencing returning 6406 * it in T_CONN_CONN. 6407 */ 6408 if (tcr->OPT_length != 0) { /* there are resulting options */ 6409 conn_opts_mp = copyb(mp); 6410 if (!conn_opts_mp) { 6411 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 6412 TSYSERR, ENOMEM); 6413 return; 6414 } 6415 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); 6416 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; 6417 /* 6418 * Note: 6419 * These resulting option negotiation can include any 6420 * end-to-end negotiation options but there no such 6421 * thing (yet?) in our TCP/IP. 6422 */ 6423 } 6424 } 6425 6426 /* 6427 * If we're connecting to an IPv4-mapped IPv6 address, we need to 6428 * make sure that the template IP header in the tcp structure is an 6429 * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We 6430 * need to this before we call tcp_bindi() so that the port lookup 6431 * code will look for ports in the correct port space (IPv4 and 6432 * IPv6 have separate port spaces). 6433 */ 6434 if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION && 6435 IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 6436 int err = 0; 6437 6438 err = tcp_header_init_ipv4(tcp); 6439 if (err != 0) { 6440 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); 6441 goto connect_failed; 6442 } 6443 if (tcp->tcp_lport != 0) 6444 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 6445 } 6446 6447 if (tcp->tcp_issocket) { 6448 /* 6449 * TCP is _D_SODIRECT and sockfs is directly above so save 6450 * the shared sonode sodirect_t pointer (if any) to enable 6451 * TCP sodirect. 6452 */ 6453 tcp->tcp_sodirect = SOD_QTOSODP(tcp->tcp_rq); 6454 } 6455 6456 switch (tcp->tcp_state) { 6457 case TCPS_IDLE: 6458 /* 6459 * We support quick connect, refer to comments in 6460 * tcp_connect_*() 6461 */ 6462 /* FALLTHRU */ 6463 case TCPS_BOUND: 6464 case TCPS_LISTEN: 6465 if (tcp->tcp_family == AF_INET6) { 6466 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 6467 tcp_connect_ipv6(tcp, mp, 6468 &sin6->sin6_addr, 6469 sin6->sin6_port, sin6->sin6_flowinfo, 6470 sin6->__sin6_src_id, sin6->sin6_scope_id); 6471 return; 6472 } 6473 /* 6474 * Destination adress is mapped IPv6 address. 6475 * Source bound address should be unspecified or 6476 * IPv6 mapped address as well. 6477 */ 6478 if (!IN6_IS_ADDR_UNSPECIFIED( 6479 &tcp->tcp_bound_source_v6) && 6480 !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) { 6481 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, 6482 EADDRNOTAVAIL); 6483 break; 6484 } 6485 dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr)); 6486 dstport = sin6->sin6_port; 6487 srcid = sin6->__sin6_src_id; 6488 } else { 6489 dstaddrp = &sin->sin_addr.s_addr; 6490 dstport = sin->sin_port; 6491 srcid = 0; 6492 } 6493 6494 tcp_connect_ipv4(tcp, mp, dstaddrp, dstport, srcid); 6495 return; 6496 default: 6497 mp = mi_tpi_err_ack_alloc(mp, TOUTSTATE, 0); 6498 break; 6499 } 6500 /* 6501 * Note: Code below is the "failure" case 6502 */ 6503 /* return error ack and blow away saved option results if any */ 6504 connect_failed: 6505 if (mp != NULL) 6506 putnext(tcp->tcp_rq, mp); 6507 else { 6508 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6509 TSYSERR, ENOMEM); 6510 } 6511 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 6512 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 6513 } 6514 6515 /* 6516 * Handle connect to IPv4 destinations, including connections for AF_INET6 6517 * sockets connecting to IPv4 mapped IPv6 destinations. 6518 */ 6519 static void 6520 tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport, 6521 uint_t srcid) 6522 { 6523 tcph_t *tcph; 6524 mblk_t *mp1; 6525 ipaddr_t dstaddr = *dstaddrp; 6526 int32_t oldstate; 6527 uint16_t lport; 6528 tcp_stack_t *tcps = tcp->tcp_tcps; 6529 6530 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 6531 6532 /* Check for attempt to connect to INADDR_ANY */ 6533 if (dstaddr == INADDR_ANY) { 6534 /* 6535 * SunOS 4.x and 4.3 BSD allow an application 6536 * to connect a TCP socket to INADDR_ANY. 6537 * When they do this, the kernel picks the 6538 * address of one interface and uses it 6539 * instead. The kernel usually ends up 6540 * picking the address of the loopback 6541 * interface. This is an undocumented feature. 6542 * However, we provide the same thing here 6543 * in order to have source and binary 6544 * compatibility with SunOS 4.x. 6545 * Update the T_CONN_REQ (sin/sin6) since it is used to 6546 * generate the T_CONN_CON. 6547 */ 6548 dstaddr = htonl(INADDR_LOOPBACK); 6549 *dstaddrp = dstaddr; 6550 } 6551 6552 /* Handle __sin6_src_id if socket not bound to an IP address */ 6553 if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) { 6554 ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6, 6555 tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack); 6556 IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6, 6557 tcp->tcp_ipha->ipha_src); 6558 } 6559 6560 /* 6561 * Don't let an endpoint connect to itself. Note that 6562 * the test here does not catch the case where the 6563 * source IP addr was left unspecified by the user. In 6564 * this case, the source addr is set in tcp_adapt_ire() 6565 * using the reply to the T_BIND message that we send 6566 * down to IP here and the check is repeated in tcp_rput_other. 6567 */ 6568 if (dstaddr == tcp->tcp_ipha->ipha_src && 6569 dstport == tcp->tcp_lport) { 6570 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 6571 goto failed; 6572 } 6573 6574 tcp->tcp_ipha->ipha_dst = dstaddr; 6575 IN6_IPADDR_TO_V4MAPPED(dstaddr, &tcp->tcp_remote_v6); 6576 6577 /* 6578 * Massage a source route if any putting the first hop 6579 * in iph_dst. Compute a starting value for the checksum which 6580 * takes into account that the original iph_dst should be 6581 * included in the checksum but that ip will include the 6582 * first hop in the source route in the tcp checksum. 6583 */ 6584 tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha, tcps->tcps_netstack); 6585 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); 6586 tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + 6587 (tcp->tcp_ipha->ipha_dst & 0xffff)); 6588 if ((int)tcp->tcp_sum < 0) 6589 tcp->tcp_sum--; 6590 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); 6591 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + 6592 (tcp->tcp_sum >> 16)); 6593 tcph = tcp->tcp_tcph; 6594 *(uint16_t *)tcph->th_fport = dstport; 6595 tcp->tcp_fport = dstport; 6596 6597 oldstate = tcp->tcp_state; 6598 /* 6599 * At this point the remote destination address and remote port fields 6600 * in the tcp-four-tuple have been filled in the tcp structure. Now we 6601 * have to see which state tcp was in so we can take apropriate action. 6602 */ 6603 if (oldstate == TCPS_IDLE) { 6604 /* 6605 * We support a quick connect capability here, allowing 6606 * clients to transition directly from IDLE to SYN_SENT 6607 * tcp_bindi will pick an unused port, insert the connection 6608 * in the bind hash and transition to BOUND state. 6609 */ 6610 lport = tcp_update_next_port(tcps->tcps_next_port_to_try, 6611 tcp, B_TRUE); 6612 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, 6613 B_FALSE, B_FALSE); 6614 if (lport == 0) { 6615 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0); 6616 goto failed; 6617 } 6618 } 6619 tcp->tcp_state = TCPS_SYN_SENT; 6620 6621 /* 6622 * TODO: allow data with connect requests 6623 * by unlinking M_DATA trailers here and 6624 * linking them in behind the T_OK_ACK mblk. 6625 * The tcp_rput() bind ack handler would then 6626 * feed them to tcp_wput_data() rather than call 6627 * tcp_timer(). 6628 */ 6629 mp = mi_tpi_ok_ack_alloc(mp); 6630 if (!mp) { 6631 tcp->tcp_state = oldstate; 6632 goto failed; 6633 } 6634 if (tcp->tcp_family == AF_INET) { 6635 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 6636 sizeof (ipa_conn_t)); 6637 } else { 6638 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 6639 sizeof (ipa6_conn_t)); 6640 } 6641 if (mp1) { 6642 /* 6643 * We need to make sure that the conn_recv is set to a non-null 6644 * value before we insert the conn_t into the classifier table. 6645 * This is to avoid a race with an incoming packet which does 6646 * an ipcl_classify(). 6647 */ 6648 tcp->tcp_connp->conn_recv = tcp_input; 6649 6650 /* Hang onto the T_OK_ACK for later. */ 6651 linkb(mp1, mp); 6652 mblk_setcred(mp1, tcp->tcp_cred); 6653 if (tcp->tcp_family == AF_INET) 6654 mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp); 6655 else { 6656 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp, 6657 &tcp->tcp_sticky_ipp); 6658 } 6659 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); 6660 tcp->tcp_active_open = 1; 6661 /* 6662 * If the bind cannot complete immediately 6663 * IP will arrange to call tcp_rput_other 6664 * when the bind completes. 6665 */ 6666 if (mp1 != NULL) 6667 tcp_rput_other(tcp, mp1); 6668 return; 6669 } 6670 /* Error case */ 6671 tcp->tcp_state = oldstate; 6672 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); 6673 6674 failed: 6675 /* return error ack and blow away saved option results if any */ 6676 if (mp != NULL) 6677 putnext(tcp->tcp_rq, mp); 6678 else { 6679 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6680 TSYSERR, ENOMEM); 6681 } 6682 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 6683 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 6684 6685 } 6686 6687 /* 6688 * Handle connect to IPv6 destinations. 6689 */ 6690 static void 6691 tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, 6692 in_port_t dstport, uint32_t flowinfo, uint_t srcid, uint32_t scope_id) 6693 { 6694 tcph_t *tcph; 6695 mblk_t *mp1; 6696 ip6_rthdr_t *rth; 6697 int32_t oldstate; 6698 uint16_t lport; 6699 tcp_stack_t *tcps = tcp->tcp_tcps; 6700 6701 ASSERT(tcp->tcp_family == AF_INET6); 6702 6703 /* 6704 * If we're here, it means that the destination address is a native 6705 * IPv6 address. Return an error if tcp_ipversion is not IPv6. A 6706 * reason why it might not be IPv6 is if the socket was bound to an 6707 * IPv4-mapped IPv6 address. 6708 */ 6709 if (tcp->tcp_ipversion != IPV6_VERSION) { 6710 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 6711 goto failed; 6712 } 6713 6714 /* 6715 * Interpret a zero destination to mean loopback. 6716 * Update the T_CONN_REQ (sin/sin6) since it is used to 6717 * generate the T_CONN_CON. 6718 */ 6719 if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) { 6720 *dstaddrp = ipv6_loopback; 6721 } 6722 6723 /* Handle __sin6_src_id if socket not bound to an IP address */ 6724 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { 6725 ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src, 6726 tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack); 6727 tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src; 6728 } 6729 6730 /* 6731 * Take care of the scope_id now and add ip6i_t 6732 * if ip6i_t is not already allocated through TCP 6733 * sticky options. At this point tcp_ip6h does not 6734 * have dst info, thus use dstaddrp. 6735 */ 6736 if (scope_id != 0 && 6737 IN6_IS_ADDR_LINKSCOPE(dstaddrp)) { 6738 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; 6739 ip6i_t *ip6i; 6740 6741 ipp->ipp_ifindex = scope_id; 6742 ip6i = (ip6i_t *)tcp->tcp_iphc; 6743 6744 if ((ipp->ipp_fields & IPPF_HAS_IP6I) && 6745 ip6i != NULL && (ip6i->ip6i_nxt == IPPROTO_RAW)) { 6746 /* Already allocated */ 6747 ip6i->ip6i_flags |= IP6I_IFINDEX; 6748 ip6i->ip6i_ifindex = ipp->ipp_ifindex; 6749 ipp->ipp_fields |= IPPF_SCOPE_ID; 6750 } else { 6751 int reterr; 6752 6753 ipp->ipp_fields |= IPPF_SCOPE_ID; 6754 if (ipp->ipp_fields & IPPF_HAS_IP6I) 6755 ip2dbg(("tcp_connect_v6: SCOPE_ID set\n")); 6756 reterr = tcp_build_hdrs(tcp->tcp_rq, tcp); 6757 if (reterr != 0) 6758 goto failed; 6759 ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n")); 6760 } 6761 } 6762 6763 /* 6764 * Don't let an endpoint connect to itself. Note that 6765 * the test here does not catch the case where the 6766 * source IP addr was left unspecified by the user. In 6767 * this case, the source addr is set in tcp_adapt_ire() 6768 * using the reply to the T_BIND message that we send 6769 * down to IP here and the check is repeated in tcp_rput_other. 6770 */ 6771 if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) && 6772 (dstport == tcp->tcp_lport)) { 6773 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 6774 goto failed; 6775 } 6776 6777 tcp->tcp_ip6h->ip6_dst = *dstaddrp; 6778 tcp->tcp_remote_v6 = *dstaddrp; 6779 tcp->tcp_ip6h->ip6_vcf = 6780 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 6781 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 6782 6783 6784 /* 6785 * Massage a routing header (if present) putting the first hop 6786 * in ip6_dst. Compute a starting value for the checksum which 6787 * takes into account that the original ip6_dst should be 6788 * included in the checksum but that ip will include the 6789 * first hop in the source route in the tcp checksum. 6790 */ 6791 rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph); 6792 if (rth != NULL) { 6793 tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth, 6794 tcps->tcps_netstack); 6795 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + 6796 (tcp->tcp_sum >> 16)); 6797 } else { 6798 tcp->tcp_sum = 0; 6799 } 6800 6801 tcph = tcp->tcp_tcph; 6802 *(uint16_t *)tcph->th_fport = dstport; 6803 tcp->tcp_fport = dstport; 6804 6805 oldstate = tcp->tcp_state; 6806 /* 6807 * At this point the remote destination address and remote port fields 6808 * in the tcp-four-tuple have been filled in the tcp structure. Now we 6809 * have to see which state tcp was in so we can take apropriate action. 6810 */ 6811 if (oldstate == TCPS_IDLE) { 6812 /* 6813 * We support a quick connect capability here, allowing 6814 * clients to transition directly from IDLE to SYN_SENT 6815 * tcp_bindi will pick an unused port, insert the connection 6816 * in the bind hash and transition to BOUND state. 6817 */ 6818 lport = tcp_update_next_port(tcps->tcps_next_port_to_try, 6819 tcp, B_TRUE); 6820 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, 6821 B_FALSE, B_FALSE); 6822 if (lport == 0) { 6823 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0); 6824 goto failed; 6825 } 6826 } 6827 tcp->tcp_state = TCPS_SYN_SENT; 6828 /* 6829 * TODO: allow data with connect requests 6830 * by unlinking M_DATA trailers here and 6831 * linking them in behind the T_OK_ACK mblk. 6832 * The tcp_rput() bind ack handler would then 6833 * feed them to tcp_wput_data() rather than call 6834 * tcp_timer(). 6835 */ 6836 mp = mi_tpi_ok_ack_alloc(mp); 6837 if (!mp) { 6838 tcp->tcp_state = oldstate; 6839 goto failed; 6840 } 6841 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t)); 6842 if (mp1) { 6843 /* 6844 * We need to make sure that the conn_recv is set to a non-null 6845 * value before we insert the conn_t into the classifier table. 6846 * This is to avoid a race with an incoming packet which does 6847 * an ipcl_classify(). 6848 */ 6849 tcp->tcp_connp->conn_recv = tcp_input; 6850 6851 /* Hang onto the T_OK_ACK for later. */ 6852 linkb(mp1, mp); 6853 mblk_setcred(mp1, tcp->tcp_cred); 6854 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp, 6855 &tcp->tcp_sticky_ipp); 6856 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); 6857 tcp->tcp_active_open = 1; 6858 /* ip_bind_v6() may return ACK or ERROR */ 6859 if (mp1 != NULL) 6860 tcp_rput_other(tcp, mp1); 6861 return; 6862 } 6863 /* Error case */ 6864 tcp->tcp_state = oldstate; 6865 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); 6866 6867 failed: 6868 /* return error ack and blow away saved option results if any */ 6869 if (mp != NULL) 6870 putnext(tcp->tcp_rq, mp); 6871 else { 6872 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6873 TSYSERR, ENOMEM); 6874 } 6875 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 6876 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 6877 } 6878 6879 /* 6880 * We need a stream q for detached closing tcp connections 6881 * to use. Our client hereby indicates that this q is the 6882 * one to use. 6883 */ 6884 static void 6885 tcp_def_q_set(tcp_t *tcp, mblk_t *mp) 6886 { 6887 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 6888 queue_t *q = tcp->tcp_wq; 6889 tcp_stack_t *tcps = tcp->tcp_tcps; 6890 6891 #ifdef NS_DEBUG 6892 (void) printf("TCP_IOC_DEFAULT_Q for stack %d\n", 6893 tcps->tcps_netstack->netstack_stackid); 6894 #endif 6895 mp->b_datap->db_type = M_IOCACK; 6896 iocp->ioc_count = 0; 6897 mutex_enter(&tcps->tcps_g_q_lock); 6898 if (tcps->tcps_g_q != NULL) { 6899 mutex_exit(&tcps->tcps_g_q_lock); 6900 iocp->ioc_error = EALREADY; 6901 } else { 6902 mblk_t *mp1; 6903 6904 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 0); 6905 if (mp1 == NULL) { 6906 mutex_exit(&tcps->tcps_g_q_lock); 6907 iocp->ioc_error = ENOMEM; 6908 } else { 6909 tcps->tcps_g_q = tcp->tcp_rq; 6910 mutex_exit(&tcps->tcps_g_q_lock); 6911 iocp->ioc_error = 0; 6912 iocp->ioc_rval = 0; 6913 /* 6914 * We are passing tcp_sticky_ipp as NULL 6915 * as it is not useful for tcp_default queue 6916 * 6917 * Set conn_recv just in case. 6918 */ 6919 tcp->tcp_connp->conn_recv = tcp_conn_request; 6920 6921 mp1 = ip_bind_v6(q, mp1, tcp->tcp_connp, NULL); 6922 if (mp1 != NULL) 6923 tcp_rput_other(tcp, mp1); 6924 } 6925 } 6926 qreply(q, mp); 6927 } 6928 6929 /* 6930 * Our client hereby directs us to reject the connection request 6931 * that tcp_conn_request() marked with 'seqnum'. Rejection consists 6932 * of sending the appropriate RST, not an ICMP error. 6933 */ 6934 static void 6935 tcp_disconnect(tcp_t *tcp, mblk_t *mp) 6936 { 6937 tcp_t *ltcp = NULL; 6938 t_scalar_t seqnum; 6939 conn_t *connp; 6940 tcp_stack_t *tcps = tcp->tcp_tcps; 6941 6942 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 6943 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) { 6944 tcp_err_ack(tcp, mp, TPROTO, 0); 6945 return; 6946 } 6947 6948 /* 6949 * Right now, upper modules pass down a T_DISCON_REQ to TCP, 6950 * when the stream is in BOUND state. Do not send a reset, 6951 * since the destination IP address is not valid, and it can 6952 * be the initialized value of all zeros (broadcast address). 6953 * 6954 * If TCP has sent down a bind request to IP and has not 6955 * received the reply, reject the request. Otherwise, TCP 6956 * will be confused. 6957 */ 6958 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) { 6959 if (tcp->tcp_debug) { 6960 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 6961 "tcp_disconnect: bad state, %d", tcp->tcp_state); 6962 } 6963 tcp_err_ack(tcp, mp, TOUTSTATE, 0); 6964 return; 6965 } 6966 6967 seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number; 6968 6969 if (seqnum == -1 || tcp->tcp_conn_req_max == 0) { 6970 6971 /* 6972 * According to TPI, for non-listeners, ignore seqnum 6973 * and disconnect. 6974 * Following interpretation of -1 seqnum is historical 6975 * and implied TPI ? (TPI only states that for T_CONN_IND, 6976 * a valid seqnum should not be -1). 6977 * 6978 * -1 means disconnect everything 6979 * regardless even on a listener. 6980 */ 6981 6982 int old_state = tcp->tcp_state; 6983 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 6984 6985 /* 6986 * The connection can't be on the tcp_time_wait_head list 6987 * since it is not detached. 6988 */ 6989 ASSERT(tcp->tcp_time_wait_next == NULL); 6990 ASSERT(tcp->tcp_time_wait_prev == NULL); 6991 ASSERT(tcp->tcp_time_wait_expire == 0); 6992 ltcp = NULL; 6993 /* 6994 * If it used to be a listener, check to make sure no one else 6995 * has taken the port before switching back to LISTEN state. 6996 */ 6997 if (tcp->tcp_ipversion == IPV4_VERSION) { 6998 connp = ipcl_lookup_listener_v4(tcp->tcp_lport, 6999 tcp->tcp_ipha->ipha_src, 7000 tcp->tcp_connp->conn_zoneid, ipst); 7001 if (connp != NULL) 7002 ltcp = connp->conn_tcp; 7003 } else { 7004 /* Allow tcp_bound_if listeners? */ 7005 connp = ipcl_lookup_listener_v6(tcp->tcp_lport, 7006 &tcp->tcp_ip6h->ip6_src, 0, 7007 tcp->tcp_connp->conn_zoneid, ipst); 7008 if (connp != NULL) 7009 ltcp = connp->conn_tcp; 7010 } 7011 if (tcp->tcp_conn_req_max && ltcp == NULL) { 7012 tcp->tcp_state = TCPS_LISTEN; 7013 } else if (old_state > TCPS_BOUND) { 7014 tcp->tcp_conn_req_max = 0; 7015 tcp->tcp_state = TCPS_BOUND; 7016 } 7017 if (ltcp != NULL) 7018 CONN_DEC_REF(ltcp->tcp_connp); 7019 if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) { 7020 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); 7021 } else if (old_state == TCPS_ESTABLISHED || 7022 old_state == TCPS_CLOSE_WAIT) { 7023 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); 7024 } 7025 7026 if (tcp->tcp_fused) 7027 tcp_unfuse(tcp); 7028 7029 mutex_enter(&tcp->tcp_eager_lock); 7030 if ((tcp->tcp_conn_req_cnt_q0 != 0) || 7031 (tcp->tcp_conn_req_cnt_q != 0)) { 7032 tcp_eager_cleanup(tcp, 0); 7033 } 7034 mutex_exit(&tcp->tcp_eager_lock); 7035 7036 tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt, 7037 tcp->tcp_rnxt, TH_RST | TH_ACK); 7038 7039 tcp_reinit(tcp); 7040 7041 if (old_state >= TCPS_ESTABLISHED) { 7042 /* Send M_FLUSH according to TPI */ 7043 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); 7044 } 7045 mp = mi_tpi_ok_ack_alloc(mp); 7046 if (mp) 7047 putnext(tcp->tcp_rq, mp); 7048 return; 7049 } else if (!tcp_eager_blowoff(tcp, seqnum)) { 7050 tcp_err_ack(tcp, mp, TBADSEQ, 0); 7051 return; 7052 } 7053 if (tcp->tcp_state >= TCPS_ESTABLISHED) { 7054 /* Send M_FLUSH according to TPI */ 7055 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); 7056 } 7057 mp = mi_tpi_ok_ack_alloc(mp); 7058 if (mp) 7059 putnext(tcp->tcp_rq, mp); 7060 } 7061 7062 /* 7063 * Diagnostic routine used to return a string associated with the tcp state. 7064 * Note that if the caller does not supply a buffer, it will use an internal 7065 * static string. This means that if multiple threads call this function at 7066 * the same time, output can be corrupted... Note also that this function 7067 * does not check the size of the supplied buffer. The caller has to make 7068 * sure that it is big enough. 7069 */ 7070 static char * 7071 tcp_display(tcp_t *tcp, char *sup_buf, char format) 7072 { 7073 char buf1[30]; 7074 static char priv_buf[INET6_ADDRSTRLEN * 2 + 80]; 7075 char *buf; 7076 char *cp; 7077 in6_addr_t local, remote; 7078 char local_addrbuf[INET6_ADDRSTRLEN]; 7079 char remote_addrbuf[INET6_ADDRSTRLEN]; 7080 7081 if (sup_buf != NULL) 7082 buf = sup_buf; 7083 else 7084 buf = priv_buf; 7085 7086 if (tcp == NULL) 7087 return ("NULL_TCP"); 7088 switch (tcp->tcp_state) { 7089 case TCPS_CLOSED: 7090 cp = "TCP_CLOSED"; 7091 break; 7092 case TCPS_IDLE: 7093 cp = "TCP_IDLE"; 7094 break; 7095 case TCPS_BOUND: 7096 cp = "TCP_BOUND"; 7097 break; 7098 case TCPS_LISTEN: 7099 cp = "TCP_LISTEN"; 7100 break; 7101 case TCPS_SYN_SENT: 7102 cp = "TCP_SYN_SENT"; 7103 break; 7104 case TCPS_SYN_RCVD: 7105 cp = "TCP_SYN_RCVD"; 7106 break; 7107 case TCPS_ESTABLISHED: 7108 cp = "TCP_ESTABLISHED"; 7109 break; 7110 case TCPS_CLOSE_WAIT: 7111 cp = "TCP_CLOSE_WAIT"; 7112 break; 7113 case TCPS_FIN_WAIT_1: 7114 cp = "TCP_FIN_WAIT_1"; 7115 break; 7116 case TCPS_CLOSING: 7117 cp = "TCP_CLOSING"; 7118 break; 7119 case TCPS_LAST_ACK: 7120 cp = "TCP_LAST_ACK"; 7121 break; 7122 case TCPS_FIN_WAIT_2: 7123 cp = "TCP_FIN_WAIT_2"; 7124 break; 7125 case TCPS_TIME_WAIT: 7126 cp = "TCP_TIME_WAIT"; 7127 break; 7128 default: 7129 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 7130 cp = buf1; 7131 break; 7132 } 7133 switch (format) { 7134 case DISP_ADDR_AND_PORT: 7135 if (tcp->tcp_ipversion == IPV4_VERSION) { 7136 /* 7137 * Note that we use the remote address in the tcp_b 7138 * structure. This means that it will print out 7139 * the real destination address, not the next hop's 7140 * address if source routing is used. 7141 */ 7142 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ip_src, &local); 7143 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &remote); 7144 7145 } else { 7146 local = tcp->tcp_ip_src_v6; 7147 remote = tcp->tcp_remote_v6; 7148 } 7149 (void) inet_ntop(AF_INET6, &local, local_addrbuf, 7150 sizeof (local_addrbuf)); 7151 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf, 7152 sizeof (remote_addrbuf)); 7153 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s", 7154 local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf, 7155 ntohs(tcp->tcp_fport), cp); 7156 break; 7157 case DISP_PORT_ONLY: 7158 default: 7159 (void) mi_sprintf(buf, "[%u, %u] %s", 7160 ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp); 7161 break; 7162 } 7163 7164 return (buf); 7165 } 7166 7167 /* 7168 * Called via squeue to get on to eager's perimeter. It sends a 7169 * TH_RST if eager is in the fanout table. The listener wants the 7170 * eager to disappear either by means of tcp_eager_blowoff() or 7171 * tcp_eager_cleanup() being called. tcp_eager_kill() can also be 7172 * called (via squeue) if the eager cannot be inserted in the 7173 * fanout table in tcp_conn_request(). 7174 */ 7175 /* ARGSUSED */ 7176 void 7177 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2) 7178 { 7179 conn_t *econnp = (conn_t *)arg; 7180 tcp_t *eager = econnp->conn_tcp; 7181 tcp_t *listener = eager->tcp_listener; 7182 tcp_stack_t *tcps = eager->tcp_tcps; 7183 7184 /* 7185 * We could be called because listener is closing. Since 7186 * the eager is using listener's queue's, its not safe. 7187 * Better use the default queue just to send the TH_RST 7188 * out. 7189 */ 7190 ASSERT(tcps->tcps_g_q != NULL); 7191 eager->tcp_rq = tcps->tcps_g_q; 7192 eager->tcp_wq = WR(tcps->tcps_g_q); 7193 7194 /* 7195 * An eager's conn_fanout will be NULL if it's a duplicate 7196 * for an existing 4-tuples in the conn fanout table. 7197 * We don't want to send an RST out in such case. 7198 */ 7199 if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) { 7200 tcp_xmit_ctl("tcp_eager_kill, can't wait", 7201 eager, eager->tcp_snxt, 0, TH_RST); 7202 } 7203 7204 /* We are here because listener wants this eager gone */ 7205 if (listener != NULL) { 7206 mutex_enter(&listener->tcp_eager_lock); 7207 tcp_eager_unlink(eager); 7208 if (eager->tcp_tconnind_started) { 7209 /* 7210 * The eager has sent a conn_ind up to the 7211 * listener but listener decides to close 7212 * instead. We need to drop the extra ref 7213 * placed on eager in tcp_rput_data() before 7214 * sending the conn_ind to listener. 7215 */ 7216 CONN_DEC_REF(econnp); 7217 } 7218 mutex_exit(&listener->tcp_eager_lock); 7219 CONN_DEC_REF(listener->tcp_connp); 7220 } 7221 7222 if (eager->tcp_state > TCPS_BOUND) 7223 tcp_close_detached(eager); 7224 } 7225 7226 /* 7227 * Reset any eager connection hanging off this listener marked 7228 * with 'seqnum' and then reclaim it's resources. 7229 */ 7230 static boolean_t 7231 tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) 7232 { 7233 tcp_t *eager; 7234 mblk_t *mp; 7235 tcp_stack_t *tcps = listener->tcp_tcps; 7236 7237 TCP_STAT(tcps, tcp_eager_blowoff_calls); 7238 eager = listener; 7239 mutex_enter(&listener->tcp_eager_lock); 7240 do { 7241 eager = eager->tcp_eager_next_q; 7242 if (eager == NULL) { 7243 mutex_exit(&listener->tcp_eager_lock); 7244 return (B_FALSE); 7245 } 7246 } while (eager->tcp_conn_req_seqnum != seqnum); 7247 7248 if (eager->tcp_closemp_used) { 7249 mutex_exit(&listener->tcp_eager_lock); 7250 return (B_TRUE); 7251 } 7252 eager->tcp_closemp_used = B_TRUE; 7253 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 7254 CONN_INC_REF(eager->tcp_connp); 7255 mutex_exit(&listener->tcp_eager_lock); 7256 mp = &eager->tcp_closemp; 7257 squeue_fill(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, 7258 eager->tcp_connp, SQTAG_TCP_EAGER_BLOWOFF); 7259 return (B_TRUE); 7260 } 7261 7262 /* 7263 * Reset any eager connection hanging off this listener 7264 * and then reclaim it's resources. 7265 */ 7266 static void 7267 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) 7268 { 7269 tcp_t *eager; 7270 mblk_t *mp; 7271 tcp_stack_t *tcps = listener->tcp_tcps; 7272 7273 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 7274 7275 if (!q0_only) { 7276 /* First cleanup q */ 7277 TCP_STAT(tcps, tcp_eager_blowoff_q); 7278 eager = listener->tcp_eager_next_q; 7279 while (eager != NULL) { 7280 if (!eager->tcp_closemp_used) { 7281 eager->tcp_closemp_used = B_TRUE; 7282 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 7283 CONN_INC_REF(eager->tcp_connp); 7284 mp = &eager->tcp_closemp; 7285 squeue_fill(eager->tcp_connp->conn_sqp, mp, 7286 tcp_eager_kill, eager->tcp_connp, 7287 SQTAG_TCP_EAGER_CLEANUP); 7288 } 7289 eager = eager->tcp_eager_next_q; 7290 } 7291 } 7292 /* Then cleanup q0 */ 7293 TCP_STAT(tcps, tcp_eager_blowoff_q0); 7294 eager = listener->tcp_eager_next_q0; 7295 while (eager != listener) { 7296 if (!eager->tcp_closemp_used) { 7297 eager->tcp_closemp_used = B_TRUE; 7298 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); 7299 CONN_INC_REF(eager->tcp_connp); 7300 mp = &eager->tcp_closemp; 7301 squeue_fill(eager->tcp_connp->conn_sqp, mp, 7302 tcp_eager_kill, eager->tcp_connp, 7303 SQTAG_TCP_EAGER_CLEANUP_Q0); 7304 } 7305 eager = eager->tcp_eager_next_q0; 7306 } 7307 } 7308 7309 /* 7310 * If we are an eager connection hanging off a listener that hasn't 7311 * formally accepted the connection yet, get off his list and blow off 7312 * any data that we have accumulated. 7313 */ 7314 static void 7315 tcp_eager_unlink(tcp_t *tcp) 7316 { 7317 tcp_t *listener = tcp->tcp_listener; 7318 7319 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 7320 ASSERT(listener != NULL); 7321 if (tcp->tcp_eager_next_q0 != NULL) { 7322 ASSERT(tcp->tcp_eager_prev_q0 != NULL); 7323 7324 /* Remove the eager tcp from q0 */ 7325 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 7326 tcp->tcp_eager_prev_q0; 7327 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 7328 tcp->tcp_eager_next_q0; 7329 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 7330 listener->tcp_conn_req_cnt_q0--; 7331 7332 tcp->tcp_eager_next_q0 = NULL; 7333 tcp->tcp_eager_prev_q0 = NULL; 7334 7335 /* 7336 * Take the eager out, if it is in the list of droppable 7337 * eagers. 7338 */ 7339 MAKE_UNDROPPABLE(tcp); 7340 7341 if (tcp->tcp_syn_rcvd_timeout != 0) { 7342 /* we have timed out before */ 7343 ASSERT(listener->tcp_syn_rcvd_timeout > 0); 7344 listener->tcp_syn_rcvd_timeout--; 7345 } 7346 } else { 7347 tcp_t **tcpp = &listener->tcp_eager_next_q; 7348 tcp_t *prev = NULL; 7349 7350 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 7351 if (tcpp[0] == tcp) { 7352 if (listener->tcp_eager_last_q == tcp) { 7353 /* 7354 * If we are unlinking the last 7355 * element on the list, adjust 7356 * tail pointer. Set tail pointer 7357 * to nil when list is empty. 7358 */ 7359 ASSERT(tcp->tcp_eager_next_q == NULL); 7360 if (listener->tcp_eager_last_q == 7361 listener->tcp_eager_next_q) { 7362 listener->tcp_eager_last_q = 7363 NULL; 7364 } else { 7365 /* 7366 * We won't get here if there 7367 * is only one eager in the 7368 * list. 7369 */ 7370 ASSERT(prev != NULL); 7371 listener->tcp_eager_last_q = 7372 prev; 7373 } 7374 } 7375 tcpp[0] = tcp->tcp_eager_next_q; 7376 tcp->tcp_eager_next_q = NULL; 7377 tcp->tcp_eager_last_q = NULL; 7378 ASSERT(listener->tcp_conn_req_cnt_q > 0); 7379 listener->tcp_conn_req_cnt_q--; 7380 break; 7381 } 7382 prev = tcpp[0]; 7383 } 7384 } 7385 tcp->tcp_listener = NULL; 7386 } 7387 7388 /* Shorthand to generate and send TPI error acks to our client */ 7389 static void 7390 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) 7391 { 7392 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 7393 putnext(tcp->tcp_rq, mp); 7394 } 7395 7396 /* Shorthand to generate and send TPI error acks to our client */ 7397 static void 7398 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 7399 int t_error, int sys_error) 7400 { 7401 struct T_error_ack *teackp; 7402 7403 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 7404 M_PCPROTO, T_ERROR_ACK)) != NULL) { 7405 teackp = (struct T_error_ack *)mp->b_rptr; 7406 teackp->ERROR_prim = primitive; 7407 teackp->TLI_error = t_error; 7408 teackp->UNIX_error = sys_error; 7409 putnext(tcp->tcp_rq, mp); 7410 } 7411 } 7412 7413 /* 7414 * Note: No locks are held when inspecting tcp_g_*epriv_ports 7415 * but instead the code relies on: 7416 * - the fact that the address of the array and its size never changes 7417 * - the atomic assignment of the elements of the array 7418 */ 7419 /* ARGSUSED */ 7420 static int 7421 tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 7422 { 7423 int i; 7424 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 7425 7426 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 7427 if (tcps->tcps_g_epriv_ports[i] != 0) 7428 (void) mi_mpprintf(mp, "%d ", 7429 tcps->tcps_g_epriv_ports[i]); 7430 } 7431 return (0); 7432 } 7433 7434 /* 7435 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple 7436 * threads from changing it at the same time. 7437 */ 7438 /* ARGSUSED */ 7439 static int 7440 tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 7441 cred_t *cr) 7442 { 7443 long new_value; 7444 int i; 7445 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 7446 7447 /* 7448 * Fail the request if the new value does not lie within the 7449 * port number limits. 7450 */ 7451 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 7452 new_value <= 0 || new_value >= 65536) { 7453 return (EINVAL); 7454 } 7455 7456 mutex_enter(&tcps->tcps_epriv_port_lock); 7457 /* Check if the value is already in the list */ 7458 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 7459 if (new_value == tcps->tcps_g_epriv_ports[i]) { 7460 mutex_exit(&tcps->tcps_epriv_port_lock); 7461 return (EEXIST); 7462 } 7463 } 7464 /* Find an empty slot */ 7465 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 7466 if (tcps->tcps_g_epriv_ports[i] == 0) 7467 break; 7468 } 7469 if (i == tcps->tcps_g_num_epriv_ports) { 7470 mutex_exit(&tcps->tcps_epriv_port_lock); 7471 return (EOVERFLOW); 7472 } 7473 /* Set the new value */ 7474 tcps->tcps_g_epriv_ports[i] = (uint16_t)new_value; 7475 mutex_exit(&tcps->tcps_epriv_port_lock); 7476 return (0); 7477 } 7478 7479 /* 7480 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple 7481 * threads from changing it at the same time. 7482 */ 7483 /* ARGSUSED */ 7484 static int 7485 tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 7486 cred_t *cr) 7487 { 7488 long new_value; 7489 int i; 7490 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 7491 7492 /* 7493 * Fail the request if the new value does not lie within the 7494 * port number limits. 7495 */ 7496 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value <= 0 || 7497 new_value >= 65536) { 7498 return (EINVAL); 7499 } 7500 7501 mutex_enter(&tcps->tcps_epriv_port_lock); 7502 /* Check that the value is already in the list */ 7503 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 7504 if (tcps->tcps_g_epriv_ports[i] == new_value) 7505 break; 7506 } 7507 if (i == tcps->tcps_g_num_epriv_ports) { 7508 mutex_exit(&tcps->tcps_epriv_port_lock); 7509 return (ESRCH); 7510 } 7511 /* Clear the value */ 7512 tcps->tcps_g_epriv_ports[i] = 0; 7513 mutex_exit(&tcps->tcps_epriv_port_lock); 7514 return (0); 7515 } 7516 7517 /* Return the TPI/TLI equivalent of our current tcp_state */ 7518 static int 7519 tcp_tpistate(tcp_t *tcp) 7520 { 7521 switch (tcp->tcp_state) { 7522 case TCPS_IDLE: 7523 return (TS_UNBND); 7524 case TCPS_LISTEN: 7525 /* 7526 * Return whether there are outstanding T_CONN_IND waiting 7527 * for the matching T_CONN_RES. Therefore don't count q0. 7528 */ 7529 if (tcp->tcp_conn_req_cnt_q > 0) 7530 return (TS_WRES_CIND); 7531 else 7532 return (TS_IDLE); 7533 case TCPS_BOUND: 7534 return (TS_IDLE); 7535 case TCPS_SYN_SENT: 7536 return (TS_WCON_CREQ); 7537 case TCPS_SYN_RCVD: 7538 /* 7539 * Note: assumption: this has to the active open SYN_RCVD. 7540 * The passive instance is detached in SYN_RCVD stage of 7541 * incoming connection processing so we cannot get request 7542 * for T_info_ack on it. 7543 */ 7544 return (TS_WACK_CRES); 7545 case TCPS_ESTABLISHED: 7546 return (TS_DATA_XFER); 7547 case TCPS_CLOSE_WAIT: 7548 return (TS_WREQ_ORDREL); 7549 case TCPS_FIN_WAIT_1: 7550 return (TS_WIND_ORDREL); 7551 case TCPS_FIN_WAIT_2: 7552 return (TS_WIND_ORDREL); 7553 7554 case TCPS_CLOSING: 7555 case TCPS_LAST_ACK: 7556 case TCPS_TIME_WAIT: 7557 case TCPS_CLOSED: 7558 /* 7559 * Following TS_WACK_DREQ7 is a rendition of "not 7560 * yet TS_IDLE" TPI state. There is no best match to any 7561 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we 7562 * choose a value chosen that will map to TLI/XTI level 7563 * state of TSTATECHNG (state is process of changing) which 7564 * captures what this dummy state represents. 7565 */ 7566 return (TS_WACK_DREQ7); 7567 default: 7568 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", 7569 tcp->tcp_state, tcp_display(tcp, NULL, 7570 DISP_PORT_ONLY)); 7571 return (TS_UNBND); 7572 } 7573 } 7574 7575 static void 7576 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) 7577 { 7578 tcp_stack_t *tcps = tcp->tcp_tcps; 7579 7580 if (tcp->tcp_family == AF_INET6) 7581 *tia = tcp_g_t_info_ack_v6; 7582 else 7583 *tia = tcp_g_t_info_ack; 7584 tia->CURRENT_state = tcp_tpistate(tcp); 7585 tia->OPT_size = tcp_max_optsize; 7586 if (tcp->tcp_mss == 0) { 7587 /* Not yet set - tcp_open does not set mss */ 7588 if (tcp->tcp_ipversion == IPV4_VERSION) 7589 tia->TIDU_size = tcps->tcps_mss_def_ipv4; 7590 else 7591 tia->TIDU_size = tcps->tcps_mss_def_ipv6; 7592 } else { 7593 tia->TIDU_size = tcp->tcp_mss; 7594 } 7595 /* TODO: Default ETSDU is 1. Is that correct for tcp? */ 7596 } 7597 7598 /* 7599 * This routine responds to T_CAPABILITY_REQ messages. It is called by 7600 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from 7601 * tcp_g_t_info_ack. The current state of the stream is copied from 7602 * tcp_state. 7603 */ 7604 static void 7605 tcp_capability_req(tcp_t *tcp, mblk_t *mp) 7606 { 7607 t_uscalar_t cap_bits1; 7608 struct T_capability_ack *tcap; 7609 7610 if (MBLKL(mp) < sizeof (struct T_capability_req)) { 7611 freemsg(mp); 7612 return; 7613 } 7614 7615 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 7616 7617 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 7618 mp->b_datap->db_type, T_CAPABILITY_ACK); 7619 if (mp == NULL) 7620 return; 7621 7622 tcap = (struct T_capability_ack *)mp->b_rptr; 7623 tcap->CAP_bits1 = 0; 7624 7625 if (cap_bits1 & TC1_INFO) { 7626 tcp_copy_info(&tcap->INFO_ack, tcp); 7627 tcap->CAP_bits1 |= TC1_INFO; 7628 } 7629 7630 if (cap_bits1 & TC1_ACCEPTOR_ID) { 7631 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; 7632 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; 7633 } 7634 7635 putnext(tcp->tcp_rq, mp); 7636 } 7637 7638 /* 7639 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. 7640 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. 7641 * The current state of the stream is copied from tcp_state. 7642 */ 7643 static void 7644 tcp_info_req(tcp_t *tcp, mblk_t *mp) 7645 { 7646 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 7647 T_INFO_ACK); 7648 if (!mp) { 7649 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 7650 return; 7651 } 7652 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); 7653 putnext(tcp->tcp_rq, mp); 7654 } 7655 7656 /* Respond to the TPI addr request */ 7657 static void 7658 tcp_addr_req(tcp_t *tcp, mblk_t *mp) 7659 { 7660 sin_t *sin; 7661 mblk_t *ackmp; 7662 struct T_addr_ack *taa; 7663 7664 /* Make it large enough for worst case */ 7665 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 7666 2 * sizeof (sin6_t), 1); 7667 if (ackmp == NULL) { 7668 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 7669 return; 7670 } 7671 7672 if (tcp->tcp_ipversion == IPV6_VERSION) { 7673 tcp_addr_req_ipv6(tcp, ackmp); 7674 return; 7675 } 7676 taa = (struct T_addr_ack *)ackmp->b_rptr; 7677 7678 bzero(taa, sizeof (struct T_addr_ack)); 7679 ackmp->b_wptr = (uchar_t *)&taa[1]; 7680 7681 taa->PRIM_type = T_ADDR_ACK; 7682 ackmp->b_datap->db_type = M_PCPROTO; 7683 7684 /* 7685 * Note: Following code assumes 32 bit alignment of basic 7686 * data structures like sin_t and struct T_addr_ack. 7687 */ 7688 if (tcp->tcp_state >= TCPS_BOUND) { 7689 /* 7690 * Fill in local address 7691 */ 7692 taa->LOCADDR_length = sizeof (sin_t); 7693 taa->LOCADDR_offset = sizeof (*taa); 7694 7695 sin = (sin_t *)&taa[1]; 7696 7697 /* Fill zeroes and then intialize non-zero fields */ 7698 *sin = sin_null; 7699 7700 sin->sin_family = AF_INET; 7701 7702 sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src; 7703 sin->sin_port = *(uint16_t *)tcp->tcp_tcph->th_lport; 7704 7705 ackmp->b_wptr = (uchar_t *)&sin[1]; 7706 7707 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 7708 /* 7709 * Fill in Remote address 7710 */ 7711 taa->REMADDR_length = sizeof (sin_t); 7712 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset + 7713 taa->LOCADDR_length); 7714 7715 sin = (sin_t *)(ackmp->b_rptr + taa->REMADDR_offset); 7716 *sin = sin_null; 7717 sin->sin_family = AF_INET; 7718 sin->sin_addr.s_addr = tcp->tcp_remote; 7719 sin->sin_port = tcp->tcp_fport; 7720 7721 ackmp->b_wptr = (uchar_t *)&sin[1]; 7722 } 7723 } 7724 putnext(tcp->tcp_rq, ackmp); 7725 } 7726 7727 /* Assumes that tcp_addr_req gets enough space and alignment */ 7728 static void 7729 tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp) 7730 { 7731 sin6_t *sin6; 7732 struct T_addr_ack *taa; 7733 7734 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 7735 ASSERT(OK_32PTR(ackmp->b_rptr)); 7736 ASSERT(ackmp->b_wptr - ackmp->b_rptr >= sizeof (struct T_addr_ack) + 7737 2 * sizeof (sin6_t)); 7738 7739 taa = (struct T_addr_ack *)ackmp->b_rptr; 7740 7741 bzero(taa, sizeof (struct T_addr_ack)); 7742 ackmp->b_wptr = (uchar_t *)&taa[1]; 7743 7744 taa->PRIM_type = T_ADDR_ACK; 7745 ackmp->b_datap->db_type = M_PCPROTO; 7746 7747 /* 7748 * Note: Following code assumes 32 bit alignment of basic 7749 * data structures like sin6_t and struct T_addr_ack. 7750 */ 7751 if (tcp->tcp_state >= TCPS_BOUND) { 7752 /* 7753 * Fill in local address 7754 */ 7755 taa->LOCADDR_length = sizeof (sin6_t); 7756 taa->LOCADDR_offset = sizeof (*taa); 7757 7758 sin6 = (sin6_t *)&taa[1]; 7759 *sin6 = sin6_null; 7760 7761 sin6->sin6_family = AF_INET6; 7762 sin6->sin6_addr = tcp->tcp_ip6h->ip6_src; 7763 sin6->sin6_port = tcp->tcp_lport; 7764 7765 ackmp->b_wptr = (uchar_t *)&sin6[1]; 7766 7767 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 7768 /* 7769 * Fill in Remote address 7770 */ 7771 taa->REMADDR_length = sizeof (sin6_t); 7772 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset + 7773 taa->LOCADDR_length); 7774 7775 sin6 = (sin6_t *)(ackmp->b_rptr + taa->REMADDR_offset); 7776 *sin6 = sin6_null; 7777 sin6->sin6_family = AF_INET6; 7778 sin6->sin6_flowinfo = 7779 tcp->tcp_ip6h->ip6_vcf & 7780 ~IPV6_VERS_AND_FLOW_MASK; 7781 sin6->sin6_addr = tcp->tcp_remote_v6; 7782 sin6->sin6_port = tcp->tcp_fport; 7783 7784 ackmp->b_wptr = (uchar_t *)&sin6[1]; 7785 } 7786 } 7787 putnext(tcp->tcp_rq, ackmp); 7788 } 7789 7790 /* 7791 * Handle reinitialization of a tcp structure. 7792 * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE. 7793 */ 7794 static void 7795 tcp_reinit(tcp_t *tcp) 7796 { 7797 mblk_t *mp; 7798 int err; 7799 tcp_stack_t *tcps = tcp->tcp_tcps; 7800 7801 TCP_STAT(tcps, tcp_reinit_calls); 7802 7803 /* tcp_reinit should never be called for detached tcp_t's */ 7804 ASSERT(tcp->tcp_listener == NULL); 7805 ASSERT((tcp->tcp_family == AF_INET && 7806 tcp->tcp_ipversion == IPV4_VERSION) || 7807 (tcp->tcp_family == AF_INET6 && 7808 (tcp->tcp_ipversion == IPV4_VERSION || 7809 tcp->tcp_ipversion == IPV6_VERSION))); 7810 7811 /* Cancel outstanding timers */ 7812 tcp_timers_stop(tcp); 7813 7814 /* 7815 * Reset everything in the state vector, after updating global 7816 * MIB data from instance counters. 7817 */ 7818 UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs); 7819 tcp->tcp_ibsegs = 0; 7820 UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs); 7821 tcp->tcp_obsegs = 0; 7822 7823 tcp_close_mpp(&tcp->tcp_xmit_head); 7824 if (tcp->tcp_snd_zcopy_aware) 7825 tcp_zcopy_notify(tcp); 7826 tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL; 7827 tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0; 7828 mutex_enter(&tcp->tcp_non_sq_lock); 7829 if (tcp->tcp_flow_stopped && 7830 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 7831 tcp_clrqfull(tcp); 7832 } 7833 mutex_exit(&tcp->tcp_non_sq_lock); 7834 tcp_close_mpp(&tcp->tcp_reass_head); 7835 tcp->tcp_reass_tail = NULL; 7836 if (tcp->tcp_rcv_list != NULL) { 7837 /* Free b_next chain */ 7838 tcp_close_mpp(&tcp->tcp_rcv_list); 7839 tcp->tcp_rcv_last_head = NULL; 7840 tcp->tcp_rcv_last_tail = NULL; 7841 tcp->tcp_rcv_cnt = 0; 7842 } 7843 tcp->tcp_rcv_last_tail = NULL; 7844 7845 if ((mp = tcp->tcp_urp_mp) != NULL) { 7846 freemsg(mp); 7847 tcp->tcp_urp_mp = NULL; 7848 } 7849 if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 7850 freemsg(mp); 7851 tcp->tcp_urp_mark_mp = NULL; 7852 } 7853 if (tcp->tcp_fused_sigurg_mp != NULL) { 7854 freeb(tcp->tcp_fused_sigurg_mp); 7855 tcp->tcp_fused_sigurg_mp = NULL; 7856 } 7857 7858 /* 7859 * Following is a union with two members which are 7860 * identical types and size so the following cleanup 7861 * is enough. 7862 */ 7863 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 7864 7865 CL_INET_DISCONNECT(tcp); 7866 7867 /* 7868 * The connection can't be on the tcp_time_wait_head list 7869 * since it is not detached. 7870 */ 7871 ASSERT(tcp->tcp_time_wait_next == NULL); 7872 ASSERT(tcp->tcp_time_wait_prev == NULL); 7873 ASSERT(tcp->tcp_time_wait_expire == 0); 7874 7875 if (tcp->tcp_kssl_pending) { 7876 tcp->tcp_kssl_pending = B_FALSE; 7877 7878 /* Don't reset if the initialized by bind. */ 7879 if (tcp->tcp_kssl_ent != NULL) { 7880 kssl_release_ent(tcp->tcp_kssl_ent, NULL, 7881 KSSL_NO_PROXY); 7882 } 7883 } 7884 if (tcp->tcp_kssl_ctx != NULL) { 7885 kssl_release_ctx(tcp->tcp_kssl_ctx); 7886 tcp->tcp_kssl_ctx = NULL; 7887 } 7888 7889 /* 7890 * Reset/preserve other values 7891 */ 7892 tcp_reinit_values(tcp); 7893 ipcl_hash_remove(tcp->tcp_connp); 7894 conn_delete_ire(tcp->tcp_connp, NULL); 7895 tcp_ipsec_cleanup(tcp); 7896 7897 if (tcp->tcp_conn_req_max != 0) { 7898 /* 7899 * This is the case when a TLI program uses the same 7900 * transport end point to accept a connection. This 7901 * makes the TCP both a listener and acceptor. When 7902 * this connection is closed, we need to set the state 7903 * back to TCPS_LISTEN. Make sure that the eager list 7904 * is reinitialized. 7905 * 7906 * Note that this stream is still bound to the four 7907 * tuples of the previous connection in IP. If a new 7908 * SYN with different foreign address comes in, IP will 7909 * not find it and will send it to the global queue. In 7910 * the global queue, TCP will do a tcp_lookup_listener() 7911 * to find this stream. This works because this stream 7912 * is only removed from connected hash. 7913 * 7914 */ 7915 tcp->tcp_state = TCPS_LISTEN; 7916 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 7917 tcp->tcp_eager_next_drop_q0 = tcp; 7918 tcp->tcp_eager_prev_drop_q0 = tcp; 7919 tcp->tcp_connp->conn_recv = tcp_conn_request; 7920 if (tcp->tcp_family == AF_INET6) { 7921 ASSERT(tcp->tcp_connp->conn_af_isv6); 7922 (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP, 7923 &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport); 7924 } else { 7925 ASSERT(!tcp->tcp_connp->conn_af_isv6); 7926 (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP, 7927 tcp->tcp_ipha->ipha_src, tcp->tcp_lport); 7928 } 7929 } else { 7930 tcp->tcp_state = TCPS_BOUND; 7931 } 7932 7933 /* 7934 * Initialize to default values 7935 * Can't fail since enough header template space already allocated 7936 * at open(). 7937 */ 7938 err = tcp_init_values(tcp); 7939 ASSERT(err == 0); 7940 /* Restore state in tcp_tcph */ 7941 bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN); 7942 if (tcp->tcp_ipversion == IPV4_VERSION) 7943 tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source; 7944 else 7945 tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6; 7946 /* 7947 * Copy of the src addr. in tcp_t is needed in tcp_t 7948 * since the lookup funcs can only lookup on tcp_t 7949 */ 7950 tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6; 7951 7952 ASSERT(tcp->tcp_ptpbhn != NULL); 7953 tcp->tcp_rq->q_hiwat = tcps->tcps_recv_hiwat; 7954 tcp->tcp_rwnd = tcps->tcps_recv_hiwat; 7955 tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ? 7956 tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4; 7957 } 7958 7959 /* 7960 * Force values to zero that need be zero. 7961 * Do not touch values asociated with the BOUND or LISTEN state 7962 * since the connection will end up in that state after the reinit. 7963 * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t 7964 * structure! 7965 */ 7966 static void 7967 tcp_reinit_values(tcp) 7968 tcp_t *tcp; 7969 { 7970 tcp_stack_t *tcps = tcp->tcp_tcps; 7971 7972 #ifndef lint 7973 #define DONTCARE(x) 7974 #define PRESERVE(x) 7975 #else 7976 #define DONTCARE(x) ((x) = (x)) 7977 #define PRESERVE(x) ((x) = (x)) 7978 #endif /* lint */ 7979 7980 PRESERVE(tcp->tcp_bind_hash); 7981 PRESERVE(tcp->tcp_ptpbhn); 7982 PRESERVE(tcp->tcp_acceptor_hash); 7983 PRESERVE(tcp->tcp_ptpahn); 7984 7985 /* Should be ASSERT NULL on these with new code! */ 7986 ASSERT(tcp->tcp_time_wait_next == NULL); 7987 ASSERT(tcp->tcp_time_wait_prev == NULL); 7988 ASSERT(tcp->tcp_time_wait_expire == 0); 7989 PRESERVE(tcp->tcp_state); 7990 PRESERVE(tcp->tcp_rq); 7991 PRESERVE(tcp->tcp_wq); 7992 7993 ASSERT(tcp->tcp_xmit_head == NULL); 7994 ASSERT(tcp->tcp_xmit_last == NULL); 7995 ASSERT(tcp->tcp_unsent == 0); 7996 ASSERT(tcp->tcp_xmit_tail == NULL); 7997 ASSERT(tcp->tcp_xmit_tail_unsent == 0); 7998 7999 tcp->tcp_snxt = 0; /* Displayed in mib */ 8000 tcp->tcp_suna = 0; /* Displayed in mib */ 8001 tcp->tcp_swnd = 0; 8002 DONTCARE(tcp->tcp_cwnd); /* Init in tcp_mss_set */ 8003 8004 ASSERT(tcp->tcp_ibsegs == 0); 8005 ASSERT(tcp->tcp_obsegs == 0); 8006 8007 if (tcp->tcp_iphc != NULL) { 8008 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 8009 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 8010 } 8011 8012 DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */ 8013 DONTCARE(tcp->tcp_hdr_len); /* Init in tcp_init_values */ 8014 DONTCARE(tcp->tcp_ipha); 8015 DONTCARE(tcp->tcp_ip6h); 8016 DONTCARE(tcp->tcp_ip_hdr_len); 8017 DONTCARE(tcp->tcp_tcph); 8018 DONTCARE(tcp->tcp_tcp_hdr_len); /* Init in tcp_init_values */ 8019 tcp->tcp_valid_bits = 0; 8020 8021 DONTCARE(tcp->tcp_xmit_hiwater); /* Init in tcp_init_values */ 8022 DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */ 8023 DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */ 8024 tcp->tcp_last_rcv_lbolt = 0; 8025 8026 tcp->tcp_init_cwnd = 0; 8027 8028 tcp->tcp_urp_last_valid = 0; 8029 tcp->tcp_hard_binding = 0; 8030 tcp->tcp_hard_bound = 0; 8031 PRESERVE(tcp->tcp_cred); 8032 PRESERVE(tcp->tcp_cpid); 8033 PRESERVE(tcp->tcp_open_time); 8034 PRESERVE(tcp->tcp_exclbind); 8035 8036 tcp->tcp_fin_acked = 0; 8037 tcp->tcp_fin_rcvd = 0; 8038 tcp->tcp_fin_sent = 0; 8039 tcp->tcp_ordrel_done = 0; 8040 8041 tcp->tcp_debug = 0; 8042 tcp->tcp_dontroute = 0; 8043 tcp->tcp_broadcast = 0; 8044 8045 tcp->tcp_useloopback = 0; 8046 tcp->tcp_reuseaddr = 0; 8047 tcp->tcp_oobinline = 0; 8048 tcp->tcp_dgram_errind = 0; 8049 8050 tcp->tcp_detached = 0; 8051 tcp->tcp_bind_pending = 0; 8052 tcp->tcp_unbind_pending = 0; 8053 tcp->tcp_deferred_clean_death = 0; 8054 8055 tcp->tcp_snd_ws_ok = B_FALSE; 8056 tcp->tcp_snd_ts_ok = B_FALSE; 8057 tcp->tcp_linger = 0; 8058 tcp->tcp_ka_enabled = 0; 8059 tcp->tcp_zero_win_probe = 0; 8060 8061 tcp->tcp_loopback = 0; 8062 tcp->tcp_localnet = 0; 8063 tcp->tcp_syn_defense = 0; 8064 tcp->tcp_set_timer = 0; 8065 8066 tcp->tcp_active_open = 0; 8067 ASSERT(tcp->tcp_timeout == B_FALSE); 8068 tcp->tcp_rexmit = B_FALSE; 8069 tcp->tcp_xmit_zc_clean = B_FALSE; 8070 8071 tcp->tcp_snd_sack_ok = B_FALSE; 8072 PRESERVE(tcp->tcp_recvdstaddr); 8073 tcp->tcp_hwcksum = B_FALSE; 8074 8075 tcp->tcp_ire_ill_check_done = B_FALSE; 8076 DONTCARE(tcp->tcp_maxpsz); /* Init in tcp_init_values */ 8077 8078 tcp->tcp_mdt = B_FALSE; 8079 tcp->tcp_mdt_hdr_head = 0; 8080 tcp->tcp_mdt_hdr_tail = 0; 8081 8082 tcp->tcp_conn_def_q0 = 0; 8083 tcp->tcp_ip_forward_progress = B_FALSE; 8084 tcp->tcp_anon_priv_bind = 0; 8085 tcp->tcp_ecn_ok = B_FALSE; 8086 8087 tcp->tcp_cwr = B_FALSE; 8088 tcp->tcp_ecn_echo_on = B_FALSE; 8089 8090 if (tcp->tcp_sack_info != NULL) { 8091 if (tcp->tcp_notsack_list != NULL) { 8092 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 8093 } 8094 kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info); 8095 tcp->tcp_sack_info = NULL; 8096 } 8097 8098 tcp->tcp_rcv_ws = 0; 8099 tcp->tcp_snd_ws = 0; 8100 tcp->tcp_ts_recent = 0; 8101 tcp->tcp_rnxt = 0; /* Displayed in mib */ 8102 DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */ 8103 tcp->tcp_if_mtu = 0; 8104 8105 ASSERT(tcp->tcp_reass_head == NULL); 8106 ASSERT(tcp->tcp_reass_tail == NULL); 8107 8108 tcp->tcp_cwnd_cnt = 0; 8109 8110 ASSERT(tcp->tcp_rcv_list == NULL); 8111 ASSERT(tcp->tcp_rcv_last_head == NULL); 8112 ASSERT(tcp->tcp_rcv_last_tail == NULL); 8113 ASSERT(tcp->tcp_rcv_cnt == 0); 8114 8115 DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_adapt_ire */ 8116 DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */ 8117 tcp->tcp_csuna = 0; 8118 8119 tcp->tcp_rto = 0; /* Displayed in MIB */ 8120 DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */ 8121 DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */ 8122 tcp->tcp_rtt_update = 0; 8123 8124 DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 8125 DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 8126 8127 tcp->tcp_rack = 0; /* Displayed in mib */ 8128 tcp->tcp_rack_cnt = 0; 8129 tcp->tcp_rack_cur_max = 0; 8130 tcp->tcp_rack_abs_max = 0; 8131 8132 tcp->tcp_max_swnd = 0; 8133 8134 ASSERT(tcp->tcp_listener == NULL); 8135 8136 DONTCARE(tcp->tcp_xmit_lowater); /* Init in tcp_init_values */ 8137 8138 DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */ 8139 DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */ 8140 DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */ 8141 DONTCARE(tcp->tcp_urg); /* tcp_valid_bits cleared */ 8142 8143 ASSERT(tcp->tcp_conn_req_cnt_q == 0); 8144 ASSERT(tcp->tcp_conn_req_cnt_q0 == 0); 8145 PRESERVE(tcp->tcp_conn_req_max); 8146 PRESERVE(tcp->tcp_conn_req_seqnum); 8147 8148 DONTCARE(tcp->tcp_ip_hdr_len); /* Init in tcp_init_values */ 8149 DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */ 8150 DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */ 8151 DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */ 8152 DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */ 8153 8154 tcp->tcp_lingertime = 0; 8155 8156 DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */ 8157 ASSERT(tcp->tcp_urp_mp == NULL); 8158 ASSERT(tcp->tcp_urp_mark_mp == NULL); 8159 ASSERT(tcp->tcp_fused_sigurg_mp == NULL); 8160 8161 ASSERT(tcp->tcp_eager_next_q == NULL); 8162 ASSERT(tcp->tcp_eager_last_q == NULL); 8163 ASSERT((tcp->tcp_eager_next_q0 == NULL && 8164 tcp->tcp_eager_prev_q0 == NULL) || 8165 tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0); 8166 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 8167 8168 ASSERT((tcp->tcp_eager_next_drop_q0 == NULL && 8169 tcp->tcp_eager_prev_drop_q0 == NULL) || 8170 tcp->tcp_eager_next_drop_q0 == tcp->tcp_eager_prev_drop_q0); 8171 8172 tcp->tcp_client_errno = 0; 8173 8174 DONTCARE(tcp->tcp_sum); /* Init in tcp_init_values */ 8175 8176 tcp->tcp_remote_v6 = ipv6_all_zeros; /* Displayed in MIB */ 8177 8178 PRESERVE(tcp->tcp_bound_source_v6); 8179 tcp->tcp_last_sent_len = 0; 8180 tcp->tcp_dupack_cnt = 0; 8181 8182 tcp->tcp_fport = 0; /* Displayed in MIB */ 8183 PRESERVE(tcp->tcp_lport); 8184 8185 PRESERVE(tcp->tcp_acceptor_lockp); 8186 8187 ASSERT(tcp->tcp_ordrelid == 0); 8188 PRESERVE(tcp->tcp_acceptor_id); 8189 DONTCARE(tcp->tcp_ipsec_overhead); 8190 8191 /* 8192 * If tcp_tracing flag is ON (i.e. We have a trace buffer 8193 * in tcp structure and now tracing), Re-initialize all 8194 * members of tcp_traceinfo. 8195 */ 8196 if (tcp->tcp_tracebuf != NULL) { 8197 bzero(tcp->tcp_tracebuf, sizeof (tcptrch_t)); 8198 } 8199 8200 PRESERVE(tcp->tcp_family); 8201 if (tcp->tcp_family == AF_INET6) { 8202 tcp->tcp_ipversion = IPV6_VERSION; 8203 tcp->tcp_mss = tcps->tcps_mss_def_ipv6; 8204 } else { 8205 tcp->tcp_ipversion = IPV4_VERSION; 8206 tcp->tcp_mss = tcps->tcps_mss_def_ipv4; 8207 } 8208 8209 tcp->tcp_bound_if = 0; 8210 tcp->tcp_ipv6_recvancillary = 0; 8211 tcp->tcp_recvifindex = 0; 8212 tcp->tcp_recvhops = 0; 8213 tcp->tcp_closed = 0; 8214 tcp->tcp_cleandeathtag = 0; 8215 if (tcp->tcp_hopopts != NULL) { 8216 mi_free(tcp->tcp_hopopts); 8217 tcp->tcp_hopopts = NULL; 8218 tcp->tcp_hopoptslen = 0; 8219 } 8220 ASSERT(tcp->tcp_hopoptslen == 0); 8221 if (tcp->tcp_dstopts != NULL) { 8222 mi_free(tcp->tcp_dstopts); 8223 tcp->tcp_dstopts = NULL; 8224 tcp->tcp_dstoptslen = 0; 8225 } 8226 ASSERT(tcp->tcp_dstoptslen == 0); 8227 if (tcp->tcp_rtdstopts != NULL) { 8228 mi_free(tcp->tcp_rtdstopts); 8229 tcp->tcp_rtdstopts = NULL; 8230 tcp->tcp_rtdstoptslen = 0; 8231 } 8232 ASSERT(tcp->tcp_rtdstoptslen == 0); 8233 if (tcp->tcp_rthdr != NULL) { 8234 mi_free(tcp->tcp_rthdr); 8235 tcp->tcp_rthdr = NULL; 8236 tcp->tcp_rthdrlen = 0; 8237 } 8238 ASSERT(tcp->tcp_rthdrlen == 0); 8239 PRESERVE(tcp->tcp_drop_opt_ack_cnt); 8240 8241 /* Reset fusion-related fields */ 8242 tcp->tcp_fused = B_FALSE; 8243 tcp->tcp_unfusable = B_FALSE; 8244 tcp->tcp_fused_sigurg = B_FALSE; 8245 tcp->tcp_direct_sockfs = B_FALSE; 8246 tcp->tcp_fuse_syncstr_stopped = B_FALSE; 8247 tcp->tcp_fuse_syncstr_plugged = B_FALSE; 8248 tcp->tcp_loopback_peer = NULL; 8249 tcp->tcp_fuse_rcv_hiwater = 0; 8250 tcp->tcp_fuse_rcv_unread_hiwater = 0; 8251 tcp->tcp_fuse_rcv_unread_cnt = 0; 8252 8253 tcp->tcp_lso = B_FALSE; 8254 8255 tcp->tcp_in_ack_unsent = 0; 8256 tcp->tcp_cork = B_FALSE; 8257 tcp->tcp_tconnind_started = B_FALSE; 8258 8259 PRESERVE(tcp->tcp_squeue_bytes); 8260 8261 ASSERT(tcp->tcp_kssl_ctx == NULL); 8262 ASSERT(!tcp->tcp_kssl_pending); 8263 PRESERVE(tcp->tcp_kssl_ent); 8264 8265 /* Sodirect */ 8266 tcp->tcp_sodirect = NULL; 8267 8268 tcp->tcp_closemp_used = B_FALSE; 8269 8270 #ifdef DEBUG 8271 DONTCARE(tcp->tcmp_stk[0]); 8272 #endif 8273 8274 8275 #undef DONTCARE 8276 #undef PRESERVE 8277 } 8278 8279 /* 8280 * Allocate necessary resources and initialize state vector. 8281 * Guaranteed not to fail so that when an error is returned, 8282 * the caller doesn't need to do any additional cleanup. 8283 */ 8284 int 8285 tcp_init(tcp_t *tcp, queue_t *q) 8286 { 8287 int err; 8288 8289 tcp->tcp_rq = q; 8290 tcp->tcp_wq = WR(q); 8291 tcp->tcp_state = TCPS_IDLE; 8292 if ((err = tcp_init_values(tcp)) != 0) 8293 tcp_timers_stop(tcp); 8294 return (err); 8295 } 8296 8297 static int 8298 tcp_init_values(tcp_t *tcp) 8299 { 8300 int err; 8301 tcp_stack_t *tcps = tcp->tcp_tcps; 8302 8303 ASSERT((tcp->tcp_family == AF_INET && 8304 tcp->tcp_ipversion == IPV4_VERSION) || 8305 (tcp->tcp_family == AF_INET6 && 8306 (tcp->tcp_ipversion == IPV4_VERSION || 8307 tcp->tcp_ipversion == IPV6_VERSION))); 8308 8309 /* 8310 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 8311 * will be close to tcp_rexmit_interval_initial. By doing this, we 8312 * allow the algorithm to adjust slowly to large fluctuations of RTT 8313 * during first few transmissions of a connection as seen in slow 8314 * links. 8315 */ 8316 tcp->tcp_rtt_sa = tcps->tcps_rexmit_interval_initial << 2; 8317 tcp->tcp_rtt_sd = tcps->tcps_rexmit_interval_initial >> 1; 8318 tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 8319 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + 8320 tcps->tcps_conn_grace_period; 8321 if (tcp->tcp_rto < tcps->tcps_rexmit_interval_min) 8322 tcp->tcp_rto = tcps->tcps_rexmit_interval_min; 8323 tcp->tcp_timer_backoff = 0; 8324 tcp->tcp_ms_we_have_waited = 0; 8325 tcp->tcp_last_recv_time = lbolt; 8326 tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_; 8327 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 8328 tcp->tcp_snd_burst = TCP_CWND_INFINITE; 8329 8330 tcp->tcp_maxpsz = tcps->tcps_maxpsz_multiplier; 8331 8332 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval; 8333 tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval; 8334 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval; 8335 /* 8336 * Fix it to tcp_ip_abort_linterval later if it turns out to be a 8337 * passive open. 8338 */ 8339 tcp->tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval; 8340 8341 tcp->tcp_naglim = tcps->tcps_naglim_def; 8342 8343 /* NOTE: ISS is now set in tcp_adapt_ire(). */ 8344 8345 tcp->tcp_mdt_hdr_head = 0; 8346 tcp->tcp_mdt_hdr_tail = 0; 8347 8348 /* Reset fusion-related fields */ 8349 tcp->tcp_fused = B_FALSE; 8350 tcp->tcp_unfusable = B_FALSE; 8351 tcp->tcp_fused_sigurg = B_FALSE; 8352 tcp->tcp_direct_sockfs = B_FALSE; 8353 tcp->tcp_fuse_syncstr_stopped = B_FALSE; 8354 tcp->tcp_fuse_syncstr_plugged = B_FALSE; 8355 tcp->tcp_loopback_peer = NULL; 8356 tcp->tcp_fuse_rcv_hiwater = 0; 8357 tcp->tcp_fuse_rcv_unread_hiwater = 0; 8358 tcp->tcp_fuse_rcv_unread_cnt = 0; 8359 8360 /* Sodirect */ 8361 tcp->tcp_sodirect = NULL; 8362 8363 /* Initialize the header template */ 8364 if (tcp->tcp_ipversion == IPV4_VERSION) { 8365 err = tcp_header_init_ipv4(tcp); 8366 } else { 8367 err = tcp_header_init_ipv6(tcp); 8368 } 8369 if (err) 8370 return (err); 8371 8372 /* 8373 * Init the window scale to the max so tcp_rwnd_set() won't pare 8374 * down tcp_rwnd. tcp_adapt_ire() will set the right value later. 8375 */ 8376 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 8377 tcp->tcp_xmit_lowater = tcps->tcps_xmit_lowat; 8378 tcp->tcp_xmit_hiwater = tcps->tcps_xmit_hiwat; 8379 8380 tcp->tcp_cork = B_FALSE; 8381 /* 8382 * Init the tcp_debug option. This value determines whether TCP 8383 * calls strlog() to print out debug messages. Doing this 8384 * initialization here means that this value is not inherited thru 8385 * tcp_reinit(). 8386 */ 8387 tcp->tcp_debug = tcps->tcps_dbg; 8388 8389 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval; 8390 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval; 8391 8392 return (0); 8393 } 8394 8395 /* 8396 * Initialize the IPv4 header. Loses any record of any IP options. 8397 */ 8398 static int 8399 tcp_header_init_ipv4(tcp_t *tcp) 8400 { 8401 tcph_t *tcph; 8402 uint32_t sum; 8403 conn_t *connp; 8404 tcp_stack_t *tcps = tcp->tcp_tcps; 8405 8406 /* 8407 * This is a simple initialization. If there's 8408 * already a template, it should never be too small, 8409 * so reuse it. Otherwise, allocate space for the new one. 8410 */ 8411 if (tcp->tcp_iphc == NULL) { 8412 ASSERT(tcp->tcp_iphc_len == 0); 8413 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 8414 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP); 8415 if (tcp->tcp_iphc == NULL) { 8416 tcp->tcp_iphc_len = 0; 8417 return (ENOMEM); 8418 } 8419 } 8420 8421 /* options are gone; may need a new label */ 8422 connp = tcp->tcp_connp; 8423 connp->conn_mlp_type = mlptSingle; 8424 connp->conn_ulp_labeled = !is_system_labeled(); 8425 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 8426 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; 8427 tcp->tcp_ip6h = NULL; 8428 tcp->tcp_ipversion = IPV4_VERSION; 8429 tcp->tcp_hdr_len = sizeof (ipha_t) + sizeof (tcph_t); 8430 tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 8431 tcp->tcp_ip_hdr_len = sizeof (ipha_t); 8432 tcp->tcp_ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (tcph_t)); 8433 tcp->tcp_ipha->ipha_version_and_hdr_length 8434 = (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS; 8435 tcp->tcp_ipha->ipha_ident = 0; 8436 8437 tcp->tcp_ttl = (uchar_t)tcps->tcps_ipv4_ttl; 8438 tcp->tcp_tos = 0; 8439 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; 8440 tcp->tcp_ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; 8441 tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP; 8442 8443 tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t)); 8444 tcp->tcp_tcph = tcph; 8445 tcph->th_offset_and_rsrvd[0] = (5 << 4); 8446 /* 8447 * IP wants our header length in the checksum field to 8448 * allow it to perform a single pseudo-header+checksum 8449 * calculation on behalf of TCP. 8450 * Include the adjustment for a source route once IP_OPTIONS is set. 8451 */ 8452 sum = sizeof (tcph_t) + tcp->tcp_sum; 8453 sum = (sum >> 16) + (sum & 0xFFFF); 8454 U16_TO_ABE16(sum, tcph->th_sum); 8455 return (0); 8456 } 8457 8458 /* 8459 * Initialize the IPv6 header. Loses any record of any IPv6 extension headers. 8460 */ 8461 static int 8462 tcp_header_init_ipv6(tcp_t *tcp) 8463 { 8464 tcph_t *tcph; 8465 uint32_t sum; 8466 conn_t *connp; 8467 tcp_stack_t *tcps = tcp->tcp_tcps; 8468 8469 /* 8470 * This is a simple initialization. If there's 8471 * already a template, it should never be too small, 8472 * so reuse it. Otherwise, allocate space for the new one. 8473 * Ensure that there is enough space to "downgrade" the tcp_t 8474 * to an IPv4 tcp_t. This requires having space for a full load 8475 * of IPv4 options, as well as a full load of TCP options 8476 * (TCP_MAX_COMBINED_HEADER_LENGTH, 120 bytes); this is more space 8477 * than a v6 header and a TCP header with a full load of TCP options 8478 * (IPV6_HDR_LEN is 40 bytes; TCP_MAX_HDR_LENGTH is 60 bytes). 8479 * We want to avoid reallocation in the "downgraded" case when 8480 * processing outbound IPv4 options. 8481 */ 8482 if (tcp->tcp_iphc == NULL) { 8483 ASSERT(tcp->tcp_iphc_len == 0); 8484 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 8485 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP); 8486 if (tcp->tcp_iphc == NULL) { 8487 tcp->tcp_iphc_len = 0; 8488 return (ENOMEM); 8489 } 8490 } 8491 8492 /* options are gone; may need a new label */ 8493 connp = tcp->tcp_connp; 8494 connp->conn_mlp_type = mlptSingle; 8495 connp->conn_ulp_labeled = !is_system_labeled(); 8496 8497 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 8498 tcp->tcp_ipversion = IPV6_VERSION; 8499 tcp->tcp_hdr_len = IPV6_HDR_LEN + sizeof (tcph_t); 8500 tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 8501 tcp->tcp_ip_hdr_len = IPV6_HDR_LEN; 8502 tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc; 8503 tcp->tcp_ipha = NULL; 8504 8505 /* Initialize the header template */ 8506 8507 tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 8508 tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t)); 8509 tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP; 8510 tcp->tcp_ip6h->ip6_hops = (uint8_t)tcps->tcps_ipv6_hoplimit; 8511 8512 tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN); 8513 tcp->tcp_tcph = tcph; 8514 tcph->th_offset_and_rsrvd[0] = (5 << 4); 8515 /* 8516 * IP wants our header length in the checksum field to 8517 * allow it to perform a single psuedo-header+checksum 8518 * calculation on behalf of TCP. 8519 * Include the adjustment for a source route when IPV6_RTHDR is set. 8520 */ 8521 sum = sizeof (tcph_t) + tcp->tcp_sum; 8522 sum = (sum >> 16) + (sum & 0xFFFF); 8523 U16_TO_ABE16(sum, tcph->th_sum); 8524 return (0); 8525 } 8526 8527 /* At minimum we need 8 bytes in the TCP header for the lookup */ 8528 #define ICMP_MIN_TCP_HDR 8 8529 8530 /* 8531 * tcp_icmp_error is called by tcp_rput_other to process ICMP error messages 8532 * passed up by IP. The message is always received on the correct tcp_t. 8533 * Assumes that IP has pulled up everything up to and including the ICMP header. 8534 */ 8535 void 8536 tcp_icmp_error(tcp_t *tcp, mblk_t *mp) 8537 { 8538 icmph_t *icmph; 8539 ipha_t *ipha; 8540 int iph_hdr_length; 8541 tcph_t *tcph; 8542 boolean_t ipsec_mctl = B_FALSE; 8543 boolean_t secure; 8544 mblk_t *first_mp = mp; 8545 uint32_t new_mss; 8546 uint32_t ratio; 8547 size_t mp_size = MBLKL(mp); 8548 uint32_t seg_seq; 8549 tcp_stack_t *tcps = tcp->tcp_tcps; 8550 8551 /* Assume IP provides aligned packets - otherwise toss */ 8552 if (!OK_32PTR(mp->b_rptr)) { 8553 freemsg(mp); 8554 return; 8555 } 8556 8557 /* 8558 * Since ICMP errors are normal data marked with M_CTL when sent 8559 * to TCP or UDP, we have to look for a IPSEC_IN value to identify 8560 * packets starting with an ipsec_info_t, see ipsec_info.h. 8561 */ 8562 if ((mp_size == sizeof (ipsec_info_t)) && 8563 (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == IPSEC_IN)) { 8564 ASSERT(mp->b_cont != NULL); 8565 mp = mp->b_cont; 8566 /* IP should have done this */ 8567 ASSERT(OK_32PTR(mp->b_rptr)); 8568 mp_size = MBLKL(mp); 8569 ipsec_mctl = B_TRUE; 8570 } 8571 8572 /* 8573 * Verify that we have a complete outer IP header. If not, drop it. 8574 */ 8575 if (mp_size < sizeof (ipha_t)) { 8576 noticmpv4: 8577 freemsg(first_mp); 8578 return; 8579 } 8580 8581 ipha = (ipha_t *)mp->b_rptr; 8582 /* 8583 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent 8584 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6. 8585 */ 8586 switch (IPH_HDR_VERSION(ipha)) { 8587 case IPV6_VERSION: 8588 tcp_icmp_error_ipv6(tcp, first_mp, ipsec_mctl); 8589 return; 8590 case IPV4_VERSION: 8591 break; 8592 default: 8593 goto noticmpv4; 8594 } 8595 8596 /* Skip past the outer IP and ICMP headers */ 8597 iph_hdr_length = IPH_HDR_LENGTH(ipha); 8598 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 8599 /* 8600 * If we don't have the correct outer IP header length or if the ULP 8601 * is not IPPROTO_ICMP or if we don't have a complete inner IP header 8602 * send it upstream. 8603 */ 8604 if (iph_hdr_length < sizeof (ipha_t) || 8605 ipha->ipha_protocol != IPPROTO_ICMP || 8606 (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) { 8607 goto noticmpv4; 8608 } 8609 ipha = (ipha_t *)&icmph[1]; 8610 8611 /* Skip past the inner IP and find the ULP header */ 8612 iph_hdr_length = IPH_HDR_LENGTH(ipha); 8613 tcph = (tcph_t *)((char *)ipha + iph_hdr_length); 8614 /* 8615 * If we don't have the correct inner IP header length or if the ULP 8616 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR 8617 * bytes of TCP header, drop it. 8618 */ 8619 if (iph_hdr_length < sizeof (ipha_t) || 8620 ipha->ipha_protocol != IPPROTO_TCP || 8621 (uchar_t *)tcph + ICMP_MIN_TCP_HDR > mp->b_wptr) { 8622 goto noticmpv4; 8623 } 8624 8625 if (TCP_IS_DETACHED_NONEAGER(tcp)) { 8626 if (ipsec_mctl) { 8627 secure = ipsec_in_is_secure(first_mp); 8628 } else { 8629 secure = B_FALSE; 8630 } 8631 if (secure) { 8632 /* 8633 * If we are willing to accept this in clear 8634 * we don't have to verify policy. 8635 */ 8636 if (!ipsec_inbound_accept_clear(mp, ipha, NULL)) { 8637 if (!tcp_check_policy(tcp, first_mp, 8638 ipha, NULL, secure, ipsec_mctl)) { 8639 /* 8640 * tcp_check_policy called 8641 * ip_drop_packet() on failure. 8642 */ 8643 return; 8644 } 8645 } 8646 } 8647 } else if (ipsec_mctl) { 8648 /* 8649 * This is a hard_bound connection. IP has already 8650 * verified policy. We don't have to do it again. 8651 */ 8652 freeb(first_mp); 8653 first_mp = mp; 8654 ipsec_mctl = B_FALSE; 8655 } 8656 8657 seg_seq = ABE32_TO_U32(tcph->th_seq); 8658 /* 8659 * TCP SHOULD check that the TCP sequence number contained in 8660 * payload of the ICMP error message is within the range 8661 * SND.UNA <= SEG.SEQ < SND.NXT. 8662 */ 8663 if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) { 8664 /* 8665 * If the ICMP message is bogus, should we kill the 8666 * connection, or should we just drop the bogus ICMP 8667 * message? It would probably make more sense to just 8668 * drop the message so that if this one managed to get 8669 * in, the real connection should not suffer. 8670 */ 8671 goto noticmpv4; 8672 } 8673 8674 switch (icmph->icmph_type) { 8675 case ICMP_DEST_UNREACHABLE: 8676 switch (icmph->icmph_code) { 8677 case ICMP_FRAGMENTATION_NEEDED: 8678 /* 8679 * Reduce the MSS based on the new MTU. This will 8680 * eliminate any fragmentation locally. 8681 * N.B. There may well be some funny side-effects on 8682 * the local send policy and the remote receive policy. 8683 * Pending further research, we provide 8684 * tcp_ignore_path_mtu just in case this proves 8685 * disastrous somewhere. 8686 * 8687 * After updating the MSS, retransmit part of the 8688 * dropped segment using the new mss by calling 8689 * tcp_wput_data(). Need to adjust all those 8690 * params to make sure tcp_wput_data() work properly. 8691 */ 8692 if (tcps->tcps_ignore_path_mtu) 8693 break; 8694 8695 /* 8696 * Decrease the MSS by time stamp options 8697 * IP options and IPSEC options. tcp_hdr_len 8698 * includes time stamp option and IP option 8699 * length. 8700 */ 8701 8702 new_mss = ntohs(icmph->icmph_du_mtu) - 8703 tcp->tcp_hdr_len - tcp->tcp_ipsec_overhead; 8704 8705 /* 8706 * Only update the MSS if the new one is 8707 * smaller than the previous one. This is 8708 * to avoid problems when getting multiple 8709 * ICMP errors for the same MTU. 8710 */ 8711 if (new_mss >= tcp->tcp_mss) 8712 break; 8713 8714 /* 8715 * Stop doing PMTU if new_mss is less than 68 8716 * or less than tcp_mss_min. 8717 * The value 68 comes from rfc 1191. 8718 */ 8719 if (new_mss < MAX(68, tcps->tcps_mss_min)) 8720 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 8721 0; 8722 8723 ratio = tcp->tcp_cwnd / tcp->tcp_mss; 8724 ASSERT(ratio >= 1); 8725 tcp_mss_set(tcp, new_mss, B_TRUE); 8726 8727 /* 8728 * Make sure we have something to 8729 * send. 8730 */ 8731 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) && 8732 (tcp->tcp_xmit_head != NULL)) { 8733 /* 8734 * Shrink tcp_cwnd in 8735 * proportion to the old MSS/new MSS. 8736 */ 8737 tcp->tcp_cwnd = ratio * tcp->tcp_mss; 8738 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 8739 (tcp->tcp_unsent == 0)) { 8740 tcp->tcp_rexmit_max = tcp->tcp_fss; 8741 } else { 8742 tcp->tcp_rexmit_max = tcp->tcp_snxt; 8743 } 8744 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 8745 tcp->tcp_rexmit = B_TRUE; 8746 tcp->tcp_dupack_cnt = 0; 8747 tcp->tcp_snd_burst = TCP_CWND_SS; 8748 tcp_ss_rexmit(tcp); 8749 } 8750 break; 8751 case ICMP_PORT_UNREACHABLE: 8752 case ICMP_PROTOCOL_UNREACHABLE: 8753 switch (tcp->tcp_state) { 8754 case TCPS_SYN_SENT: 8755 case TCPS_SYN_RCVD: 8756 /* 8757 * ICMP can snipe away incipient 8758 * TCP connections as long as 8759 * seq number is same as initial 8760 * send seq number. 8761 */ 8762 if (seg_seq == tcp->tcp_iss) { 8763 (void) tcp_clean_death(tcp, 8764 ECONNREFUSED, 6); 8765 } 8766 break; 8767 } 8768 break; 8769 case ICMP_HOST_UNREACHABLE: 8770 case ICMP_NET_UNREACHABLE: 8771 /* Record the error in case we finally time out. */ 8772 if (icmph->icmph_code == ICMP_HOST_UNREACHABLE) 8773 tcp->tcp_client_errno = EHOSTUNREACH; 8774 else 8775 tcp->tcp_client_errno = ENETUNREACH; 8776 if (tcp->tcp_state == TCPS_SYN_RCVD) { 8777 if (tcp->tcp_listener != NULL && 8778 tcp->tcp_listener->tcp_syn_defense) { 8779 /* 8780 * Ditch the half-open connection if we 8781 * suspect a SYN attack is under way. 8782 */ 8783 tcp_ip_ire_mark_advice(tcp); 8784 (void) tcp_clean_death(tcp, 8785 tcp->tcp_client_errno, 7); 8786 } 8787 } 8788 break; 8789 default: 8790 break; 8791 } 8792 break; 8793 case ICMP_SOURCE_QUENCH: { 8794 /* 8795 * use a global boolean to control 8796 * whether TCP should respond to ICMP_SOURCE_QUENCH. 8797 * The default is false. 8798 */ 8799 if (tcp_icmp_source_quench) { 8800 /* 8801 * Reduce the sending rate as if we got a 8802 * retransmit timeout 8803 */ 8804 uint32_t npkt; 8805 8806 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / 8807 tcp->tcp_mss; 8808 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss; 8809 tcp->tcp_cwnd = tcp->tcp_mss; 8810 tcp->tcp_cwnd_cnt = 0; 8811 } 8812 break; 8813 } 8814 } 8815 freemsg(first_mp); 8816 } 8817 8818 /* 8819 * tcp_icmp_error_ipv6 is called by tcp_rput_other to process ICMPv6 8820 * error messages passed up by IP. 8821 * Assumes that IP has pulled up all the extension headers as well 8822 * as the ICMPv6 header. 8823 */ 8824 static void 8825 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl) 8826 { 8827 icmp6_t *icmp6; 8828 ip6_t *ip6h; 8829 uint16_t iph_hdr_length; 8830 tcpha_t *tcpha; 8831 uint8_t *nexthdrp; 8832 uint32_t new_mss; 8833 uint32_t ratio; 8834 boolean_t secure; 8835 mblk_t *first_mp = mp; 8836 size_t mp_size; 8837 uint32_t seg_seq; 8838 tcp_stack_t *tcps = tcp->tcp_tcps; 8839 8840 /* 8841 * The caller has determined if this is an IPSEC_IN packet and 8842 * set ipsec_mctl appropriately (see tcp_icmp_error). 8843 */ 8844 if (ipsec_mctl) 8845 mp = mp->b_cont; 8846 8847 mp_size = MBLKL(mp); 8848 8849 /* 8850 * Verify that we have a complete IP header. If not, send it upstream. 8851 */ 8852 if (mp_size < sizeof (ip6_t)) { 8853 noticmpv6: 8854 freemsg(first_mp); 8855 return; 8856 } 8857 8858 /* 8859 * Verify this is an ICMPV6 packet, else send it upstream. 8860 */ 8861 ip6h = (ip6_t *)mp->b_rptr; 8862 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 8863 iph_hdr_length = IPV6_HDR_LEN; 8864 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, 8865 &nexthdrp) || 8866 *nexthdrp != IPPROTO_ICMPV6) { 8867 goto noticmpv6; 8868 } 8869 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 8870 ip6h = (ip6_t *)&icmp6[1]; 8871 /* 8872 * Verify if we have a complete ICMP and inner IP header. 8873 */ 8874 if ((uchar_t *)&ip6h[1] > mp->b_wptr) 8875 goto noticmpv6; 8876 8877 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) 8878 goto noticmpv6; 8879 tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length); 8880 /* 8881 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't 8882 * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the 8883 * packet. 8884 */ 8885 if ((*nexthdrp != IPPROTO_TCP) || 8886 ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) { 8887 goto noticmpv6; 8888 } 8889 8890 /* 8891 * ICMP errors come on the right queue or come on 8892 * listener/global queue for detached connections and 8893 * get switched to the right queue. If it comes on the 8894 * right queue, policy check has already been done by IP 8895 * and thus free the first_mp without verifying the policy. 8896 * If it has come for a non-hard bound connection, we need 8897 * to verify policy as IP may not have done it. 8898 */ 8899 if (!tcp->tcp_hard_bound) { 8900 if (ipsec_mctl) { 8901 secure = ipsec_in_is_secure(first_mp); 8902 } else { 8903 secure = B_FALSE; 8904 } 8905 if (secure) { 8906 /* 8907 * If we are willing to accept this in clear 8908 * we don't have to verify policy. 8909 */ 8910 if (!ipsec_inbound_accept_clear(mp, NULL, ip6h)) { 8911 if (!tcp_check_policy(tcp, first_mp, 8912 NULL, ip6h, secure, ipsec_mctl)) { 8913 /* 8914 * tcp_check_policy called 8915 * ip_drop_packet() on failure. 8916 */ 8917 return; 8918 } 8919 } 8920 } 8921 } else if (ipsec_mctl) { 8922 /* 8923 * This is a hard_bound connection. IP has already 8924 * verified policy. We don't have to do it again. 8925 */ 8926 freeb(first_mp); 8927 first_mp = mp; 8928 ipsec_mctl = B_FALSE; 8929 } 8930 8931 seg_seq = ntohl(tcpha->tha_seq); 8932 /* 8933 * TCP SHOULD check that the TCP sequence number contained in 8934 * payload of the ICMP error message is within the range 8935 * SND.UNA <= SEG.SEQ < SND.NXT. 8936 */ 8937 if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) { 8938 /* 8939 * If the ICMP message is bogus, should we kill the 8940 * connection, or should we just drop the bogus ICMP 8941 * message? It would probably make more sense to just 8942 * drop the message so that if this one managed to get 8943 * in, the real connection should not suffer. 8944 */ 8945 goto noticmpv6; 8946 } 8947 8948 switch (icmp6->icmp6_type) { 8949 case ICMP6_PACKET_TOO_BIG: 8950 /* 8951 * Reduce the MSS based on the new MTU. This will 8952 * eliminate any fragmentation locally. 8953 * N.B. There may well be some funny side-effects on 8954 * the local send policy and the remote receive policy. 8955 * Pending further research, we provide 8956 * tcp_ignore_path_mtu just in case this proves 8957 * disastrous somewhere. 8958 * 8959 * After updating the MSS, retransmit part of the 8960 * dropped segment using the new mss by calling 8961 * tcp_wput_data(). Need to adjust all those 8962 * params to make sure tcp_wput_data() work properly. 8963 */ 8964 if (tcps->tcps_ignore_path_mtu) 8965 break; 8966 8967 /* 8968 * Decrease the MSS by time stamp options 8969 * IP options and IPSEC options. tcp_hdr_len 8970 * includes time stamp option and IP option 8971 * length. 8972 */ 8973 new_mss = ntohs(icmp6->icmp6_mtu) - tcp->tcp_hdr_len - 8974 tcp->tcp_ipsec_overhead; 8975 8976 /* 8977 * Only update the MSS if the new one is 8978 * smaller than the previous one. This is 8979 * to avoid problems when getting multiple 8980 * ICMP errors for the same MTU. 8981 */ 8982 if (new_mss >= tcp->tcp_mss) 8983 break; 8984 8985 ratio = tcp->tcp_cwnd / tcp->tcp_mss; 8986 ASSERT(ratio >= 1); 8987 tcp_mss_set(tcp, new_mss, B_TRUE); 8988 8989 /* 8990 * Make sure we have something to 8991 * send. 8992 */ 8993 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) && 8994 (tcp->tcp_xmit_head != NULL)) { 8995 /* 8996 * Shrink tcp_cwnd in 8997 * proportion to the old MSS/new MSS. 8998 */ 8999 tcp->tcp_cwnd = ratio * tcp->tcp_mss; 9000 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 9001 (tcp->tcp_unsent == 0)) { 9002 tcp->tcp_rexmit_max = tcp->tcp_fss; 9003 } else { 9004 tcp->tcp_rexmit_max = tcp->tcp_snxt; 9005 } 9006 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 9007 tcp->tcp_rexmit = B_TRUE; 9008 tcp->tcp_dupack_cnt = 0; 9009 tcp->tcp_snd_burst = TCP_CWND_SS; 9010 tcp_ss_rexmit(tcp); 9011 } 9012 break; 9013 9014 case ICMP6_DST_UNREACH: 9015 switch (icmp6->icmp6_code) { 9016 case ICMP6_DST_UNREACH_NOPORT: 9017 if (((tcp->tcp_state == TCPS_SYN_SENT) || 9018 (tcp->tcp_state == TCPS_SYN_RCVD)) && 9019 (seg_seq == tcp->tcp_iss)) { 9020 (void) tcp_clean_death(tcp, 9021 ECONNREFUSED, 8); 9022 } 9023 break; 9024 9025 case ICMP6_DST_UNREACH_ADMIN: 9026 case ICMP6_DST_UNREACH_NOROUTE: 9027 case ICMP6_DST_UNREACH_BEYONDSCOPE: 9028 case ICMP6_DST_UNREACH_ADDR: 9029 /* Record the error in case we finally time out. */ 9030 tcp->tcp_client_errno = EHOSTUNREACH; 9031 if (((tcp->tcp_state == TCPS_SYN_SENT) || 9032 (tcp->tcp_state == TCPS_SYN_RCVD)) && 9033 (seg_seq == tcp->tcp_iss)) { 9034 if (tcp->tcp_listener != NULL && 9035 tcp->tcp_listener->tcp_syn_defense) { 9036 /* 9037 * Ditch the half-open connection if we 9038 * suspect a SYN attack is under way. 9039 */ 9040 tcp_ip_ire_mark_advice(tcp); 9041 (void) tcp_clean_death(tcp, 9042 tcp->tcp_client_errno, 9); 9043 } 9044 } 9045 9046 9047 break; 9048 default: 9049 break; 9050 } 9051 break; 9052 9053 case ICMP6_PARAM_PROB: 9054 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 9055 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 9056 (uchar_t *)ip6h + icmp6->icmp6_pptr == 9057 (uchar_t *)nexthdrp) { 9058 if (tcp->tcp_state == TCPS_SYN_SENT || 9059 tcp->tcp_state == TCPS_SYN_RCVD) { 9060 (void) tcp_clean_death(tcp, 9061 ECONNREFUSED, 10); 9062 } 9063 break; 9064 } 9065 break; 9066 9067 case ICMP6_TIME_EXCEEDED: 9068 default: 9069 break; 9070 } 9071 freemsg(first_mp); 9072 } 9073 9074 /* 9075 * IP recognizes seven kinds of bind requests: 9076 * 9077 * - A zero-length address binds only to the protocol number. 9078 * 9079 * - A 4-byte address is treated as a request to 9080 * validate that the address is a valid local IPv4 9081 * address, appropriate for an application to bind to. 9082 * IP does the verification, but does not make any note 9083 * of the address at this time. 9084 * 9085 * - A 16-byte address contains is treated as a request 9086 * to validate a local IPv6 address, as the 4-byte 9087 * address case above. 9088 * 9089 * - A 16-byte sockaddr_in to validate the local IPv4 address and also 9090 * use it for the inbound fanout of packets. 9091 * 9092 * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also 9093 * use it for the inbound fanout of packets. 9094 * 9095 * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout 9096 * information consisting of local and remote addresses 9097 * and ports. In this case, the addresses are both 9098 * validated as appropriate for this operation, and, if 9099 * so, the information is retained for use in the 9100 * inbound fanout. 9101 * 9102 * - A 36-byte address address (ipa6_conn_t) containing complete IPv6 9103 * fanout information, like the 12-byte case above. 9104 * 9105 * IP will also fill in the IRE request mblk with information 9106 * regarding our peer. In all cases, we notify IP of our protocol 9107 * type by appending a single protocol byte to the bind request. 9108 */ 9109 static mblk_t * 9110 tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, t_scalar_t addr_length) 9111 { 9112 char *cp; 9113 mblk_t *mp; 9114 struct T_bind_req *tbr; 9115 ipa_conn_t *ac; 9116 ipa6_conn_t *ac6; 9117 sin_t *sin; 9118 sin6_t *sin6; 9119 9120 ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ); 9121 ASSERT((tcp->tcp_family == AF_INET && 9122 tcp->tcp_ipversion == IPV4_VERSION) || 9123 (tcp->tcp_family == AF_INET6 && 9124 (tcp->tcp_ipversion == IPV4_VERSION || 9125 tcp->tcp_ipversion == IPV6_VERSION))); 9126 9127 mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI); 9128 if (!mp) 9129 return (mp); 9130 mp->b_datap->db_type = M_PROTO; 9131 tbr = (struct T_bind_req *)mp->b_rptr; 9132 tbr->PRIM_type = bind_prim; 9133 tbr->ADDR_offset = sizeof (*tbr); 9134 tbr->CONIND_number = 0; 9135 tbr->ADDR_length = addr_length; 9136 cp = (char *)&tbr[1]; 9137 switch (addr_length) { 9138 case sizeof (ipa_conn_t): 9139 ASSERT(tcp->tcp_family == AF_INET); 9140 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 9141 9142 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); 9143 if (mp->b_cont == NULL) { 9144 freemsg(mp); 9145 return (NULL); 9146 } 9147 mp->b_cont->b_wptr += sizeof (ire_t); 9148 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; 9149 9150 /* cp known to be 32 bit aligned */ 9151 ac = (ipa_conn_t *)cp; 9152 ac->ac_laddr = tcp->tcp_ipha->ipha_src; 9153 ac->ac_faddr = tcp->tcp_remote; 9154 ac->ac_fport = tcp->tcp_fport; 9155 ac->ac_lport = tcp->tcp_lport; 9156 tcp->tcp_hard_binding = 1; 9157 break; 9158 9159 case sizeof (ipa6_conn_t): 9160 ASSERT(tcp->tcp_family == AF_INET6); 9161 9162 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); 9163 if (mp->b_cont == NULL) { 9164 freemsg(mp); 9165 return (NULL); 9166 } 9167 mp->b_cont->b_wptr += sizeof (ire_t); 9168 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; 9169 9170 /* cp known to be 32 bit aligned */ 9171 ac6 = (ipa6_conn_t *)cp; 9172 if (tcp->tcp_ipversion == IPV4_VERSION) { 9173 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 9174 &ac6->ac6_laddr); 9175 } else { 9176 ac6->ac6_laddr = tcp->tcp_ip6h->ip6_src; 9177 } 9178 ac6->ac6_faddr = tcp->tcp_remote_v6; 9179 ac6->ac6_fport = tcp->tcp_fport; 9180 ac6->ac6_lport = tcp->tcp_lport; 9181 tcp->tcp_hard_binding = 1; 9182 break; 9183 9184 case sizeof (sin_t): 9185 /* 9186 * NOTE: IPV6_ADDR_LEN also has same size. 9187 * Use family to discriminate. 9188 */ 9189 if (tcp->tcp_family == AF_INET) { 9190 sin = (sin_t *)cp; 9191 9192 *sin = sin_null; 9193 sin->sin_family = AF_INET; 9194 sin->sin_addr.s_addr = tcp->tcp_bound_source; 9195 sin->sin_port = tcp->tcp_lport; 9196 break; 9197 } else { 9198 *(in6_addr_t *)cp = tcp->tcp_bound_source_v6; 9199 } 9200 break; 9201 9202 case sizeof (sin6_t): 9203 ASSERT(tcp->tcp_family == AF_INET6); 9204 sin6 = (sin6_t *)cp; 9205 9206 *sin6 = sin6_null; 9207 sin6->sin6_family = AF_INET6; 9208 sin6->sin6_addr = tcp->tcp_bound_source_v6; 9209 sin6->sin6_port = tcp->tcp_lport; 9210 break; 9211 9212 case IP_ADDR_LEN: 9213 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 9214 *(uint32_t *)cp = tcp->tcp_ipha->ipha_src; 9215 break; 9216 9217 } 9218 /* Add protocol number to end */ 9219 cp[addr_length] = (char)IPPROTO_TCP; 9220 mp->b_wptr = (uchar_t *)&cp[addr_length + 1]; 9221 return (mp); 9222 } 9223 9224 /* 9225 * Notify IP that we are having trouble with this connection. IP should 9226 * blow the IRE away and start over. 9227 */ 9228 static void 9229 tcp_ip_notify(tcp_t *tcp) 9230 { 9231 struct iocblk *iocp; 9232 ipid_t *ipid; 9233 mblk_t *mp; 9234 9235 /* IPv6 has NUD thus notification to delete the IRE is not needed */ 9236 if (tcp->tcp_ipversion == IPV6_VERSION) 9237 return; 9238 9239 mp = mkiocb(IP_IOCTL); 9240 if (mp == NULL) 9241 return; 9242 9243 iocp = (struct iocblk *)mp->b_rptr; 9244 iocp->ioc_count = sizeof (ipid_t) + sizeof (tcp->tcp_ipha->ipha_dst); 9245 9246 mp->b_cont = allocb(iocp->ioc_count, BPRI_HI); 9247 if (!mp->b_cont) { 9248 freeb(mp); 9249 return; 9250 } 9251 9252 ipid = (ipid_t *)mp->b_cont->b_rptr; 9253 mp->b_cont->b_wptr += iocp->ioc_count; 9254 bzero(ipid, sizeof (*ipid)); 9255 ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY; 9256 ipid->ipid_ire_type = IRE_CACHE; 9257 ipid->ipid_addr_offset = sizeof (ipid_t); 9258 ipid->ipid_addr_length = sizeof (tcp->tcp_ipha->ipha_dst); 9259 /* 9260 * Note: in the case of source routing we want to blow away the 9261 * route to the first source route hop. 9262 */ 9263 bcopy(&tcp->tcp_ipha->ipha_dst, &ipid[1], 9264 sizeof (tcp->tcp_ipha->ipha_dst)); 9265 9266 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 9267 } 9268 9269 /* Unlink and return any mblk that looks like it contains an ire */ 9270 static mblk_t * 9271 tcp_ire_mp(mblk_t *mp) 9272 { 9273 mblk_t *prev_mp; 9274 9275 for (;;) { 9276 prev_mp = mp; 9277 mp = mp->b_cont; 9278 if (mp == NULL) 9279 break; 9280 switch (DB_TYPE(mp)) { 9281 case IRE_DB_TYPE: 9282 case IRE_DB_REQ_TYPE: 9283 if (prev_mp != NULL) 9284 prev_mp->b_cont = mp->b_cont; 9285 mp->b_cont = NULL; 9286 return (mp); 9287 default: 9288 break; 9289 } 9290 } 9291 return (mp); 9292 } 9293 9294 /* 9295 * Timer callback routine for keepalive probe. We do a fake resend of 9296 * last ACKed byte. Then set a timer using RTO. When the timer expires, 9297 * check to see if we have heard anything from the other end for the last 9298 * RTO period. If we have, set the timer to expire for another 9299 * tcp_keepalive_intrvl and check again. If we have not, set a timer using 9300 * RTO << 1 and check again when it expires. Keep exponentially increasing 9301 * the timeout if we have not heard from the other side. If for more than 9302 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything, 9303 * kill the connection unless the keepalive abort threshold is 0. In 9304 * that case, we will probe "forever." 9305 */ 9306 static void 9307 tcp_keepalive_killer(void *arg) 9308 { 9309 mblk_t *mp; 9310 conn_t *connp = (conn_t *)arg; 9311 tcp_t *tcp = connp->conn_tcp; 9312 int32_t firetime; 9313 int32_t idletime; 9314 int32_t ka_intrvl; 9315 tcp_stack_t *tcps = tcp->tcp_tcps; 9316 9317 tcp->tcp_ka_tid = 0; 9318 9319 if (tcp->tcp_fused) 9320 return; 9321 9322 BUMP_MIB(&tcps->tcps_mib, tcpTimKeepalive); 9323 ka_intrvl = tcp->tcp_ka_interval; 9324 9325 /* 9326 * Keepalive probe should only be sent if the application has not 9327 * done a close on the connection. 9328 */ 9329 if (tcp->tcp_state > TCPS_CLOSE_WAIT) { 9330 return; 9331 } 9332 /* Timer fired too early, restart it. */ 9333 if (tcp->tcp_state < TCPS_ESTABLISHED) { 9334 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 9335 MSEC_TO_TICK(ka_intrvl)); 9336 return; 9337 } 9338 9339 idletime = TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time); 9340 /* 9341 * If we have not heard from the other side for a long 9342 * time, kill the connection unless the keepalive abort 9343 * threshold is 0. In that case, we will probe "forever." 9344 */ 9345 if (tcp->tcp_ka_abort_thres != 0 && 9346 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) { 9347 BUMP_MIB(&tcps->tcps_mib, tcpTimKeepaliveDrop); 9348 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ? 9349 tcp->tcp_client_errno : ETIMEDOUT, 11); 9350 return; 9351 } 9352 9353 if (tcp->tcp_snxt == tcp->tcp_suna && 9354 idletime >= ka_intrvl) { 9355 /* Fake resend of last ACKed byte. */ 9356 mblk_t *mp1 = allocb(1, BPRI_LO); 9357 9358 if (mp1 != NULL) { 9359 *mp1->b_wptr++ = '\0'; 9360 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL, 9361 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE); 9362 freeb(mp1); 9363 /* 9364 * if allocation failed, fall through to start the 9365 * timer back. 9366 */ 9367 if (mp != NULL) { 9368 TCP_RECORD_TRACE(tcp, mp, 9369 TCP_TRACE_SEND_PKT); 9370 tcp_send_data(tcp, tcp->tcp_wq, mp); 9371 BUMP_MIB(&tcps->tcps_mib, 9372 tcpTimKeepaliveProbe); 9373 if (tcp->tcp_ka_last_intrvl != 0) { 9374 int max; 9375 /* 9376 * We should probe again at least 9377 * in ka_intrvl, but not more than 9378 * tcp_rexmit_interval_max. 9379 */ 9380 max = tcps->tcps_rexmit_interval_max; 9381 firetime = MIN(ka_intrvl - 1, 9382 tcp->tcp_ka_last_intrvl << 1); 9383 if (firetime > max) 9384 firetime = max; 9385 } else { 9386 firetime = tcp->tcp_rto; 9387 } 9388 tcp->tcp_ka_tid = TCP_TIMER(tcp, 9389 tcp_keepalive_killer, 9390 MSEC_TO_TICK(firetime)); 9391 tcp->tcp_ka_last_intrvl = firetime; 9392 return; 9393 } 9394 } 9395 } else { 9396 tcp->tcp_ka_last_intrvl = 0; 9397 } 9398 9399 /* firetime can be negative if (mp1 == NULL || mp == NULL) */ 9400 if ((firetime = ka_intrvl - idletime) < 0) { 9401 firetime = ka_intrvl; 9402 } 9403 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 9404 MSEC_TO_TICK(firetime)); 9405 } 9406 9407 int 9408 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) 9409 { 9410 queue_t *q = tcp->tcp_rq; 9411 int32_t mss = tcp->tcp_mss; 9412 int maxpsz; 9413 9414 if (TCP_IS_DETACHED(tcp)) 9415 return (mss); 9416 9417 if (tcp->tcp_fused) { 9418 maxpsz = tcp_fuse_maxpsz_set(tcp); 9419 mss = INFPSZ; 9420 } else if (tcp->tcp_mdt || tcp->tcp_lso || tcp->tcp_maxpsz == 0) { 9421 /* 9422 * Set the sd_qn_maxpsz according to the socket send buffer 9423 * size, and sd_maxblk to INFPSZ (-1). This will essentially 9424 * instruct the stream head to copyin user data into contiguous 9425 * kernel-allocated buffers without breaking it up into smaller 9426 * chunks. We round up the buffer size to the nearest SMSS. 9427 */ 9428 maxpsz = MSS_ROUNDUP(tcp->tcp_xmit_hiwater, mss); 9429 if (tcp->tcp_kssl_ctx == NULL) 9430 mss = INFPSZ; 9431 else 9432 mss = SSL3_MAX_RECORD_LEN; 9433 } else { 9434 /* 9435 * Set sd_qn_maxpsz to approx half the (receivers) buffer 9436 * (and a multiple of the mss). This instructs the stream 9437 * head to break down larger than SMSS writes into SMSS- 9438 * size mblks, up to tcp_maxpsz_multiplier mblks at a time. 9439 */ 9440 maxpsz = tcp->tcp_maxpsz * mss; 9441 if (maxpsz > tcp->tcp_xmit_hiwater/2) { 9442 maxpsz = tcp->tcp_xmit_hiwater/2; 9443 /* Round up to nearest mss */ 9444 maxpsz = MSS_ROUNDUP(maxpsz, mss); 9445 } 9446 } 9447 (void) setmaxps(q, maxpsz); 9448 tcp->tcp_wq->q_maxpsz = maxpsz; 9449 9450 if (set_maxblk) 9451 (void) mi_set_sth_maxblk(q, mss); 9452 9453 return (mss); 9454 } 9455 9456 /* 9457 * Extract option values from a tcp header. We put any found values into the 9458 * tcpopt struct and return a bitmask saying which options were found. 9459 */ 9460 static int 9461 tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) 9462 { 9463 uchar_t *endp; 9464 int len; 9465 uint32_t mss; 9466 uchar_t *up = (uchar_t *)tcph; 9467 int found = 0; 9468 int32_t sack_len; 9469 tcp_seq sack_begin, sack_end; 9470 tcp_t *tcp; 9471 9472 endp = up + TCP_HDR_LENGTH(tcph); 9473 up += TCP_MIN_HEADER_LENGTH; 9474 while (up < endp) { 9475 len = endp - up; 9476 switch (*up) { 9477 case TCPOPT_EOL: 9478 break; 9479 9480 case TCPOPT_NOP: 9481 up++; 9482 continue; 9483 9484 case TCPOPT_MAXSEG: 9485 if (len < TCPOPT_MAXSEG_LEN || 9486 up[1] != TCPOPT_MAXSEG_LEN) 9487 break; 9488 9489 mss = BE16_TO_U16(up+2); 9490 /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 9491 tcpopt->tcp_opt_mss = mss; 9492 found |= TCP_OPT_MSS_PRESENT; 9493 9494 up += TCPOPT_MAXSEG_LEN; 9495 continue; 9496 9497 case TCPOPT_WSCALE: 9498 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 9499 break; 9500 9501 if (up[2] > TCP_MAX_WINSHIFT) 9502 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 9503 else 9504 tcpopt->tcp_opt_wscale = up[2]; 9505 found |= TCP_OPT_WSCALE_PRESENT; 9506 9507 up += TCPOPT_WS_LEN; 9508 continue; 9509 9510 case TCPOPT_SACK_PERMITTED: 9511 if (len < TCPOPT_SACK_OK_LEN || 9512 up[1] != TCPOPT_SACK_OK_LEN) 9513 break; 9514 found |= TCP_OPT_SACK_OK_PRESENT; 9515 up += TCPOPT_SACK_OK_LEN; 9516 continue; 9517 9518 case TCPOPT_SACK: 9519 if (len <= 2 || up[1] <= 2 || len < up[1]) 9520 break; 9521 9522 /* If TCP is not interested in SACK blks... */ 9523 if ((tcp = tcpopt->tcp) == NULL) { 9524 up += up[1]; 9525 continue; 9526 } 9527 sack_len = up[1] - TCPOPT_HEADER_LEN; 9528 up += TCPOPT_HEADER_LEN; 9529 9530 /* 9531 * If the list is empty, allocate one and assume 9532 * nothing is sack'ed. 9533 */ 9534 ASSERT(tcp->tcp_sack_info != NULL); 9535 if (tcp->tcp_notsack_list == NULL) { 9536 tcp_notsack_update(&(tcp->tcp_notsack_list), 9537 tcp->tcp_suna, tcp->tcp_snxt, 9538 &(tcp->tcp_num_notsack_blk), 9539 &(tcp->tcp_cnt_notsack_list)); 9540 9541 /* 9542 * Make sure tcp_notsack_list is not NULL. 9543 * This happens when kmem_alloc(KM_NOSLEEP) 9544 * returns NULL. 9545 */ 9546 if (tcp->tcp_notsack_list == NULL) { 9547 up += sack_len; 9548 continue; 9549 } 9550 tcp->tcp_fack = tcp->tcp_suna; 9551 } 9552 9553 while (sack_len > 0) { 9554 if (up + 8 > endp) { 9555 up = endp; 9556 break; 9557 } 9558 sack_begin = BE32_TO_U32(up); 9559 up += 4; 9560 sack_end = BE32_TO_U32(up); 9561 up += 4; 9562 sack_len -= 8; 9563 /* 9564 * Bounds checking. Make sure the SACK 9565 * info is within tcp_suna and tcp_snxt. 9566 * If this SACK blk is out of bound, ignore 9567 * it but continue to parse the following 9568 * blks. 9569 */ 9570 if (SEQ_LEQ(sack_end, sack_begin) || 9571 SEQ_LT(sack_begin, tcp->tcp_suna) || 9572 SEQ_GT(sack_end, tcp->tcp_snxt)) { 9573 continue; 9574 } 9575 tcp_notsack_insert(&(tcp->tcp_notsack_list), 9576 sack_begin, sack_end, 9577 &(tcp->tcp_num_notsack_blk), 9578 &(tcp->tcp_cnt_notsack_list)); 9579 if (SEQ_GT(sack_end, tcp->tcp_fack)) { 9580 tcp->tcp_fack = sack_end; 9581 } 9582 } 9583 found |= TCP_OPT_SACK_PRESENT; 9584 continue; 9585 9586 case TCPOPT_TSTAMP: 9587 if (len < TCPOPT_TSTAMP_LEN || 9588 up[1] != TCPOPT_TSTAMP_LEN) 9589 break; 9590 9591 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 9592 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 9593 9594 found |= TCP_OPT_TSTAMP_PRESENT; 9595 9596 up += TCPOPT_TSTAMP_LEN; 9597 continue; 9598 9599 default: 9600 if (len <= 1 || len < (int)up[1] || up[1] == 0) 9601 break; 9602 up += up[1]; 9603 continue; 9604 } 9605 break; 9606 } 9607 return (found); 9608 } 9609 9610 /* 9611 * Set the mss associated with a particular tcp based on its current value, 9612 * and a new one passed in. Observe minimums and maximums, and reset 9613 * other state variables that we want to view as multiples of mss. 9614 * 9615 * This function is called mainly because values like tcp_mss, tcp_cwnd, 9616 * highwater marks etc. need to be initialized or adjusted. 9617 * 1) From tcp_process_options() when the other side's SYN/SYN-ACK 9618 * packet arrives. 9619 * 2) We need to set a new MSS when ICMP_FRAGMENTATION_NEEDED or 9620 * ICMP6_PACKET_TOO_BIG arrives. 9621 * 3) From tcp_paws_check() if the other side stops sending the timestamp, 9622 * to increase the MSS to use the extra bytes available. 9623 * 9624 * Callers except tcp_paws_check() ensure that they only reduce mss. 9625 */ 9626 static void 9627 tcp_mss_set(tcp_t *tcp, uint32_t mss, boolean_t do_ss) 9628 { 9629 uint32_t mss_max; 9630 tcp_stack_t *tcps = tcp->tcp_tcps; 9631 9632 if (tcp->tcp_ipversion == IPV4_VERSION) 9633 mss_max = tcps->tcps_mss_max_ipv4; 9634 else 9635 mss_max = tcps->tcps_mss_max_ipv6; 9636 9637 if (mss < tcps->tcps_mss_min) 9638 mss = tcps->tcps_mss_min; 9639 if (mss > mss_max) 9640 mss = mss_max; 9641 /* 9642 * Unless naglim has been set by our client to 9643 * a non-mss value, force naglim to track mss. 9644 * This can help to aggregate small writes. 9645 */ 9646 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 9647 tcp->tcp_naglim = mss; 9648 /* 9649 * TCP should be able to buffer at least 4 MSS data for obvious 9650 * performance reason. 9651 */ 9652 if ((mss << 2) > tcp->tcp_xmit_hiwater) 9653 tcp->tcp_xmit_hiwater = mss << 2; 9654 9655 if (do_ss) { 9656 /* 9657 * Either the tcp_cwnd is as yet uninitialized, or mss is 9658 * changing due to a reduction in MTU, presumably as a 9659 * result of a new path component, reset cwnd to its 9660 * "initial" value, as a multiple of the new mss. 9661 */ 9662 SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_initial); 9663 } else { 9664 /* 9665 * Called by tcp_paws_check(), the mss increased 9666 * marginally to allow use of space previously taken 9667 * by the timestamp option. It would be inappropriate 9668 * to apply slow start or tcp_init_cwnd values to 9669 * tcp_cwnd, simply adjust to a multiple of the new mss. 9670 */ 9671 tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss; 9672 tcp->tcp_cwnd_cnt = 0; 9673 } 9674 tcp->tcp_mss = mss; 9675 (void) tcp_maxpsz_set(tcp, B_TRUE); 9676 } 9677 9678 /* For /dev/tcp aka AF_INET open */ 9679 static int 9680 tcp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9681 { 9682 return (tcp_open(q, devp, flag, sflag, credp, B_FALSE)); 9683 } 9684 9685 /* For /dev/tcp6 aka AF_INET6 open */ 9686 static int 9687 tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9688 { 9689 return (tcp_open(q, devp, flag, sflag, credp, B_TRUE)); 9690 } 9691 9692 static int 9693 tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, 9694 boolean_t isv6) 9695 { 9696 tcp_t *tcp = NULL; 9697 conn_t *connp; 9698 int err; 9699 vmem_t *minor_arena = NULL; 9700 dev_t conn_dev; 9701 zoneid_t zoneid; 9702 tcp_stack_t *tcps = NULL; 9703 9704 if (q->q_ptr != NULL) 9705 return (0); 9706 9707 if (sflag == MODOPEN) 9708 return (EINVAL); 9709 9710 if (!(flag & SO_ACCEPTOR)) { 9711 /* 9712 * Special case for install: miniroot needs to be able to 9713 * access files via NFS as though it were always in the 9714 * global zone. 9715 */ 9716 if (credp == kcred && nfs_global_client_only != 0) { 9717 zoneid = GLOBAL_ZONEID; 9718 tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)-> 9719 netstack_tcp; 9720 ASSERT(tcps != NULL); 9721 } else { 9722 netstack_t *ns; 9723 9724 ns = netstack_find_by_cred(credp); 9725 ASSERT(ns != NULL); 9726 tcps = ns->netstack_tcp; 9727 ASSERT(tcps != NULL); 9728 9729 /* 9730 * For exclusive stacks we set the zoneid to zero 9731 * to make TCP operate as if in the global zone. 9732 */ 9733 if (tcps->tcps_netstack->netstack_stackid != 9734 GLOBAL_NETSTACKID) 9735 zoneid = GLOBAL_ZONEID; 9736 else 9737 zoneid = crgetzoneid(credp); 9738 } 9739 /* 9740 * For stackid zero this is done from strplumb.c, but 9741 * non-zero stackids are handled here. 9742 */ 9743 if (tcps->tcps_g_q == NULL && 9744 tcps->tcps_netstack->netstack_stackid != 9745 GLOBAL_NETSTACKID) { 9746 tcp_g_q_setup(tcps); 9747 } 9748 } 9749 9750 if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) && 9751 ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) { 9752 minor_arena = ip_minor_arena_la; 9753 } else { 9754 /* 9755 * Either minor numbers in the large arena were exhausted 9756 * or a non socket application is doing the open. 9757 * Try to allocate from the small arena. 9758 */ 9759 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) { 9760 if (tcps != NULL) 9761 netstack_rele(tcps->tcps_netstack); 9762 return (EBUSY); 9763 } 9764 minor_arena = ip_minor_arena_sa; 9765 } 9766 ASSERT(minor_arena != NULL); 9767 9768 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 9769 9770 if (flag & SO_ACCEPTOR) { 9771 /* No netstack_find_by_cred, hence no netstack_rele needed */ 9772 ASSERT(tcps == NULL); 9773 q->q_qinfo = &tcp_acceptor_rinit; 9774 /* 9775 * the conn_dev and minor_arena will be subsequently used by 9776 * tcp_wput_accept() and tcpclose_accept() to figure out the 9777 * minor device number for this connection from the q_ptr. 9778 */ 9779 RD(q)->q_ptr = (void *)conn_dev; 9780 WR(q)->q_qinfo = &tcp_acceptor_winit; 9781 WR(q)->q_ptr = (void *)minor_arena; 9782 qprocson(q); 9783 return (0); 9784 } 9785 9786 connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt), tcps); 9787 /* 9788 * Both tcp_get_conn and netstack_find_by_cred incremented refcnt, 9789 * so we drop it by one. 9790 */ 9791 netstack_rele(tcps->tcps_netstack); 9792 if (connp == NULL) { 9793 inet_minor_free(minor_arena, conn_dev); 9794 q->q_ptr = NULL; 9795 return (ENOSR); 9796 } 9797 connp->conn_sqp = IP_SQUEUE_GET(lbolt); 9798 tcp = connp->conn_tcp; 9799 9800 q->q_ptr = WR(q)->q_ptr = connp; 9801 if (isv6) { 9802 connp->conn_flags |= (IPCL_TCP6|IPCL_ISV6); 9803 connp->conn_send = ip_output_v6; 9804 connp->conn_af_isv6 = B_TRUE; 9805 connp->conn_pkt_isv6 = B_TRUE; 9806 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; 9807 tcp->tcp_ipversion = IPV6_VERSION; 9808 tcp->tcp_family = AF_INET6; 9809 tcp->tcp_mss = tcps->tcps_mss_def_ipv6; 9810 } else { 9811 connp->conn_flags |= IPCL_TCP4; 9812 connp->conn_send = ip_output; 9813 connp->conn_af_isv6 = B_FALSE; 9814 connp->conn_pkt_isv6 = B_FALSE; 9815 tcp->tcp_ipversion = IPV4_VERSION; 9816 tcp->tcp_family = AF_INET; 9817 tcp->tcp_mss = tcps->tcps_mss_def_ipv4; 9818 } 9819 9820 /* 9821 * TCP keeps a copy of cred for cache locality reasons but 9822 * we put a reference only once. If connp->conn_cred 9823 * becomes invalid, tcp_cred should also be set to NULL. 9824 */ 9825 tcp->tcp_cred = connp->conn_cred = credp; 9826 crhold(connp->conn_cred); 9827 tcp->tcp_cpid = curproc->p_pid; 9828 tcp->tcp_open_time = lbolt64; 9829 connp->conn_zoneid = zoneid; 9830 connp->conn_mlp_type = mlptSingle; 9831 connp->conn_ulp_labeled = !is_system_labeled(); 9832 ASSERT(connp->conn_netstack == tcps->tcps_netstack); 9833 ASSERT(tcp->tcp_tcps == tcps); 9834 9835 /* 9836 * If the caller has the process-wide flag set, then default to MAC 9837 * exempt mode. This allows read-down to unlabeled hosts. 9838 */ 9839 if (getpflags(NET_MAC_AWARE, credp) != 0) 9840 connp->conn_mac_exempt = B_TRUE; 9841 9842 connp->conn_dev = conn_dev; 9843 connp->conn_minor_arena = minor_arena; 9844 9845 ASSERT(q->q_qinfo == &tcp_rinitv4 || q->q_qinfo == &tcp_rinitv6); 9846 ASSERT(WR(q)->q_qinfo == &tcp_winit); 9847 9848 if (flag & SO_SOCKSTR) { 9849 /* 9850 * No need to insert a socket in tcp acceptor hash. 9851 * If it was a socket acceptor stream, we dealt with 9852 * it above. A socket listener can never accept a 9853 * connection and doesn't need acceptor_id. 9854 */ 9855 connp->conn_flags |= IPCL_SOCKET; 9856 tcp->tcp_issocket = 1; 9857 WR(q)->q_qinfo = &tcp_sock_winit; 9858 } else { 9859 #ifdef _ILP32 9860 tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); 9861 #else 9862 tcp->tcp_acceptor_id = conn_dev; 9863 #endif /* _ILP32 */ 9864 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 9865 } 9866 9867 if (tcps->tcps_trace) 9868 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_SLEEP); 9869 9870 err = tcp_init(tcp, q); 9871 if (err != 0) { 9872 inet_minor_free(connp->conn_minor_arena, connp->conn_dev); 9873 tcp_acceptor_hash_remove(tcp); 9874 CONN_DEC_REF(connp); 9875 q->q_ptr = WR(q)->q_ptr = NULL; 9876 return (err); 9877 } 9878 9879 RD(q)->q_hiwat = tcps->tcps_recv_hiwat; 9880 tcp->tcp_rwnd = tcps->tcps_recv_hiwat; 9881 9882 /* Non-zero default values */ 9883 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 9884 /* 9885 * Put the ref for TCP. Ref for IP was already put 9886 * by ipcl_conn_create. Also Make the conn_t globally 9887 * visible to walkers 9888 */ 9889 mutex_enter(&connp->conn_lock); 9890 CONN_INC_REF_LOCKED(connp); 9891 ASSERT(connp->conn_ref == 2); 9892 connp->conn_state_flags &= ~CONN_INCIPIENT; 9893 mutex_exit(&connp->conn_lock); 9894 9895 qprocson(q); 9896 return (0); 9897 } 9898 9899 /* 9900 * Some TCP options can be "set" by requesting them in the option 9901 * buffer. This is needed for XTI feature test though we do not 9902 * allow it in general. We interpret that this mechanism is more 9903 * applicable to OSI protocols and need not be allowed in general. 9904 * This routine filters out options for which it is not allowed (most) 9905 * and lets through those (few) for which it is. [ The XTI interface 9906 * test suite specifics will imply that any XTI_GENERIC level XTI_* if 9907 * ever implemented will have to be allowed here ]. 9908 */ 9909 static boolean_t 9910 tcp_allow_connopt_set(int level, int name) 9911 { 9912 9913 switch (level) { 9914 case IPPROTO_TCP: 9915 switch (name) { 9916 case TCP_NODELAY: 9917 return (B_TRUE); 9918 default: 9919 return (B_FALSE); 9920 } 9921 /*NOTREACHED*/ 9922 default: 9923 return (B_FALSE); 9924 } 9925 /*NOTREACHED*/ 9926 } 9927 9928 /* 9929 * This routine gets default values of certain options whose default 9930 * values are maintained by protocol specific code 9931 */ 9932 /* ARGSUSED */ 9933 int 9934 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 9935 { 9936 int32_t *i1 = (int32_t *)ptr; 9937 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 9938 9939 switch (level) { 9940 case IPPROTO_TCP: 9941 switch (name) { 9942 case TCP_NOTIFY_THRESHOLD: 9943 *i1 = tcps->tcps_ip_notify_interval; 9944 break; 9945 case TCP_ABORT_THRESHOLD: 9946 *i1 = tcps->tcps_ip_abort_interval; 9947 break; 9948 case TCP_CONN_NOTIFY_THRESHOLD: 9949 *i1 = tcps->tcps_ip_notify_cinterval; 9950 break; 9951 case TCP_CONN_ABORT_THRESHOLD: 9952 *i1 = tcps->tcps_ip_abort_cinterval; 9953 break; 9954 default: 9955 return (-1); 9956 } 9957 break; 9958 case IPPROTO_IP: 9959 switch (name) { 9960 case IP_TTL: 9961 *i1 = tcps->tcps_ipv4_ttl; 9962 break; 9963 default: 9964 return (-1); 9965 } 9966 break; 9967 case IPPROTO_IPV6: 9968 switch (name) { 9969 case IPV6_UNICAST_HOPS: 9970 *i1 = tcps->tcps_ipv6_hoplimit; 9971 break; 9972 default: 9973 return (-1); 9974 } 9975 break; 9976 default: 9977 return (-1); 9978 } 9979 return (sizeof (int)); 9980 } 9981 9982 9983 /* 9984 * TCP routine to get the values of options. 9985 */ 9986 int 9987 tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 9988 { 9989 int *i1 = (int *)ptr; 9990 conn_t *connp = Q_TO_CONN(q); 9991 tcp_t *tcp = connp->conn_tcp; 9992 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; 9993 9994 switch (level) { 9995 case SOL_SOCKET: 9996 switch (name) { 9997 case SO_LINGER: { 9998 struct linger *lgr = (struct linger *)ptr; 9999 10000 lgr->l_onoff = tcp->tcp_linger ? SO_LINGER : 0; 10001 lgr->l_linger = tcp->tcp_lingertime; 10002 } 10003 return (sizeof (struct linger)); 10004 case SO_DEBUG: 10005 *i1 = tcp->tcp_debug ? SO_DEBUG : 0; 10006 break; 10007 case SO_KEEPALIVE: 10008 *i1 = tcp->tcp_ka_enabled ? SO_KEEPALIVE : 0; 10009 break; 10010 case SO_DONTROUTE: 10011 *i1 = tcp->tcp_dontroute ? SO_DONTROUTE : 0; 10012 break; 10013 case SO_USELOOPBACK: 10014 *i1 = tcp->tcp_useloopback ? SO_USELOOPBACK : 0; 10015 break; 10016 case SO_BROADCAST: 10017 *i1 = tcp->tcp_broadcast ? SO_BROADCAST : 0; 10018 break; 10019 case SO_REUSEADDR: 10020 *i1 = tcp->tcp_reuseaddr ? SO_REUSEADDR : 0; 10021 break; 10022 case SO_OOBINLINE: 10023 *i1 = tcp->tcp_oobinline ? SO_OOBINLINE : 0; 10024 break; 10025 case SO_DGRAM_ERRIND: 10026 *i1 = tcp->tcp_dgram_errind ? SO_DGRAM_ERRIND : 0; 10027 break; 10028 case SO_TYPE: 10029 *i1 = SOCK_STREAM; 10030 break; 10031 case SO_SNDBUF: 10032 *i1 = tcp->tcp_xmit_hiwater; 10033 break; 10034 case SO_RCVBUF: 10035 *i1 = RD(q)->q_hiwat; 10036 break; 10037 case SO_SND_COPYAVOID: 10038 *i1 = tcp->tcp_snd_zcopy_on ? 10039 SO_SND_COPYAVOID : 0; 10040 break; 10041 case SO_ALLZONES: 10042 *i1 = connp->conn_allzones ? 1 : 0; 10043 break; 10044 case SO_ANON_MLP: 10045 *i1 = connp->conn_anon_mlp; 10046 break; 10047 case SO_MAC_EXEMPT: 10048 *i1 = connp->conn_mac_exempt; 10049 break; 10050 case SO_EXCLBIND: 10051 *i1 = tcp->tcp_exclbind ? SO_EXCLBIND : 0; 10052 break; 10053 case SO_PROTOTYPE: 10054 *i1 = IPPROTO_TCP; 10055 break; 10056 case SO_DOMAIN: 10057 *i1 = tcp->tcp_family; 10058 break; 10059 default: 10060 return (-1); 10061 } 10062 break; 10063 case IPPROTO_TCP: 10064 switch (name) { 10065 case TCP_NODELAY: 10066 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; 10067 break; 10068 case TCP_MAXSEG: 10069 *i1 = tcp->tcp_mss; 10070 break; 10071 case TCP_NOTIFY_THRESHOLD: 10072 *i1 = (int)tcp->tcp_first_timer_threshold; 10073 break; 10074 case TCP_ABORT_THRESHOLD: 10075 *i1 = tcp->tcp_second_timer_threshold; 10076 break; 10077 case TCP_CONN_NOTIFY_THRESHOLD: 10078 *i1 = tcp->tcp_first_ctimer_threshold; 10079 break; 10080 case TCP_CONN_ABORT_THRESHOLD: 10081 *i1 = tcp->tcp_second_ctimer_threshold; 10082 break; 10083 case TCP_RECVDSTADDR: 10084 *i1 = tcp->tcp_recvdstaddr; 10085 break; 10086 case TCP_ANONPRIVBIND: 10087 *i1 = tcp->tcp_anon_priv_bind; 10088 break; 10089 case TCP_EXCLBIND: 10090 *i1 = tcp->tcp_exclbind ? TCP_EXCLBIND : 0; 10091 break; 10092 case TCP_INIT_CWND: 10093 *i1 = tcp->tcp_init_cwnd; 10094 break; 10095 case TCP_KEEPALIVE_THRESHOLD: 10096 *i1 = tcp->tcp_ka_interval; 10097 break; 10098 case TCP_KEEPALIVE_ABORT_THRESHOLD: 10099 *i1 = tcp->tcp_ka_abort_thres; 10100 break; 10101 case TCP_CORK: 10102 *i1 = tcp->tcp_cork; 10103 break; 10104 default: 10105 return (-1); 10106 } 10107 break; 10108 case IPPROTO_IP: 10109 if (tcp->tcp_family != AF_INET) 10110 return (-1); 10111 switch (name) { 10112 case IP_OPTIONS: 10113 case T_IP_OPTIONS: { 10114 /* 10115 * This is compatible with BSD in that in only return 10116 * the reverse source route with the final destination 10117 * as the last entry. The first 4 bytes of the option 10118 * will contain the final destination. 10119 */ 10120 int opt_len; 10121 10122 opt_len = (char *)tcp->tcp_tcph - (char *)tcp->tcp_ipha; 10123 opt_len -= tcp->tcp_label_len + IP_SIMPLE_HDR_LENGTH; 10124 ASSERT(opt_len >= 0); 10125 /* Caller ensures enough space */ 10126 if (opt_len > 0) { 10127 /* 10128 * TODO: Do we have to handle getsockopt on an 10129 * initiator as well? 10130 */ 10131 return (ip_opt_get_user(tcp->tcp_ipha, ptr)); 10132 } 10133 return (0); 10134 } 10135 case IP_TOS: 10136 case T_IP_TOS: 10137 *i1 = (int)tcp->tcp_ipha->ipha_type_of_service; 10138 break; 10139 case IP_TTL: 10140 *i1 = (int)tcp->tcp_ipha->ipha_ttl; 10141 break; 10142 case IP_NEXTHOP: 10143 /* Handled at IP level */ 10144 return (-EINVAL); 10145 default: 10146 return (-1); 10147 } 10148 break; 10149 case IPPROTO_IPV6: 10150 /* 10151 * IPPROTO_IPV6 options are only supported for sockets 10152 * that are using IPv6 on the wire. 10153 */ 10154 if (tcp->tcp_ipversion != IPV6_VERSION) { 10155 return (-1); 10156 } 10157 switch (name) { 10158 case IPV6_UNICAST_HOPS: 10159 *i1 = (unsigned int) tcp->tcp_ip6h->ip6_hops; 10160 break; /* goto sizeof (int) option return */ 10161 case IPV6_BOUND_IF: 10162 /* Zero if not set */ 10163 *i1 = tcp->tcp_bound_if; 10164 break; /* goto sizeof (int) option return */ 10165 case IPV6_RECVPKTINFO: 10166 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) 10167 *i1 = 1; 10168 else 10169 *i1 = 0; 10170 break; /* goto sizeof (int) option return */ 10171 case IPV6_RECVTCLASS: 10172 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS) 10173 *i1 = 1; 10174 else 10175 *i1 = 0; 10176 break; /* goto sizeof (int) option return */ 10177 case IPV6_RECVHOPLIMIT: 10178 if (tcp->tcp_ipv6_recvancillary & 10179 TCP_IPV6_RECVHOPLIMIT) 10180 *i1 = 1; 10181 else 10182 *i1 = 0; 10183 break; /* goto sizeof (int) option return */ 10184 case IPV6_RECVHOPOPTS: 10185 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) 10186 *i1 = 1; 10187 else 10188 *i1 = 0; 10189 break; /* goto sizeof (int) option return */ 10190 case IPV6_RECVDSTOPTS: 10191 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVDSTOPTS) 10192 *i1 = 1; 10193 else 10194 *i1 = 0; 10195 break; /* goto sizeof (int) option return */ 10196 case _OLD_IPV6_RECVDSTOPTS: 10197 if (tcp->tcp_ipv6_recvancillary & 10198 TCP_OLD_IPV6_RECVDSTOPTS) 10199 *i1 = 1; 10200 else 10201 *i1 = 0; 10202 break; /* goto sizeof (int) option return */ 10203 case IPV6_RECVRTHDR: 10204 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) 10205 *i1 = 1; 10206 else 10207 *i1 = 0; 10208 break; /* goto sizeof (int) option return */ 10209 case IPV6_RECVRTHDRDSTOPTS: 10210 if (tcp->tcp_ipv6_recvancillary & 10211 TCP_IPV6_RECVRTDSTOPTS) 10212 *i1 = 1; 10213 else 10214 *i1 = 0; 10215 break; /* goto sizeof (int) option return */ 10216 case IPV6_PKTINFO: { 10217 /* XXX assumes that caller has room for max size! */ 10218 struct in6_pktinfo *pkti; 10219 10220 pkti = (struct in6_pktinfo *)ptr; 10221 if (ipp->ipp_fields & IPPF_IFINDEX) 10222 pkti->ipi6_ifindex = ipp->ipp_ifindex; 10223 else 10224 pkti->ipi6_ifindex = 0; 10225 if (ipp->ipp_fields & IPPF_ADDR) 10226 pkti->ipi6_addr = ipp->ipp_addr; 10227 else 10228 pkti->ipi6_addr = ipv6_all_zeros; 10229 return (sizeof (struct in6_pktinfo)); 10230 } 10231 case IPV6_TCLASS: 10232 if (ipp->ipp_fields & IPPF_TCLASS) 10233 *i1 = ipp->ipp_tclass; 10234 else 10235 *i1 = IPV6_FLOW_TCLASS( 10236 IPV6_DEFAULT_VERS_AND_FLOW); 10237 break; /* goto sizeof (int) option return */ 10238 case IPV6_NEXTHOP: { 10239 sin6_t *sin6 = (sin6_t *)ptr; 10240 10241 if (!(ipp->ipp_fields & IPPF_NEXTHOP)) 10242 return (0); 10243 *sin6 = sin6_null; 10244 sin6->sin6_family = AF_INET6; 10245 sin6->sin6_addr = ipp->ipp_nexthop; 10246 return (sizeof (sin6_t)); 10247 } 10248 case IPV6_HOPOPTS: 10249 if (!(ipp->ipp_fields & IPPF_HOPOPTS)) 10250 return (0); 10251 if (ipp->ipp_hopoptslen <= tcp->tcp_label_len) 10252 return (0); 10253 bcopy((char *)ipp->ipp_hopopts + tcp->tcp_label_len, 10254 ptr, ipp->ipp_hopoptslen - tcp->tcp_label_len); 10255 if (tcp->tcp_label_len > 0) { 10256 ptr[0] = ((char *)ipp->ipp_hopopts)[0]; 10257 ptr[1] = (ipp->ipp_hopoptslen - 10258 tcp->tcp_label_len + 7) / 8 - 1; 10259 } 10260 return (ipp->ipp_hopoptslen - tcp->tcp_label_len); 10261 case IPV6_RTHDRDSTOPTS: 10262 if (!(ipp->ipp_fields & IPPF_RTDSTOPTS)) 10263 return (0); 10264 bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen); 10265 return (ipp->ipp_rtdstoptslen); 10266 case IPV6_RTHDR: 10267 if (!(ipp->ipp_fields & IPPF_RTHDR)) 10268 return (0); 10269 bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen); 10270 return (ipp->ipp_rthdrlen); 10271 case IPV6_DSTOPTS: 10272 if (!(ipp->ipp_fields & IPPF_DSTOPTS)) 10273 return (0); 10274 bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen); 10275 return (ipp->ipp_dstoptslen); 10276 case IPV6_SRC_PREFERENCES: 10277 return (ip6_get_src_preferences(connp, 10278 (uint32_t *)ptr)); 10279 case IPV6_PATHMTU: { 10280 struct ip6_mtuinfo *mtuinfo = (struct ip6_mtuinfo *)ptr; 10281 10282 if (tcp->tcp_state < TCPS_ESTABLISHED) 10283 return (-1); 10284 10285 return (ip_fill_mtuinfo(&connp->conn_remv6, 10286 connp->conn_fport, mtuinfo, 10287 connp->conn_netstack)); 10288 } 10289 default: 10290 return (-1); 10291 } 10292 break; 10293 default: 10294 return (-1); 10295 } 10296 return (sizeof (int)); 10297 } 10298 10299 /* 10300 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. 10301 * Parameters are assumed to be verified by the caller. 10302 */ 10303 /* ARGSUSED */ 10304 int 10305 tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, 10306 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 10307 void *thisdg_attrs, cred_t *cr, mblk_t *mblk) 10308 { 10309 conn_t *connp = Q_TO_CONN(q); 10310 tcp_t *tcp = connp->conn_tcp; 10311 int *i1 = (int *)invalp; 10312 boolean_t onoff = (*i1 == 0) ? 0 : 1; 10313 boolean_t checkonly; 10314 int reterr; 10315 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 10316 10317 switch (optset_context) { 10318 case SETFN_OPTCOM_CHECKONLY: 10319 checkonly = B_TRUE; 10320 /* 10321 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 10322 * inlen != 0 implies value supplied and 10323 * we have to "pretend" to set it. 10324 * inlen == 0 implies that there is no 10325 * value part in T_CHECK request and just validation 10326 * done elsewhere should be enough, we just return here. 10327 */ 10328 if (inlen == 0) { 10329 *outlenp = 0; 10330 return (0); 10331 } 10332 break; 10333 case SETFN_OPTCOM_NEGOTIATE: 10334 checkonly = B_FALSE; 10335 break; 10336 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ 10337 case SETFN_CONN_NEGOTIATE: 10338 checkonly = B_FALSE; 10339 /* 10340 * Negotiating local and "association-related" options 10341 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) 10342 * primitives is allowed by XTI, but we choose 10343 * to not implement this style negotiation for Internet 10344 * protocols (We interpret it is a must for OSI world but 10345 * optional for Internet protocols) for all options. 10346 * [ Will do only for the few options that enable test 10347 * suites that our XTI implementation of this feature 10348 * works for transports that do allow it ] 10349 */ 10350 if (!tcp_allow_connopt_set(level, name)) { 10351 *outlenp = 0; 10352 return (EINVAL); 10353 } 10354 break; 10355 default: 10356 /* 10357 * We should never get here 10358 */ 10359 *outlenp = 0; 10360 return (EINVAL); 10361 } 10362 10363 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 10364 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 10365 10366 /* 10367 * For TCP, we should have no ancillary data sent down 10368 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs 10369 * has to be zero. 10370 */ 10371 ASSERT(thisdg_attrs == NULL); 10372 10373 /* 10374 * For fixed length options, no sanity check 10375 * of passed in length is done. It is assumed *_optcom_req() 10376 * routines do the right thing. 10377 */ 10378 10379 switch (level) { 10380 case SOL_SOCKET: 10381 switch (name) { 10382 case SO_LINGER: { 10383 struct linger *lgr = (struct linger *)invalp; 10384 10385 if (!checkonly) { 10386 if (lgr->l_onoff) { 10387 tcp->tcp_linger = 1; 10388 tcp->tcp_lingertime = lgr->l_linger; 10389 } else { 10390 tcp->tcp_linger = 0; 10391 tcp->tcp_lingertime = 0; 10392 } 10393 /* struct copy */ 10394 *(struct linger *)outvalp = *lgr; 10395 } else { 10396 if (!lgr->l_onoff) { 10397 ((struct linger *) 10398 outvalp)->l_onoff = 0; 10399 ((struct linger *) 10400 outvalp)->l_linger = 0; 10401 } else { 10402 /* struct copy */ 10403 *(struct linger *)outvalp = *lgr; 10404 } 10405 } 10406 *outlenp = sizeof (struct linger); 10407 return (0); 10408 } 10409 case SO_DEBUG: 10410 if (!checkonly) 10411 tcp->tcp_debug = onoff; 10412 break; 10413 case SO_KEEPALIVE: 10414 if (checkonly) { 10415 /* T_CHECK case */ 10416 break; 10417 } 10418 10419 if (!onoff) { 10420 if (tcp->tcp_ka_enabled) { 10421 if (tcp->tcp_ka_tid != 0) { 10422 (void) TCP_TIMER_CANCEL(tcp, 10423 tcp->tcp_ka_tid); 10424 tcp->tcp_ka_tid = 0; 10425 } 10426 tcp->tcp_ka_enabled = 0; 10427 } 10428 break; 10429 } 10430 if (!tcp->tcp_ka_enabled) { 10431 /* Crank up the keepalive timer */ 10432 tcp->tcp_ka_last_intrvl = 0; 10433 tcp->tcp_ka_tid = TCP_TIMER(tcp, 10434 tcp_keepalive_killer, 10435 MSEC_TO_TICK(tcp->tcp_ka_interval)); 10436 tcp->tcp_ka_enabled = 1; 10437 } 10438 break; 10439 case SO_DONTROUTE: 10440 /* 10441 * SO_DONTROUTE, SO_USELOOPBACK, and SO_BROADCAST are 10442 * only of interest to IP. We track them here only so 10443 * that we can report their current value. 10444 */ 10445 if (!checkonly) { 10446 tcp->tcp_dontroute = onoff; 10447 tcp->tcp_connp->conn_dontroute = onoff; 10448 } 10449 break; 10450 case SO_USELOOPBACK: 10451 if (!checkonly) { 10452 tcp->tcp_useloopback = onoff; 10453 tcp->tcp_connp->conn_loopback = onoff; 10454 } 10455 break; 10456 case SO_BROADCAST: 10457 if (!checkonly) { 10458 tcp->tcp_broadcast = onoff; 10459 tcp->tcp_connp->conn_broadcast = onoff; 10460 } 10461 break; 10462 case SO_REUSEADDR: 10463 if (!checkonly) { 10464 tcp->tcp_reuseaddr = onoff; 10465 tcp->tcp_connp->conn_reuseaddr = onoff; 10466 } 10467 break; 10468 case SO_OOBINLINE: 10469 if (!checkonly) 10470 tcp->tcp_oobinline = onoff; 10471 break; 10472 case SO_DGRAM_ERRIND: 10473 if (!checkonly) 10474 tcp->tcp_dgram_errind = onoff; 10475 break; 10476 case SO_SNDBUF: { 10477 if (*i1 > tcps->tcps_max_buf) { 10478 *outlenp = 0; 10479 return (ENOBUFS); 10480 } 10481 if (checkonly) 10482 break; 10483 10484 tcp->tcp_xmit_hiwater = *i1; 10485 if (tcps->tcps_snd_lowat_fraction != 0) 10486 tcp->tcp_xmit_lowater = 10487 tcp->tcp_xmit_hiwater / 10488 tcps->tcps_snd_lowat_fraction; 10489 (void) tcp_maxpsz_set(tcp, B_TRUE); 10490 /* 10491 * If we are flow-controlled, recheck the condition. 10492 * There are apps that increase SO_SNDBUF size when 10493 * flow-controlled (EWOULDBLOCK), and expect the flow 10494 * control condition to be lifted right away. 10495 */ 10496 mutex_enter(&tcp->tcp_non_sq_lock); 10497 if (tcp->tcp_flow_stopped && 10498 TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) { 10499 tcp_clrqfull(tcp); 10500 } 10501 mutex_exit(&tcp->tcp_non_sq_lock); 10502 break; 10503 } 10504 case SO_RCVBUF: 10505 if (*i1 > tcps->tcps_max_buf) { 10506 *outlenp = 0; 10507 return (ENOBUFS); 10508 } 10509 /* Silently ignore zero */ 10510 if (!checkonly && *i1 != 0) { 10511 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); 10512 (void) tcp_rwnd_set(tcp, *i1); 10513 } 10514 /* 10515 * XXX should we return the rwnd here 10516 * and tcp_opt_get ? 10517 */ 10518 break; 10519 case SO_SND_COPYAVOID: 10520 if (!checkonly) { 10521 /* we only allow enable at most once for now */ 10522 if (tcp->tcp_loopback || 10523 (tcp->tcp_kssl_ctx != NULL) || 10524 (!tcp->tcp_snd_zcopy_aware && 10525 (onoff != 1 || !tcp_zcopy_check(tcp)))) { 10526 *outlenp = 0; 10527 return (EOPNOTSUPP); 10528 } 10529 tcp->tcp_snd_zcopy_aware = 1; 10530 } 10531 break; 10532 case SO_ALLZONES: 10533 /* Pass option along to IP level for handling */ 10534 return (-EINVAL); 10535 case SO_ANON_MLP: 10536 /* Pass option along to IP level for handling */ 10537 return (-EINVAL); 10538 case SO_MAC_EXEMPT: 10539 /* Pass option along to IP level for handling */ 10540 return (-EINVAL); 10541 case SO_EXCLBIND: 10542 if (!checkonly) 10543 tcp->tcp_exclbind = onoff; 10544 break; 10545 default: 10546 *outlenp = 0; 10547 return (EINVAL); 10548 } 10549 break; 10550 case IPPROTO_TCP: 10551 switch (name) { 10552 case TCP_NODELAY: 10553 if (!checkonly) 10554 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; 10555 break; 10556 case TCP_NOTIFY_THRESHOLD: 10557 if (!checkonly) 10558 tcp->tcp_first_timer_threshold = *i1; 10559 break; 10560 case TCP_ABORT_THRESHOLD: 10561 if (!checkonly) 10562 tcp->tcp_second_timer_threshold = *i1; 10563 break; 10564 case TCP_CONN_NOTIFY_THRESHOLD: 10565 if (!checkonly) 10566 tcp->tcp_first_ctimer_threshold = *i1; 10567 break; 10568 case TCP_CONN_ABORT_THRESHOLD: 10569 if (!checkonly) 10570 tcp->tcp_second_ctimer_threshold = *i1; 10571 break; 10572 case TCP_RECVDSTADDR: 10573 if (tcp->tcp_state > TCPS_LISTEN) 10574 return (EOPNOTSUPP); 10575 if (!checkonly) 10576 tcp->tcp_recvdstaddr = onoff; 10577 break; 10578 case TCP_ANONPRIVBIND: 10579 if ((reterr = secpolicy_net_privaddr(cr, 0, 10580 IPPROTO_TCP)) != 0) { 10581 *outlenp = 0; 10582 return (reterr); 10583 } 10584 if (!checkonly) { 10585 tcp->tcp_anon_priv_bind = onoff; 10586 } 10587 break; 10588 case TCP_EXCLBIND: 10589 if (!checkonly) 10590 tcp->tcp_exclbind = onoff; 10591 break; /* goto sizeof (int) option return */ 10592 case TCP_INIT_CWND: { 10593 uint32_t init_cwnd = *((uint32_t *)invalp); 10594 10595 if (checkonly) 10596 break; 10597 10598 /* 10599 * Only allow socket with network configuration 10600 * privilege to set the initial cwnd to be larger 10601 * than allowed by RFC 3390. 10602 */ 10603 if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { 10604 tcp->tcp_init_cwnd = init_cwnd; 10605 break; 10606 } 10607 if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) { 10608 *outlenp = 0; 10609 return (reterr); 10610 } 10611 if (init_cwnd > TCP_MAX_INIT_CWND) { 10612 *outlenp = 0; 10613 return (EINVAL); 10614 } 10615 tcp->tcp_init_cwnd = init_cwnd; 10616 break; 10617 } 10618 case TCP_KEEPALIVE_THRESHOLD: 10619 if (checkonly) 10620 break; 10621 10622 if (*i1 < tcps->tcps_keepalive_interval_low || 10623 *i1 > tcps->tcps_keepalive_interval_high) { 10624 *outlenp = 0; 10625 return (EINVAL); 10626 } 10627 if (*i1 != tcp->tcp_ka_interval) { 10628 tcp->tcp_ka_interval = *i1; 10629 /* 10630 * Check if we need to restart the 10631 * keepalive timer. 10632 */ 10633 if (tcp->tcp_ka_tid != 0) { 10634 ASSERT(tcp->tcp_ka_enabled); 10635 (void) TCP_TIMER_CANCEL(tcp, 10636 tcp->tcp_ka_tid); 10637 tcp->tcp_ka_last_intrvl = 0; 10638 tcp->tcp_ka_tid = TCP_TIMER(tcp, 10639 tcp_keepalive_killer, 10640 MSEC_TO_TICK(tcp->tcp_ka_interval)); 10641 } 10642 } 10643 break; 10644 case TCP_KEEPALIVE_ABORT_THRESHOLD: 10645 if (!checkonly) { 10646 if (*i1 < 10647 tcps->tcps_keepalive_abort_interval_low || 10648 *i1 > 10649 tcps->tcps_keepalive_abort_interval_high) { 10650 *outlenp = 0; 10651 return (EINVAL); 10652 } 10653 tcp->tcp_ka_abort_thres = *i1; 10654 } 10655 break; 10656 case TCP_CORK: 10657 if (!checkonly) { 10658 /* 10659 * if tcp->tcp_cork was set and is now 10660 * being unset, we have to make sure that 10661 * the remaining data gets sent out. Also 10662 * unset tcp->tcp_cork so that tcp_wput_data() 10663 * can send data even if it is less than mss 10664 */ 10665 if (tcp->tcp_cork && onoff == 0 && 10666 tcp->tcp_unsent > 0) { 10667 tcp->tcp_cork = B_FALSE; 10668 tcp_wput_data(tcp, NULL, B_FALSE); 10669 } 10670 tcp->tcp_cork = onoff; 10671 } 10672 break; 10673 default: 10674 *outlenp = 0; 10675 return (EINVAL); 10676 } 10677 break; 10678 case IPPROTO_IP: 10679 if (tcp->tcp_family != AF_INET) { 10680 *outlenp = 0; 10681 return (ENOPROTOOPT); 10682 } 10683 switch (name) { 10684 case IP_OPTIONS: 10685 case T_IP_OPTIONS: 10686 reterr = tcp_opt_set_header(tcp, checkonly, 10687 invalp, inlen); 10688 if (reterr) { 10689 *outlenp = 0; 10690 return (reterr); 10691 } 10692 /* OK return - copy input buffer into output buffer */ 10693 if (invalp != outvalp) { 10694 /* don't trust bcopy for identical src/dst */ 10695 bcopy(invalp, outvalp, inlen); 10696 } 10697 *outlenp = inlen; 10698 return (0); 10699 case IP_TOS: 10700 case T_IP_TOS: 10701 if (!checkonly) { 10702 tcp->tcp_ipha->ipha_type_of_service = 10703 (uchar_t)*i1; 10704 tcp->tcp_tos = (uchar_t)*i1; 10705 } 10706 break; 10707 case IP_TTL: 10708 if (!checkonly) { 10709 tcp->tcp_ipha->ipha_ttl = (uchar_t)*i1; 10710 tcp->tcp_ttl = (uchar_t)*i1; 10711 } 10712 break; 10713 case IP_BOUND_IF: 10714 case IP_NEXTHOP: 10715 /* Handled at the IP level */ 10716 return (-EINVAL); 10717 case IP_SEC_OPT: 10718 /* 10719 * We should not allow policy setting after 10720 * we start listening for connections. 10721 */ 10722 if (tcp->tcp_state == TCPS_LISTEN) { 10723 return (EINVAL); 10724 } else { 10725 /* Handled at the IP level */ 10726 return (-EINVAL); 10727 } 10728 default: 10729 *outlenp = 0; 10730 return (EINVAL); 10731 } 10732 break; 10733 case IPPROTO_IPV6: { 10734 ip6_pkt_t *ipp; 10735 10736 /* 10737 * IPPROTO_IPV6 options are only supported for sockets 10738 * that are using IPv6 on the wire. 10739 */ 10740 if (tcp->tcp_ipversion != IPV6_VERSION) { 10741 *outlenp = 0; 10742 return (ENOPROTOOPT); 10743 } 10744 /* 10745 * Only sticky options; no ancillary data 10746 */ 10747 ASSERT(thisdg_attrs == NULL); 10748 ipp = &tcp->tcp_sticky_ipp; 10749 10750 switch (name) { 10751 case IPV6_UNICAST_HOPS: 10752 /* -1 means use default */ 10753 if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) { 10754 *outlenp = 0; 10755 return (EINVAL); 10756 } 10757 if (!checkonly) { 10758 if (*i1 == -1) { 10759 tcp->tcp_ip6h->ip6_hops = 10760 ipp->ipp_unicast_hops = 10761 (uint8_t)tcps->tcps_ipv6_hoplimit; 10762 ipp->ipp_fields &= ~IPPF_UNICAST_HOPS; 10763 /* Pass modified value to IP. */ 10764 *i1 = tcp->tcp_ip6h->ip6_hops; 10765 } else { 10766 tcp->tcp_ip6h->ip6_hops = 10767 ipp->ipp_unicast_hops = 10768 (uint8_t)*i1; 10769 ipp->ipp_fields |= IPPF_UNICAST_HOPS; 10770 } 10771 reterr = tcp_build_hdrs(q, tcp); 10772 if (reterr != 0) 10773 return (reterr); 10774 } 10775 break; 10776 case IPV6_BOUND_IF: 10777 if (!checkonly) { 10778 int error = 0; 10779 10780 tcp->tcp_bound_if = *i1; 10781 error = ip_opt_set_ill(tcp->tcp_connp, *i1, 10782 B_TRUE, checkonly, level, name, mblk); 10783 if (error != 0) { 10784 *outlenp = 0; 10785 return (error); 10786 } 10787 } 10788 break; 10789 /* 10790 * Set boolean switches for ancillary data delivery 10791 */ 10792 case IPV6_RECVPKTINFO: 10793 if (!checkonly) { 10794 if (onoff) 10795 tcp->tcp_ipv6_recvancillary |= 10796 TCP_IPV6_RECVPKTINFO; 10797 else 10798 tcp->tcp_ipv6_recvancillary &= 10799 ~TCP_IPV6_RECVPKTINFO; 10800 /* Force it to be sent up with the next msg */ 10801 tcp->tcp_recvifindex = 0; 10802 } 10803 break; 10804 case IPV6_RECVTCLASS: 10805 if (!checkonly) { 10806 if (onoff) 10807 tcp->tcp_ipv6_recvancillary |= 10808 TCP_IPV6_RECVTCLASS; 10809 else 10810 tcp->tcp_ipv6_recvancillary &= 10811 ~TCP_IPV6_RECVTCLASS; 10812 } 10813 break; 10814 case IPV6_RECVHOPLIMIT: 10815 if (!checkonly) { 10816 if (onoff) 10817 tcp->tcp_ipv6_recvancillary |= 10818 TCP_IPV6_RECVHOPLIMIT; 10819 else 10820 tcp->tcp_ipv6_recvancillary &= 10821 ~TCP_IPV6_RECVHOPLIMIT; 10822 /* Force it to be sent up with the next msg */ 10823 tcp->tcp_recvhops = 0xffffffffU; 10824 } 10825 break; 10826 case IPV6_RECVHOPOPTS: 10827 if (!checkonly) { 10828 if (onoff) 10829 tcp->tcp_ipv6_recvancillary |= 10830 TCP_IPV6_RECVHOPOPTS; 10831 else 10832 tcp->tcp_ipv6_recvancillary &= 10833 ~TCP_IPV6_RECVHOPOPTS; 10834 } 10835 break; 10836 case IPV6_RECVDSTOPTS: 10837 if (!checkonly) { 10838 if (onoff) 10839 tcp->tcp_ipv6_recvancillary |= 10840 TCP_IPV6_RECVDSTOPTS; 10841 else 10842 tcp->tcp_ipv6_recvancillary &= 10843 ~TCP_IPV6_RECVDSTOPTS; 10844 } 10845 break; 10846 case _OLD_IPV6_RECVDSTOPTS: 10847 if (!checkonly) { 10848 if (onoff) 10849 tcp->tcp_ipv6_recvancillary |= 10850 TCP_OLD_IPV6_RECVDSTOPTS; 10851 else 10852 tcp->tcp_ipv6_recvancillary &= 10853 ~TCP_OLD_IPV6_RECVDSTOPTS; 10854 } 10855 break; 10856 case IPV6_RECVRTHDR: 10857 if (!checkonly) { 10858 if (onoff) 10859 tcp->tcp_ipv6_recvancillary |= 10860 TCP_IPV6_RECVRTHDR; 10861 else 10862 tcp->tcp_ipv6_recvancillary &= 10863 ~TCP_IPV6_RECVRTHDR; 10864 } 10865 break; 10866 case IPV6_RECVRTHDRDSTOPTS: 10867 if (!checkonly) { 10868 if (onoff) 10869 tcp->tcp_ipv6_recvancillary |= 10870 TCP_IPV6_RECVRTDSTOPTS; 10871 else 10872 tcp->tcp_ipv6_recvancillary &= 10873 ~TCP_IPV6_RECVRTDSTOPTS; 10874 } 10875 break; 10876 case IPV6_PKTINFO: 10877 if (inlen != 0 && inlen != sizeof (struct in6_pktinfo)) 10878 return (EINVAL); 10879 if (checkonly) 10880 break; 10881 10882 if (inlen == 0) { 10883 ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR); 10884 } else { 10885 struct in6_pktinfo *pkti; 10886 10887 pkti = (struct in6_pktinfo *)invalp; 10888 /* 10889 * RFC 3542 states that ipi6_addr must be 10890 * the unspecified address when setting the 10891 * IPV6_PKTINFO sticky socket option on a 10892 * TCP socket. 10893 */ 10894 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) 10895 return (EINVAL); 10896 /* 10897 * ip6_set_pktinfo() validates the source 10898 * address and interface index. 10899 */ 10900 reterr = ip6_set_pktinfo(cr, tcp->tcp_connp, 10901 pkti, mblk); 10902 if (reterr != 0) 10903 return (reterr); 10904 ipp->ipp_ifindex = pkti->ipi6_ifindex; 10905 ipp->ipp_addr = pkti->ipi6_addr; 10906 if (ipp->ipp_ifindex != 0) 10907 ipp->ipp_fields |= IPPF_IFINDEX; 10908 else 10909 ipp->ipp_fields &= ~IPPF_IFINDEX; 10910 if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)) 10911 ipp->ipp_fields |= IPPF_ADDR; 10912 else 10913 ipp->ipp_fields &= ~IPPF_ADDR; 10914 } 10915 reterr = tcp_build_hdrs(q, tcp); 10916 if (reterr != 0) 10917 return (reterr); 10918 break; 10919 case IPV6_TCLASS: 10920 if (inlen != 0 && inlen != sizeof (int)) 10921 return (EINVAL); 10922 if (checkonly) 10923 break; 10924 10925 if (inlen == 0) { 10926 ipp->ipp_fields &= ~IPPF_TCLASS; 10927 } else { 10928 if (*i1 > 255 || *i1 < -1) 10929 return (EINVAL); 10930 if (*i1 == -1) { 10931 ipp->ipp_tclass = 0; 10932 *i1 = 0; 10933 } else { 10934 ipp->ipp_tclass = *i1; 10935 } 10936 ipp->ipp_fields |= IPPF_TCLASS; 10937 } 10938 reterr = tcp_build_hdrs(q, tcp); 10939 if (reterr != 0) 10940 return (reterr); 10941 break; 10942 case IPV6_NEXTHOP: 10943 /* 10944 * IP will verify that the nexthop is reachable 10945 * and fail for sticky options. 10946 */ 10947 if (inlen != 0 && inlen != sizeof (sin6_t)) 10948 return (EINVAL); 10949 if (checkonly) 10950 break; 10951 10952 if (inlen == 0) { 10953 ipp->ipp_fields &= ~IPPF_NEXTHOP; 10954 } else { 10955 sin6_t *sin6 = (sin6_t *)invalp; 10956 10957 if (sin6->sin6_family != AF_INET6) 10958 return (EAFNOSUPPORT); 10959 if (IN6_IS_ADDR_V4MAPPED( 10960 &sin6->sin6_addr)) 10961 return (EADDRNOTAVAIL); 10962 ipp->ipp_nexthop = sin6->sin6_addr; 10963 if (!IN6_IS_ADDR_UNSPECIFIED( 10964 &ipp->ipp_nexthop)) 10965 ipp->ipp_fields |= IPPF_NEXTHOP; 10966 else 10967 ipp->ipp_fields &= ~IPPF_NEXTHOP; 10968 } 10969 reterr = tcp_build_hdrs(q, tcp); 10970 if (reterr != 0) 10971 return (reterr); 10972 break; 10973 case IPV6_HOPOPTS: { 10974 ip6_hbh_t *hopts = (ip6_hbh_t *)invalp; 10975 10976 /* 10977 * Sanity checks - minimum size, size a multiple of 10978 * eight bytes, and matching size passed in. 10979 */ 10980 if (inlen != 0 && 10981 inlen != (8 * (hopts->ip6h_len + 1))) 10982 return (EINVAL); 10983 10984 if (checkonly) 10985 break; 10986 10987 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10988 (uchar_t **)&ipp->ipp_hopopts, 10989 &ipp->ipp_hopoptslen, tcp->tcp_label_len); 10990 if (reterr != 0) 10991 return (reterr); 10992 if (ipp->ipp_hopoptslen == 0) 10993 ipp->ipp_fields &= ~IPPF_HOPOPTS; 10994 else 10995 ipp->ipp_fields |= IPPF_HOPOPTS; 10996 reterr = tcp_build_hdrs(q, tcp); 10997 if (reterr != 0) 10998 return (reterr); 10999 break; 11000 } 11001 case IPV6_RTHDRDSTOPTS: { 11002 ip6_dest_t *dopts = (ip6_dest_t *)invalp; 11003 11004 /* 11005 * Sanity checks - minimum size, size a multiple of 11006 * eight bytes, and matching size passed in. 11007 */ 11008 if (inlen != 0 && 11009 inlen != (8 * (dopts->ip6d_len + 1))) 11010 return (EINVAL); 11011 11012 if (checkonly) 11013 break; 11014 11015 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 11016 (uchar_t **)&ipp->ipp_rtdstopts, 11017 &ipp->ipp_rtdstoptslen, 0); 11018 if (reterr != 0) 11019 return (reterr); 11020 if (ipp->ipp_rtdstoptslen == 0) 11021 ipp->ipp_fields &= ~IPPF_RTDSTOPTS; 11022 else 11023 ipp->ipp_fields |= IPPF_RTDSTOPTS; 11024 reterr = tcp_build_hdrs(q, tcp); 11025 if (reterr != 0) 11026 return (reterr); 11027 break; 11028 } 11029 case IPV6_DSTOPTS: { 11030 ip6_dest_t *dopts = (ip6_dest_t *)invalp; 11031 11032 /* 11033 * Sanity checks - minimum size, size a multiple of 11034 * eight bytes, and matching size passed in. 11035 */ 11036 if (inlen != 0 && 11037 inlen != (8 * (dopts->ip6d_len + 1))) 11038 return (EINVAL); 11039 11040 if (checkonly) 11041 break; 11042 11043 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 11044 (uchar_t **)&ipp->ipp_dstopts, 11045 &ipp->ipp_dstoptslen, 0); 11046 if (reterr != 0) 11047 return (reterr); 11048 if (ipp->ipp_dstoptslen == 0) 11049 ipp->ipp_fields &= ~IPPF_DSTOPTS; 11050 else 11051 ipp->ipp_fields |= IPPF_DSTOPTS; 11052 reterr = tcp_build_hdrs(q, tcp); 11053 if (reterr != 0) 11054 return (reterr); 11055 break; 11056 } 11057 case IPV6_RTHDR: { 11058 ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp; 11059 11060 /* 11061 * Sanity checks - minimum size, size a multiple of 11062 * eight bytes, and matching size passed in. 11063 */ 11064 if (inlen != 0 && 11065 inlen != (8 * (rt->ip6r_len + 1))) 11066 return (EINVAL); 11067 11068 if (checkonly) 11069 break; 11070 11071 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 11072 (uchar_t **)&ipp->ipp_rthdr, 11073 &ipp->ipp_rthdrlen, 0); 11074 if (reterr != 0) 11075 return (reterr); 11076 if (ipp->ipp_rthdrlen == 0) 11077 ipp->ipp_fields &= ~IPPF_RTHDR; 11078 else 11079 ipp->ipp_fields |= IPPF_RTHDR; 11080 reterr = tcp_build_hdrs(q, tcp); 11081 if (reterr != 0) 11082 return (reterr); 11083 break; 11084 } 11085 case IPV6_V6ONLY: 11086 if (!checkonly) 11087 tcp->tcp_connp->conn_ipv6_v6only = onoff; 11088 break; 11089 case IPV6_USE_MIN_MTU: 11090 if (inlen != sizeof (int)) 11091 return (EINVAL); 11092 11093 if (*i1 < -1 || *i1 > 1) 11094 return (EINVAL); 11095 11096 if (checkonly) 11097 break; 11098 11099 ipp->ipp_fields |= IPPF_USE_MIN_MTU; 11100 ipp->ipp_use_min_mtu = *i1; 11101 break; 11102 case IPV6_BOUND_PIF: 11103 /* Handled at the IP level */ 11104 return (-EINVAL); 11105 case IPV6_SEC_OPT: 11106 /* 11107 * We should not allow policy setting after 11108 * we start listening for connections. 11109 */ 11110 if (tcp->tcp_state == TCPS_LISTEN) { 11111 return (EINVAL); 11112 } else { 11113 /* Handled at the IP level */ 11114 return (-EINVAL); 11115 } 11116 case IPV6_SRC_PREFERENCES: 11117 if (inlen != sizeof (uint32_t)) 11118 return (EINVAL); 11119 reterr = ip6_set_src_preferences(tcp->tcp_connp, 11120 *(uint32_t *)invalp); 11121 if (reterr != 0) { 11122 *outlenp = 0; 11123 return (reterr); 11124 } 11125 break; 11126 default: 11127 *outlenp = 0; 11128 return (EINVAL); 11129 } 11130 break; 11131 } /* end IPPROTO_IPV6 */ 11132 default: 11133 *outlenp = 0; 11134 return (EINVAL); 11135 } 11136 /* 11137 * Common case of OK return with outval same as inval 11138 */ 11139 if (invalp != outvalp) { 11140 /* don't trust bcopy for identical src/dst */ 11141 (void) bcopy(invalp, outvalp, inlen); 11142 } 11143 *outlenp = inlen; 11144 return (0); 11145 } 11146 11147 /* 11148 * Update tcp_sticky_hdrs based on tcp_sticky_ipp. 11149 * The headers include ip6i_t (if needed), ip6_t, any sticky extension 11150 * headers, and the maximum size tcp header (to avoid reallocation 11151 * on the fly for additional tcp options). 11152 * Returns failure if can't allocate memory. 11153 */ 11154 static int 11155 tcp_build_hdrs(queue_t *q, tcp_t *tcp) 11156 { 11157 char *hdrs; 11158 uint_t hdrs_len; 11159 ip6i_t *ip6i; 11160 char buf[TCP_MAX_HDR_LENGTH]; 11161 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; 11162 in6_addr_t src, dst; 11163 tcp_stack_t *tcps = tcp->tcp_tcps; 11164 11165 /* 11166 * save the existing tcp header and source/dest IP addresses 11167 */ 11168 bcopy(tcp->tcp_tcph, buf, tcp->tcp_tcp_hdr_len); 11169 src = tcp->tcp_ip6h->ip6_src; 11170 dst = tcp->tcp_ip6h->ip6_dst; 11171 hdrs_len = ip_total_hdrs_len_v6(ipp) + TCP_MAX_HDR_LENGTH; 11172 ASSERT(hdrs_len != 0); 11173 if (hdrs_len > tcp->tcp_iphc_len) { 11174 /* Need to reallocate */ 11175 hdrs = kmem_zalloc(hdrs_len, KM_NOSLEEP); 11176 if (hdrs == NULL) 11177 return (ENOMEM); 11178 if (tcp->tcp_iphc != NULL) { 11179 if (tcp->tcp_hdr_grown) { 11180 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 11181 } else { 11182 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 11183 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 11184 } 11185 tcp->tcp_iphc_len = 0; 11186 } 11187 ASSERT(tcp->tcp_iphc_len == 0); 11188 tcp->tcp_iphc = hdrs; 11189 tcp->tcp_iphc_len = hdrs_len; 11190 tcp->tcp_hdr_grown = B_TRUE; 11191 } 11192 ip_build_hdrs_v6((uchar_t *)tcp->tcp_iphc, 11193 hdrs_len - TCP_MAX_HDR_LENGTH, ipp, IPPROTO_TCP); 11194 11195 /* Set header fields not in ipp */ 11196 if (ipp->ipp_fields & IPPF_HAS_IP6I) { 11197 ip6i = (ip6i_t *)tcp->tcp_iphc; 11198 tcp->tcp_ip6h = (ip6_t *)&ip6i[1]; 11199 } else { 11200 tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc; 11201 } 11202 /* 11203 * tcp->tcp_ip_hdr_len will include ip6i_t if there is one. 11204 * 11205 * tcp->tcp_tcp_hdr_len doesn't change here. 11206 */ 11207 tcp->tcp_ip_hdr_len = hdrs_len - TCP_MAX_HDR_LENGTH; 11208 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + tcp->tcp_ip_hdr_len); 11209 tcp->tcp_hdr_len = tcp->tcp_ip_hdr_len + tcp->tcp_tcp_hdr_len; 11210 11211 bcopy(buf, tcp->tcp_tcph, tcp->tcp_tcp_hdr_len); 11212 11213 tcp->tcp_ip6h->ip6_src = src; 11214 tcp->tcp_ip6h->ip6_dst = dst; 11215 11216 /* 11217 * If the hop limit was not set by ip_build_hdrs_v6(), set it to 11218 * the default value for TCP. 11219 */ 11220 if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS)) 11221 tcp->tcp_ip6h->ip6_hops = tcps->tcps_ipv6_hoplimit; 11222 11223 /* 11224 * If we're setting extension headers after a connection 11225 * has been established, and if we have a routing header 11226 * among the extension headers, call ip_massage_options_v6 to 11227 * manipulate the routing header/ip6_dst set the checksum 11228 * difference in the tcp header template. 11229 * (This happens in tcp_connect_ipv6 if the routing header 11230 * is set prior to the connect.) 11231 * Set the tcp_sum to zero first in case we've cleared a 11232 * routing header or don't have one at all. 11233 */ 11234 tcp->tcp_sum = 0; 11235 if ((tcp->tcp_state >= TCPS_SYN_SENT) && 11236 (tcp->tcp_ipp_fields & IPPF_RTHDR)) { 11237 ip6_rthdr_t *rth = ip_find_rthdr_v6(tcp->tcp_ip6h, 11238 (uint8_t *)tcp->tcp_tcph); 11239 if (rth != NULL) { 11240 tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, 11241 rth, tcps->tcps_netstack); 11242 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + 11243 (tcp->tcp_sum >> 16)); 11244 } 11245 } 11246 11247 /* Try to get everything in a single mblk */ 11248 (void) mi_set_sth_wroff(RD(q), hdrs_len + tcps->tcps_wroff_xtra); 11249 return (0); 11250 } 11251 11252 /* 11253 * Transfer any source route option from ipha to buf/dst in reversed form. 11254 */ 11255 static int 11256 tcp_opt_rev_src_route(ipha_t *ipha, char *buf, uchar_t *dst) 11257 { 11258 ipoptp_t opts; 11259 uchar_t *opt; 11260 uint8_t optval; 11261 uint8_t optlen; 11262 uint32_t len = 0; 11263 11264 for (optval = ipoptp_first(&opts, ipha); 11265 optval != IPOPT_EOL; 11266 optval = ipoptp_next(&opts)) { 11267 opt = opts.ipoptp_cur; 11268 optlen = opts.ipoptp_len; 11269 switch (optval) { 11270 int off1, off2; 11271 case IPOPT_SSRR: 11272 case IPOPT_LSRR: 11273 11274 /* Reverse source route */ 11275 /* 11276 * First entry should be the next to last one in the 11277 * current source route (the last entry is our 11278 * address.) 11279 * The last entry should be the final destination. 11280 */ 11281 buf[IPOPT_OPTVAL] = (uint8_t)optval; 11282 buf[IPOPT_OLEN] = (uint8_t)optlen; 11283 off1 = IPOPT_MINOFF_SR - 1; 11284 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; 11285 if (off2 < 0) { 11286 /* No entries in source route */ 11287 break; 11288 } 11289 bcopy(opt + off2, dst, IP_ADDR_LEN); 11290 /* 11291 * Note: use src since ipha has not had its src 11292 * and dst reversed (it is in the state it was 11293 * received. 11294 */ 11295 bcopy(&ipha->ipha_src, buf + off2, 11296 IP_ADDR_LEN); 11297 off2 -= IP_ADDR_LEN; 11298 11299 while (off2 > 0) { 11300 bcopy(opt + off2, buf + off1, 11301 IP_ADDR_LEN); 11302 off1 += IP_ADDR_LEN; 11303 off2 -= IP_ADDR_LEN; 11304 } 11305 buf[IPOPT_OFFSET] = IPOPT_MINOFF_SR; 11306 buf += optlen; 11307 len += optlen; 11308 break; 11309 } 11310 } 11311 done: 11312 /* Pad the resulting options */ 11313 while (len & 0x3) { 11314 *buf++ = IPOPT_EOL; 11315 len++; 11316 } 11317 return (len); 11318 } 11319 11320 11321 /* 11322 * Extract and revert a source route from ipha (if any) 11323 * and then update the relevant fields in both tcp_t and the standard header. 11324 */ 11325 static void 11326 tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha) 11327 { 11328 char buf[TCP_MAX_HDR_LENGTH]; 11329 uint_t tcph_len; 11330 int len; 11331 11332 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 11333 len = IPH_HDR_LENGTH(ipha); 11334 if (len == IP_SIMPLE_HDR_LENGTH) 11335 /* Nothing to do */ 11336 return; 11337 if (len > IP_SIMPLE_HDR_LENGTH + TCP_MAX_IP_OPTIONS_LENGTH || 11338 (len & 0x3)) 11339 return; 11340 11341 tcph_len = tcp->tcp_tcp_hdr_len; 11342 bcopy(tcp->tcp_tcph, buf, tcph_len); 11343 tcp->tcp_sum = (tcp->tcp_ipha->ipha_dst >> 16) + 11344 (tcp->tcp_ipha->ipha_dst & 0xffff); 11345 len = tcp_opt_rev_src_route(ipha, (char *)tcp->tcp_ipha + 11346 IP_SIMPLE_HDR_LENGTH, (uchar_t *)&tcp->tcp_ipha->ipha_dst); 11347 len += IP_SIMPLE_HDR_LENGTH; 11348 tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + 11349 (tcp->tcp_ipha->ipha_dst & 0xffff)); 11350 if ((int)tcp->tcp_sum < 0) 11351 tcp->tcp_sum--; 11352 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); 11353 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16)); 11354 tcp->tcp_tcph = (tcph_t *)((char *)tcp->tcp_ipha + len); 11355 bcopy(buf, tcp->tcp_tcph, tcph_len); 11356 tcp->tcp_ip_hdr_len = len; 11357 tcp->tcp_ipha->ipha_version_and_hdr_length = 11358 (IP_VERSION << 4) | (len >> 2); 11359 len += tcph_len; 11360 tcp->tcp_hdr_len = len; 11361 } 11362 11363 /* 11364 * Copy the standard header into its new location, 11365 * lay in the new options and then update the relevant 11366 * fields in both tcp_t and the standard header. 11367 */ 11368 static int 11369 tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len) 11370 { 11371 uint_t tcph_len; 11372 uint8_t *ip_optp; 11373 tcph_t *new_tcph; 11374 tcp_stack_t *tcps = tcp->tcp_tcps; 11375 11376 if ((len > TCP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3)) 11377 return (EINVAL); 11378 11379 if (len > IP_MAX_OPT_LENGTH - tcp->tcp_label_len) 11380 return (EINVAL); 11381 11382 if (checkonly) { 11383 /* 11384 * do not really set, just pretend to - T_CHECK 11385 */ 11386 return (0); 11387 } 11388 11389 ip_optp = (uint8_t *)tcp->tcp_ipha + IP_SIMPLE_HDR_LENGTH; 11390 if (tcp->tcp_label_len > 0) { 11391 int padlen; 11392 uint8_t opt; 11393 11394 /* convert list termination to no-ops */ 11395 padlen = tcp->tcp_label_len - ip_optp[IPOPT_OLEN]; 11396 ip_optp += ip_optp[IPOPT_OLEN]; 11397 opt = len > 0 ? IPOPT_NOP : IPOPT_EOL; 11398 while (--padlen >= 0) 11399 *ip_optp++ = opt; 11400 } 11401 tcph_len = tcp->tcp_tcp_hdr_len; 11402 new_tcph = (tcph_t *)(ip_optp + len); 11403 ovbcopy(tcp->tcp_tcph, new_tcph, tcph_len); 11404 tcp->tcp_tcph = new_tcph; 11405 bcopy(ptr, ip_optp, len); 11406 11407 len += IP_SIMPLE_HDR_LENGTH + tcp->tcp_label_len; 11408 11409 tcp->tcp_ip_hdr_len = len; 11410 tcp->tcp_ipha->ipha_version_and_hdr_length = 11411 (IP_VERSION << 4) | (len >> 2); 11412 tcp->tcp_hdr_len = len + tcph_len; 11413 if (!TCP_IS_DETACHED(tcp)) { 11414 /* Always allocate room for all options. */ 11415 (void) mi_set_sth_wroff(tcp->tcp_rq, 11416 TCP_MAX_COMBINED_HEADER_LENGTH + tcps->tcps_wroff_xtra); 11417 } 11418 return (0); 11419 } 11420 11421 /* Get callback routine passed to nd_load by tcp_param_register */ 11422 /* ARGSUSED */ 11423 static int 11424 tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 11425 { 11426 tcpparam_t *tcppa = (tcpparam_t *)cp; 11427 11428 (void) mi_mpprintf(mp, "%u", tcppa->tcp_param_val); 11429 return (0); 11430 } 11431 11432 /* 11433 * Walk through the param array specified registering each element with the 11434 * named dispatch handler. 11435 */ 11436 static boolean_t 11437 tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *tcps) 11438 { 11439 for (; cnt-- > 0; tcppa++) { 11440 if (tcppa->tcp_param_name && tcppa->tcp_param_name[0]) { 11441 if (!nd_load(ndp, tcppa->tcp_param_name, 11442 tcp_param_get, tcp_param_set, 11443 (caddr_t)tcppa)) { 11444 nd_free(ndp); 11445 return (B_FALSE); 11446 } 11447 } 11448 } 11449 tcps->tcps_wroff_xtra_param = kmem_zalloc(sizeof (tcpparam_t), 11450 KM_SLEEP); 11451 bcopy(&lcl_tcp_wroff_xtra_param, tcps->tcps_wroff_xtra_param, 11452 sizeof (tcpparam_t)); 11453 if (!nd_load(ndp, tcps->tcps_wroff_xtra_param->tcp_param_name, 11454 tcp_param_get, tcp_param_set_aligned, 11455 (caddr_t)tcps->tcps_wroff_xtra_param)) { 11456 nd_free(ndp); 11457 return (B_FALSE); 11458 } 11459 tcps->tcps_mdt_head_param = kmem_zalloc(sizeof (tcpparam_t), 11460 KM_SLEEP); 11461 bcopy(&lcl_tcp_mdt_head_param, tcps->tcps_mdt_head_param, 11462 sizeof (tcpparam_t)); 11463 if (!nd_load(ndp, tcps->tcps_mdt_head_param->tcp_param_name, 11464 tcp_param_get, tcp_param_set_aligned, 11465 (caddr_t)tcps->tcps_mdt_head_param)) { 11466 nd_free(ndp); 11467 return (B_FALSE); 11468 } 11469 tcps->tcps_mdt_tail_param = kmem_zalloc(sizeof (tcpparam_t), 11470 KM_SLEEP); 11471 bcopy(&lcl_tcp_mdt_tail_param, tcps->tcps_mdt_tail_param, 11472 sizeof (tcpparam_t)); 11473 if (!nd_load(ndp, tcps->tcps_mdt_tail_param->tcp_param_name, 11474 tcp_param_get, tcp_param_set_aligned, 11475 (caddr_t)tcps->tcps_mdt_tail_param)) { 11476 nd_free(ndp); 11477 return (B_FALSE); 11478 } 11479 tcps->tcps_mdt_max_pbufs_param = kmem_zalloc(sizeof (tcpparam_t), 11480 KM_SLEEP); 11481 bcopy(&lcl_tcp_mdt_max_pbufs_param, tcps->tcps_mdt_max_pbufs_param, 11482 sizeof (tcpparam_t)); 11483 if (!nd_load(ndp, tcps->tcps_mdt_max_pbufs_param->tcp_param_name, 11484 tcp_param_get, tcp_param_set_aligned, 11485 (caddr_t)tcps->tcps_mdt_max_pbufs_param)) { 11486 nd_free(ndp); 11487 return (B_FALSE); 11488 } 11489 if (!nd_load(ndp, "tcp_extra_priv_ports", 11490 tcp_extra_priv_ports_get, NULL, NULL)) { 11491 nd_free(ndp); 11492 return (B_FALSE); 11493 } 11494 if (!nd_load(ndp, "tcp_extra_priv_ports_add", 11495 NULL, tcp_extra_priv_ports_add, NULL)) { 11496 nd_free(ndp); 11497 return (B_FALSE); 11498 } 11499 if (!nd_load(ndp, "tcp_extra_priv_ports_del", 11500 NULL, tcp_extra_priv_ports_del, NULL)) { 11501 nd_free(ndp); 11502 return (B_FALSE); 11503 } 11504 if (!nd_load(ndp, "tcp_status", tcp_status_report, NULL, 11505 NULL)) { 11506 nd_free(ndp); 11507 return (B_FALSE); 11508 } 11509 if (!nd_load(ndp, "tcp_bind_hash", tcp_bind_hash_report, 11510 NULL, NULL)) { 11511 nd_free(ndp); 11512 return (B_FALSE); 11513 } 11514 if (!nd_load(ndp, "tcp_listen_hash", 11515 tcp_listen_hash_report, NULL, NULL)) { 11516 nd_free(ndp); 11517 return (B_FALSE); 11518 } 11519 if (!nd_load(ndp, "tcp_conn_hash", tcp_conn_hash_report, 11520 NULL, NULL)) { 11521 nd_free(ndp); 11522 return (B_FALSE); 11523 } 11524 if (!nd_load(ndp, "tcp_acceptor_hash", 11525 tcp_acceptor_hash_report, NULL, NULL)) { 11526 nd_free(ndp); 11527 return (B_FALSE); 11528 } 11529 if (!nd_load(ndp, "tcp_host_param", tcp_host_param_report, 11530 tcp_host_param_set, NULL)) { 11531 nd_free(ndp); 11532 return (B_FALSE); 11533 } 11534 if (!nd_load(ndp, "tcp_host_param_ipv6", 11535 tcp_host_param_report, tcp_host_param_set_ipv6, NULL)) { 11536 nd_free(ndp); 11537 return (B_FALSE); 11538 } 11539 if (!nd_load(ndp, "tcp_1948_phrase", NULL, 11540 tcp_1948_phrase_set, NULL)) { 11541 nd_free(ndp); 11542 return (B_FALSE); 11543 } 11544 if (!nd_load(ndp, "tcp_reserved_port_list", 11545 tcp_reserved_port_list, NULL, NULL)) { 11546 nd_free(ndp); 11547 return (B_FALSE); 11548 } 11549 /* 11550 * Dummy ndd variables - only to convey obsolescence information 11551 * through printing of their name (no get or set routines) 11552 * XXX Remove in future releases ? 11553 */ 11554 if (!nd_load(ndp, 11555 "tcp_close_wait_interval(obsoleted - " 11556 "use tcp_time_wait_interval)", NULL, NULL, NULL)) { 11557 nd_free(ndp); 11558 return (B_FALSE); 11559 } 11560 return (B_TRUE); 11561 } 11562 11563 /* ndd set routine for tcp_wroff_xtra, tcp_mdt_hdr_{head,tail}_min. */ 11564 /* ARGSUSED */ 11565 static int 11566 tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 11567 cred_t *cr) 11568 { 11569 long new_value; 11570 tcpparam_t *tcppa = (tcpparam_t *)cp; 11571 11572 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11573 new_value < tcppa->tcp_param_min || 11574 new_value > tcppa->tcp_param_max) { 11575 return (EINVAL); 11576 } 11577 /* 11578 * Need to make sure new_value is a multiple of 4. If it is not, 11579 * round it up. For future 64 bit requirement, we actually make it 11580 * a multiple of 8. 11581 */ 11582 if (new_value & 0x7) { 11583 new_value = (new_value & ~0x7) + 0x8; 11584 } 11585 tcppa->tcp_param_val = new_value; 11586 return (0); 11587 } 11588 11589 /* Set callback routine passed to nd_load by tcp_param_register */ 11590 /* ARGSUSED */ 11591 static int 11592 tcp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) 11593 { 11594 long new_value; 11595 tcpparam_t *tcppa = (tcpparam_t *)cp; 11596 11597 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11598 new_value < tcppa->tcp_param_min || 11599 new_value > tcppa->tcp_param_max) { 11600 return (EINVAL); 11601 } 11602 tcppa->tcp_param_val = new_value; 11603 return (0); 11604 } 11605 11606 /* 11607 * Add a new piece to the tcp reassembly queue. If the gap at the beginning 11608 * is filled, return as much as we can. The message passed in may be 11609 * multi-part, chained using b_cont. "start" is the starting sequence 11610 * number for this piece. 11611 */ 11612 static mblk_t * 11613 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 11614 { 11615 uint32_t end; 11616 mblk_t *mp1; 11617 mblk_t *mp2; 11618 mblk_t *next_mp; 11619 uint32_t u1; 11620 tcp_stack_t *tcps = tcp->tcp_tcps; 11621 11622 /* Walk through all the new pieces. */ 11623 do { 11624 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 11625 (uintptr_t)INT_MAX); 11626 end = start + (int)(mp->b_wptr - mp->b_rptr); 11627 next_mp = mp->b_cont; 11628 if (start == end) { 11629 /* Empty. Blast it. */ 11630 freeb(mp); 11631 continue; 11632 } 11633 mp->b_cont = NULL; 11634 TCP_REASS_SET_SEQ(mp, start); 11635 TCP_REASS_SET_END(mp, end); 11636 mp1 = tcp->tcp_reass_tail; 11637 if (!mp1) { 11638 tcp->tcp_reass_tail = mp; 11639 tcp->tcp_reass_head = mp; 11640 BUMP_MIB(&tcps->tcps_mib, tcpInDataUnorderSegs); 11641 UPDATE_MIB(&tcps->tcps_mib, 11642 tcpInDataUnorderBytes, end - start); 11643 continue; 11644 } 11645 /* New stuff completely beyond tail? */ 11646 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { 11647 /* Link it on end. */ 11648 mp1->b_cont = mp; 11649 tcp->tcp_reass_tail = mp; 11650 BUMP_MIB(&tcps->tcps_mib, tcpInDataUnorderSegs); 11651 UPDATE_MIB(&tcps->tcps_mib, 11652 tcpInDataUnorderBytes, end - start); 11653 continue; 11654 } 11655 mp1 = tcp->tcp_reass_head; 11656 u1 = TCP_REASS_SEQ(mp1); 11657 /* New stuff at the front? */ 11658 if (SEQ_LT(start, u1)) { 11659 /* Yes... Check for overlap. */ 11660 mp->b_cont = mp1; 11661 tcp->tcp_reass_head = mp; 11662 tcp_reass_elim_overlap(tcp, mp); 11663 continue; 11664 } 11665 /* 11666 * The new piece fits somewhere between the head and tail. 11667 * We find our slot, where mp1 precedes us and mp2 trails. 11668 */ 11669 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 11670 u1 = TCP_REASS_SEQ(mp2); 11671 if (SEQ_LEQ(start, u1)) 11672 break; 11673 } 11674 /* Link ourselves in */ 11675 mp->b_cont = mp2; 11676 mp1->b_cont = mp; 11677 11678 /* Trim overlap with following mblk(s) first */ 11679 tcp_reass_elim_overlap(tcp, mp); 11680 11681 /* Trim overlap with preceding mblk */ 11682 tcp_reass_elim_overlap(tcp, mp1); 11683 11684 } while (start = end, mp = next_mp); 11685 mp1 = tcp->tcp_reass_head; 11686 /* Anything ready to go? */ 11687 if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 11688 return (NULL); 11689 /* Eat what we can off the queue */ 11690 for (;;) { 11691 mp = mp1->b_cont; 11692 end = TCP_REASS_END(mp1); 11693 TCP_REASS_SET_SEQ(mp1, 0); 11694 TCP_REASS_SET_END(mp1, 0); 11695 if (!mp) { 11696 tcp->tcp_reass_tail = NULL; 11697 break; 11698 } 11699 if (end != TCP_REASS_SEQ(mp)) { 11700 mp1->b_cont = NULL; 11701 break; 11702 } 11703 mp1 = mp; 11704 } 11705 mp1 = tcp->tcp_reass_head; 11706 tcp->tcp_reass_head = mp; 11707 return (mp1); 11708 } 11709 11710 /* Eliminate any overlap that mp may have over later mblks */ 11711 static void 11712 tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 11713 { 11714 uint32_t end; 11715 mblk_t *mp1; 11716 uint32_t u1; 11717 tcp_stack_t *tcps = tcp->tcp_tcps; 11718 11719 end = TCP_REASS_END(mp); 11720 while ((mp1 = mp->b_cont) != NULL) { 11721 u1 = TCP_REASS_SEQ(mp1); 11722 if (!SEQ_GT(end, u1)) 11723 break; 11724 if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 11725 mp->b_wptr -= end - u1; 11726 TCP_REASS_SET_END(mp, u1); 11727 BUMP_MIB(&tcps->tcps_mib, tcpInDataPartDupSegs); 11728 UPDATE_MIB(&tcps->tcps_mib, 11729 tcpInDataPartDupBytes, end - u1); 11730 break; 11731 } 11732 mp->b_cont = mp1->b_cont; 11733 TCP_REASS_SET_SEQ(mp1, 0); 11734 TCP_REASS_SET_END(mp1, 0); 11735 freeb(mp1); 11736 BUMP_MIB(&tcps->tcps_mib, tcpInDataDupSegs); 11737 UPDATE_MIB(&tcps->tcps_mib, tcpInDataDupBytes, end - u1); 11738 } 11739 if (!mp1) 11740 tcp->tcp_reass_tail = mp; 11741 } 11742 11743 /* 11744 * Send up all messages queued on tcp_rcv_list. 11745 */ 11746 static uint_t 11747 tcp_rcv_drain(queue_t *q, tcp_t *tcp) 11748 { 11749 mblk_t *mp; 11750 uint_t ret = 0; 11751 uint_t thwin; 11752 #ifdef DEBUG 11753 uint_t cnt = 0; 11754 #endif 11755 tcp_stack_t *tcps = tcp->tcp_tcps; 11756 11757 /* Can't drain on an eager connection */ 11758 if (tcp->tcp_listener != NULL) 11759 return (ret); 11760 11761 /* Can't be sodirect enabled */ 11762 ASSERT(SOD_NOT_ENABLED(tcp)); 11763 11764 /* 11765 * Handle two cases here: we are currently fused or we were 11766 * previously fused and have some urgent data to be delivered 11767 * upstream. The latter happens because we either ran out of 11768 * memory or were detached and therefore sending the SIGURG was 11769 * deferred until this point. In either case we pass control 11770 * over to tcp_fuse_rcv_drain() since it may need to complete 11771 * some work. 11772 */ 11773 if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) { 11774 ASSERT(tcp->tcp_fused_sigurg_mp != NULL); 11775 if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL : 11776 &tcp->tcp_fused_sigurg_mp)) 11777 return (ret); 11778 } 11779 11780 while ((mp = tcp->tcp_rcv_list) != NULL) { 11781 tcp->tcp_rcv_list = mp->b_next; 11782 mp->b_next = NULL; 11783 #ifdef DEBUG 11784 cnt += msgdsize(mp); 11785 #endif 11786 /* Does this need SSL processing first? */ 11787 if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) { 11788 DTRACE_PROBE1(kssl_mblk__ksslinput_rcvdrain, 11789 mblk_t *, mp); 11790 tcp_kssl_input(tcp, mp); 11791 continue; 11792 } 11793 putnext(q, mp); 11794 } 11795 ASSERT(cnt == tcp->tcp_rcv_cnt); 11796 tcp->tcp_rcv_last_head = NULL; 11797 tcp->tcp_rcv_last_tail = NULL; 11798 tcp->tcp_rcv_cnt = 0; 11799 11800 /* Learn the latest rwnd information that we sent to the other side. */ 11801 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 11802 << tcp->tcp_rcv_ws; 11803 /* This is peer's calculated send window (our receive window). */ 11804 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 11805 /* 11806 * Increase the receive window to max. But we need to do receiver 11807 * SWS avoidance. This means that we need to check the increase of 11808 * of receive window is at least 1 MSS. 11809 */ 11810 if (canputnext(q) && (q->q_hiwat - thwin >= tcp->tcp_mss)) { 11811 /* 11812 * If the window that the other side knows is less than max 11813 * deferred acks segments, send an update immediately. 11814 */ 11815 if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { 11816 BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); 11817 ret = TH_ACK_NEEDED; 11818 } 11819 tcp->tcp_rwnd = q->q_hiwat; 11820 } 11821 /* No need for the push timer now. */ 11822 if (tcp->tcp_push_tid != 0) { 11823 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 11824 tcp->tcp_push_tid = 0; 11825 } 11826 return (ret); 11827 } 11828 11829 /* 11830 * Queue data on tcp_rcv_list which is a b_next chain. 11831 * tcp_rcv_last_head/tail is the last element of this chain. 11832 * Each element of the chain is a b_cont chain. 11833 * 11834 * M_DATA messages are added to the current element. 11835 * Other messages are added as new (b_next) elements. 11836 */ 11837 void 11838 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) 11839 { 11840 ASSERT(seg_len == msgdsize(mp)); 11841 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); 11842 11843 if (tcp->tcp_rcv_list == NULL) { 11844 ASSERT(tcp->tcp_rcv_last_head == NULL); 11845 tcp->tcp_rcv_list = mp; 11846 tcp->tcp_rcv_last_head = mp; 11847 } else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) { 11848 tcp->tcp_rcv_last_tail->b_cont = mp; 11849 } else { 11850 tcp->tcp_rcv_last_head->b_next = mp; 11851 tcp->tcp_rcv_last_head = mp; 11852 } 11853 11854 while (mp->b_cont) 11855 mp = mp->b_cont; 11856 11857 tcp->tcp_rcv_last_tail = mp; 11858 tcp->tcp_rcv_cnt += seg_len; 11859 tcp->tcp_rwnd -= seg_len; 11860 } 11861 11862 /* 11863 * The tcp_rcv_sod_XXX() functions enqueue data directly to the socket 11864 * above, in addition when uioa is enabled schedule an asynchronous uio 11865 * prior to enqueuing. They implement the combinhed semantics of the 11866 * tcp_rcv_XXX() functions, tcp_rcv_list push logic, and STREAMS putnext() 11867 * canputnext(), i.e. flow-control with backenable. 11868 * 11869 * tcp_sod_wakeup() is called where tcp_rcv_drain() would be called in the 11870 * non sodirect connection but as there are no tcp_tcv_list mblk_t's we deal 11871 * with the rcv_wnd and push timer and call the sodirect wakeup function. 11872 * 11873 * Must be called with sodp->sod_lock held and will return with the lock 11874 * released. 11875 */ 11876 static uint_t 11877 tcp_rcv_sod_wakeup(tcp_t *tcp, sodirect_t *sodp) 11878 { 11879 queue_t *q = tcp->tcp_rq; 11880 uint_t thwin; 11881 tcp_stack_t *tcps = tcp->tcp_tcps; 11882 uint_t ret = 0; 11883 11884 /* Can't be an eager connection */ 11885 ASSERT(tcp->tcp_listener == NULL); 11886 11887 /* Caller must have lock held */ 11888 ASSERT(MUTEX_HELD(sodp->sod_lock)); 11889 11890 /* Sodirect mode so must not be a tcp_rcv_list */ 11891 ASSERT(tcp->tcp_rcv_list == NULL); 11892 11893 if (SOD_QFULL(sodp)) { 11894 /* Q is full, mark Q for need backenable */ 11895 SOD_QSETBE(sodp); 11896 } 11897 /* Last advertised rwnd, i.e. rwnd last sent in a packet */ 11898 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 11899 << tcp->tcp_rcv_ws; 11900 /* This is peer's calculated send window (our available rwnd). */ 11901 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 11902 /* 11903 * Increase the receive window to max. But we need to do receiver 11904 * SWS avoidance. This means that we need to check the increase of 11905 * of receive window is at least 1 MSS. 11906 */ 11907 if (!SOD_QFULL(sodp) && (q->q_hiwat - thwin >= tcp->tcp_mss)) { 11908 /* 11909 * If the window that the other side knows is less than max 11910 * deferred acks segments, send an update immediately. 11911 */ 11912 if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { 11913 BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); 11914 ret = TH_ACK_NEEDED; 11915 } 11916 tcp->tcp_rwnd = q->q_hiwat; 11917 } 11918 11919 if (!SOD_QEMPTY(sodp)) { 11920 /* Wakeup to socket */ 11921 sodp->sod_state &= SOD_WAKE_CLR; 11922 sodp->sod_state |= SOD_WAKE_DONE; 11923 (sodp->sod_wakeup)(sodp); 11924 /* wakeup() does the mutex_ext() */ 11925 } else { 11926 /* Q is empty, no need to wake */ 11927 sodp->sod_state &= SOD_WAKE_CLR; 11928 sodp->sod_state |= SOD_WAKE_NOT; 11929 mutex_exit(sodp->sod_lock); 11930 } 11931 11932 /* No need for the push timer now. */ 11933 if (tcp->tcp_push_tid != 0) { 11934 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 11935 tcp->tcp_push_tid = 0; 11936 } 11937 11938 return (ret); 11939 } 11940 11941 /* 11942 * Called where tcp_rcv_enqueue()/putnext(RD(q)) would be. For M_DATA 11943 * mblk_t's if uioa enabled then start a uioa asynchronous copy directly 11944 * to the user-land buffer and flag the mblk_t as such. 11945 * 11946 * Also, handle tcp_rwnd. 11947 */ 11948 uint_t 11949 tcp_rcv_sod_enqueue(tcp_t *tcp, sodirect_t *sodp, mblk_t *mp, uint_t seg_len) 11950 { 11951 uioa_t *uioap = &sodp->sod_uioa; 11952 boolean_t qfull; 11953 uint_t thwin; 11954 11955 /* Can't be an eager connection */ 11956 ASSERT(tcp->tcp_listener == NULL); 11957 11958 /* Caller must have lock held */ 11959 ASSERT(MUTEX_HELD(sodp->sod_lock)); 11960 11961 /* Sodirect mode so must not be a tcp_rcv_list */ 11962 ASSERT(tcp->tcp_rcv_list == NULL); 11963 11964 /* Passed in segment length must be equal to mblk_t chain data size */ 11965 ASSERT(seg_len == msgdsize(mp)); 11966 11967 if (DB_TYPE(mp) != M_DATA) { 11968 /* Only process M_DATA mblk_t's */ 11969 goto enq; 11970 } 11971 if (uioap->uioa_state & UIOA_ENABLED) { 11972 /* Uioa is enabled */ 11973 mblk_t *mp1 = mp; 11974 11975 if (seg_len > uioap->uio_resid) { 11976 /* 11977 * There isn't enough uio space for the mblk_t chain 11978 * so disable uioa such that this and any additional 11979 * mblk_t data is handled by the socket and schedule 11980 * the socket for wakeup to finish this uioa. 11981 */ 11982 uioap->uioa_state &= UIOA_CLR; 11983 uioap->uioa_state |= UIOA_FINI; 11984 if (sodp->sod_state & SOD_WAKE_NOT) { 11985 sodp->sod_state &= SOD_WAKE_CLR; 11986 sodp->sod_state |= SOD_WAKE_NEED; 11987 } 11988 goto enq; 11989 } 11990 do { 11991 uint32_t len = MBLKL(mp1); 11992 11993 if (!uioamove(mp1->b_rptr, len, UIO_READ, uioap)) { 11994 /* Scheduled, mark dblk_t as such */ 11995 DB_FLAGS(mp1) |= DBLK_UIOA; 11996 } else { 11997 /* Error, turn off async processing */ 11998 uioap->uioa_state &= UIOA_CLR; 11999 uioap->uioa_state |= UIOA_FINI; 12000 break; 12001 } 12002 } while ((mp1 = mp1->b_cont) != NULL); 12003 12004 if (mp1 != NULL || uioap->uio_resid == 0) { 12005 /* 12006 * Not all mblk_t(s) uioamoved (error) or all uio 12007 * space has been consumed so schedule the socket 12008 * for wakeup to finish this uio. 12009 */ 12010 sodp->sod_state &= SOD_WAKE_CLR; 12011 sodp->sod_state |= SOD_WAKE_NEED; 12012 } 12013 } else if (uioap->uioa_state & UIOA_FINI) { 12014 /* 12015 * Post UIO_ENABLED waiting for socket to finish processing 12016 * so just enqueue and update tcp_rwnd. 12017 */ 12018 if (SOD_QFULL(sodp)) 12019 tcp->tcp_rwnd -= seg_len; 12020 } else if (sodp->sod_want > 0) { 12021 /* 12022 * Uioa isn't enabled but sodirect has a pending read(). 12023 */ 12024 if (SOD_QCNT(sodp) + seg_len >= sodp->sod_want) { 12025 if (sodp->sod_state & SOD_WAKE_NOT) { 12026 /* Schedule socket for wakeup */ 12027 sodp->sod_state &= SOD_WAKE_CLR; 12028 sodp->sod_state |= SOD_WAKE_NEED; 12029 } 12030 tcp->tcp_rwnd -= seg_len; 12031 } 12032 } else if (SOD_QCNT(sodp) + seg_len >= tcp->tcp_rq->q_hiwat >> 3) { 12033 /* 12034 * No pending sodirect read() so used the default 12035 * TCP push logic to guess that a push is needed. 12036 */ 12037 if (sodp->sod_state & SOD_WAKE_NOT) { 12038 /* Schedule socket for wakeup */ 12039 sodp->sod_state &= SOD_WAKE_CLR; 12040 sodp->sod_state |= SOD_WAKE_NEED; 12041 } 12042 tcp->tcp_rwnd -= seg_len; 12043 } else { 12044 /* Just update tcp_rwnd */ 12045 tcp->tcp_rwnd -= seg_len; 12046 } 12047 enq: 12048 qfull = SOD_QFULL(sodp); 12049 12050 (sodp->sod_enqueue)(sodp, mp); 12051 12052 if (! qfull && SOD_QFULL(sodp)) { 12053 /* Wasn't QFULL, now QFULL, need back-enable */ 12054 SOD_QSETBE(sodp); 12055 } 12056 12057 /* 12058 * Check to see if remote avail swnd < mss due to delayed ACK, 12059 * first get advertised rwnd. 12060 */ 12061 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)); 12062 /* Minus delayed ACK count */ 12063 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 12064 if (thwin < tcp->tcp_mss) { 12065 /* Remote avail swnd < mss, need ACK now */ 12066 return (TH_ACK_NEEDED); 12067 } 12068 12069 return (0); 12070 } 12071 12072 /* 12073 * DEFAULT TCP ENTRY POINT via squeue on READ side. 12074 * 12075 * This is the default entry function into TCP on the read side. TCP is 12076 * always entered via squeue i.e. using squeue's for mutual exclusion. 12077 * When classifier does a lookup to find the tcp, it also puts a reference 12078 * on the conn structure associated so the tcp is guaranteed to exist 12079 * when we come here. We still need to check the state because it might 12080 * as well has been closed. The squeue processing function i.e. squeue_enter, 12081 * squeue_enter_nodrain, or squeue_drain is responsible for doing the 12082 * CONN_DEC_REF. 12083 * 12084 * Apart from the default entry point, IP also sends packets directly to 12085 * tcp_rput_data for AF_INET fast path and tcp_conn_request for incoming 12086 * connections. 12087 */ 12088 void 12089 tcp_input(void *arg, mblk_t *mp, void *arg2) 12090 { 12091 conn_t *connp = (conn_t *)arg; 12092 tcp_t *tcp = (tcp_t *)connp->conn_tcp; 12093 12094 /* arg2 is the sqp */ 12095 ASSERT(arg2 != NULL); 12096 ASSERT(mp != NULL); 12097 12098 /* 12099 * Don't accept any input on a closed tcp as this TCP logically does 12100 * not exist on the system. Don't proceed further with this TCP. 12101 * For eg. this packet could trigger another close of this tcp 12102 * which would be disastrous for tcp_refcnt. tcp_close_detached / 12103 * tcp_clean_death / tcp_closei_local must be called at most once 12104 * on a TCP. In this case we need to refeed the packet into the 12105 * classifier and figure out where the packet should go. Need to 12106 * preserve the recv_ill somehow. Until we figure that out, for 12107 * now just drop the packet if we can't classify the packet. 12108 */ 12109 if (tcp->tcp_state == TCPS_CLOSED || 12110 tcp->tcp_state == TCPS_BOUND) { 12111 conn_t *new_connp; 12112 ip_stack_t *ipst = tcp->tcp_tcps->tcps_netstack->netstack_ip; 12113 12114 new_connp = ipcl_classify(mp, connp->conn_zoneid, ipst); 12115 if (new_connp != NULL) { 12116 tcp_reinput(new_connp, mp, arg2); 12117 return; 12118 } 12119 /* We failed to classify. For now just drop the packet */ 12120 freemsg(mp); 12121 return; 12122 } 12123 12124 if (DB_TYPE(mp) == M_DATA) 12125 tcp_rput_data(connp, mp, arg2); 12126 else 12127 tcp_rput_common(tcp, mp); 12128 } 12129 12130 /* 12131 * The read side put procedure. 12132 * The packets passed up by ip are assume to be aligned according to 12133 * OK_32PTR and the IP+TCP headers fitting in the first mblk. 12134 */ 12135 static void 12136 tcp_rput_common(tcp_t *tcp, mblk_t *mp) 12137 { 12138 /* 12139 * tcp_rput_data() does not expect M_CTL except for the case 12140 * where tcp_ipv6_recvancillary is set and we get a IN_PKTINFO 12141 * type. Need to make sure that any other M_CTLs don't make 12142 * it to tcp_rput_data since it is not expecting any and doesn't 12143 * check for it. 12144 */ 12145 if (DB_TYPE(mp) == M_CTL) { 12146 switch (*(uint32_t *)(mp->b_rptr)) { 12147 case TCP_IOC_ABORT_CONN: 12148 /* 12149 * Handle connection abort request. 12150 */ 12151 tcp_ioctl_abort_handler(tcp, mp); 12152 return; 12153 case IPSEC_IN: 12154 /* 12155 * Only secure icmp arrive in TCP and they 12156 * don't go through data path. 12157 */ 12158 tcp_icmp_error(tcp, mp); 12159 return; 12160 case IN_PKTINFO: 12161 /* 12162 * Handle IPV6_RECVPKTINFO socket option on AF_INET6 12163 * sockets that are receiving IPv4 traffic. tcp 12164 */ 12165 ASSERT(tcp->tcp_family == AF_INET6); 12166 ASSERT(tcp->tcp_ipv6_recvancillary & 12167 TCP_IPV6_RECVPKTINFO); 12168 tcp_rput_data(tcp->tcp_connp, mp, 12169 tcp->tcp_connp->conn_sqp); 12170 return; 12171 case MDT_IOC_INFO_UPDATE: 12172 /* 12173 * Handle Multidata information update; the 12174 * following routine will free the message. 12175 */ 12176 if (tcp->tcp_connp->conn_mdt_ok) { 12177 tcp_mdt_update(tcp, 12178 &((ip_mdt_info_t *)mp->b_rptr)->mdt_capab, 12179 B_FALSE); 12180 } 12181 freemsg(mp); 12182 return; 12183 case LSO_IOC_INFO_UPDATE: 12184 /* 12185 * Handle LSO information update; the following 12186 * routine will free the message. 12187 */ 12188 if (tcp->tcp_connp->conn_lso_ok) { 12189 tcp_lso_update(tcp, 12190 &((ip_lso_info_t *)mp->b_rptr)->lso_capab); 12191 } 12192 freemsg(mp); 12193 return; 12194 default: 12195 /* 12196 * tcp_icmp_err() will process the M_CTL packets. 12197 * Non-ICMP packets, if any, will be discarded in 12198 * tcp_icmp_err(). We will process the ICMP packet 12199 * even if we are TCP_IS_DETACHED_NONEAGER as the 12200 * incoming ICMP packet may result in changing 12201 * the tcp_mss, which we would need if we have 12202 * packets to retransmit. 12203 */ 12204 tcp_icmp_error(tcp, mp); 12205 return; 12206 } 12207 } 12208 12209 /* No point processing the message if tcp is already closed */ 12210 if (TCP_IS_DETACHED_NONEAGER(tcp)) { 12211 freemsg(mp); 12212 return; 12213 } 12214 12215 tcp_rput_other(tcp, mp); 12216 } 12217 12218 12219 /* The minimum of smoothed mean deviation in RTO calculation. */ 12220 #define TCP_SD_MIN 400 12221 12222 /* 12223 * Set RTO for this connection. The formula is from Jacobson and Karels' 12224 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 12225 * are the same as those in Appendix A.2 of that paper. 12226 * 12227 * m = new measurement 12228 * sa = smoothed RTT average (8 * average estimates). 12229 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 12230 */ 12231 static void 12232 tcp_set_rto(tcp_t *tcp, clock_t rtt) 12233 { 12234 long m = TICK_TO_MSEC(rtt); 12235 clock_t sa = tcp->tcp_rtt_sa; 12236 clock_t sv = tcp->tcp_rtt_sd; 12237 clock_t rto; 12238 tcp_stack_t *tcps = tcp->tcp_tcps; 12239 12240 BUMP_MIB(&tcps->tcps_mib, tcpRttUpdate); 12241 tcp->tcp_rtt_update++; 12242 12243 /* tcp_rtt_sa is not 0 means this is a new sample. */ 12244 if (sa != 0) { 12245 /* 12246 * Update average estimator: 12247 * new rtt = 7/8 old rtt + 1/8 Error 12248 */ 12249 12250 /* m is now Error in estimate. */ 12251 m -= sa >> 3; 12252 if ((sa += m) <= 0) { 12253 /* 12254 * Don't allow the smoothed average to be negative. 12255 * We use 0 to denote reinitialization of the 12256 * variables. 12257 */ 12258 sa = 1; 12259 } 12260 12261 /* 12262 * Update deviation estimator: 12263 * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) 12264 */ 12265 if (m < 0) 12266 m = -m; 12267 m -= sv >> 2; 12268 sv += m; 12269 } else { 12270 /* 12271 * This follows BSD's implementation. So the reinitialized 12272 * RTO is 3 * m. We cannot go less than 2 because if the 12273 * link is bandwidth dominated, doubling the window size 12274 * during slow start means doubling the RTT. We want to be 12275 * more conservative when we reinitialize our estimates. 3 12276 * is just a convenient number. 12277 */ 12278 sa = m << 3; 12279 sv = m << 1; 12280 } 12281 if (sv < TCP_SD_MIN) { 12282 /* 12283 * We do not know that if sa captures the delay ACK 12284 * effect as in a long train of segments, a receiver 12285 * does not delay its ACKs. So set the minimum of sv 12286 * to be TCP_SD_MIN, which is default to 400 ms, twice 12287 * of BSD DATO. That means the minimum of mean 12288 * deviation is 100 ms. 12289 * 12290 */ 12291 sv = TCP_SD_MIN; 12292 } 12293 tcp->tcp_rtt_sa = sa; 12294 tcp->tcp_rtt_sd = sv; 12295 /* 12296 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) 12297 * 12298 * Add tcp_rexmit_interval extra in case of extreme environment 12299 * where the algorithm fails to work. The default value of 12300 * tcp_rexmit_interval_extra should be 0. 12301 * 12302 * As we use a finer grained clock than BSD and update 12303 * RTO for every ACKs, add in another .25 of RTT to the 12304 * deviation of RTO to accomodate burstiness of 1/4 of 12305 * window size. 12306 */ 12307 rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5); 12308 12309 if (rto > tcps->tcps_rexmit_interval_max) { 12310 tcp->tcp_rto = tcps->tcps_rexmit_interval_max; 12311 } else if (rto < tcps->tcps_rexmit_interval_min) { 12312 tcp->tcp_rto = tcps->tcps_rexmit_interval_min; 12313 } else { 12314 tcp->tcp_rto = rto; 12315 } 12316 12317 /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 12318 tcp->tcp_timer_backoff = 0; 12319 } 12320 12321 /* 12322 * tcp_get_seg_mp() is called to get the pointer to a segment in the 12323 * send queue which starts at the given seq. no. 12324 * 12325 * Parameters: 12326 * tcp_t *tcp: the tcp instance pointer. 12327 * uint32_t seq: the starting seq. no of the requested segment. 12328 * int32_t *off: after the execution, *off will be the offset to 12329 * the returned mblk which points to the requested seq no. 12330 * It is the caller's responsibility to send in a non-null off. 12331 * 12332 * Return: 12333 * A mblk_t pointer pointing to the requested segment in send queue. 12334 */ 12335 static mblk_t * 12336 tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 12337 { 12338 int32_t cnt; 12339 mblk_t *mp; 12340 12341 /* Defensive coding. Make sure we don't send incorrect data. */ 12342 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt)) 12343 return (NULL); 12344 12345 cnt = seq - tcp->tcp_suna; 12346 mp = tcp->tcp_xmit_head; 12347 while (cnt > 0 && mp != NULL) { 12348 cnt -= mp->b_wptr - mp->b_rptr; 12349 if (cnt < 0) { 12350 cnt += mp->b_wptr - mp->b_rptr; 12351 break; 12352 } 12353 mp = mp->b_cont; 12354 } 12355 ASSERT(mp != NULL); 12356 *off = cnt; 12357 return (mp); 12358 } 12359 12360 /* 12361 * This function handles all retransmissions if SACK is enabled for this 12362 * connection. First it calculates how many segments can be retransmitted 12363 * based on tcp_pipe. Then it goes thru the notsack list to find eligible 12364 * segments. A segment is eligible if sack_cnt for that segment is greater 12365 * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 12366 * all eligible segments, it checks to see if TCP can send some new segments 12367 * (fast recovery). If it can, set the appropriate flag for tcp_rput_data(). 12368 * 12369 * Parameters: 12370 * tcp_t *tcp: the tcp structure of the connection. 12371 * uint_t *flags: in return, appropriate value will be set for 12372 * tcp_rput_data(). 12373 */ 12374 static void 12375 tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) 12376 { 12377 notsack_blk_t *notsack_blk; 12378 int32_t usable_swnd; 12379 int32_t mss; 12380 uint32_t seg_len; 12381 mblk_t *xmit_mp; 12382 tcp_stack_t *tcps = tcp->tcp_tcps; 12383 12384 ASSERT(tcp->tcp_sack_info != NULL); 12385 ASSERT(tcp->tcp_notsack_list != NULL); 12386 ASSERT(tcp->tcp_rexmit == B_FALSE); 12387 12388 /* Defensive coding in case there is a bug... */ 12389 if (tcp->tcp_notsack_list == NULL) { 12390 return; 12391 } 12392 notsack_blk = tcp->tcp_notsack_list; 12393 mss = tcp->tcp_mss; 12394 12395 /* 12396 * Limit the num of outstanding data in the network to be 12397 * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 12398 */ 12399 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 12400 12401 /* At least retransmit 1 MSS of data. */ 12402 if (usable_swnd <= 0) { 12403 usable_swnd = mss; 12404 } 12405 12406 /* Make sure no new RTT samples will be taken. */ 12407 tcp->tcp_csuna = tcp->tcp_snxt; 12408 12409 notsack_blk = tcp->tcp_notsack_list; 12410 while (usable_swnd > 0) { 12411 mblk_t *snxt_mp, *tmp_mp; 12412 tcp_seq begin = tcp->tcp_sack_snxt; 12413 tcp_seq end; 12414 int32_t off; 12415 12416 for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 12417 if (SEQ_GT(notsack_blk->end, begin) && 12418 (notsack_blk->sack_cnt >= 12419 tcps->tcps_dupack_fast_retransmit)) { 12420 end = notsack_blk->end; 12421 if (SEQ_LT(begin, notsack_blk->begin)) { 12422 begin = notsack_blk->begin; 12423 } 12424 break; 12425 } 12426 } 12427 /* 12428 * All holes are filled. Manipulate tcp_cwnd to send more 12429 * if we can. Note that after the SACK recovery, tcp_cwnd is 12430 * set to tcp_cwnd_ssthresh. 12431 */ 12432 if (notsack_blk == NULL) { 12433 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 12434 if (usable_swnd <= 0 || tcp->tcp_unsent == 0) { 12435 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 12436 ASSERT(tcp->tcp_cwnd > 0); 12437 return; 12438 } else { 12439 usable_swnd = usable_swnd / mss; 12440 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 12441 MAX(usable_swnd * mss, mss); 12442 *flags |= TH_XMIT_NEEDED; 12443 return; 12444 } 12445 } 12446 12447 /* 12448 * Note that we may send more than usable_swnd allows here 12449 * because of round off, but no more than 1 MSS of data. 12450 */ 12451 seg_len = end - begin; 12452 if (seg_len > mss) 12453 seg_len = mss; 12454 snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 12455 ASSERT(snxt_mp != NULL); 12456 /* This should not happen. Defensive coding again... */ 12457 if (snxt_mp == NULL) { 12458 return; 12459 } 12460 12461 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 12462 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 12463 if (xmit_mp == NULL) 12464 return; 12465 12466 usable_swnd -= seg_len; 12467 tcp->tcp_pipe += seg_len; 12468 tcp->tcp_sack_snxt = begin + seg_len; 12469 TCP_RECORD_TRACE(tcp, xmit_mp, TCP_TRACE_SEND_PKT); 12470 tcp_send_data(tcp, tcp->tcp_wq, xmit_mp); 12471 12472 /* 12473 * Update the send timestamp to avoid false retransmission. 12474 */ 12475 snxt_mp->b_prev = (mblk_t *)lbolt; 12476 12477 BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); 12478 UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, seg_len); 12479 BUMP_MIB(&tcps->tcps_mib, tcpOutSackRetransSegs); 12480 /* 12481 * Update tcp_rexmit_max to extend this SACK recovery phase. 12482 * This happens when new data sent during fast recovery is 12483 * also lost. If TCP retransmits those new data, it needs 12484 * to extend SACK recover phase to avoid starting another 12485 * fast retransmit/recovery unnecessarily. 12486 */ 12487 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 12488 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 12489 } 12490 } 12491 } 12492 12493 /* 12494 * This function handles policy checking at TCP level for non-hard_bound/ 12495 * detached connections. 12496 */ 12497 static boolean_t 12498 tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h, 12499 boolean_t secure, boolean_t mctl_present) 12500 { 12501 ipsec_latch_t *ipl = NULL; 12502 ipsec_action_t *act = NULL; 12503 mblk_t *data_mp; 12504 ipsec_in_t *ii; 12505 const char *reason; 12506 kstat_named_t *counter; 12507 tcp_stack_t *tcps = tcp->tcp_tcps; 12508 ipsec_stack_t *ipss; 12509 ip_stack_t *ipst; 12510 12511 ASSERT(mctl_present || !secure); 12512 12513 ASSERT((ipha == NULL && ip6h != NULL) || 12514 (ip6h == NULL && ipha != NULL)); 12515 12516 /* 12517 * We don't necessarily have an ipsec_in_act action to verify 12518 * policy because of assymetrical policy where we have only 12519 * outbound policy and no inbound policy (possible with global 12520 * policy). 12521 */ 12522 if (!secure) { 12523 if (act == NULL || act->ipa_act.ipa_type == IPSEC_ACT_BYPASS || 12524 act->ipa_act.ipa_type == IPSEC_ACT_CLEAR) 12525 return (B_TRUE); 12526 ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH, 12527 "tcp_check_policy", ipha, ip6h, secure, 12528 tcps->tcps_netstack); 12529 ipss = tcps->tcps_netstack->netstack_ipsec; 12530 12531 ip_drop_packet(first_mp, B_TRUE, NULL, NULL, 12532 DROPPER(ipss, ipds_tcp_clear), 12533 &tcps->tcps_dropper); 12534 return (B_FALSE); 12535 } 12536 12537 /* 12538 * We have a secure packet. 12539 */ 12540 if (act == NULL) { 12541 ipsec_log_policy_failure(IPSEC_POLICY_NOT_NEEDED, 12542 "tcp_check_policy", ipha, ip6h, secure, 12543 tcps->tcps_netstack); 12544 ipss = tcps->tcps_netstack->netstack_ipsec; 12545 12546 ip_drop_packet(first_mp, B_TRUE, NULL, NULL, 12547 DROPPER(ipss, ipds_tcp_secure), 12548 &tcps->tcps_dropper); 12549 return (B_FALSE); 12550 } 12551 12552 /* 12553 * XXX This whole routine is currently incorrect. ipl should 12554 * be set to the latch pointer, but is currently not set, so 12555 * we initialize it to NULL to avoid picking up random garbage. 12556 */ 12557 if (ipl == NULL) 12558 return (B_TRUE); 12559 12560 data_mp = first_mp->b_cont; 12561 12562 ii = (ipsec_in_t *)first_mp->b_rptr; 12563 12564 ipst = tcps->tcps_netstack->netstack_ip; 12565 12566 if (ipsec_check_ipsecin_latch(ii, data_mp, ipl, ipha, ip6h, &reason, 12567 &counter, tcp->tcp_connp)) { 12568 BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded); 12569 return (B_TRUE); 12570 } 12571 (void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE, 12572 "tcp inbound policy mismatch: %s, packet dropped\n", 12573 reason); 12574 BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed); 12575 12576 ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter, 12577 &tcps->tcps_dropper); 12578 return (B_FALSE); 12579 } 12580 12581 /* 12582 * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start 12583 * retransmission after a timeout. 12584 * 12585 * To limit the number of duplicate segments, we limit the number of segment 12586 * to be sent in one time to tcp_snd_burst, the burst variable. 12587 */ 12588 static void 12589 tcp_ss_rexmit(tcp_t *tcp) 12590 { 12591 uint32_t snxt; 12592 uint32_t smax; 12593 int32_t win; 12594 int32_t mss; 12595 int32_t off; 12596 int32_t burst = tcp->tcp_snd_burst; 12597 mblk_t *snxt_mp; 12598 tcp_stack_t *tcps = tcp->tcp_tcps; 12599 12600 /* 12601 * Note that tcp_rexmit can be set even though TCP has retransmitted 12602 * all unack'ed segments. 12603 */ 12604 if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 12605 smax = tcp->tcp_rexmit_max; 12606 snxt = tcp->tcp_rexmit_nxt; 12607 if (SEQ_LT(snxt, tcp->tcp_suna)) { 12608 snxt = tcp->tcp_suna; 12609 } 12610 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 12611 win -= snxt - tcp->tcp_suna; 12612 mss = tcp->tcp_mss; 12613 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 12614 12615 while (SEQ_LT(snxt, smax) && (win > 0) && 12616 (burst > 0) && (snxt_mp != NULL)) { 12617 mblk_t *xmit_mp; 12618 mblk_t *old_snxt_mp = snxt_mp; 12619 uint32_t cnt = mss; 12620 12621 if (win < cnt) { 12622 cnt = win; 12623 } 12624 if (SEQ_GT(snxt + cnt, smax)) { 12625 cnt = smax - snxt; 12626 } 12627 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 12628 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 12629 if (xmit_mp == NULL) 12630 return; 12631 12632 tcp_send_data(tcp, tcp->tcp_wq, xmit_mp); 12633 12634 snxt += cnt; 12635 win -= cnt; 12636 /* 12637 * Update the send timestamp to avoid false 12638 * retransmission. 12639 */ 12640 old_snxt_mp->b_prev = (mblk_t *)lbolt; 12641 BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); 12642 UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, cnt); 12643 12644 tcp->tcp_rexmit_nxt = snxt; 12645 burst--; 12646 } 12647 /* 12648 * If we have transmitted all we have at the time 12649 * we started the retranmission, we can leave 12650 * the rest of the job to tcp_wput_data(). But we 12651 * need to check the send window first. If the 12652 * win is not 0, go on with tcp_wput_data(). 12653 */ 12654 if (SEQ_LT(snxt, smax) || win == 0) { 12655 return; 12656 } 12657 } 12658 /* Only call tcp_wput_data() if there is data to be sent. */ 12659 if (tcp->tcp_unsent) { 12660 tcp_wput_data(tcp, NULL, B_FALSE); 12661 } 12662 } 12663 12664 /* 12665 * Process all TCP option in SYN segment. Note that this function should 12666 * be called after tcp_adapt_ire() is called so that the necessary info 12667 * from IRE is already set in the tcp structure. 12668 * 12669 * This function sets up the correct tcp_mss value according to the 12670 * MSS option value and our header size. It also sets up the window scale 12671 * and timestamp values, and initialize SACK info blocks. But it does not 12672 * change receive window size after setting the tcp_mss value. The caller 12673 * should do the appropriate change. 12674 */ 12675 void 12676 tcp_process_options(tcp_t *tcp, tcph_t *tcph) 12677 { 12678 int options; 12679 tcp_opt_t tcpopt; 12680 uint32_t mss_max; 12681 char *tmp_tcph; 12682 tcp_stack_t *tcps = tcp->tcp_tcps; 12683 12684 tcpopt.tcp = NULL; 12685 options = tcp_parse_options(tcph, &tcpopt); 12686 12687 /* 12688 * Process MSS option. Note that MSS option value does not account 12689 * for IP or TCP options. This means that it is equal to MTU - minimum 12690 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 12691 * IPv6. 12692 */ 12693 if (!(options & TCP_OPT_MSS_PRESENT)) { 12694 if (tcp->tcp_ipversion == IPV4_VERSION) 12695 tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4; 12696 else 12697 tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6; 12698 } else { 12699 if (tcp->tcp_ipversion == IPV4_VERSION) 12700 mss_max = tcps->tcps_mss_max_ipv4; 12701 else 12702 mss_max = tcps->tcps_mss_max_ipv6; 12703 if (tcpopt.tcp_opt_mss < tcps->tcps_mss_min) 12704 tcpopt.tcp_opt_mss = tcps->tcps_mss_min; 12705 else if (tcpopt.tcp_opt_mss > mss_max) 12706 tcpopt.tcp_opt_mss = mss_max; 12707 } 12708 12709 /* Process Window Scale option. */ 12710 if (options & TCP_OPT_WSCALE_PRESENT) { 12711 tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 12712 tcp->tcp_snd_ws_ok = B_TRUE; 12713 } else { 12714 tcp->tcp_snd_ws = B_FALSE; 12715 tcp->tcp_snd_ws_ok = B_FALSE; 12716 tcp->tcp_rcv_ws = B_FALSE; 12717 } 12718 12719 /* Process Timestamp option. */ 12720 if ((options & TCP_OPT_TSTAMP_PRESENT) && 12721 (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) { 12722 tmp_tcph = (char *)tcp->tcp_tcph; 12723 12724 tcp->tcp_snd_ts_ok = B_TRUE; 12725 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 12726 tcp->tcp_last_rcv_lbolt = lbolt64; 12727 ASSERT(OK_32PTR(tmp_tcph)); 12728 ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 12729 12730 /* Fill in our template header with basic timestamp option. */ 12731 tmp_tcph += tcp->tcp_tcp_hdr_len; 12732 tmp_tcph[0] = TCPOPT_NOP; 12733 tmp_tcph[1] = TCPOPT_NOP; 12734 tmp_tcph[2] = TCPOPT_TSTAMP; 12735 tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 12736 tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN; 12737 tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN; 12738 tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4); 12739 } else { 12740 tcp->tcp_snd_ts_ok = B_FALSE; 12741 } 12742 12743 /* 12744 * Process SACK options. If SACK is enabled for this connection, 12745 * then allocate the SACK info structure. Note the following ways 12746 * when tcp_snd_sack_ok is set to true. 12747 * 12748 * For active connection: in tcp_adapt_ire() called in 12749 * tcp_rput_other(), or in tcp_rput_other() when tcp_sack_permitted 12750 * is checked. 12751 * 12752 * For passive connection: in tcp_adapt_ire() called in 12753 * tcp_accept_comm(). 12754 * 12755 * That's the reason why the extra TCP_IS_DETACHED() check is there. 12756 * That check makes sure that if we did not send a SACK OK option, 12757 * we will not enable SACK for this connection even though the other 12758 * side sends us SACK OK option. For active connection, the SACK 12759 * info structure has already been allocated. So we need to free 12760 * it if SACK is disabled. 12761 */ 12762 if ((options & TCP_OPT_SACK_OK_PRESENT) && 12763 (tcp->tcp_snd_sack_ok || 12764 (tcps->tcps_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) { 12765 /* This should be true only in the passive case. */ 12766 if (tcp->tcp_sack_info == NULL) { 12767 ASSERT(TCP_IS_DETACHED(tcp)); 12768 tcp->tcp_sack_info = 12769 kmem_cache_alloc(tcp_sack_info_cache, KM_NOSLEEP); 12770 } 12771 if (tcp->tcp_sack_info == NULL) { 12772 tcp->tcp_snd_sack_ok = B_FALSE; 12773 } else { 12774 tcp->tcp_snd_sack_ok = B_TRUE; 12775 if (tcp->tcp_snd_ts_ok) { 12776 tcp->tcp_max_sack_blk = 3; 12777 } else { 12778 tcp->tcp_max_sack_blk = 4; 12779 } 12780 } 12781 } else { 12782 /* 12783 * Resetting tcp_snd_sack_ok to B_FALSE so that 12784 * no SACK info will be used for this 12785 * connection. This assumes that SACK usage 12786 * permission is negotiated. This may need 12787 * to be changed once this is clarified. 12788 */ 12789 if (tcp->tcp_sack_info != NULL) { 12790 ASSERT(tcp->tcp_notsack_list == NULL); 12791 kmem_cache_free(tcp_sack_info_cache, 12792 tcp->tcp_sack_info); 12793 tcp->tcp_sack_info = NULL; 12794 } 12795 tcp->tcp_snd_sack_ok = B_FALSE; 12796 } 12797 12798 /* 12799 * Now we know the exact TCP/IP header length, subtract 12800 * that from tcp_mss to get our side's MSS. 12801 */ 12802 tcp->tcp_mss -= tcp->tcp_hdr_len; 12803 /* 12804 * Here we assume that the other side's header size will be equal to 12805 * our header size. We calculate the real MSS accordingly. Need to 12806 * take into additional stuffs IPsec puts in. 12807 * 12808 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 12809 */ 12810 tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len + tcp->tcp_ipsec_overhead - 12811 ((tcp->tcp_ipversion == IPV4_VERSION ? 12812 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH); 12813 12814 /* 12815 * Set MSS to the smaller one of both ends of the connection. 12816 * We should not have called tcp_mss_set() before, but our 12817 * side of the MSS should have been set to a proper value 12818 * by tcp_adapt_ire(). tcp_mss_set() will also set up the 12819 * STREAM head parameters properly. 12820 * 12821 * If we have a larger-than-16-bit window but the other side 12822 * didn't want to do window scale, tcp_rwnd_set() will take 12823 * care of that. 12824 */ 12825 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss), B_TRUE); 12826 } 12827 12828 /* 12829 * Sends the T_CONN_IND to the listener. The caller calls this 12830 * functions via squeue to get inside the listener's perimeter 12831 * once the 3 way hand shake is done a T_CONN_IND needs to be 12832 * sent. As an optimization, the caller can call this directly 12833 * if listener's perimeter is same as eager's. 12834 */ 12835 /* ARGSUSED */ 12836 void 12837 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) 12838 { 12839 conn_t *lconnp = (conn_t *)arg; 12840 tcp_t *listener = lconnp->conn_tcp; 12841 tcp_t *tcp; 12842 struct T_conn_ind *conn_ind; 12843 ipaddr_t *addr_cache; 12844 boolean_t need_send_conn_ind = B_FALSE; 12845 tcp_stack_t *tcps = listener->tcp_tcps; 12846 12847 /* retrieve the eager */ 12848 conn_ind = (struct T_conn_ind *)mp->b_rptr; 12849 ASSERT(conn_ind->OPT_offset != 0 && 12850 conn_ind->OPT_length == sizeof (intptr_t)); 12851 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 12852 conn_ind->OPT_length); 12853 12854 /* 12855 * TLI/XTI applications will get confused by 12856 * sending eager as an option since it violates 12857 * the option semantics. So remove the eager as 12858 * option since TLI/XTI app doesn't need it anyway. 12859 */ 12860 if (!TCP_IS_SOCKET(listener)) { 12861 conn_ind->OPT_length = 0; 12862 conn_ind->OPT_offset = 0; 12863 } 12864 if (listener->tcp_state == TCPS_CLOSED || 12865 TCP_IS_DETACHED(listener)) { 12866 /* 12867 * If listener has closed, it would have caused a 12868 * a cleanup/blowoff to happen for the eager. We 12869 * just need to return. 12870 */ 12871 freemsg(mp); 12872 return; 12873 } 12874 12875 12876 /* 12877 * if the conn_req_q is full defer passing up the 12878 * T_CONN_IND until space is availabe after t_accept() 12879 * processing 12880 */ 12881 mutex_enter(&listener->tcp_eager_lock); 12882 12883 /* 12884 * Take the eager out, if it is in the list of droppable eagers 12885 * as we are here because the 3W handshake is over. 12886 */ 12887 MAKE_UNDROPPABLE(tcp); 12888 12889 if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { 12890 tcp_t *tail; 12891 12892 /* 12893 * The eager already has an extra ref put in tcp_rput_data 12894 * so that it stays till accept comes back even though it 12895 * might get into TCPS_CLOSED as a result of a TH_RST etc. 12896 */ 12897 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 12898 listener->tcp_conn_req_cnt_q0--; 12899 listener->tcp_conn_req_cnt_q++; 12900 12901 /* Move from SYN_RCVD to ESTABLISHED list */ 12902 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 12903 tcp->tcp_eager_prev_q0; 12904 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 12905 tcp->tcp_eager_next_q0; 12906 tcp->tcp_eager_prev_q0 = NULL; 12907 tcp->tcp_eager_next_q0 = NULL; 12908 12909 /* 12910 * Insert at end of the queue because sockfs 12911 * sends down T_CONN_RES in chronological 12912 * order. Leaving the older conn indications 12913 * at front of the queue helps reducing search 12914 * time. 12915 */ 12916 tail = listener->tcp_eager_last_q; 12917 if (tail != NULL) 12918 tail->tcp_eager_next_q = tcp; 12919 else 12920 listener->tcp_eager_next_q = tcp; 12921 listener->tcp_eager_last_q = tcp; 12922 tcp->tcp_eager_next_q = NULL; 12923 /* 12924 * Delay sending up the T_conn_ind until we are 12925 * done with the eager. Once we have have sent up 12926 * the T_conn_ind, the accept can potentially complete 12927 * any time and release the refhold we have on the eager. 12928 */ 12929 need_send_conn_ind = B_TRUE; 12930 } else { 12931 /* 12932 * Defer connection on q0 and set deferred 12933 * connection bit true 12934 */ 12935 tcp->tcp_conn_def_q0 = B_TRUE; 12936 12937 /* take tcp out of q0 ... */ 12938 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 12939 tcp->tcp_eager_next_q0; 12940 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 12941 tcp->tcp_eager_prev_q0; 12942 12943 /* ... and place it at the end of q0 */ 12944 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 12945 tcp->tcp_eager_next_q0 = listener; 12946 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 12947 listener->tcp_eager_prev_q0 = tcp; 12948 tcp->tcp_conn.tcp_eager_conn_ind = mp; 12949 } 12950 12951 /* we have timed out before */ 12952 if (tcp->tcp_syn_rcvd_timeout != 0) { 12953 tcp->tcp_syn_rcvd_timeout = 0; 12954 listener->tcp_syn_rcvd_timeout--; 12955 if (listener->tcp_syn_defense && 12956 listener->tcp_syn_rcvd_timeout <= 12957 (tcps->tcps_conn_req_max_q0 >> 5) && 12958 10*MINUTES < TICK_TO_MSEC(lbolt64 - 12959 listener->tcp_last_rcv_lbolt)) { 12960 /* 12961 * Turn off the defense mode if we 12962 * believe the SYN attack is over. 12963 */ 12964 listener->tcp_syn_defense = B_FALSE; 12965 if (listener->tcp_ip_addr_cache) { 12966 kmem_free((void *)listener->tcp_ip_addr_cache, 12967 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 12968 listener->tcp_ip_addr_cache = NULL; 12969 } 12970 } 12971 } 12972 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 12973 if (addr_cache != NULL) { 12974 /* 12975 * We have finished a 3-way handshake with this 12976 * remote host. This proves the IP addr is good. 12977 * Cache it! 12978 */ 12979 addr_cache[IP_ADDR_CACHE_HASH( 12980 tcp->tcp_remote)] = tcp->tcp_remote; 12981 } 12982 mutex_exit(&listener->tcp_eager_lock); 12983 if (need_send_conn_ind) 12984 putnext(listener->tcp_rq, mp); 12985 } 12986 12987 mblk_t * 12988 tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp, 12989 uint_t *ifindexp, ip6_pkt_t *ippp) 12990 { 12991 ip_pktinfo_t *pinfo; 12992 ip6_t *ip6h; 12993 uchar_t *rptr; 12994 mblk_t *first_mp = mp; 12995 boolean_t mctl_present = B_FALSE; 12996 uint_t ifindex = 0; 12997 ip6_pkt_t ipp; 12998 uint_t ipvers; 12999 uint_t ip_hdr_len; 13000 tcp_stack_t *tcps = tcp->tcp_tcps; 13001 13002 rptr = mp->b_rptr; 13003 ASSERT(OK_32PTR(rptr)); 13004 ASSERT(tcp != NULL); 13005 ipp.ipp_fields = 0; 13006 13007 switch DB_TYPE(mp) { 13008 case M_CTL: 13009 mp = mp->b_cont; 13010 if (mp == NULL) { 13011 freemsg(first_mp); 13012 return (NULL); 13013 } 13014 if (DB_TYPE(mp) != M_DATA) { 13015 freemsg(first_mp); 13016 return (NULL); 13017 } 13018 mctl_present = B_TRUE; 13019 break; 13020 case M_DATA: 13021 break; 13022 default: 13023 cmn_err(CE_NOTE, "tcp_find_pktinfo: unknown db_type"); 13024 freemsg(mp); 13025 return (NULL); 13026 } 13027 ipvers = IPH_HDR_VERSION(rptr); 13028 if (ipvers == IPV4_VERSION) { 13029 if (tcp == NULL) { 13030 ip_hdr_len = IPH_HDR_LENGTH(rptr); 13031 goto done; 13032 } 13033 13034 ipp.ipp_fields |= IPPF_HOPLIMIT; 13035 ipp.ipp_hoplimit = ((ipha_t *)rptr)->ipha_ttl; 13036 13037 /* 13038 * If we have IN_PKTINFO in an M_CTL and tcp_ipv6_recvancillary 13039 * has TCP_IPV6_RECVPKTINFO set, pass I/F index along in ipp. 13040 */ 13041 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) && 13042 mctl_present) { 13043 pinfo = (ip_pktinfo_t *)first_mp->b_rptr; 13044 if ((MBLKL(first_mp) == sizeof (ip_pktinfo_t)) && 13045 (pinfo->ip_pkt_ulp_type == IN_PKTINFO) && 13046 (pinfo->ip_pkt_flags & IPF_RECVIF)) { 13047 ipp.ipp_fields |= IPPF_IFINDEX; 13048 ipp.ipp_ifindex = pinfo->ip_pkt_ifindex; 13049 ifindex = pinfo->ip_pkt_ifindex; 13050 } 13051 freeb(first_mp); 13052 mctl_present = B_FALSE; 13053 } 13054 ip_hdr_len = IPH_HDR_LENGTH(rptr); 13055 } else { 13056 ip6h = (ip6_t *)rptr; 13057 13058 ASSERT(ipvers == IPV6_VERSION); 13059 ipp.ipp_fields = IPPF_HOPLIMIT | IPPF_TCLASS; 13060 ipp.ipp_tclass = (ip6h->ip6_flow & 0x0FF00000) >> 20; 13061 ipp.ipp_hoplimit = ip6h->ip6_hops; 13062 13063 if (ip6h->ip6_nxt != IPPROTO_TCP) { 13064 uint8_t nexthdrp; 13065 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 13066 13067 /* Look for ifindex information */ 13068 if (ip6h->ip6_nxt == IPPROTO_RAW) { 13069 ip6i_t *ip6i = (ip6i_t *)ip6h; 13070 if ((uchar_t *)&ip6i[1] > mp->b_wptr) { 13071 BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs); 13072 freemsg(first_mp); 13073 return (NULL); 13074 } 13075 13076 if (ip6i->ip6i_flags & IP6I_IFINDEX) { 13077 ASSERT(ip6i->ip6i_ifindex != 0); 13078 ipp.ipp_fields |= IPPF_IFINDEX; 13079 ipp.ipp_ifindex = ip6i->ip6i_ifindex; 13080 ifindex = ip6i->ip6i_ifindex; 13081 } 13082 rptr = (uchar_t *)&ip6i[1]; 13083 mp->b_rptr = rptr; 13084 if (rptr == mp->b_wptr) { 13085 mblk_t *mp1; 13086 mp1 = mp->b_cont; 13087 freeb(mp); 13088 mp = mp1; 13089 rptr = mp->b_rptr; 13090 } 13091 if (MBLKL(mp) < IPV6_HDR_LEN + 13092 sizeof (tcph_t)) { 13093 BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs); 13094 freemsg(first_mp); 13095 return (NULL); 13096 } 13097 ip6h = (ip6_t *)rptr; 13098 } 13099 13100 /* 13101 * Find any potentially interesting extension headers 13102 * as well as the length of the IPv6 + extension 13103 * headers. 13104 */ 13105 ip_hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp); 13106 /* Verify if this is a TCP packet */ 13107 if (nexthdrp != IPPROTO_TCP) { 13108 BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs); 13109 freemsg(first_mp); 13110 return (NULL); 13111 } 13112 } else { 13113 ip_hdr_len = IPV6_HDR_LEN; 13114 } 13115 } 13116 13117 done: 13118 if (ipversp != NULL) 13119 *ipversp = ipvers; 13120 if (ip_hdr_lenp != NULL) 13121 *ip_hdr_lenp = ip_hdr_len; 13122 if (ippp != NULL) 13123 *ippp = ipp; 13124 if (ifindexp != NULL) 13125 *ifindexp = ifindex; 13126 if (mctl_present) { 13127 freeb(first_mp); 13128 } 13129 return (mp); 13130 } 13131 13132 /* 13133 * Handle M_DATA messages from IP. Its called directly from IP via 13134 * squeue for AF_INET type sockets fast path. No M_CTL are expected 13135 * in this path. 13136 * 13137 * For everything else (including AF_INET6 sockets with 'tcp_ipversion' 13138 * v4 and v6), we are called through tcp_input() and a M_CTL can 13139 * be present for options but tcp_find_pktinfo() deals with it. We 13140 * only expect M_DATA packets after tcp_find_pktinfo() is done. 13141 * 13142 * The first argument is always the connp/tcp to which the mp belongs. 13143 * There are no exceptions to this rule. The caller has already put 13144 * a reference on this connp/tcp and once tcp_rput_data() returns, 13145 * the squeue will do the refrele. 13146 * 13147 * The TH_SYN for the listener directly go to tcp_conn_request via 13148 * squeue. 13149 * 13150 * sqp: NULL = recursive, sqp != NULL means called from squeue 13151 */ 13152 void 13153 tcp_rput_data(void *arg, mblk_t *mp, void *arg2) 13154 { 13155 int32_t bytes_acked; 13156 int32_t gap; 13157 mblk_t *mp1; 13158 uint_t flags; 13159 uint32_t new_swnd = 0; 13160 uchar_t *iphdr; 13161 uchar_t *rptr; 13162 int32_t rgap; 13163 uint32_t seg_ack; 13164 int seg_len; 13165 uint_t ip_hdr_len; 13166 uint32_t seg_seq; 13167 tcph_t *tcph; 13168 int urp; 13169 tcp_opt_t tcpopt; 13170 uint_t ipvers; 13171 ip6_pkt_t ipp; 13172 boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 13173 uint32_t cwnd; 13174 uint32_t add; 13175 int npkt; 13176 int mss; 13177 conn_t *connp = (conn_t *)arg; 13178 squeue_t *sqp = (squeue_t *)arg2; 13179 tcp_t *tcp = connp->conn_tcp; 13180 tcp_stack_t *tcps = tcp->tcp_tcps; 13181 13182 /* 13183 * RST from fused tcp loopback peer should trigger an unfuse. 13184 */ 13185 if (tcp->tcp_fused) { 13186 TCP_STAT(tcps, tcp_fusion_aborted); 13187 tcp_unfuse(tcp); 13188 } 13189 13190 iphdr = mp->b_rptr; 13191 rptr = mp->b_rptr; 13192 ASSERT(OK_32PTR(rptr)); 13193 13194 /* 13195 * An AF_INET socket is not capable of receiving any pktinfo. Do inline 13196 * processing here. For rest call tcp_find_pktinfo to fill up the 13197 * necessary information. 13198 */ 13199 if (IPCL_IS_TCP4(connp)) { 13200 ipvers = IPV4_VERSION; 13201 ip_hdr_len = IPH_HDR_LENGTH(rptr); 13202 } else { 13203 mp = tcp_find_pktinfo(tcp, mp, &ipvers, &ip_hdr_len, 13204 NULL, &ipp); 13205 if (mp == NULL) { 13206 TCP_STAT(tcps, tcp_rput_v6_error); 13207 return; 13208 } 13209 iphdr = mp->b_rptr; 13210 rptr = mp->b_rptr; 13211 } 13212 ASSERT(DB_TYPE(mp) == M_DATA); 13213 13214 tcph = (tcph_t *)&rptr[ip_hdr_len]; 13215 seg_seq = ABE32_TO_U32(tcph->th_seq); 13216 seg_ack = ABE32_TO_U32(tcph->th_ack); 13217 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 13218 seg_len = (int)(mp->b_wptr - rptr) - 13219 (ip_hdr_len + TCP_HDR_LENGTH(tcph)); 13220 if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) { 13221 do { 13222 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 13223 (uintptr_t)INT_MAX); 13224 seg_len += (int)(mp1->b_wptr - mp1->b_rptr); 13225 } while ((mp1 = mp1->b_cont) != NULL && 13226 mp1->b_datap->db_type == M_DATA); 13227 } 13228 13229 if (tcp->tcp_state == TCPS_TIME_WAIT) { 13230 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 13231 seg_len, tcph); 13232 return; 13233 } 13234 13235 if (sqp != NULL) { 13236 /* 13237 * This is the correct place to update tcp_last_recv_time. Note 13238 * that it is also updated for tcp structure that belongs to 13239 * global and listener queues which do not really need updating. 13240 * But that should not cause any harm. And it is updated for 13241 * all kinds of incoming segments, not only for data segments. 13242 */ 13243 tcp->tcp_last_recv_time = lbolt; 13244 } 13245 13246 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 13247 13248 BUMP_LOCAL(tcp->tcp_ibsegs); 13249 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_RECV_PKT); 13250 13251 if ((flags & TH_URG) && sqp != NULL) { 13252 /* 13253 * TCP can't handle urgent pointers that arrive before 13254 * the connection has been accept()ed since it can't 13255 * buffer OOB data. Discard segment if this happens. 13256 * 13257 * We can't just rely on a non-null tcp_listener to indicate 13258 * that the accept() has completed since unlinking of the 13259 * eager and completion of the accept are not atomic. 13260 * tcp_detached, when it is not set (B_FALSE) indicates 13261 * that the accept() has completed. 13262 * 13263 * Nor can it reassemble urgent pointers, so discard 13264 * if it's not the next segment expected. 13265 * 13266 * Otherwise, collapse chain into one mblk (discard if 13267 * that fails). This makes sure the headers, retransmitted 13268 * data, and new data all are in the same mblk. 13269 */ 13270 ASSERT(mp != NULL); 13271 if (tcp->tcp_detached || !pullupmsg(mp, -1)) { 13272 freemsg(mp); 13273 return; 13274 } 13275 /* Update pointers into message */ 13276 iphdr = rptr = mp->b_rptr; 13277 tcph = (tcph_t *)&rptr[ip_hdr_len]; 13278 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) { 13279 /* 13280 * Since we can't handle any data with this urgent 13281 * pointer that is out of sequence, we expunge 13282 * the data. This allows us to still register 13283 * the urgent mark and generate the M_PCSIG, 13284 * which we can do. 13285 */ 13286 mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph); 13287 seg_len = 0; 13288 } 13289 } 13290 13291 switch (tcp->tcp_state) { 13292 case TCPS_SYN_SENT: 13293 if (flags & TH_ACK) { 13294 /* 13295 * Note that our stack cannot send data before a 13296 * connection is established, therefore the 13297 * following check is valid. Otherwise, it has 13298 * to be changed. 13299 */ 13300 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 13301 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 13302 freemsg(mp); 13303 if (flags & TH_RST) 13304 return; 13305 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 13306 tcp, seg_ack, 0, TH_RST); 13307 return; 13308 } 13309 ASSERT(tcp->tcp_suna + 1 == seg_ack); 13310 } 13311 if (flags & TH_RST) { 13312 freemsg(mp); 13313 if (flags & TH_ACK) 13314 (void) tcp_clean_death(tcp, 13315 ECONNREFUSED, 13); 13316 return; 13317 } 13318 if (!(flags & TH_SYN)) { 13319 freemsg(mp); 13320 return; 13321 } 13322 13323 /* Process all TCP options. */ 13324 tcp_process_options(tcp, tcph); 13325 /* 13326 * The following changes our rwnd to be a multiple of the 13327 * MIN(peer MSS, our MSS) for performance reason. 13328 */ 13329 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rq->q_hiwat, 13330 tcp->tcp_mss)); 13331 13332 /* Is the other end ECN capable? */ 13333 if (tcp->tcp_ecn_ok) { 13334 if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 13335 tcp->tcp_ecn_ok = B_FALSE; 13336 } 13337 } 13338 /* 13339 * Clear ECN flags because it may interfere with later 13340 * processing. 13341 */ 13342 flags &= ~(TH_ECE|TH_CWR); 13343 13344 tcp->tcp_irs = seg_seq; 13345 tcp->tcp_rack = seg_seq; 13346 tcp->tcp_rnxt = seg_seq + 1; 13347 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 13348 if (!TCP_IS_DETACHED(tcp)) { 13349 /* Allocate room for SACK options if needed. */ 13350 if (tcp->tcp_snd_sack_ok) { 13351 (void) mi_set_sth_wroff(tcp->tcp_rq, 13352 tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + 13353 (tcp->tcp_loopback ? 0 : 13354 tcps->tcps_wroff_xtra)); 13355 } else { 13356 (void) mi_set_sth_wroff(tcp->tcp_rq, 13357 tcp->tcp_hdr_len + 13358 (tcp->tcp_loopback ? 0 : 13359 tcps->tcps_wroff_xtra)); 13360 } 13361 } 13362 if (flags & TH_ACK) { 13363 /* 13364 * If we can't get the confirmation upstream, pretend 13365 * we didn't even see this one. 13366 * 13367 * XXX: how can we pretend we didn't see it if we 13368 * have updated rnxt et. al. 13369 * 13370 * For loopback we defer sending up the T_CONN_CON 13371 * until after some checks below. 13372 */ 13373 mp1 = NULL; 13374 if (!tcp_conn_con(tcp, iphdr, tcph, mp, 13375 tcp->tcp_loopback ? &mp1 : NULL)) { 13376 freemsg(mp); 13377 return; 13378 } 13379 /* SYN was acked - making progress */ 13380 if (tcp->tcp_ipversion == IPV6_VERSION) 13381 tcp->tcp_ip_forward_progress = B_TRUE; 13382 13383 /* One for the SYN */ 13384 tcp->tcp_suna = tcp->tcp_iss + 1; 13385 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 13386 tcp->tcp_state = TCPS_ESTABLISHED; 13387 13388 /* 13389 * If SYN was retransmitted, need to reset all 13390 * retransmission info. This is because this 13391 * segment will be treated as a dup ACK. 13392 */ 13393 if (tcp->tcp_rexmit) { 13394 tcp->tcp_rexmit = B_FALSE; 13395 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 13396 tcp->tcp_rexmit_max = tcp->tcp_snxt; 13397 tcp->tcp_snd_burst = tcp->tcp_localnet ? 13398 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 13399 tcp->tcp_ms_we_have_waited = 0; 13400 13401 /* 13402 * Set tcp_cwnd back to 1 MSS, per 13403 * recommendation from 13404 * draft-floyd-incr-init-win-01.txt, 13405 * Increasing TCP's Initial Window. 13406 */ 13407 tcp->tcp_cwnd = tcp->tcp_mss; 13408 } 13409 13410 tcp->tcp_swl1 = seg_seq; 13411 tcp->tcp_swl2 = seg_ack; 13412 13413 new_swnd = BE16_TO_U16(tcph->th_win); 13414 tcp->tcp_swnd = new_swnd; 13415 if (new_swnd > tcp->tcp_max_swnd) 13416 tcp->tcp_max_swnd = new_swnd; 13417 13418 /* 13419 * Always send the three-way handshake ack immediately 13420 * in order to make the connection complete as soon as 13421 * possible on the accepting host. 13422 */ 13423 flags |= TH_ACK_NEEDED; 13424 13425 /* 13426 * Special case for loopback. At this point we have 13427 * received SYN-ACK from the remote endpoint. In 13428 * order to ensure that both endpoints reach the 13429 * fused state prior to any data exchange, the final 13430 * ACK needs to be sent before we indicate T_CONN_CON 13431 * to the module upstream. 13432 */ 13433 if (tcp->tcp_loopback) { 13434 mblk_t *ack_mp; 13435 13436 ASSERT(!tcp->tcp_unfusable); 13437 ASSERT(mp1 != NULL); 13438 /* 13439 * For loopback, we always get a pure SYN-ACK 13440 * and only need to send back the final ACK 13441 * with no data (this is because the other 13442 * tcp is ours and we don't do T/TCP). This 13443 * final ACK triggers the passive side to 13444 * perform fusion in ESTABLISHED state. 13445 */ 13446 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) { 13447 if (tcp->tcp_ack_tid != 0) { 13448 (void) TCP_TIMER_CANCEL(tcp, 13449 tcp->tcp_ack_tid); 13450 tcp->tcp_ack_tid = 0; 13451 } 13452 TCP_RECORD_TRACE(tcp, ack_mp, 13453 TCP_TRACE_SEND_PKT); 13454 tcp_send_data(tcp, tcp->tcp_wq, ack_mp); 13455 BUMP_LOCAL(tcp->tcp_obsegs); 13456 BUMP_MIB(&tcps->tcps_mib, tcpOutAck); 13457 13458 /* Send up T_CONN_CON */ 13459 putnext(tcp->tcp_rq, mp1); 13460 13461 freemsg(mp); 13462 return; 13463 } 13464 /* 13465 * Forget fusion; we need to handle more 13466 * complex cases below. Send the deferred 13467 * T_CONN_CON message upstream and proceed 13468 * as usual. Mark this tcp as not capable 13469 * of fusion. 13470 */ 13471 TCP_STAT(tcps, tcp_fusion_unfusable); 13472 tcp->tcp_unfusable = B_TRUE; 13473 putnext(tcp->tcp_rq, mp1); 13474 } 13475 13476 /* 13477 * Check to see if there is data to be sent. If 13478 * yes, set the transmit flag. Then check to see 13479 * if received data processing needs to be done. 13480 * If not, go straight to xmit_check. This short 13481 * cut is OK as we don't support T/TCP. 13482 */ 13483 if (tcp->tcp_unsent) 13484 flags |= TH_XMIT_NEEDED; 13485 13486 if (seg_len == 0 && !(flags & TH_URG)) { 13487 freemsg(mp); 13488 goto xmit_check; 13489 } 13490 13491 flags &= ~TH_SYN; 13492 seg_seq++; 13493 break; 13494 } 13495 tcp->tcp_state = TCPS_SYN_RCVD; 13496 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 13497 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 13498 if (mp1) { 13499 DB_CPID(mp1) = tcp->tcp_cpid; 13500 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); 13501 tcp_send_data(tcp, tcp->tcp_wq, mp1); 13502 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 13503 } 13504 freemsg(mp); 13505 return; 13506 case TCPS_SYN_RCVD: 13507 if (flags & TH_ACK) { 13508 /* 13509 * In this state, a SYN|ACK packet is either bogus 13510 * because the other side must be ACKing our SYN which 13511 * indicates it has seen the ACK for their SYN and 13512 * shouldn't retransmit it or we're crossing SYNs 13513 * on active open. 13514 */ 13515 if ((flags & TH_SYN) && !tcp->tcp_active_open) { 13516 freemsg(mp); 13517 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_syn", 13518 tcp, seg_ack, 0, TH_RST); 13519 return; 13520 } 13521 /* 13522 * NOTE: RFC 793 pg. 72 says this should be 13523 * tcp->tcp_suna <= seg_ack <= tcp->tcp_snxt 13524 * but that would mean we have an ack that ignored 13525 * our SYN. 13526 */ 13527 if (SEQ_LEQ(seg_ack, tcp->tcp_suna) || 13528 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 13529 freemsg(mp); 13530 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 13531 tcp, seg_ack, 0, TH_RST); 13532 return; 13533 } 13534 } 13535 break; 13536 case TCPS_LISTEN: 13537 /* 13538 * Only a TLI listener can come through this path when a 13539 * acceptor is going back to be a listener and a packet 13540 * for the acceptor hits the classifier. For a socket 13541 * listener, this can never happen because a listener 13542 * can never accept connection on itself and hence a 13543 * socket acceptor can not go back to being a listener. 13544 */ 13545 ASSERT(!TCP_IS_SOCKET(tcp)); 13546 /*FALLTHRU*/ 13547 case TCPS_CLOSED: 13548 case TCPS_BOUND: { 13549 conn_t *new_connp; 13550 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 13551 13552 new_connp = ipcl_classify(mp, connp->conn_zoneid, ipst); 13553 if (new_connp != NULL) { 13554 tcp_reinput(new_connp, mp, connp->conn_sqp); 13555 return; 13556 } 13557 /* We failed to classify. For now just drop the packet */ 13558 freemsg(mp); 13559 return; 13560 } 13561 case TCPS_IDLE: 13562 /* 13563 * Handle the case where the tcp_clean_death() has happened 13564 * on a connection (application hasn't closed yet) but a packet 13565 * was already queued on squeue before tcp_clean_death() 13566 * was processed. Calling tcp_clean_death() twice on same 13567 * connection can result in weird behaviour. 13568 */ 13569 freemsg(mp); 13570 return; 13571 default: 13572 break; 13573 } 13574 13575 /* 13576 * Already on the correct queue/perimeter. 13577 * If this is a detached connection and not an eager 13578 * connection hanging off a listener then new data 13579 * (past the FIN) will cause a reset. 13580 * We do a special check here where it 13581 * is out of the main line, rather than check 13582 * if we are detached every time we see new 13583 * data down below. 13584 */ 13585 if (TCP_IS_DETACHED_NONEAGER(tcp) && 13586 (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) { 13587 BUMP_MIB(&tcps->tcps_mib, tcpInClosed); 13588 TCP_RECORD_TRACE(tcp, 13589 mp, TCP_TRACE_RECV_PKT); 13590 13591 freemsg(mp); 13592 /* 13593 * This could be an SSL closure alert. We're detached so just 13594 * acknowledge it this last time. 13595 */ 13596 if (tcp->tcp_kssl_ctx != NULL) { 13597 kssl_release_ctx(tcp->tcp_kssl_ctx); 13598 tcp->tcp_kssl_ctx = NULL; 13599 13600 tcp->tcp_rnxt += seg_len; 13601 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 13602 flags |= TH_ACK_NEEDED; 13603 goto ack_check; 13604 } 13605 13606 tcp_xmit_ctl("new data when detached", tcp, 13607 tcp->tcp_snxt, 0, TH_RST); 13608 (void) tcp_clean_death(tcp, EPROTO, 12); 13609 return; 13610 } 13611 13612 mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph); 13613 urp = BE16_TO_U16(tcph->th_urp) - TCP_OLD_URP_INTERPRETATION; 13614 new_swnd = BE16_TO_U16(tcph->th_win) << 13615 ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 13616 13617 if (tcp->tcp_snd_ts_ok) { 13618 if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 13619 /* 13620 * This segment is not acceptable. 13621 * Drop it and send back an ACK. 13622 */ 13623 freemsg(mp); 13624 flags |= TH_ACK_NEEDED; 13625 goto ack_check; 13626 } 13627 } else if (tcp->tcp_snd_sack_ok) { 13628 ASSERT(tcp->tcp_sack_info != NULL); 13629 tcpopt.tcp = tcp; 13630 /* 13631 * SACK info in already updated in tcp_parse_options. Ignore 13632 * all other TCP options... 13633 */ 13634 (void) tcp_parse_options(tcph, &tcpopt); 13635 } 13636 try_again:; 13637 mss = tcp->tcp_mss; 13638 gap = seg_seq - tcp->tcp_rnxt; 13639 rgap = tcp->tcp_rwnd - (gap + seg_len); 13640 /* 13641 * gap is the amount of sequence space between what we expect to see 13642 * and what we got for seg_seq. A positive value for gap means 13643 * something got lost. A negative value means we got some old stuff. 13644 */ 13645 if (gap < 0) { 13646 /* Old stuff present. Is the SYN in there? */ 13647 if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 13648 (seg_len != 0)) { 13649 flags &= ~TH_SYN; 13650 seg_seq++; 13651 urp--; 13652 /* Recompute the gaps after noting the SYN. */ 13653 goto try_again; 13654 } 13655 BUMP_MIB(&tcps->tcps_mib, tcpInDataDupSegs); 13656 UPDATE_MIB(&tcps->tcps_mib, tcpInDataDupBytes, 13657 (seg_len > -gap ? -gap : seg_len)); 13658 /* Remove the old stuff from seg_len. */ 13659 seg_len += gap; 13660 /* 13661 * Anything left? 13662 * Make sure to check for unack'd FIN when rest of data 13663 * has been previously ack'd. 13664 */ 13665 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 13666 /* 13667 * Resets are only valid if they lie within our offered 13668 * window. If the RST bit is set, we just ignore this 13669 * segment. 13670 */ 13671 if (flags & TH_RST) { 13672 freemsg(mp); 13673 return; 13674 } 13675 13676 /* 13677 * The arriving of dup data packets indicate that we 13678 * may have postponed an ack for too long, or the other 13679 * side's RTT estimate is out of shape. Start acking 13680 * more often. 13681 */ 13682 if (SEQ_GEQ(seg_seq + seg_len - gap, tcp->tcp_rack) && 13683 tcp->tcp_rack_cnt >= 1 && 13684 tcp->tcp_rack_abs_max > 2) { 13685 tcp->tcp_rack_abs_max--; 13686 } 13687 tcp->tcp_rack_cur_max = 1; 13688 13689 /* 13690 * This segment is "unacceptable". None of its 13691 * sequence space lies within our advertized window. 13692 * 13693 * Adjust seg_len to the original value for tracing. 13694 */ 13695 seg_len -= gap; 13696 if (tcp->tcp_debug) { 13697 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13698 "tcp_rput: unacceptable, gap %d, rgap %d, " 13699 "flags 0x%x, seg_seq %u, seg_ack %u, " 13700 "seg_len %d, rnxt %u, snxt %u, %s", 13701 gap, rgap, flags, seg_seq, seg_ack, 13702 seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 13703 tcp_display(tcp, NULL, 13704 DISP_ADDR_AND_PORT)); 13705 } 13706 13707 /* 13708 * Arrange to send an ACK in response to the 13709 * unacceptable segment per RFC 793 page 69. There 13710 * is only one small difference between ours and the 13711 * acceptability test in the RFC - we accept ACK-only 13712 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 13713 * will be generated. 13714 * 13715 * Note that we have to ACK an ACK-only packet at least 13716 * for stacks that send 0-length keep-alives with 13717 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 13718 * section 4.2.3.6. As long as we don't ever generate 13719 * an unacceptable packet in response to an incoming 13720 * packet that is unacceptable, it should not cause 13721 * "ACK wars". 13722 */ 13723 flags |= TH_ACK_NEEDED; 13724 13725 /* 13726 * Continue processing this segment in order to use the 13727 * ACK information it contains, but skip all other 13728 * sequence-number processing. Processing the ACK 13729 * information is necessary in order to 13730 * re-synchronize connections that may have lost 13731 * synchronization. 13732 * 13733 * We clear seg_len and flag fields related to 13734 * sequence number processing as they are not 13735 * to be trusted for an unacceptable segment. 13736 */ 13737 seg_len = 0; 13738 flags &= ~(TH_SYN | TH_FIN | TH_URG); 13739 goto process_ack; 13740 } 13741 13742 /* Fix seg_seq, and chew the gap off the front. */ 13743 seg_seq = tcp->tcp_rnxt; 13744 urp += gap; 13745 do { 13746 mblk_t *mp2; 13747 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 13748 (uintptr_t)UINT_MAX); 13749 gap += (uint_t)(mp->b_wptr - mp->b_rptr); 13750 if (gap > 0) { 13751 mp->b_rptr = mp->b_wptr - gap; 13752 break; 13753 } 13754 mp2 = mp; 13755 mp = mp->b_cont; 13756 freeb(mp2); 13757 } while (gap < 0); 13758 /* 13759 * If the urgent data has already been acknowledged, we 13760 * should ignore TH_URG below 13761 */ 13762 if (urp < 0) 13763 flags &= ~TH_URG; 13764 } 13765 /* 13766 * rgap is the amount of stuff received out of window. A negative 13767 * value is the amount out of window. 13768 */ 13769 if (rgap < 0) { 13770 mblk_t *mp2; 13771 13772 if (tcp->tcp_rwnd == 0) { 13773 BUMP_MIB(&tcps->tcps_mib, tcpInWinProbe); 13774 } else { 13775 BUMP_MIB(&tcps->tcps_mib, tcpInDataPastWinSegs); 13776 UPDATE_MIB(&tcps->tcps_mib, 13777 tcpInDataPastWinBytes, -rgap); 13778 } 13779 13780 /* 13781 * seg_len does not include the FIN, so if more than 13782 * just the FIN is out of window, we act like we don't 13783 * see it. (If just the FIN is out of window, rgap 13784 * will be zero and we will go ahead and acknowledge 13785 * the FIN.) 13786 */ 13787 flags &= ~TH_FIN; 13788 13789 /* Fix seg_len and make sure there is something left. */ 13790 seg_len += rgap; 13791 if (seg_len <= 0) { 13792 /* 13793 * Resets are only valid if they lie within our offered 13794 * window. If the RST bit is set, we just ignore this 13795 * segment. 13796 */ 13797 if (flags & TH_RST) { 13798 freemsg(mp); 13799 return; 13800 } 13801 13802 /* Per RFC 793, we need to send back an ACK. */ 13803 flags |= TH_ACK_NEEDED; 13804 13805 /* 13806 * Send SIGURG as soon as possible i.e. even 13807 * if the TH_URG was delivered in a window probe 13808 * packet (which will be unacceptable). 13809 * 13810 * We generate a signal if none has been generated 13811 * for this connection or if this is a new urgent 13812 * byte. Also send a zero-length "unmarked" message 13813 * to inform SIOCATMARK that this is not the mark. 13814 * 13815 * tcp_urp_last_valid is cleared when the T_exdata_ind 13816 * is sent up. This plus the check for old data 13817 * (gap >= 0) handles the wraparound of the sequence 13818 * number space without having to always track the 13819 * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks 13820 * this max in its rcv_up variable). 13821 * 13822 * This prevents duplicate SIGURGS due to a "late" 13823 * zero-window probe when the T_EXDATA_IND has already 13824 * been sent up. 13825 */ 13826 if ((flags & TH_URG) && 13827 (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, 13828 tcp->tcp_urp_last))) { 13829 mp1 = allocb(0, BPRI_MED); 13830 if (mp1 == NULL) { 13831 freemsg(mp); 13832 return; 13833 } 13834 if (!TCP_IS_DETACHED(tcp) && 13835 !putnextctl1(tcp->tcp_rq, M_PCSIG, 13836 SIGURG)) { 13837 /* Try again on the rexmit. */ 13838 freemsg(mp1); 13839 freemsg(mp); 13840 return; 13841 } 13842 /* 13843 * If the next byte would be the mark 13844 * then mark with MARKNEXT else mark 13845 * with NOTMARKNEXT. 13846 */ 13847 if (gap == 0 && urp == 0) 13848 mp1->b_flag |= MSGMARKNEXT; 13849 else 13850 mp1->b_flag |= MSGNOTMARKNEXT; 13851 freemsg(tcp->tcp_urp_mark_mp); 13852 tcp->tcp_urp_mark_mp = mp1; 13853 flags |= TH_SEND_URP_MARK; 13854 tcp->tcp_urp_last_valid = B_TRUE; 13855 tcp->tcp_urp_last = urp + seg_seq; 13856 } 13857 /* 13858 * If this is a zero window probe, continue to 13859 * process the ACK part. But we need to set seg_len 13860 * to 0 to avoid data processing. Otherwise just 13861 * drop the segment and send back an ACK. 13862 */ 13863 if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 13864 flags &= ~(TH_SYN | TH_URG); 13865 seg_len = 0; 13866 goto process_ack; 13867 } else { 13868 freemsg(mp); 13869 goto ack_check; 13870 } 13871 } 13872 /* Pitch out of window stuff off the end. */ 13873 rgap = seg_len; 13874 mp2 = mp; 13875 do { 13876 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 13877 (uintptr_t)INT_MAX); 13878 rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 13879 if (rgap < 0) { 13880 mp2->b_wptr += rgap; 13881 if ((mp1 = mp2->b_cont) != NULL) { 13882 mp2->b_cont = NULL; 13883 freemsg(mp1); 13884 } 13885 break; 13886 } 13887 } while ((mp2 = mp2->b_cont) != NULL); 13888 } 13889 ok:; 13890 /* 13891 * TCP should check ECN info for segments inside the window only. 13892 * Therefore the check should be done here. 13893 */ 13894 if (tcp->tcp_ecn_ok) { 13895 if (flags & TH_CWR) { 13896 tcp->tcp_ecn_echo_on = B_FALSE; 13897 } 13898 /* 13899 * Note that both ECN_CE and CWR can be set in the 13900 * same segment. In this case, we once again turn 13901 * on ECN_ECHO. 13902 */ 13903 if (tcp->tcp_ipversion == IPV4_VERSION) { 13904 uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service; 13905 13906 if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 13907 tcp->tcp_ecn_echo_on = B_TRUE; 13908 } 13909 } else { 13910 uint32_t vcf = ((ip6_t *)rptr)->ip6_vcf; 13911 13912 if ((vcf & htonl(IPH_ECN_CE << 20)) == 13913 htonl(IPH_ECN_CE << 20)) { 13914 tcp->tcp_ecn_echo_on = B_TRUE; 13915 } 13916 } 13917 } 13918 13919 /* 13920 * Check whether we can update tcp_ts_recent. This test is 13921 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 13922 * Extensions for High Performance: An Update", Internet Draft. 13923 */ 13924 if (tcp->tcp_snd_ts_ok && 13925 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 13926 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 13927 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 13928 tcp->tcp_last_rcv_lbolt = lbolt64; 13929 } 13930 13931 if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 13932 /* 13933 * FIN in an out of order segment. We record this in 13934 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 13935 * Clear the FIN so that any check on FIN flag will fail. 13936 * Remember that FIN also counts in the sequence number 13937 * space. So we need to ack out of order FIN only segments. 13938 */ 13939 if (flags & TH_FIN) { 13940 tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 13941 tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 13942 flags &= ~TH_FIN; 13943 flags |= TH_ACK_NEEDED; 13944 } 13945 if (seg_len > 0) { 13946 /* Fill in the SACK blk list. */ 13947 if (tcp->tcp_snd_sack_ok) { 13948 ASSERT(tcp->tcp_sack_info != NULL); 13949 tcp_sack_insert(tcp->tcp_sack_list, 13950 seg_seq, seg_seq + seg_len, 13951 &(tcp->tcp_num_sack_blk)); 13952 } 13953 13954 /* 13955 * Attempt reassembly and see if we have something 13956 * ready to go. 13957 */ 13958 mp = tcp_reass(tcp, mp, seg_seq); 13959 /* Always ack out of order packets */ 13960 flags |= TH_ACK_NEEDED | TH_PUSH; 13961 if (mp) { 13962 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 13963 (uintptr_t)INT_MAX); 13964 seg_len = mp->b_cont ? msgdsize(mp) : 13965 (int)(mp->b_wptr - mp->b_rptr); 13966 seg_seq = tcp->tcp_rnxt; 13967 /* 13968 * A gap is filled and the seq num and len 13969 * of the gap match that of a previously 13970 * received FIN, put the FIN flag back in. 13971 */ 13972 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 13973 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 13974 flags |= TH_FIN; 13975 tcp->tcp_valid_bits &= 13976 ~TCP_OFO_FIN_VALID; 13977 } 13978 } else { 13979 /* 13980 * Keep going even with NULL mp. 13981 * There may be a useful ACK or something else 13982 * we don't want to miss. 13983 * 13984 * But TCP should not perform fast retransmit 13985 * because of the ack number. TCP uses 13986 * seg_len == 0 to determine if it is a pure 13987 * ACK. And this is not a pure ACK. 13988 */ 13989 seg_len = 0; 13990 ofo_seg = B_TRUE; 13991 } 13992 } 13993 } else if (seg_len > 0) { 13994 BUMP_MIB(&tcps->tcps_mib, tcpInDataInorderSegs); 13995 UPDATE_MIB(&tcps->tcps_mib, tcpInDataInorderBytes, seg_len); 13996 /* 13997 * If an out of order FIN was received before, and the seq 13998 * num and len of the new segment match that of the FIN, 13999 * put the FIN flag back in. 14000 */ 14001 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 14002 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 14003 flags |= TH_FIN; 14004 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 14005 } 14006 } 14007 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 14008 if (flags & TH_RST) { 14009 freemsg(mp); 14010 switch (tcp->tcp_state) { 14011 case TCPS_SYN_RCVD: 14012 (void) tcp_clean_death(tcp, ECONNREFUSED, 14); 14013 break; 14014 case TCPS_ESTABLISHED: 14015 case TCPS_FIN_WAIT_1: 14016 case TCPS_FIN_WAIT_2: 14017 case TCPS_CLOSE_WAIT: 14018 (void) tcp_clean_death(tcp, ECONNRESET, 15); 14019 break; 14020 case TCPS_CLOSING: 14021 case TCPS_LAST_ACK: 14022 (void) tcp_clean_death(tcp, 0, 16); 14023 break; 14024 default: 14025 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 14026 (void) tcp_clean_death(tcp, ENXIO, 17); 14027 break; 14028 } 14029 return; 14030 } 14031 if (flags & TH_SYN) { 14032 /* 14033 * See RFC 793, Page 71 14034 * 14035 * The seq number must be in the window as it should 14036 * be "fixed" above. If it is outside window, it should 14037 * be already rejected. Note that we allow seg_seq to be 14038 * rnxt + rwnd because we want to accept 0 window probe. 14039 */ 14040 ASSERT(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 14041 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 14042 freemsg(mp); 14043 /* 14044 * If the ACK flag is not set, just use our snxt as the 14045 * seq number of the RST segment. 14046 */ 14047 if (!(flags & TH_ACK)) { 14048 seg_ack = tcp->tcp_snxt; 14049 } 14050 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 14051 TH_RST|TH_ACK); 14052 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 14053 (void) tcp_clean_death(tcp, ECONNRESET, 18); 14054 return; 14055 } 14056 /* 14057 * urp could be -1 when the urp field in the packet is 0 14058 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent 14059 * byte was at seg_seq - 1, in which case we ignore the urgent flag. 14060 */ 14061 if (flags & TH_URG && urp >= 0) { 14062 if (!tcp->tcp_urp_last_valid || 14063 SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { 14064 /* 14065 * If we haven't generated the signal yet for this 14066 * urgent pointer value, do it now. Also, send up a 14067 * zero-length M_DATA indicating whether or not this is 14068 * the mark. The latter is not needed when a 14069 * T_EXDATA_IND is sent up. However, if there are 14070 * allocation failures this code relies on the sender 14071 * retransmitting and the socket code for determining 14072 * the mark should not block waiting for the peer to 14073 * transmit. Thus, for simplicity we always send up the 14074 * mark indication. 14075 */ 14076 mp1 = allocb(0, BPRI_MED); 14077 if (mp1 == NULL) { 14078 freemsg(mp); 14079 return; 14080 } 14081 if (!TCP_IS_DETACHED(tcp) && 14082 !putnextctl1(tcp->tcp_rq, M_PCSIG, SIGURG)) { 14083 /* Try again on the rexmit. */ 14084 freemsg(mp1); 14085 freemsg(mp); 14086 return; 14087 } 14088 /* 14089 * Mark with NOTMARKNEXT for now. 14090 * The code below will change this to MARKNEXT 14091 * if we are at the mark. 14092 * 14093 * If there are allocation failures (e.g. in dupmsg 14094 * below) the next time tcp_rput_data sees the urgent 14095 * segment it will send up the MSG*MARKNEXT message. 14096 */ 14097 mp1->b_flag |= MSGNOTMARKNEXT; 14098 freemsg(tcp->tcp_urp_mark_mp); 14099 tcp->tcp_urp_mark_mp = mp1; 14100 flags |= TH_SEND_URP_MARK; 14101 #ifdef DEBUG 14102 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14103 "tcp_rput: sent M_PCSIG 2 seq %x urp %x " 14104 "last %x, %s", 14105 seg_seq, urp, tcp->tcp_urp_last, 14106 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 14107 #endif /* DEBUG */ 14108 tcp->tcp_urp_last_valid = B_TRUE; 14109 tcp->tcp_urp_last = urp + seg_seq; 14110 } else if (tcp->tcp_urp_mark_mp != NULL) { 14111 /* 14112 * An allocation failure prevented the previous 14113 * tcp_rput_data from sending up the allocated 14114 * MSG*MARKNEXT message - send it up this time 14115 * around. 14116 */ 14117 flags |= TH_SEND_URP_MARK; 14118 } 14119 14120 /* 14121 * If the urgent byte is in this segment, make sure that it is 14122 * all by itself. This makes it much easier to deal with the 14123 * possibility of an allocation failure on the T_exdata_ind. 14124 * Note that seg_len is the number of bytes in the segment, and 14125 * urp is the offset into the segment of the urgent byte. 14126 * urp < seg_len means that the urgent byte is in this segment. 14127 */ 14128 if (urp < seg_len) { 14129 if (seg_len != 1) { 14130 uint32_t tmp_rnxt; 14131 /* 14132 * Break it up and feed it back in. 14133 * Re-attach the IP header. 14134 */ 14135 mp->b_rptr = iphdr; 14136 if (urp > 0) { 14137 /* 14138 * There is stuff before the urgent 14139 * byte. 14140 */ 14141 mp1 = dupmsg(mp); 14142 if (!mp1) { 14143 /* 14144 * Trim from urgent byte on. 14145 * The rest will come back. 14146 */ 14147 (void) adjmsg(mp, 14148 urp - seg_len); 14149 tcp_rput_data(connp, 14150 mp, NULL); 14151 return; 14152 } 14153 (void) adjmsg(mp1, urp - seg_len); 14154 /* Feed this piece back in. */ 14155 tmp_rnxt = tcp->tcp_rnxt; 14156 tcp_rput_data(connp, mp1, NULL); 14157 /* 14158 * If the data passed back in was not 14159 * processed (ie: bad ACK) sending 14160 * the remainder back in will cause a 14161 * loop. In this case, drop the 14162 * packet and let the sender try 14163 * sending a good packet. 14164 */ 14165 if (tmp_rnxt == tcp->tcp_rnxt) { 14166 freemsg(mp); 14167 return; 14168 } 14169 } 14170 if (urp != seg_len - 1) { 14171 uint32_t tmp_rnxt; 14172 /* 14173 * There is stuff after the urgent 14174 * byte. 14175 */ 14176 mp1 = dupmsg(mp); 14177 if (!mp1) { 14178 /* 14179 * Trim everything beyond the 14180 * urgent byte. The rest will 14181 * come back. 14182 */ 14183 (void) adjmsg(mp, 14184 urp + 1 - seg_len); 14185 tcp_rput_data(connp, 14186 mp, NULL); 14187 return; 14188 } 14189 (void) adjmsg(mp1, urp + 1 - seg_len); 14190 tmp_rnxt = tcp->tcp_rnxt; 14191 tcp_rput_data(connp, mp1, NULL); 14192 /* 14193 * If the data passed back in was not 14194 * processed (ie: bad ACK) sending 14195 * the remainder back in will cause a 14196 * loop. In this case, drop the 14197 * packet and let the sender try 14198 * sending a good packet. 14199 */ 14200 if (tmp_rnxt == tcp->tcp_rnxt) { 14201 freemsg(mp); 14202 return; 14203 } 14204 } 14205 tcp_rput_data(connp, mp, NULL); 14206 return; 14207 } 14208 /* 14209 * This segment contains only the urgent byte. We 14210 * have to allocate the T_exdata_ind, if we can. 14211 */ 14212 if (!tcp->tcp_urp_mp) { 14213 struct T_exdata_ind *tei; 14214 mp1 = allocb(sizeof (struct T_exdata_ind), 14215 BPRI_MED); 14216 if (!mp1) { 14217 /* 14218 * Sigh... It'll be back. 14219 * Generate any MSG*MARK message now. 14220 */ 14221 freemsg(mp); 14222 seg_len = 0; 14223 if (flags & TH_SEND_URP_MARK) { 14224 14225 14226 ASSERT(tcp->tcp_urp_mark_mp); 14227 tcp->tcp_urp_mark_mp->b_flag &= 14228 ~MSGNOTMARKNEXT; 14229 tcp->tcp_urp_mark_mp->b_flag |= 14230 MSGMARKNEXT; 14231 } 14232 goto ack_check; 14233 } 14234 mp1->b_datap->db_type = M_PROTO; 14235 tei = (struct T_exdata_ind *)mp1->b_rptr; 14236 tei->PRIM_type = T_EXDATA_IND; 14237 tei->MORE_flag = 0; 14238 mp1->b_wptr = (uchar_t *)&tei[1]; 14239 tcp->tcp_urp_mp = mp1; 14240 #ifdef DEBUG 14241 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14242 "tcp_rput: allocated exdata_ind %s", 14243 tcp_display(tcp, NULL, 14244 DISP_PORT_ONLY)); 14245 #endif /* DEBUG */ 14246 /* 14247 * There is no need to send a separate MSG*MARK 14248 * message since the T_EXDATA_IND will be sent 14249 * now. 14250 */ 14251 flags &= ~TH_SEND_URP_MARK; 14252 freemsg(tcp->tcp_urp_mark_mp); 14253 tcp->tcp_urp_mark_mp = NULL; 14254 } 14255 /* 14256 * Now we are all set. On the next putnext upstream, 14257 * tcp_urp_mp will be non-NULL and will get prepended 14258 * to what has to be this piece containing the urgent 14259 * byte. If for any reason we abort this segment below, 14260 * if it comes back, we will have this ready, or it 14261 * will get blown off in close. 14262 */ 14263 } else if (urp == seg_len) { 14264 /* 14265 * The urgent byte is the next byte after this sequence 14266 * number. If there is data it is marked with 14267 * MSGMARKNEXT and any tcp_urp_mark_mp is discarded 14268 * since it is not needed. Otherwise, if the code 14269 * above just allocated a zero-length tcp_urp_mark_mp 14270 * message, that message is tagged with MSGMARKNEXT. 14271 * Sending up these MSGMARKNEXT messages makes 14272 * SIOCATMARK work correctly even though 14273 * the T_EXDATA_IND will not be sent up until the 14274 * urgent byte arrives. 14275 */ 14276 if (seg_len != 0) { 14277 flags |= TH_MARKNEXT_NEEDED; 14278 freemsg(tcp->tcp_urp_mark_mp); 14279 tcp->tcp_urp_mark_mp = NULL; 14280 flags &= ~TH_SEND_URP_MARK; 14281 } else if (tcp->tcp_urp_mark_mp != NULL) { 14282 flags |= TH_SEND_URP_MARK; 14283 tcp->tcp_urp_mark_mp->b_flag &= 14284 ~MSGNOTMARKNEXT; 14285 tcp->tcp_urp_mark_mp->b_flag |= MSGMARKNEXT; 14286 } 14287 #ifdef DEBUG 14288 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14289 "tcp_rput: AT MARK, len %d, flags 0x%x, %s", 14290 seg_len, flags, 14291 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 14292 #endif /* DEBUG */ 14293 } else { 14294 /* Data left until we hit mark */ 14295 #ifdef DEBUG 14296 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14297 "tcp_rput: URP %d bytes left, %s", 14298 urp - seg_len, tcp_display(tcp, NULL, 14299 DISP_PORT_ONLY)); 14300 #endif /* DEBUG */ 14301 } 14302 } 14303 14304 process_ack: 14305 if (!(flags & TH_ACK)) { 14306 freemsg(mp); 14307 goto xmit_check; 14308 } 14309 } 14310 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 14311 14312 if (tcp->tcp_ipversion == IPV6_VERSION && bytes_acked > 0) 14313 tcp->tcp_ip_forward_progress = B_TRUE; 14314 if (tcp->tcp_state == TCPS_SYN_RCVD) { 14315 if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) && 14316 ((tcp->tcp_kssl_ent == NULL) || !tcp->tcp_kssl_pending)) { 14317 /* 3-way handshake complete - pass up the T_CONN_IND */ 14318 tcp_t *listener = tcp->tcp_listener; 14319 mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind; 14320 14321 tcp->tcp_tconnind_started = B_TRUE; 14322 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 14323 /* 14324 * We are here means eager is fine but it can 14325 * get a TH_RST at any point between now and till 14326 * accept completes and disappear. We need to 14327 * ensure that reference to eager is valid after 14328 * we get out of eager's perimeter. So we do 14329 * an extra refhold. 14330 */ 14331 CONN_INC_REF(connp); 14332 14333 /* 14334 * The listener also exists because of the refhold 14335 * done in tcp_conn_request. Its possible that it 14336 * might have closed. We will check that once we 14337 * get inside listeners context. 14338 */ 14339 CONN_INC_REF(listener->tcp_connp); 14340 if (listener->tcp_connp->conn_sqp == 14341 connp->conn_sqp) { 14342 tcp_send_conn_ind(listener->tcp_connp, mp, 14343 listener->tcp_connp->conn_sqp); 14344 CONN_DEC_REF(listener->tcp_connp); 14345 } else if (!tcp->tcp_loopback) { 14346 squeue_fill(listener->tcp_connp->conn_sqp, mp, 14347 tcp_send_conn_ind, 14348 listener->tcp_connp, SQTAG_TCP_CONN_IND); 14349 } else { 14350 squeue_enter(listener->tcp_connp->conn_sqp, mp, 14351 tcp_send_conn_ind, listener->tcp_connp, 14352 SQTAG_TCP_CONN_IND); 14353 } 14354 } 14355 14356 if (tcp->tcp_active_open) { 14357 /* 14358 * We are seeing the final ack in the three way 14359 * hand shake of a active open'ed connection 14360 * so we must send up a T_CONN_CON 14361 */ 14362 if (!tcp_conn_con(tcp, iphdr, tcph, mp, NULL)) { 14363 freemsg(mp); 14364 return; 14365 } 14366 /* 14367 * Don't fuse the loopback endpoints for 14368 * simultaneous active opens. 14369 */ 14370 if (tcp->tcp_loopback) { 14371 TCP_STAT(tcps, tcp_fusion_unfusable); 14372 tcp->tcp_unfusable = B_TRUE; 14373 } 14374 } 14375 14376 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 14377 bytes_acked--; 14378 /* SYN was acked - making progress */ 14379 if (tcp->tcp_ipversion == IPV6_VERSION) 14380 tcp->tcp_ip_forward_progress = B_TRUE; 14381 14382 /* 14383 * If SYN was retransmitted, need to reset all 14384 * retransmission info as this segment will be 14385 * treated as a dup ACK. 14386 */ 14387 if (tcp->tcp_rexmit) { 14388 tcp->tcp_rexmit = B_FALSE; 14389 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 14390 tcp->tcp_rexmit_max = tcp->tcp_snxt; 14391 tcp->tcp_snd_burst = tcp->tcp_localnet ? 14392 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 14393 tcp->tcp_ms_we_have_waited = 0; 14394 tcp->tcp_cwnd = mss; 14395 } 14396 14397 /* 14398 * We set the send window to zero here. 14399 * This is needed if there is data to be 14400 * processed already on the queue. 14401 * Later (at swnd_update label), the 14402 * "new_swnd > tcp_swnd" condition is satisfied 14403 * the XMIT_NEEDED flag is set in the current 14404 * (SYN_RCVD) state. This ensures tcp_wput_data() is 14405 * called if there is already data on queue in 14406 * this state. 14407 */ 14408 tcp->tcp_swnd = 0; 14409 14410 if (new_swnd > tcp->tcp_max_swnd) 14411 tcp->tcp_max_swnd = new_swnd; 14412 tcp->tcp_swl1 = seg_seq; 14413 tcp->tcp_swl2 = seg_ack; 14414 tcp->tcp_state = TCPS_ESTABLISHED; 14415 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 14416 14417 /* Fuse when both sides are in ESTABLISHED state */ 14418 if (tcp->tcp_loopback && do_tcp_fusion) 14419 tcp_fuse(tcp, iphdr, tcph); 14420 14421 } 14422 /* This code follows 4.4BSD-Lite2 mostly. */ 14423 if (bytes_acked < 0) 14424 goto est; 14425 14426 /* 14427 * If TCP is ECN capable and the congestion experience bit is 14428 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 14429 * done once per window (or more loosely, per RTT). 14430 */ 14431 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 14432 tcp->tcp_cwr = B_FALSE; 14433 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { 14434 if (!tcp->tcp_cwr) { 14435 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss; 14436 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; 14437 tcp->tcp_cwnd = npkt * mss; 14438 /* 14439 * If the cwnd is 0, use the timer to clock out 14440 * new segments. This is required by the ECN spec. 14441 */ 14442 if (npkt == 0) { 14443 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 14444 /* 14445 * This makes sure that when the ACK comes 14446 * back, we will increase tcp_cwnd by 1 MSS. 14447 */ 14448 tcp->tcp_cwnd_cnt = 0; 14449 } 14450 tcp->tcp_cwr = B_TRUE; 14451 /* 14452 * This marks the end of the current window of in 14453 * flight data. That is why we don't use 14454 * tcp_suna + tcp_swnd. Only data in flight can 14455 * provide ECN info. 14456 */ 14457 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 14458 tcp->tcp_ecn_cwr_sent = B_FALSE; 14459 } 14460 } 14461 14462 mp1 = tcp->tcp_xmit_head; 14463 if (bytes_acked == 0) { 14464 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 14465 int dupack_cnt; 14466 14467 BUMP_MIB(&tcps->tcps_mib, tcpInDupAck); 14468 /* 14469 * Fast retransmit. When we have seen exactly three 14470 * identical ACKs while we have unacked data 14471 * outstanding we take it as a hint that our peer 14472 * dropped something. 14473 * 14474 * If TCP is retransmitting, don't do fast retransmit. 14475 */ 14476 if (mp1 && tcp->tcp_suna != tcp->tcp_snxt && 14477 ! tcp->tcp_rexmit) { 14478 /* Do Limited Transmit */ 14479 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 14480 tcps->tcps_dupack_fast_retransmit) { 14481 /* 14482 * RFC 3042 14483 * 14484 * What we need to do is temporarily 14485 * increase tcp_cwnd so that new 14486 * data can be sent if it is allowed 14487 * by the receive window (tcp_rwnd). 14488 * tcp_wput_data() will take care of 14489 * the rest. 14490 * 14491 * If the connection is SACK capable, 14492 * only do limited xmit when there 14493 * is SACK info. 14494 * 14495 * Note how tcp_cwnd is incremented. 14496 * The first dup ACK will increase 14497 * it by 1 MSS. The second dup ACK 14498 * will increase it by 2 MSS. This 14499 * means that only 1 new segment will 14500 * be sent for each dup ACK. 14501 */ 14502 if (tcp->tcp_unsent > 0 && 14503 (!tcp->tcp_snd_sack_ok || 14504 (tcp->tcp_snd_sack_ok && 14505 tcp->tcp_notsack_list != NULL))) { 14506 tcp->tcp_cwnd += mss << 14507 (tcp->tcp_dupack_cnt - 1); 14508 flags |= TH_LIMIT_XMIT; 14509 } 14510 } else if (dupack_cnt == 14511 tcps->tcps_dupack_fast_retransmit) { 14512 14513 /* 14514 * If we have reduced tcp_ssthresh 14515 * because of ECN, do not reduce it again 14516 * unless it is already one window of data 14517 * away. After one window of data, tcp_cwr 14518 * should then be cleared. Note that 14519 * for non ECN capable connection, tcp_cwr 14520 * should always be false. 14521 * 14522 * Adjust cwnd since the duplicate 14523 * ack indicates that a packet was 14524 * dropped (due to congestion.) 14525 */ 14526 if (!tcp->tcp_cwr) { 14527 npkt = ((tcp->tcp_snxt - 14528 tcp->tcp_suna) >> 1) / mss; 14529 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 14530 mss; 14531 tcp->tcp_cwnd = (npkt + 14532 tcp->tcp_dupack_cnt) * mss; 14533 } 14534 if (tcp->tcp_ecn_ok) { 14535 tcp->tcp_cwr = B_TRUE; 14536 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 14537 tcp->tcp_ecn_cwr_sent = B_FALSE; 14538 } 14539 14540 /* 14541 * We do Hoe's algorithm. Refer to her 14542 * paper "Improving the Start-up Behavior 14543 * of a Congestion Control Scheme for TCP," 14544 * appeared in SIGCOMM'96. 14545 * 14546 * Save highest seq no we have sent so far. 14547 * Be careful about the invisible FIN byte. 14548 */ 14549 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 14550 (tcp->tcp_unsent == 0)) { 14551 tcp->tcp_rexmit_max = tcp->tcp_fss; 14552 } else { 14553 tcp->tcp_rexmit_max = tcp->tcp_snxt; 14554 } 14555 14556 /* 14557 * Do not allow bursty traffic during. 14558 * fast recovery. Refer to Fall and Floyd's 14559 * paper "Simulation-based Comparisons of 14560 * Tahoe, Reno and SACK TCP" (in CCR?) 14561 * This is a best current practise. 14562 */ 14563 tcp->tcp_snd_burst = TCP_CWND_SS; 14564 14565 /* 14566 * For SACK: 14567 * Calculate tcp_pipe, which is the 14568 * estimated number of bytes in 14569 * network. 14570 * 14571 * tcp_fack is the highest sack'ed seq num 14572 * TCP has received. 14573 * 14574 * tcp_pipe is explained in the above quoted 14575 * Fall and Floyd's paper. tcp_fack is 14576 * explained in Mathis and Mahdavi's 14577 * "Forward Acknowledgment: Refining TCP 14578 * Congestion Control" in SIGCOMM '96. 14579 */ 14580 if (tcp->tcp_snd_sack_ok) { 14581 ASSERT(tcp->tcp_sack_info != NULL); 14582 if (tcp->tcp_notsack_list != NULL) { 14583 tcp->tcp_pipe = tcp->tcp_snxt - 14584 tcp->tcp_fack; 14585 tcp->tcp_sack_snxt = seg_ack; 14586 flags |= TH_NEED_SACK_REXMIT; 14587 } else { 14588 /* 14589 * Always initialize tcp_pipe 14590 * even though we don't have 14591 * any SACK info. If later 14592 * we get SACK info and 14593 * tcp_pipe is not initialized, 14594 * funny things will happen. 14595 */ 14596 tcp->tcp_pipe = 14597 tcp->tcp_cwnd_ssthresh; 14598 } 14599 } else { 14600 flags |= TH_REXMIT_NEEDED; 14601 } /* tcp_snd_sack_ok */ 14602 14603 } else { 14604 /* 14605 * Here we perform congestion 14606 * avoidance, but NOT slow start. 14607 * This is known as the Fast 14608 * Recovery Algorithm. 14609 */ 14610 if (tcp->tcp_snd_sack_ok && 14611 tcp->tcp_notsack_list != NULL) { 14612 flags |= TH_NEED_SACK_REXMIT; 14613 tcp->tcp_pipe -= mss; 14614 if (tcp->tcp_pipe < 0) 14615 tcp->tcp_pipe = 0; 14616 } else { 14617 /* 14618 * We know that one more packet has 14619 * left the pipe thus we can update 14620 * cwnd. 14621 */ 14622 cwnd = tcp->tcp_cwnd + mss; 14623 if (cwnd > tcp->tcp_cwnd_max) 14624 cwnd = tcp->tcp_cwnd_max; 14625 tcp->tcp_cwnd = cwnd; 14626 if (tcp->tcp_unsent > 0) 14627 flags |= TH_XMIT_NEEDED; 14628 } 14629 } 14630 } 14631 } else if (tcp->tcp_zero_win_probe) { 14632 /* 14633 * If the window has opened, need to arrange 14634 * to send additional data. 14635 */ 14636 if (new_swnd != 0) { 14637 /* tcp_suna != tcp_snxt */ 14638 /* Packet contains a window update */ 14639 BUMP_MIB(&tcps->tcps_mib, tcpInWinUpdate); 14640 tcp->tcp_zero_win_probe = 0; 14641 tcp->tcp_timer_backoff = 0; 14642 tcp->tcp_ms_we_have_waited = 0; 14643 14644 /* 14645 * Transmit starting with tcp_suna since 14646 * the one byte probe is not ack'ed. 14647 * If TCP has sent more than one identical 14648 * probe, tcp_rexmit will be set. That means 14649 * tcp_ss_rexmit() will send out the one 14650 * byte along with new data. Otherwise, 14651 * fake the retransmission. 14652 */ 14653 flags |= TH_XMIT_NEEDED; 14654 if (!tcp->tcp_rexmit) { 14655 tcp->tcp_rexmit = B_TRUE; 14656 tcp->tcp_dupack_cnt = 0; 14657 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 14658 tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 14659 } 14660 } 14661 } 14662 goto swnd_update; 14663 } 14664 14665 /* 14666 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 14667 * If the ACK value acks something that we have not yet sent, it might 14668 * be an old duplicate segment. Send an ACK to re-synchronize the 14669 * other side. 14670 * Note: reset in response to unacceptable ACK in SYN_RECEIVE 14671 * state is handled above, so we can always just drop the segment and 14672 * send an ACK here. 14673 * 14674 * Should we send ACKs in response to ACK only segments? 14675 */ 14676 if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 14677 BUMP_MIB(&tcps->tcps_mib, tcpInAckUnsent); 14678 /* drop the received segment */ 14679 freemsg(mp); 14680 14681 /* 14682 * Send back an ACK. If tcp_drop_ack_unsent_cnt is 14683 * greater than 0, check if the number of such 14684 * bogus ACks is greater than that count. If yes, 14685 * don't send back any ACK. This prevents TCP from 14686 * getting into an ACK storm if somehow an attacker 14687 * successfully spoofs an acceptable segment to our 14688 * peer. 14689 */ 14690 if (tcp_drop_ack_unsent_cnt > 0 && 14691 ++tcp->tcp_in_ack_unsent > tcp_drop_ack_unsent_cnt) { 14692 TCP_STAT(tcps, tcp_in_ack_unsent_drop); 14693 return; 14694 } 14695 mp = tcp_ack_mp(tcp); 14696 if (mp != NULL) { 14697 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 14698 BUMP_LOCAL(tcp->tcp_obsegs); 14699 BUMP_MIB(&tcps->tcps_mib, tcpOutAck); 14700 tcp_send_data(tcp, tcp->tcp_wq, mp); 14701 } 14702 return; 14703 } 14704 14705 /* 14706 * TCP gets a new ACK, update the notsack'ed list to delete those 14707 * blocks that are covered by this ACK. 14708 */ 14709 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 14710 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 14711 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 14712 } 14713 14714 /* 14715 * If we got an ACK after fast retransmit, check to see 14716 * if it is a partial ACK. If it is not and the congestion 14717 * window was inflated to account for the other side's 14718 * cached packets, retract it. If it is, do Hoe's algorithm. 14719 */ 14720 if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) { 14721 ASSERT(tcp->tcp_rexmit == B_FALSE); 14722 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 14723 tcp->tcp_dupack_cnt = 0; 14724 /* 14725 * Restore the orig tcp_cwnd_ssthresh after 14726 * fast retransmit phase. 14727 */ 14728 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 14729 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; 14730 } 14731 tcp->tcp_rexmit_max = seg_ack; 14732 tcp->tcp_cwnd_cnt = 0; 14733 tcp->tcp_snd_burst = tcp->tcp_localnet ? 14734 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 14735 14736 /* 14737 * Remove all notsack info to avoid confusion with 14738 * the next fast retrasnmit/recovery phase. 14739 */ 14740 if (tcp->tcp_snd_sack_ok && 14741 tcp->tcp_notsack_list != NULL) { 14742 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 14743 } 14744 } else { 14745 if (tcp->tcp_snd_sack_ok && 14746 tcp->tcp_notsack_list != NULL) { 14747 flags |= TH_NEED_SACK_REXMIT; 14748 tcp->tcp_pipe -= mss; 14749 if (tcp->tcp_pipe < 0) 14750 tcp->tcp_pipe = 0; 14751 } else { 14752 /* 14753 * Hoe's algorithm: 14754 * 14755 * Retransmit the unack'ed segment and 14756 * restart fast recovery. Note that we 14757 * need to scale back tcp_cwnd to the 14758 * original value when we started fast 14759 * recovery. This is to prevent overly 14760 * aggressive behaviour in sending new 14761 * segments. 14762 */ 14763 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + 14764 tcps->tcps_dupack_fast_retransmit * mss; 14765 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 14766 flags |= TH_REXMIT_NEEDED; 14767 } 14768 } 14769 } else { 14770 tcp->tcp_dupack_cnt = 0; 14771 if (tcp->tcp_rexmit) { 14772 /* 14773 * TCP is retranmitting. If the ACK ack's all 14774 * outstanding data, update tcp_rexmit_max and 14775 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 14776 * to the correct value. 14777 * 14778 * Note that SEQ_LEQ() is used. This is to avoid 14779 * unnecessary fast retransmit caused by dup ACKs 14780 * received when TCP does slow start retransmission 14781 * after a time out. During this phase, TCP may 14782 * send out segments which are already received. 14783 * This causes dup ACKs to be sent back. 14784 */ 14785 if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 14786 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 14787 tcp->tcp_rexmit_nxt = seg_ack; 14788 } 14789 if (seg_ack != tcp->tcp_rexmit_max) { 14790 flags |= TH_XMIT_NEEDED; 14791 } 14792 } else { 14793 tcp->tcp_rexmit = B_FALSE; 14794 tcp->tcp_xmit_zc_clean = B_FALSE; 14795 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 14796 tcp->tcp_snd_burst = tcp->tcp_localnet ? 14797 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 14798 } 14799 tcp->tcp_ms_we_have_waited = 0; 14800 } 14801 } 14802 14803 BUMP_MIB(&tcps->tcps_mib, tcpInAckSegs); 14804 UPDATE_MIB(&tcps->tcps_mib, tcpInAckBytes, bytes_acked); 14805 tcp->tcp_suna = seg_ack; 14806 if (tcp->tcp_zero_win_probe != 0) { 14807 tcp->tcp_zero_win_probe = 0; 14808 tcp->tcp_timer_backoff = 0; 14809 } 14810 14811 /* 14812 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 14813 * Note that it cannot be the SYN being ack'ed. The code flow 14814 * will not reach here. 14815 */ 14816 if (mp1 == NULL) { 14817 goto fin_acked; 14818 } 14819 14820 /* 14821 * Update the congestion window. 14822 * 14823 * If TCP is not ECN capable or TCP is ECN capable but the 14824 * congestion experience bit is not set, increase the tcp_cwnd as 14825 * usual. 14826 */ 14827 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 14828 cwnd = tcp->tcp_cwnd; 14829 add = mss; 14830 14831 if (cwnd >= tcp->tcp_cwnd_ssthresh) { 14832 /* 14833 * This is to prevent an increase of less than 1 MSS of 14834 * tcp_cwnd. With partial increase, tcp_wput_data() 14835 * may send out tinygrams in order to preserve mblk 14836 * boundaries. 14837 * 14838 * By initializing tcp_cwnd_cnt to new tcp_cwnd and 14839 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is 14840 * increased by 1 MSS for every RTTs. 14841 */ 14842 if (tcp->tcp_cwnd_cnt <= 0) { 14843 tcp->tcp_cwnd_cnt = cwnd + add; 14844 } else { 14845 tcp->tcp_cwnd_cnt -= add; 14846 add = 0; 14847 } 14848 } 14849 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); 14850 } 14851 14852 /* See if the latest urgent data has been acknowledged */ 14853 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && 14854 SEQ_GT(seg_ack, tcp->tcp_urg)) 14855 tcp->tcp_valid_bits &= ~TCP_URG_VALID; 14856 14857 /* Can we update the RTT estimates? */ 14858 if (tcp->tcp_snd_ts_ok) { 14859 /* Ignore zero timestamp echo-reply. */ 14860 if (tcpopt.tcp_opt_ts_ecr != 0) { 14861 tcp_set_rto(tcp, (int32_t)lbolt - 14862 (int32_t)tcpopt.tcp_opt_ts_ecr); 14863 } 14864 14865 /* If needed, restart the timer. */ 14866 if (tcp->tcp_set_timer == 1) { 14867 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 14868 tcp->tcp_set_timer = 0; 14869 } 14870 /* 14871 * Update tcp_csuna in case the other side stops sending 14872 * us timestamps. 14873 */ 14874 tcp->tcp_csuna = tcp->tcp_snxt; 14875 } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 14876 /* 14877 * An ACK sequence we haven't seen before, so get the RTT 14878 * and update the RTO. But first check if the timestamp is 14879 * valid to use. 14880 */ 14881 if ((mp1->b_next != NULL) && 14882 SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) 14883 tcp_set_rto(tcp, (int32_t)lbolt - 14884 (int32_t)(intptr_t)mp1->b_prev); 14885 else 14886 BUMP_MIB(&tcps->tcps_mib, tcpRttNoUpdate); 14887 14888 /* Remeber the last sequence to be ACKed */ 14889 tcp->tcp_csuna = seg_ack; 14890 if (tcp->tcp_set_timer == 1) { 14891 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 14892 tcp->tcp_set_timer = 0; 14893 } 14894 } else { 14895 BUMP_MIB(&tcps->tcps_mib, tcpRttNoUpdate); 14896 } 14897 14898 /* Eat acknowledged bytes off the xmit queue. */ 14899 for (;;) { 14900 mblk_t *mp2; 14901 uchar_t *wptr; 14902 14903 wptr = mp1->b_wptr; 14904 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 14905 bytes_acked -= (int)(wptr - mp1->b_rptr); 14906 if (bytes_acked < 0) { 14907 mp1->b_rptr = wptr + bytes_acked; 14908 /* 14909 * Set a new timestamp if all the bytes timed by the 14910 * old timestamp have been ack'ed. 14911 */ 14912 if (SEQ_GT(seg_ack, 14913 (uint32_t)(uintptr_t)(mp1->b_next))) { 14914 mp1->b_prev = (mblk_t *)(uintptr_t)lbolt; 14915 mp1->b_next = NULL; 14916 } 14917 break; 14918 } 14919 mp1->b_next = NULL; 14920 mp1->b_prev = NULL; 14921 mp2 = mp1; 14922 mp1 = mp1->b_cont; 14923 14924 /* 14925 * This notification is required for some zero-copy 14926 * clients to maintain a copy semantic. After the data 14927 * is ack'ed, client is safe to modify or reuse the buffer. 14928 */ 14929 if (tcp->tcp_snd_zcopy_aware && 14930 (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 14931 tcp_zcopy_notify(tcp); 14932 freeb(mp2); 14933 if (bytes_acked == 0) { 14934 if (mp1 == NULL) { 14935 /* Everything is ack'ed, clear the tail. */ 14936 tcp->tcp_xmit_tail = NULL; 14937 /* 14938 * Cancel the timer unless we are still 14939 * waiting for an ACK for the FIN packet. 14940 */ 14941 if (tcp->tcp_timer_tid != 0 && 14942 tcp->tcp_snxt == tcp->tcp_suna) { 14943 (void) TCP_TIMER_CANCEL(tcp, 14944 tcp->tcp_timer_tid); 14945 tcp->tcp_timer_tid = 0; 14946 } 14947 goto pre_swnd_update; 14948 } 14949 if (mp2 != tcp->tcp_xmit_tail) 14950 break; 14951 tcp->tcp_xmit_tail = mp1; 14952 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 14953 (uintptr_t)INT_MAX); 14954 tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 14955 mp1->b_rptr); 14956 break; 14957 } 14958 if (mp1 == NULL) { 14959 /* 14960 * More was acked but there is nothing more 14961 * outstanding. This means that the FIN was 14962 * just acked or that we're talking to a clown. 14963 */ 14964 fin_acked: 14965 ASSERT(tcp->tcp_fin_sent); 14966 tcp->tcp_xmit_tail = NULL; 14967 if (tcp->tcp_fin_sent) { 14968 /* FIN was acked - making progress */ 14969 if (tcp->tcp_ipversion == IPV6_VERSION && 14970 !tcp->tcp_fin_acked) 14971 tcp->tcp_ip_forward_progress = B_TRUE; 14972 tcp->tcp_fin_acked = B_TRUE; 14973 if (tcp->tcp_linger_tid != 0 && 14974 TCP_TIMER_CANCEL(tcp, 14975 tcp->tcp_linger_tid) >= 0) { 14976 tcp_stop_lingering(tcp); 14977 freemsg(mp); 14978 mp = NULL; 14979 } 14980 } else { 14981 /* 14982 * We should never get here because 14983 * we have already checked that the 14984 * number of bytes ack'ed should be 14985 * smaller than or equal to what we 14986 * have sent so far (it is the 14987 * acceptability check of the ACK). 14988 * We can only get here if the send 14989 * queue is corrupted. 14990 * 14991 * Terminate the connection and 14992 * panic the system. It is better 14993 * for us to panic instead of 14994 * continuing to avoid other disaster. 14995 */ 14996 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 14997 tcp->tcp_rnxt, TH_RST|TH_ACK); 14998 panic("Memory corruption " 14999 "detected for connection %s.", 15000 tcp_display(tcp, NULL, 15001 DISP_ADDR_AND_PORT)); 15002 /*NOTREACHED*/ 15003 } 15004 goto pre_swnd_update; 15005 } 15006 ASSERT(mp2 != tcp->tcp_xmit_tail); 15007 } 15008 if (tcp->tcp_unsent) { 15009 flags |= TH_XMIT_NEEDED; 15010 } 15011 pre_swnd_update: 15012 tcp->tcp_xmit_head = mp1; 15013 swnd_update: 15014 /* 15015 * The following check is different from most other implementations. 15016 * For bi-directional transfer, when segments are dropped, the 15017 * "normal" check will not accept a window update in those 15018 * retransmitted segemnts. Failing to do that, TCP may send out 15019 * segments which are outside receiver's window. As TCP accepts 15020 * the ack in those retransmitted segments, if the window update in 15021 * the same segment is not accepted, TCP will incorrectly calculates 15022 * that it can send more segments. This can create a deadlock 15023 * with the receiver if its window becomes zero. 15024 */ 15025 if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 15026 SEQ_LT(tcp->tcp_swl1, seg_seq) || 15027 (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 15028 /* 15029 * The criteria for update is: 15030 * 15031 * 1. the segment acknowledges some data. Or 15032 * 2. the segment is new, i.e. it has a higher seq num. Or 15033 * 3. the segment is not old and the advertised window is 15034 * larger than the previous advertised window. 15035 */ 15036 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 15037 flags |= TH_XMIT_NEEDED; 15038 tcp->tcp_swnd = new_swnd; 15039 if (new_swnd > tcp->tcp_max_swnd) 15040 tcp->tcp_max_swnd = new_swnd; 15041 tcp->tcp_swl1 = seg_seq; 15042 tcp->tcp_swl2 = seg_ack; 15043 } 15044 est: 15045 if (tcp->tcp_state > TCPS_ESTABLISHED) { 15046 15047 switch (tcp->tcp_state) { 15048 case TCPS_FIN_WAIT_1: 15049 if (tcp->tcp_fin_acked) { 15050 tcp->tcp_state = TCPS_FIN_WAIT_2; 15051 /* 15052 * We implement the non-standard BSD/SunOS 15053 * FIN_WAIT_2 flushing algorithm. 15054 * If there is no user attached to this 15055 * TCP endpoint, then this TCP struct 15056 * could hang around forever in FIN_WAIT_2 15057 * state if the peer forgets to send us 15058 * a FIN. To prevent this, we wait only 15059 * 2*MSL (a convenient time value) for 15060 * the FIN to arrive. If it doesn't show up, 15061 * we flush the TCP endpoint. This algorithm, 15062 * though a violation of RFC-793, has worked 15063 * for over 10 years in BSD systems. 15064 * Note: SunOS 4.x waits 675 seconds before 15065 * flushing the FIN_WAIT_2 connection. 15066 */ 15067 TCP_TIMER_RESTART(tcp, 15068 tcps->tcps_fin_wait_2_flush_interval); 15069 } 15070 break; 15071 case TCPS_FIN_WAIT_2: 15072 break; /* Shutdown hook? */ 15073 case TCPS_LAST_ACK: 15074 freemsg(mp); 15075 if (tcp->tcp_fin_acked) { 15076 (void) tcp_clean_death(tcp, 0, 19); 15077 return; 15078 } 15079 goto xmit_check; 15080 case TCPS_CLOSING: 15081 if (tcp->tcp_fin_acked) { 15082 tcp->tcp_state = TCPS_TIME_WAIT; 15083 /* 15084 * Unconditionally clear the exclusive binding 15085 * bit so this TIME-WAIT connection won't 15086 * interfere with new ones. 15087 */ 15088 tcp->tcp_exclbind = 0; 15089 if (!TCP_IS_DETACHED(tcp)) { 15090 TCP_TIMER_RESTART(tcp, 15091 tcps->tcps_time_wait_interval); 15092 } else { 15093 tcp_time_wait_append(tcp); 15094 TCP_DBGSTAT(tcps, tcp_rput_time_wait); 15095 } 15096 } 15097 /*FALLTHRU*/ 15098 case TCPS_CLOSE_WAIT: 15099 freemsg(mp); 15100 goto xmit_check; 15101 default: 15102 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 15103 break; 15104 } 15105 } 15106 if (flags & TH_FIN) { 15107 /* Make sure we ack the fin */ 15108 flags |= TH_ACK_NEEDED; 15109 if (!tcp->tcp_fin_rcvd) { 15110 tcp->tcp_fin_rcvd = B_TRUE; 15111 tcp->tcp_rnxt++; 15112 tcph = tcp->tcp_tcph; 15113 U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); 15114 15115 /* 15116 * Generate the ordrel_ind at the end unless we 15117 * are an eager guy. 15118 * In the eager case tcp_rsrv will do this when run 15119 * after tcp_accept is done. 15120 */ 15121 if (tcp->tcp_listener == NULL && 15122 !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding)) 15123 flags |= TH_ORDREL_NEEDED; 15124 switch (tcp->tcp_state) { 15125 case TCPS_SYN_RCVD: 15126 case TCPS_ESTABLISHED: 15127 tcp->tcp_state = TCPS_CLOSE_WAIT; 15128 /* Keepalive? */ 15129 break; 15130 case TCPS_FIN_WAIT_1: 15131 if (!tcp->tcp_fin_acked) { 15132 tcp->tcp_state = TCPS_CLOSING; 15133 break; 15134 } 15135 /* FALLTHRU */ 15136 case TCPS_FIN_WAIT_2: 15137 tcp->tcp_state = TCPS_TIME_WAIT; 15138 /* 15139 * Unconditionally clear the exclusive binding 15140 * bit so this TIME-WAIT connection won't 15141 * interfere with new ones. 15142 */ 15143 tcp->tcp_exclbind = 0; 15144 if (!TCP_IS_DETACHED(tcp)) { 15145 TCP_TIMER_RESTART(tcp, 15146 tcps->tcps_time_wait_interval); 15147 } else { 15148 tcp_time_wait_append(tcp); 15149 TCP_DBGSTAT(tcps, tcp_rput_time_wait); 15150 } 15151 if (seg_len) { 15152 /* 15153 * implies data piggybacked on FIN. 15154 * break to handle data. 15155 */ 15156 break; 15157 } 15158 freemsg(mp); 15159 goto ack_check; 15160 } 15161 } 15162 } 15163 if (mp == NULL) 15164 goto xmit_check; 15165 if (seg_len == 0) { 15166 freemsg(mp); 15167 goto xmit_check; 15168 } 15169 if (mp->b_rptr == mp->b_wptr) { 15170 /* 15171 * The header has been consumed, so we remove the 15172 * zero-length mblk here. 15173 */ 15174 mp1 = mp; 15175 mp = mp->b_cont; 15176 freeb(mp1); 15177 } 15178 tcph = tcp->tcp_tcph; 15179 tcp->tcp_rack_cnt++; 15180 { 15181 uint32_t cur_max; 15182 15183 cur_max = tcp->tcp_rack_cur_max; 15184 if (tcp->tcp_rack_cnt >= cur_max) { 15185 /* 15186 * We have more unacked data than we should - send 15187 * an ACK now. 15188 */ 15189 flags |= TH_ACK_NEEDED; 15190 cur_max++; 15191 if (cur_max > tcp->tcp_rack_abs_max) 15192 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 15193 else 15194 tcp->tcp_rack_cur_max = cur_max; 15195 } else if (TCP_IS_DETACHED(tcp)) { 15196 /* We don't have an ACK timer for detached TCP. */ 15197 flags |= TH_ACK_NEEDED; 15198 } else if (seg_len < mss) { 15199 /* 15200 * If we get a segment that is less than an mss, and we 15201 * already have unacknowledged data, and the amount 15202 * unacknowledged is not a multiple of mss, then we 15203 * better generate an ACK now. Otherwise, this may be 15204 * the tail piece of a transaction, and we would rather 15205 * wait for the response. 15206 */ 15207 uint32_t udif; 15208 ASSERT((uintptr_t)(tcp->tcp_rnxt - tcp->tcp_rack) <= 15209 (uintptr_t)INT_MAX); 15210 udif = (int)(tcp->tcp_rnxt - tcp->tcp_rack); 15211 if (udif && (udif % mss)) 15212 flags |= TH_ACK_NEEDED; 15213 else 15214 flags |= TH_ACK_TIMER_NEEDED; 15215 } else { 15216 /* Start delayed ack timer */ 15217 flags |= TH_ACK_TIMER_NEEDED; 15218 } 15219 } 15220 tcp->tcp_rnxt += seg_len; 15221 U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); 15222 15223 /* Update SACK list */ 15224 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 15225 tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 15226 &(tcp->tcp_num_sack_blk)); 15227 } 15228 15229 if (tcp->tcp_urp_mp) { 15230 tcp->tcp_urp_mp->b_cont = mp; 15231 mp = tcp->tcp_urp_mp; 15232 tcp->tcp_urp_mp = NULL; 15233 /* Ready for a new signal. */ 15234 tcp->tcp_urp_last_valid = B_FALSE; 15235 #ifdef DEBUG 15236 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 15237 "tcp_rput: sending exdata_ind %s", 15238 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 15239 #endif /* DEBUG */ 15240 } 15241 15242 /* 15243 * Check for ancillary data changes compared to last segment. 15244 */ 15245 if (tcp->tcp_ipv6_recvancillary != 0) { 15246 mp = tcp_rput_add_ancillary(tcp, mp, &ipp); 15247 if (mp == NULL) 15248 return; 15249 } 15250 15251 if (tcp->tcp_listener || tcp->tcp_hard_binding) { 15252 /* 15253 * Side queue inbound data until the accept happens. 15254 * tcp_accept/tcp_rput drains this when the accept happens. 15255 * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or 15256 * T_EXDATA_IND) it is queued on b_next. 15257 * XXX Make urgent data use this. Requires: 15258 * Removing tcp_listener check for TH_URG 15259 * Making M_PCPROTO and MARK messages skip the eager case 15260 */ 15261 15262 if (tcp->tcp_kssl_pending) { 15263 DTRACE_PROBE1(kssl_mblk__ksslinput_pending, 15264 mblk_t *, mp); 15265 tcp_kssl_input(tcp, mp); 15266 } else { 15267 tcp_rcv_enqueue(tcp, mp, seg_len); 15268 } 15269 } else { 15270 sodirect_t *sodp = tcp->tcp_sodirect; 15271 15272 /* 15273 * If an sodirect connection and an enabled sodirect_t then 15274 * sodp will be set to point to the tcp_t/sonode_t shared 15275 * sodirect_t and the sodirect_t's lock will be held. 15276 */ 15277 if (sodp != NULL) { 15278 mutex_enter(sodp->sod_lock); 15279 if (!(sodp->sod_state & SOD_ENABLED)) { 15280 mutex_exit(sodp->sod_lock); 15281 sodp = NULL; 15282 } else if (tcp->tcp_kssl_ctx != NULL && 15283 DB_TYPE(mp) == M_DATA) { 15284 mutex_exit(sodp->sod_lock); 15285 sodp = NULL; 15286 } 15287 } 15288 if (mp->b_datap->db_type != M_DATA || 15289 (flags & TH_MARKNEXT_NEEDED)) { 15290 if (sodp != NULL) { 15291 if (!SOD_QEMPTY(sodp) && 15292 (sodp->sod_state & SOD_WAKE_NOT)) { 15293 flags |= tcp_rcv_sod_wakeup(tcp, sodp); 15294 /* sod_wakeup() did the mutex_exit() */ 15295 mutex_enter(sodp->sod_lock); 15296 } 15297 } else if (tcp->tcp_rcv_list != NULL) { 15298 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 15299 } 15300 ASSERT(tcp->tcp_rcv_list == NULL || 15301 tcp->tcp_fused_sigurg); 15302 15303 if (flags & TH_MARKNEXT_NEEDED) { 15304 #ifdef DEBUG 15305 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 15306 "tcp_rput: sending MSGMARKNEXT %s", 15307 tcp_display(tcp, NULL, 15308 DISP_PORT_ONLY)); 15309 #endif /* DEBUG */ 15310 mp->b_flag |= MSGMARKNEXT; 15311 flags &= ~TH_MARKNEXT_NEEDED; 15312 } 15313 15314 /* Does this need SSL processing first? */ 15315 if ((tcp->tcp_kssl_ctx != NULL) && 15316 (DB_TYPE(mp) == M_DATA)) { 15317 DTRACE_PROBE1(kssl_mblk__ksslinput_data1, 15318 mblk_t *, mp); 15319 tcp_kssl_input(tcp, mp); 15320 } else { 15321 if (sodp) { 15322 /* 15323 * Done with sodirect, use putnext 15324 * to push this non M_DATA headed 15325 * mblk_t chain. 15326 */ 15327 mutex_exit(sodp->sod_lock); 15328 } 15329 putnext(tcp->tcp_rq, mp); 15330 if (!canputnext(tcp->tcp_rq)) 15331 tcp->tcp_rwnd -= seg_len; 15332 } 15333 } else if ((tcp->tcp_kssl_ctx != NULL) && 15334 (DB_TYPE(mp) == M_DATA)) { 15335 /* Do SSL processing first */ 15336 DTRACE_PROBE1(kssl_mblk__ksslinput_data2, 15337 mblk_t *, mp); 15338 tcp_kssl_input(tcp, mp); 15339 } else if (sodp != NULL) { 15340 /* 15341 * Sodirect so all mblk_t's are queued on the 15342 * socket directly, check for wakeup of blocked 15343 * reader (if any), and last if flow-controled. 15344 */ 15345 flags |= tcp_rcv_sod_enqueue(tcp, sodp, mp, seg_len); 15346 if ((sodp->sod_state & SOD_WAKE_NEED) || 15347 (flags & (TH_PUSH|TH_FIN))) { 15348 flags |= tcp_rcv_sod_wakeup(tcp, sodp); 15349 /* sod_wakeup() did the mutex_exit() */ 15350 } else { 15351 if (SOD_QFULL(sodp)) { 15352 /* Q is full, need backenable */ 15353 SOD_QSETBE(sodp); 15354 } 15355 mutex_exit(sodp->sod_lock); 15356 } 15357 } else if ((flags & (TH_PUSH|TH_FIN)) || 15358 tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_rq->q_hiwat >> 3) { 15359 if (tcp->tcp_rcv_list != NULL) { 15360 /* 15361 * Enqueue the new segment first and then 15362 * call tcp_rcv_drain() to send all data 15363 * up. The other way to do this is to 15364 * send all queued data up and then call 15365 * putnext() to send the new segment up. 15366 * This way can remove the else part later 15367 * on. 15368 * 15369 * We don't this to avoid one more call to 15370 * canputnext() as tcp_rcv_drain() needs to 15371 * call canputnext(). 15372 */ 15373 tcp_rcv_enqueue(tcp, mp, seg_len); 15374 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 15375 } else { 15376 putnext(tcp->tcp_rq, mp); 15377 if (!canputnext(tcp->tcp_rq)) 15378 tcp->tcp_rwnd -= seg_len; 15379 } 15380 } else { 15381 /* 15382 * Enqueue all packets when processing an mblk 15383 * from the co queue and also enqueue normal packets. 15384 */ 15385 tcp_rcv_enqueue(tcp, mp, seg_len); 15386 } 15387 /* 15388 * Make sure the timer is running if we have data waiting 15389 * for a push bit. This provides resiliency against 15390 * implementations that do not correctly generate push bits. 15391 * 15392 * Note, for sodirect if Q isn't empty and there's not a 15393 * pending wakeup then we need a timer. Also note that sodp 15394 * is assumed to be still valid after exit()ing the sod_lock 15395 * above and while the SOD state can change it can only change 15396 * such that the Q is empty now even though data was added 15397 * above. 15398 */ 15399 if (((sodp != NULL && !SOD_QEMPTY(sodp) && 15400 (sodp->sod_state & SOD_WAKE_NOT)) || 15401 (sodp == NULL && tcp->tcp_rcv_list != NULL)) && 15402 tcp->tcp_push_tid == 0) { 15403 /* 15404 * The connection may be closed at this point, so don't 15405 * do anything for a detached tcp. 15406 */ 15407 if (!TCP_IS_DETACHED(tcp)) 15408 tcp->tcp_push_tid = TCP_TIMER(tcp, 15409 tcp_push_timer, 15410 MSEC_TO_TICK( 15411 tcps->tcps_push_timer_interval)); 15412 } 15413 } 15414 15415 xmit_check: 15416 /* Is there anything left to do? */ 15417 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 15418 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 15419 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED| 15420 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 15421 goto done; 15422 15423 /* Any transmit work to do and a non-zero window? */ 15424 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 15425 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 15426 if (flags & TH_REXMIT_NEEDED) { 15427 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 15428 15429 BUMP_MIB(&tcps->tcps_mib, tcpOutFastRetrans); 15430 if (snd_size > mss) 15431 snd_size = mss; 15432 if (snd_size > tcp->tcp_swnd) 15433 snd_size = tcp->tcp_swnd; 15434 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 15435 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 15436 B_TRUE); 15437 15438 if (mp1 != NULL) { 15439 tcp->tcp_xmit_head->b_prev = (mblk_t *)lbolt; 15440 tcp->tcp_csuna = tcp->tcp_snxt; 15441 BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); 15442 UPDATE_MIB(&tcps->tcps_mib, 15443 tcpRetransBytes, snd_size); 15444 TCP_RECORD_TRACE(tcp, mp1, 15445 TCP_TRACE_SEND_PKT); 15446 tcp_send_data(tcp, tcp->tcp_wq, mp1); 15447 } 15448 } 15449 if (flags & TH_NEED_SACK_REXMIT) { 15450 tcp_sack_rxmit(tcp, &flags); 15451 } 15452 /* 15453 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 15454 * out new segment. Note that tcp_rexmit should not be 15455 * set, otherwise TH_LIMIT_XMIT should not be set. 15456 */ 15457 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 15458 if (!tcp->tcp_rexmit) { 15459 tcp_wput_data(tcp, NULL, B_FALSE); 15460 } else { 15461 tcp_ss_rexmit(tcp); 15462 } 15463 } 15464 /* 15465 * Adjust tcp_cwnd back to normal value after sending 15466 * new data segments. 15467 */ 15468 if (flags & TH_LIMIT_XMIT) { 15469 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 15470 /* 15471 * This will restart the timer. Restarting the 15472 * timer is used to avoid a timeout before the 15473 * limited transmitted segment's ACK gets back. 15474 */ 15475 if (tcp->tcp_xmit_head != NULL) 15476 tcp->tcp_xmit_head->b_prev = (mblk_t *)lbolt; 15477 } 15478 15479 /* Anything more to do? */ 15480 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| 15481 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 15482 goto done; 15483 } 15484 ack_check: 15485 if (flags & TH_SEND_URP_MARK) { 15486 ASSERT(tcp->tcp_urp_mark_mp); 15487 /* 15488 * Send up any queued data and then send the mark message 15489 */ 15490 sodirect_t *sodp; 15491 15492 SOD_PTR_ENTER(tcp, sodp); 15493 15494 mp1 = tcp->tcp_urp_mark_mp; 15495 tcp->tcp_urp_mark_mp = NULL; 15496 if (sodp != NULL) { 15497 15498 ASSERT(tcp->tcp_rcv_list == NULL); 15499 15500 flags |= tcp_rcv_sod_wakeup(tcp, sodp); 15501 /* sod_wakeup() does the mutex_exit() */ 15502 } else if (tcp->tcp_rcv_list != NULL) { 15503 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 15504 15505 ASSERT(tcp->tcp_rcv_list == NULL || 15506 tcp->tcp_fused_sigurg); 15507 15508 } 15509 putnext(tcp->tcp_rq, mp1); 15510 #ifdef DEBUG 15511 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 15512 "tcp_rput: sending zero-length %s %s", 15513 ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" : 15514 "MSGNOTMARKNEXT"), 15515 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 15516 #endif /* DEBUG */ 15517 flags &= ~TH_SEND_URP_MARK; 15518 } 15519 if (flags & TH_ACK_NEEDED) { 15520 /* 15521 * Time to send an ack for some reason. 15522 */ 15523 mp1 = tcp_ack_mp(tcp); 15524 15525 if (mp1 != NULL) { 15526 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); 15527 tcp_send_data(tcp, tcp->tcp_wq, mp1); 15528 BUMP_LOCAL(tcp->tcp_obsegs); 15529 BUMP_MIB(&tcps->tcps_mib, tcpOutAck); 15530 } 15531 if (tcp->tcp_ack_tid != 0) { 15532 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 15533 tcp->tcp_ack_tid = 0; 15534 } 15535 } 15536 if (flags & TH_ACK_TIMER_NEEDED) { 15537 /* 15538 * Arrange for deferred ACK or push wait timeout. 15539 * Start timer if it is not already running. 15540 */ 15541 if (tcp->tcp_ack_tid == 0) { 15542 tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer, 15543 MSEC_TO_TICK(tcp->tcp_localnet ? 15544 (clock_t)tcps->tcps_local_dack_interval : 15545 (clock_t)tcps->tcps_deferred_ack_interval)); 15546 } 15547 } 15548 if (flags & TH_ORDREL_NEEDED) { 15549 /* 15550 * Send up the ordrel_ind unless we are an eager guy. 15551 * In the eager case tcp_rsrv will do this when run 15552 * after tcp_accept is done. 15553 */ 15554 sodirect_t *sodp; 15555 15556 ASSERT(tcp->tcp_listener == NULL); 15557 15558 SOD_PTR_ENTER(tcp, sodp); 15559 if (sodp != NULL) { 15560 /* No more sodirect */ 15561 tcp->tcp_sodirect = NULL; 15562 if (!SOD_QEMPTY(sodp)) { 15563 /* Mblk(s) to process, notify */ 15564 flags |= tcp_rcv_sod_wakeup(tcp, sodp); 15565 /* sod_wakeup() does the mutex_exit() */ 15566 } else { 15567 /* Nothing to process */ 15568 mutex_exit(sodp->sod_lock); 15569 } 15570 } else if (tcp->tcp_rcv_list != NULL) { 15571 /* 15572 * Push any mblk(s) enqueued from co processing. 15573 */ 15574 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 15575 15576 ASSERT(tcp->tcp_rcv_list == NULL || 15577 tcp->tcp_fused_sigurg); 15578 } 15579 15580 if ((mp1 = mi_tpi_ordrel_ind()) != NULL) { 15581 tcp->tcp_ordrel_done = B_TRUE; 15582 putnext(tcp->tcp_rq, mp1); 15583 if (tcp->tcp_deferred_clean_death) { 15584 /* 15585 * tcp_clean_death was deferred 15586 * for T_ORDREL_IND - do it now 15587 */ 15588 (void) tcp_clean_death(tcp, 15589 tcp->tcp_client_errno, 20); 15590 tcp->tcp_deferred_clean_death = B_FALSE; 15591 } 15592 } else { 15593 /* 15594 * Run the orderly release in the 15595 * service routine. 15596 */ 15597 qenable(tcp->tcp_rq); 15598 /* 15599 * Caveat(XXX): The machine may be so 15600 * overloaded that tcp_rsrv() is not scheduled 15601 * until after the endpoint has transitioned 15602 * to TCPS_TIME_WAIT 15603 * and tcp_time_wait_interval expires. Then 15604 * tcp_timer() will blow away state in tcp_t 15605 * and T_ORDREL_IND will never be delivered 15606 * upstream. Unlikely but potentially 15607 * a problem. 15608 */ 15609 } 15610 } 15611 done: 15612 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 15613 } 15614 15615 /* 15616 * This function does PAWS protection check. Returns B_TRUE if the 15617 * segment passes the PAWS test, else returns B_FALSE. 15618 */ 15619 boolean_t 15620 tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) 15621 { 15622 uint8_t flags; 15623 int options; 15624 uint8_t *up; 15625 15626 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 15627 /* 15628 * If timestamp option is aligned nicely, get values inline, 15629 * otherwise call general routine to parse. Only do that 15630 * if timestamp is the only option. 15631 */ 15632 if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH + 15633 TCPOPT_REAL_TS_LEN && 15634 OK_32PTR((up = ((uint8_t *)tcph) + 15635 TCP_MIN_HEADER_LENGTH)) && 15636 *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 15637 tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 15638 tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 15639 15640 options = TCP_OPT_TSTAMP_PRESENT; 15641 } else { 15642 if (tcp->tcp_snd_sack_ok) { 15643 tcpoptp->tcp = tcp; 15644 } else { 15645 tcpoptp->tcp = NULL; 15646 } 15647 options = tcp_parse_options(tcph, tcpoptp); 15648 } 15649 15650 if (options & TCP_OPT_TSTAMP_PRESENT) { 15651 /* 15652 * Do PAWS per RFC 1323 section 4.2. Accept RST 15653 * regardless of the timestamp, page 18 RFC 1323.bis. 15654 */ 15655 if ((flags & TH_RST) == 0 && 15656 TSTMP_LT(tcpoptp->tcp_opt_ts_val, 15657 tcp->tcp_ts_recent)) { 15658 if (TSTMP_LT(lbolt64, tcp->tcp_last_rcv_lbolt + 15659 PAWS_TIMEOUT)) { 15660 /* This segment is not acceptable. */ 15661 return (B_FALSE); 15662 } else { 15663 /* 15664 * Connection has been idle for 15665 * too long. Reset the timestamp 15666 * and assume the segment is valid. 15667 */ 15668 tcp->tcp_ts_recent = 15669 tcpoptp->tcp_opt_ts_val; 15670 } 15671 } 15672 } else { 15673 /* 15674 * If we don't get a timestamp on every packet, we 15675 * figure we can't really trust 'em, so we stop sending 15676 * and parsing them. 15677 */ 15678 tcp->tcp_snd_ts_ok = B_FALSE; 15679 15680 tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 15681 tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 15682 tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4); 15683 /* 15684 * Adjust the tcp_mss accordingly. We also need to 15685 * adjust tcp_cwnd here in accordance with the new mss. 15686 * But we avoid doing a slow start here so as to not 15687 * to lose on the transfer rate built up so far. 15688 */ 15689 tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN, B_FALSE); 15690 if (tcp->tcp_snd_sack_ok) { 15691 ASSERT(tcp->tcp_sack_info != NULL); 15692 tcp->tcp_max_sack_blk = 4; 15693 } 15694 } 15695 return (B_TRUE); 15696 } 15697 15698 /* 15699 * Attach ancillary data to a received TCP segments for the 15700 * ancillary pieces requested by the application that are 15701 * different than they were in the previous data segment. 15702 * 15703 * Save the "current" values once memory allocation is ok so that 15704 * when memory allocation fails we can just wait for the next data segment. 15705 */ 15706 static mblk_t * 15707 tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) 15708 { 15709 struct T_optdata_ind *todi; 15710 int optlen; 15711 uchar_t *optptr; 15712 struct T_opthdr *toh; 15713 uint_t addflag; /* Which pieces to add */ 15714 mblk_t *mp1; 15715 15716 optlen = 0; 15717 addflag = 0; 15718 /* If app asked for pktinfo and the index has changed ... */ 15719 if ((ipp->ipp_fields & IPPF_IFINDEX) && 15720 ipp->ipp_ifindex != tcp->tcp_recvifindex && 15721 (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)) { 15722 optlen += sizeof (struct T_opthdr) + 15723 sizeof (struct in6_pktinfo); 15724 addflag |= TCP_IPV6_RECVPKTINFO; 15725 } 15726 /* If app asked for hoplimit and it has changed ... */ 15727 if ((ipp->ipp_fields & IPPF_HOPLIMIT) && 15728 ipp->ipp_hoplimit != tcp->tcp_recvhops && 15729 (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPLIMIT)) { 15730 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 15731 addflag |= TCP_IPV6_RECVHOPLIMIT; 15732 } 15733 /* If app asked for tclass and it has changed ... */ 15734 if ((ipp->ipp_fields & IPPF_TCLASS) && 15735 ipp->ipp_tclass != tcp->tcp_recvtclass && 15736 (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)) { 15737 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 15738 addflag |= TCP_IPV6_RECVTCLASS; 15739 } 15740 /* 15741 * If app asked for hopbyhop headers and it has changed ... 15742 * For security labels, note that (1) security labels can't change on 15743 * a connected socket at all, (2) we're connected to at most one peer, 15744 * (3) if anything changes, then it must be some other extra option. 15745 */ 15746 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) && 15747 ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen, 15748 (ipp->ipp_fields & IPPF_HOPOPTS), 15749 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) { 15750 optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen - 15751 tcp->tcp_label_len; 15752 addflag |= TCP_IPV6_RECVHOPOPTS; 15753 if (!ip_allocbuf((void **)&tcp->tcp_hopopts, 15754 &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), 15755 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) 15756 return (mp); 15757 } 15758 /* If app asked for dst headers before routing headers ... */ 15759 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTDSTOPTS) && 15760 ip_cmpbuf(tcp->tcp_rtdstopts, tcp->tcp_rtdstoptslen, 15761 (ipp->ipp_fields & IPPF_RTDSTOPTS), 15762 ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) { 15763 optlen += sizeof (struct T_opthdr) + 15764 ipp->ipp_rtdstoptslen; 15765 addflag |= TCP_IPV6_RECVRTDSTOPTS; 15766 if (!ip_allocbuf((void **)&tcp->tcp_rtdstopts, 15767 &tcp->tcp_rtdstoptslen, (ipp->ipp_fields & IPPF_RTDSTOPTS), 15768 ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) 15769 return (mp); 15770 } 15771 /* If app asked for routing headers and it has changed ... */ 15772 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) && 15773 ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen, 15774 (ipp->ipp_fields & IPPF_RTHDR), 15775 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { 15776 optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; 15777 addflag |= TCP_IPV6_RECVRTHDR; 15778 if (!ip_allocbuf((void **)&tcp->tcp_rthdr, 15779 &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), 15780 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) 15781 return (mp); 15782 } 15783 /* If app asked for dest headers and it has changed ... */ 15784 if ((tcp->tcp_ipv6_recvancillary & 15785 (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) && 15786 ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen, 15787 (ipp->ipp_fields & IPPF_DSTOPTS), 15788 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) { 15789 optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; 15790 addflag |= TCP_IPV6_RECVDSTOPTS; 15791 if (!ip_allocbuf((void **)&tcp->tcp_dstopts, 15792 &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), 15793 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) 15794 return (mp); 15795 } 15796 15797 if (optlen == 0) { 15798 /* Nothing to add */ 15799 return (mp); 15800 } 15801 mp1 = allocb(sizeof (struct T_optdata_ind) + optlen, BPRI_MED); 15802 if (mp1 == NULL) { 15803 /* 15804 * Defer sending ancillary data until the next TCP segment 15805 * arrives. 15806 */ 15807 return (mp); 15808 } 15809 mp1->b_cont = mp; 15810 mp = mp1; 15811 mp->b_wptr += sizeof (*todi) + optlen; 15812 mp->b_datap->db_type = M_PROTO; 15813 todi = (struct T_optdata_ind *)mp->b_rptr; 15814 todi->PRIM_type = T_OPTDATA_IND; 15815 todi->DATA_flag = 1; /* MORE data */ 15816 todi->OPT_length = optlen; 15817 todi->OPT_offset = sizeof (*todi); 15818 optptr = (uchar_t *)&todi[1]; 15819 /* 15820 * If app asked for pktinfo and the index has changed ... 15821 * Note that the local address never changes for the connection. 15822 */ 15823 if (addflag & TCP_IPV6_RECVPKTINFO) { 15824 struct in6_pktinfo *pkti; 15825 15826 toh = (struct T_opthdr *)optptr; 15827 toh->level = IPPROTO_IPV6; 15828 toh->name = IPV6_PKTINFO; 15829 toh->len = sizeof (*toh) + sizeof (*pkti); 15830 toh->status = 0; 15831 optptr += sizeof (*toh); 15832 pkti = (struct in6_pktinfo *)optptr; 15833 if (tcp->tcp_ipversion == IPV6_VERSION) 15834 pkti->ipi6_addr = tcp->tcp_ip6h->ip6_src; 15835 else 15836 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 15837 &pkti->ipi6_addr); 15838 pkti->ipi6_ifindex = ipp->ipp_ifindex; 15839 optptr += sizeof (*pkti); 15840 ASSERT(OK_32PTR(optptr)); 15841 /* Save as "last" value */ 15842 tcp->tcp_recvifindex = ipp->ipp_ifindex; 15843 } 15844 /* If app asked for hoplimit and it has changed ... */ 15845 if (addflag & TCP_IPV6_RECVHOPLIMIT) { 15846 toh = (struct T_opthdr *)optptr; 15847 toh->level = IPPROTO_IPV6; 15848 toh->name = IPV6_HOPLIMIT; 15849 toh->len = sizeof (*toh) + sizeof (uint_t); 15850 toh->status = 0; 15851 optptr += sizeof (*toh); 15852 *(uint_t *)optptr = ipp->ipp_hoplimit; 15853 optptr += sizeof (uint_t); 15854 ASSERT(OK_32PTR(optptr)); 15855 /* Save as "last" value */ 15856 tcp->tcp_recvhops = ipp->ipp_hoplimit; 15857 } 15858 /* If app asked for tclass and it has changed ... */ 15859 if (addflag & TCP_IPV6_RECVTCLASS) { 15860 toh = (struct T_opthdr *)optptr; 15861 toh->level = IPPROTO_IPV6; 15862 toh->name = IPV6_TCLASS; 15863 toh->len = sizeof (*toh) + sizeof (uint_t); 15864 toh->status = 0; 15865 optptr += sizeof (*toh); 15866 *(uint_t *)optptr = ipp->ipp_tclass; 15867 optptr += sizeof (uint_t); 15868 ASSERT(OK_32PTR(optptr)); 15869 /* Save as "last" value */ 15870 tcp->tcp_recvtclass = ipp->ipp_tclass; 15871 } 15872 if (addflag & TCP_IPV6_RECVHOPOPTS) { 15873 toh = (struct T_opthdr *)optptr; 15874 toh->level = IPPROTO_IPV6; 15875 toh->name = IPV6_HOPOPTS; 15876 toh->len = sizeof (*toh) + ipp->ipp_hopoptslen - 15877 tcp->tcp_label_len; 15878 toh->status = 0; 15879 optptr += sizeof (*toh); 15880 bcopy((uchar_t *)ipp->ipp_hopopts + tcp->tcp_label_len, optptr, 15881 ipp->ipp_hopoptslen - tcp->tcp_label_len); 15882 optptr += ipp->ipp_hopoptslen - tcp->tcp_label_len; 15883 ASSERT(OK_32PTR(optptr)); 15884 /* Save as last value */ 15885 ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, 15886 (ipp->ipp_fields & IPPF_HOPOPTS), 15887 ipp->ipp_hopopts, ipp->ipp_hopoptslen); 15888 } 15889 if (addflag & TCP_IPV6_RECVRTDSTOPTS) { 15890 toh = (struct T_opthdr *)optptr; 15891 toh->level = IPPROTO_IPV6; 15892 toh->name = IPV6_RTHDRDSTOPTS; 15893 toh->len = sizeof (*toh) + ipp->ipp_rtdstoptslen; 15894 toh->status = 0; 15895 optptr += sizeof (*toh); 15896 bcopy(ipp->ipp_rtdstopts, optptr, ipp->ipp_rtdstoptslen); 15897 optptr += ipp->ipp_rtdstoptslen; 15898 ASSERT(OK_32PTR(optptr)); 15899 /* Save as last value */ 15900 ip_savebuf((void **)&tcp->tcp_rtdstopts, 15901 &tcp->tcp_rtdstoptslen, 15902 (ipp->ipp_fields & IPPF_RTDSTOPTS), 15903 ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); 15904 } 15905 if (addflag & TCP_IPV6_RECVRTHDR) { 15906 toh = (struct T_opthdr *)optptr; 15907 toh->level = IPPROTO_IPV6; 15908 toh->name = IPV6_RTHDR; 15909 toh->len = sizeof (*toh) + ipp->ipp_rthdrlen; 15910 toh->status = 0; 15911 optptr += sizeof (*toh); 15912 bcopy(ipp->ipp_rthdr, optptr, ipp->ipp_rthdrlen); 15913 optptr += ipp->ipp_rthdrlen; 15914 ASSERT(OK_32PTR(optptr)); 15915 /* Save as last value */ 15916 ip_savebuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen, 15917 (ipp->ipp_fields & IPPF_RTHDR), 15918 ipp->ipp_rthdr, ipp->ipp_rthdrlen); 15919 } 15920 if (addflag & (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) { 15921 toh = (struct T_opthdr *)optptr; 15922 toh->level = IPPROTO_IPV6; 15923 toh->name = IPV6_DSTOPTS; 15924 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen; 15925 toh->status = 0; 15926 optptr += sizeof (*toh); 15927 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen); 15928 optptr += ipp->ipp_dstoptslen; 15929 ASSERT(OK_32PTR(optptr)); 15930 /* Save as last value */ 15931 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen, 15932 (ipp->ipp_fields & IPPF_DSTOPTS), 15933 ipp->ipp_dstopts, ipp->ipp_dstoptslen); 15934 } 15935 ASSERT(optptr == mp->b_wptr); 15936 return (mp); 15937 } 15938 15939 15940 /* 15941 * Handle a *T_BIND_REQ that has failed either due to a T_ERROR_ACK 15942 * or a "bad" IRE detected by tcp_adapt_ire. 15943 * We can't tell if the failure was due to the laddr or the faddr 15944 * thus we clear out all addresses and ports. 15945 */ 15946 static void 15947 tcp_bind_failed(tcp_t *tcp, mblk_t *mp, int error) 15948 { 15949 queue_t *q = tcp->tcp_rq; 15950 tcph_t *tcph; 15951 struct T_error_ack *tea; 15952 conn_t *connp = tcp->tcp_connp; 15953 15954 15955 ASSERT(mp->b_datap->db_type == M_PCPROTO); 15956 15957 if (mp->b_cont) { 15958 freemsg(mp->b_cont); 15959 mp->b_cont = NULL; 15960 } 15961 tea = (struct T_error_ack *)mp->b_rptr; 15962 switch (tea->PRIM_type) { 15963 case T_BIND_ACK: 15964 /* 15965 * Need to unbind with classifier since we were just told that 15966 * our bind succeeded. 15967 */ 15968 tcp->tcp_hard_bound = B_FALSE; 15969 tcp->tcp_hard_binding = B_FALSE; 15970 15971 ipcl_hash_remove(connp); 15972 /* Reuse the mblk if possible */ 15973 ASSERT(mp->b_datap->db_lim - mp->b_datap->db_base >= 15974 sizeof (*tea)); 15975 mp->b_rptr = mp->b_datap->db_base; 15976 mp->b_wptr = mp->b_rptr + sizeof (*tea); 15977 tea = (struct T_error_ack *)mp->b_rptr; 15978 tea->PRIM_type = T_ERROR_ACK; 15979 tea->TLI_error = TSYSERR; 15980 tea->UNIX_error = error; 15981 if (tcp->tcp_state >= TCPS_SYN_SENT) { 15982 tea->ERROR_prim = T_CONN_REQ; 15983 } else { 15984 tea->ERROR_prim = O_T_BIND_REQ; 15985 } 15986 break; 15987 15988 case T_ERROR_ACK: 15989 if (tcp->tcp_state >= TCPS_SYN_SENT) 15990 tea->ERROR_prim = T_CONN_REQ; 15991 break; 15992 default: 15993 panic("tcp_bind_failed: unexpected TPI type"); 15994 /*NOTREACHED*/ 15995 } 15996 15997 tcp->tcp_state = TCPS_IDLE; 15998 if (tcp->tcp_ipversion == IPV4_VERSION) 15999 tcp->tcp_ipha->ipha_src = 0; 16000 else 16001 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); 16002 /* 16003 * Copy of the src addr. in tcp_t is needed since 16004 * the lookup funcs. can only look at tcp_t 16005 */ 16006 V6_SET_ZERO(tcp->tcp_ip_src_v6); 16007 16008 tcph = tcp->tcp_tcph; 16009 tcph->th_lport[0] = 0; 16010 tcph->th_lport[1] = 0; 16011 tcp_bind_hash_remove(tcp); 16012 bzero(&connp->u_port, sizeof (connp->u_port)); 16013 /* blow away saved option results if any */ 16014 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 16015 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 16016 16017 conn_delete_ire(tcp->tcp_connp, NULL); 16018 putnext(q, mp); 16019 } 16020 16021 /* 16022 * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA 16023 * messages. 16024 */ 16025 void 16026 tcp_rput_other(tcp_t *tcp, mblk_t *mp) 16027 { 16028 mblk_t *mp1; 16029 uchar_t *rptr = mp->b_rptr; 16030 queue_t *q = tcp->tcp_rq; 16031 struct T_error_ack *tea; 16032 uint32_t mss; 16033 mblk_t *syn_mp; 16034 mblk_t *mdti; 16035 mblk_t *lsoi; 16036 int retval; 16037 mblk_t *ire_mp; 16038 tcp_stack_t *tcps = tcp->tcp_tcps; 16039 16040 switch (mp->b_datap->db_type) { 16041 case M_PROTO: 16042 case M_PCPROTO: 16043 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 16044 if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) 16045 break; 16046 tea = (struct T_error_ack *)rptr; 16047 switch (tea->PRIM_type) { 16048 case T_BIND_ACK: 16049 /* 16050 * Adapt Multidata information, if any. The 16051 * following tcp_mdt_update routine will free 16052 * the message. 16053 */ 16054 if ((mdti = tcp_mdt_info_mp(mp)) != NULL) { 16055 tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti-> 16056 b_rptr)->mdt_capab, B_TRUE); 16057 freemsg(mdti); 16058 } 16059 16060 /* 16061 * Check to update LSO information with tcp, and 16062 * tcp_lso_update routine will free the message. 16063 */ 16064 if ((lsoi = tcp_lso_info_mp(mp)) != NULL) { 16065 tcp_lso_update(tcp, &((ip_lso_info_t *)lsoi-> 16066 b_rptr)->lso_capab); 16067 freemsg(lsoi); 16068 } 16069 16070 /* Get the IRE, if we had requested for it */ 16071 ire_mp = tcp_ire_mp(mp); 16072 16073 if (tcp->tcp_hard_binding) { 16074 tcp->tcp_hard_binding = B_FALSE; 16075 tcp->tcp_hard_bound = B_TRUE; 16076 CL_INET_CONNECT(tcp); 16077 } else { 16078 if (ire_mp != NULL) 16079 freeb(ire_mp); 16080 goto after_syn_sent; 16081 } 16082 16083 retval = tcp_adapt_ire(tcp, ire_mp); 16084 if (ire_mp != NULL) 16085 freeb(ire_mp); 16086 if (retval == 0) { 16087 tcp_bind_failed(tcp, mp, 16088 (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? 16089 ENETUNREACH : EADDRNOTAVAIL)); 16090 return; 16091 } 16092 /* 16093 * Don't let an endpoint connect to itself. 16094 * Also checked in tcp_connect() but that 16095 * check can't handle the case when the 16096 * local IP address is INADDR_ANY. 16097 */ 16098 if (tcp->tcp_ipversion == IPV4_VERSION) { 16099 if ((tcp->tcp_ipha->ipha_dst == 16100 tcp->tcp_ipha->ipha_src) && 16101 (BE16_EQL(tcp->tcp_tcph->th_lport, 16102 tcp->tcp_tcph->th_fport))) { 16103 tcp_bind_failed(tcp, mp, EADDRNOTAVAIL); 16104 return; 16105 } 16106 } else { 16107 if (IN6_ARE_ADDR_EQUAL( 16108 &tcp->tcp_ip6h->ip6_dst, 16109 &tcp->tcp_ip6h->ip6_src) && 16110 (BE16_EQL(tcp->tcp_tcph->th_lport, 16111 tcp->tcp_tcph->th_fport))) { 16112 tcp_bind_failed(tcp, mp, EADDRNOTAVAIL); 16113 return; 16114 } 16115 } 16116 ASSERT(tcp->tcp_state == TCPS_SYN_SENT); 16117 /* 16118 * This should not be possible! Just for 16119 * defensive coding... 16120 */ 16121 if (tcp->tcp_state != TCPS_SYN_SENT) 16122 goto after_syn_sent; 16123 16124 if (is_system_labeled() && 16125 !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) { 16126 tcp_bind_failed(tcp, mp, EHOSTUNREACH); 16127 return; 16128 } 16129 16130 ASSERT(q == tcp->tcp_rq); 16131 /* 16132 * tcp_adapt_ire() does not adjust 16133 * for TCP/IP header length. 16134 */ 16135 mss = tcp->tcp_mss - tcp->tcp_hdr_len; 16136 16137 /* 16138 * Just make sure our rwnd is at 16139 * least tcp_recv_hiwat_mss * MSS 16140 * large, and round up to the nearest 16141 * MSS. 16142 * 16143 * We do the round up here because 16144 * we need to get the interface 16145 * MTU first before we can do the 16146 * round up. 16147 */ 16148 tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 16149 tcps->tcps_recv_hiwat_minmss * mss); 16150 q->q_hiwat = tcp->tcp_rwnd; 16151 tcp_set_ws_value(tcp); 16152 U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), 16153 tcp->tcp_tcph->th_win); 16154 if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always) 16155 tcp->tcp_snd_ws_ok = B_TRUE; 16156 16157 /* 16158 * Set tcp_snd_ts_ok to true 16159 * so that tcp_xmit_mp will 16160 * include the timestamp 16161 * option in the SYN segment. 16162 */ 16163 if (tcps->tcps_tstamp_always || 16164 (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) { 16165 tcp->tcp_snd_ts_ok = B_TRUE; 16166 } 16167 16168 /* 16169 * tcp_snd_sack_ok can be set in 16170 * tcp_adapt_ire() if the sack metric 16171 * is set. So check it here also. 16172 */ 16173 if (tcps->tcps_sack_permitted == 2 || 16174 tcp->tcp_snd_sack_ok) { 16175 if (tcp->tcp_sack_info == NULL) { 16176 tcp->tcp_sack_info = 16177 kmem_cache_alloc( 16178 tcp_sack_info_cache, 16179 KM_SLEEP); 16180 } 16181 tcp->tcp_snd_sack_ok = B_TRUE; 16182 } 16183 16184 /* 16185 * Should we use ECN? Note that the current 16186 * default value (SunOS 5.9) of tcp_ecn_permitted 16187 * is 1. The reason for doing this is that there 16188 * are equipments out there that will drop ECN 16189 * enabled IP packets. Setting it to 1 avoids 16190 * compatibility problems. 16191 */ 16192 if (tcps->tcps_ecn_permitted == 2) 16193 tcp->tcp_ecn_ok = B_TRUE; 16194 16195 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 16196 syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 16197 tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 16198 if (syn_mp) { 16199 cred_t *cr; 16200 pid_t pid; 16201 16202 /* 16203 * Obtain the credential from the 16204 * thread calling connect(); the credential 16205 * lives on in the second mblk which 16206 * originated from T_CONN_REQ and is echoed 16207 * with the T_BIND_ACK from ip. If none 16208 * can be found, default to the creator 16209 * of the socket. 16210 */ 16211 if (mp->b_cont == NULL || 16212 (cr = DB_CRED(mp->b_cont)) == NULL) { 16213 cr = tcp->tcp_cred; 16214 pid = tcp->tcp_cpid; 16215 } else { 16216 pid = DB_CPID(mp->b_cont); 16217 } 16218 16219 TCP_RECORD_TRACE(tcp, syn_mp, 16220 TCP_TRACE_SEND_PKT); 16221 mblk_setcred(syn_mp, cr); 16222 DB_CPID(syn_mp) = pid; 16223 tcp_send_data(tcp, tcp->tcp_wq, syn_mp); 16224 } 16225 after_syn_sent: 16226 /* 16227 * A trailer mblk indicates a waiting client upstream. 16228 * We complete here the processing begun in 16229 * either tcp_bind() or tcp_connect() by passing 16230 * upstream the reply message they supplied. 16231 */ 16232 mp1 = mp; 16233 mp = mp->b_cont; 16234 freeb(mp1); 16235 if (mp) 16236 break; 16237 return; 16238 case T_ERROR_ACK: 16239 if (tcp->tcp_debug) { 16240 (void) strlog(TCP_MOD_ID, 0, 1, 16241 SL_TRACE|SL_ERROR, 16242 "tcp_rput_other: case T_ERROR_ACK, " 16243 "ERROR_prim == %d", 16244 tea->ERROR_prim); 16245 } 16246 switch (tea->ERROR_prim) { 16247 case O_T_BIND_REQ: 16248 case T_BIND_REQ: 16249 tcp_bind_failed(tcp, mp, 16250 (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? 16251 ENETUNREACH : EADDRNOTAVAIL)); 16252 return; 16253 case T_UNBIND_REQ: 16254 tcp->tcp_hard_binding = B_FALSE; 16255 tcp->tcp_hard_bound = B_FALSE; 16256 if (mp->b_cont) { 16257 freemsg(mp->b_cont); 16258 mp->b_cont = NULL; 16259 } 16260 if (tcp->tcp_unbind_pending) 16261 tcp->tcp_unbind_pending = 0; 16262 else { 16263 /* From tcp_ip_unbind() - free */ 16264 freemsg(mp); 16265 return; 16266 } 16267 break; 16268 case T_SVR4_OPTMGMT_REQ: 16269 if (tcp->tcp_drop_opt_ack_cnt > 0) { 16270 /* T_OPTMGMT_REQ generated by TCP */ 16271 printf("T_SVR4_OPTMGMT_REQ failed " 16272 "%d/%d - dropped (cnt %d)\n", 16273 tea->TLI_error, tea->UNIX_error, 16274 tcp->tcp_drop_opt_ack_cnt); 16275 freemsg(mp); 16276 tcp->tcp_drop_opt_ack_cnt--; 16277 return; 16278 } 16279 break; 16280 } 16281 if (tea->ERROR_prim == T_SVR4_OPTMGMT_REQ && 16282 tcp->tcp_drop_opt_ack_cnt > 0) { 16283 printf("T_SVR4_OPTMGMT_REQ failed %d/%d " 16284 "- dropped (cnt %d)\n", 16285 tea->TLI_error, tea->UNIX_error, 16286 tcp->tcp_drop_opt_ack_cnt); 16287 freemsg(mp); 16288 tcp->tcp_drop_opt_ack_cnt--; 16289 return; 16290 } 16291 break; 16292 case T_OPTMGMT_ACK: 16293 if (tcp->tcp_drop_opt_ack_cnt > 0) { 16294 /* T_OPTMGMT_REQ generated by TCP */ 16295 freemsg(mp); 16296 tcp->tcp_drop_opt_ack_cnt--; 16297 return; 16298 } 16299 break; 16300 default: 16301 break; 16302 } 16303 break; 16304 case M_FLUSH: 16305 if (*rptr & FLUSHR) 16306 flushq(q, FLUSHDATA); 16307 break; 16308 default: 16309 /* M_CTL will be directly sent to tcp_icmp_error() */ 16310 ASSERT(DB_TYPE(mp) != M_CTL); 16311 break; 16312 } 16313 /* 16314 * Make sure we set this bit before sending the ACK for 16315 * bind. Otherwise accept could possibly run and free 16316 * this tcp struct. 16317 */ 16318 putnext(q, mp); 16319 } 16320 16321 /* 16322 * Called as the result of a qbufcall or a qtimeout to remedy a failure 16323 * to allocate a T_ordrel_ind in tcp_rsrv(). qenable(q) will make 16324 * tcp_rsrv() try again. 16325 */ 16326 static void 16327 tcp_ordrel_kick(void *arg) 16328 { 16329 conn_t *connp = (conn_t *)arg; 16330 tcp_t *tcp = connp->conn_tcp; 16331 16332 tcp->tcp_ordrelid = 0; 16333 tcp->tcp_timeout = B_FALSE; 16334 if (!TCP_IS_DETACHED(tcp) && tcp->tcp_rq != NULL && 16335 tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 16336 qenable(tcp->tcp_rq); 16337 } 16338 } 16339 16340 /* ARGSUSED */ 16341 static void 16342 tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) 16343 { 16344 conn_t *connp = (conn_t *)arg; 16345 tcp_t *tcp = connp->conn_tcp; 16346 queue_t *q = tcp->tcp_rq; 16347 uint_t thwin; 16348 tcp_stack_t *tcps = tcp->tcp_tcps; 16349 sodirect_t *sodp; 16350 boolean_t fc; 16351 16352 freeb(mp); 16353 16354 TCP_STAT(tcps, tcp_rsrv_calls); 16355 16356 if (TCP_IS_DETACHED(tcp) || q == NULL) { 16357 return; 16358 } 16359 16360 if (tcp->tcp_fused) { 16361 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 16362 16363 ASSERT(tcp->tcp_fused); 16364 ASSERT(peer_tcp != NULL && peer_tcp->tcp_fused); 16365 ASSERT(peer_tcp->tcp_loopback_peer == tcp); 16366 ASSERT(!TCP_IS_DETACHED(tcp)); 16367 ASSERT(tcp->tcp_connp->conn_sqp == 16368 peer_tcp->tcp_connp->conn_sqp); 16369 16370 /* 16371 * Normally we would not get backenabled in synchronous 16372 * streams mode, but in case this happens, we need to plug 16373 * synchronous streams during our drain to prevent a race 16374 * with tcp_fuse_rrw() or tcp_fuse_rinfop(). 16375 */ 16376 TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp); 16377 if (tcp->tcp_rcv_list != NULL) 16378 (void) tcp_rcv_drain(tcp->tcp_rq, tcp); 16379 16380 if (peer_tcp > tcp) { 16381 mutex_enter(&peer_tcp->tcp_non_sq_lock); 16382 mutex_enter(&tcp->tcp_non_sq_lock); 16383 } else { 16384 mutex_enter(&tcp->tcp_non_sq_lock); 16385 mutex_enter(&peer_tcp->tcp_non_sq_lock); 16386 } 16387 16388 if (peer_tcp->tcp_flow_stopped && 16389 (TCP_UNSENT_BYTES(peer_tcp) <= 16390 peer_tcp->tcp_xmit_lowater)) { 16391 tcp_clrqfull(peer_tcp); 16392 } 16393 mutex_exit(&peer_tcp->tcp_non_sq_lock); 16394 mutex_exit(&tcp->tcp_non_sq_lock); 16395 16396 TCP_FUSE_SYNCSTR_UNPLUG_DRAIN(tcp); 16397 TCP_STAT(tcps, tcp_fusion_backenabled); 16398 return; 16399 } 16400 16401 SOD_PTR_ENTER(tcp, sodp); 16402 if (sodp != NULL) { 16403 /* An sodirect connection */ 16404 if (SOD_QFULL(sodp)) { 16405 /* Flow-controlled, need another back-enable */ 16406 fc = B_TRUE; 16407 SOD_QSETBE(sodp); 16408 } else { 16409 /* Not flow-controlled */ 16410 fc = B_FALSE; 16411 } 16412 mutex_exit(sodp->sod_lock); 16413 } else if (canputnext(q)) { 16414 /* STREAMS, not flow-controlled */ 16415 fc = B_FALSE; 16416 } else { 16417 /* STREAMS, flow-controlled */ 16418 fc = B_TRUE; 16419 } 16420 if (!fc) { 16421 /* Not flow-controlled, open rwnd */ 16422 tcp->tcp_rwnd = q->q_hiwat; 16423 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 16424 << tcp->tcp_rcv_ws; 16425 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 16426 /* 16427 * Send back a window update immediately if TCP is above 16428 * ESTABLISHED state and the increase of the rcv window 16429 * that the other side knows is at least 1 MSS after flow 16430 * control is lifted. 16431 */ 16432 if (tcp->tcp_state >= TCPS_ESTABLISHED && 16433 (q->q_hiwat - thwin >= tcp->tcp_mss)) { 16434 tcp_xmit_ctl(NULL, tcp, 16435 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 16436 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 16437 BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); 16438 } 16439 } 16440 16441 /* Handle a failure to allocate a T_ORDREL_IND here */ 16442 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 16443 ASSERT(tcp->tcp_listener == NULL); 16444 16445 SOD_PTR_ENTER(tcp, sodp); 16446 if (sodp != NULL) { 16447 /* No more sodirect */ 16448 tcp->tcp_sodirect = NULL; 16449 if (!SOD_QEMPTY(sodp)) { 16450 /* Notify mblk(s) to process */ 16451 (void) tcp_rcv_sod_wakeup(tcp, sodp); 16452 /* sod_wakeup() does the mutex_exit() */ 16453 } else { 16454 /* Nothing to process */ 16455 mutex_exit(sodp->sod_lock); 16456 } 16457 } else if (tcp->tcp_rcv_list != NULL) { 16458 /* 16459 * Push any mblk(s) enqueued from co processing. 16460 */ 16461 (void) tcp_rcv_drain(tcp->tcp_rq, tcp); 16462 ASSERT(tcp->tcp_rcv_list == NULL || 16463 tcp->tcp_fused_sigurg); 16464 } 16465 16466 mp = mi_tpi_ordrel_ind(); 16467 if (mp) { 16468 tcp->tcp_ordrel_done = B_TRUE; 16469 putnext(q, mp); 16470 if (tcp->tcp_deferred_clean_death) { 16471 /* 16472 * tcp_clean_death was deferred for 16473 * T_ORDREL_IND - do it now 16474 */ 16475 tcp->tcp_deferred_clean_death = B_FALSE; 16476 (void) tcp_clean_death(tcp, 16477 tcp->tcp_client_errno, 22); 16478 } 16479 } else if (!tcp->tcp_timeout && tcp->tcp_ordrelid == 0) { 16480 /* 16481 * If there isn't already a timer running 16482 * start one. Use a 4 second 16483 * timer as a fallback since it can't fail. 16484 */ 16485 tcp->tcp_timeout = B_TRUE; 16486 tcp->tcp_ordrelid = TCP_TIMER(tcp, tcp_ordrel_kick, 16487 MSEC_TO_TICK(4000)); 16488 } 16489 } 16490 } 16491 16492 /* 16493 * The read side service routine is called mostly when we get back-enabled as a 16494 * result of flow control relief. Since we don't actually queue anything in 16495 * TCP, we have no data to send out of here. What we do is clear the receive 16496 * window, and send out a window update. 16497 * This routine is also called to drive an orderly release message upstream 16498 * if the attempt in tcp_rput failed. 16499 */ 16500 static void 16501 tcp_rsrv(queue_t *q) 16502 { 16503 conn_t *connp = Q_TO_CONN(q); 16504 tcp_t *tcp = connp->conn_tcp; 16505 mblk_t *mp; 16506 tcp_stack_t *tcps = tcp->tcp_tcps; 16507 16508 /* No code does a putq on the read side */ 16509 ASSERT(q->q_first == NULL); 16510 16511 /* Nothing to do for the default queue */ 16512 if (q == tcps->tcps_g_q) { 16513 return; 16514 } 16515 16516 mp = allocb(0, BPRI_HI); 16517 if (mp == NULL) { 16518 /* 16519 * We are under memory pressure. Return for now and we 16520 * we will be called again later. 16521 */ 16522 if (!tcp->tcp_timeout && tcp->tcp_ordrelid == 0) { 16523 /* 16524 * If there isn't already a timer running 16525 * start one. Use a 4 second 16526 * timer as a fallback since it can't fail. 16527 */ 16528 tcp->tcp_timeout = B_TRUE; 16529 tcp->tcp_ordrelid = TCP_TIMER(tcp, tcp_ordrel_kick, 16530 MSEC_TO_TICK(4000)); 16531 } 16532 return; 16533 } 16534 CONN_INC_REF(connp); 16535 squeue_enter(connp->conn_sqp, mp, tcp_rsrv_input, connp, 16536 SQTAG_TCP_RSRV); 16537 } 16538 16539 /* 16540 * tcp_rwnd_set() is called to adjust the receive window to a desired value. 16541 * We do not allow the receive window to shrink. After setting rwnd, 16542 * set the flow control hiwat of the stream. 16543 * 16544 * This function is called in 2 cases: 16545 * 16546 * 1) Before data transfer begins, in tcp_accept_comm() for accepting a 16547 * connection (passive open) and in tcp_rput_data() for active connect. 16548 * This is called after tcp_mss_set() when the desired MSS value is known. 16549 * This makes sure that our window size is a mutiple of the other side's 16550 * MSS. 16551 * 2) Handling SO_RCVBUF option. 16552 * 16553 * It is ASSUMED that the requested size is a multiple of the current MSS. 16554 * 16555 * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 16556 * user requests so. 16557 */ 16558 static int 16559 tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 16560 { 16561 uint32_t mss = tcp->tcp_mss; 16562 uint32_t old_max_rwnd; 16563 uint32_t max_transmittable_rwnd; 16564 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 16565 tcp_stack_t *tcps = tcp->tcp_tcps; 16566 16567 if (tcp->tcp_fused) { 16568 size_t sth_hiwat; 16569 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 16570 16571 ASSERT(peer_tcp != NULL); 16572 /* 16573 * Record the stream head's high water mark for 16574 * this endpoint; this is used for flow-control 16575 * purposes in tcp_fuse_output(). 16576 */ 16577 sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd); 16578 if (!tcp_detached) 16579 (void) mi_set_sth_hiwat(tcp->tcp_rq, sth_hiwat); 16580 16581 /* 16582 * In the fusion case, the maxpsz stream head value of 16583 * our peer is set according to its send buffer size 16584 * and our receive buffer size; since the latter may 16585 * have changed we need to update the peer's maxpsz. 16586 */ 16587 (void) tcp_maxpsz_set(peer_tcp, B_TRUE); 16588 return (rwnd); 16589 } 16590 16591 if (tcp_detached) 16592 old_max_rwnd = tcp->tcp_rwnd; 16593 else 16594 old_max_rwnd = tcp->tcp_rq->q_hiwat; 16595 16596 /* 16597 * Insist on a receive window that is at least 16598 * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 16599 * funny TCP interactions of Nagle algorithm, SWS avoidance 16600 * and delayed acknowledgement. 16601 */ 16602 rwnd = MAX(rwnd, tcps->tcps_recv_hiwat_minmss * mss); 16603 16604 /* 16605 * If window size info has already been exchanged, TCP should not 16606 * shrink the window. Shrinking window is doable if done carefully. 16607 * We may add that support later. But so far there is not a real 16608 * need to do that. 16609 */ 16610 if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 16611 /* MSS may have changed, do a round up again. */ 16612 rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 16613 } 16614 16615 /* 16616 * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 16617 * can be applied even before the window scale option is decided. 16618 */ 16619 max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 16620 if (rwnd > max_transmittable_rwnd) { 16621 rwnd = max_transmittable_rwnd - 16622 (max_transmittable_rwnd % mss); 16623 if (rwnd < mss) 16624 rwnd = max_transmittable_rwnd; 16625 /* 16626 * If we're over the limit we may have to back down tcp_rwnd. 16627 * The increment below won't work for us. So we set all three 16628 * here and the increment below will have no effect. 16629 */ 16630 tcp->tcp_rwnd = old_max_rwnd = rwnd; 16631 } 16632 if (tcp->tcp_localnet) { 16633 tcp->tcp_rack_abs_max = 16634 MIN(tcps->tcps_local_dacks_max, rwnd / mss / 2); 16635 } else { 16636 /* 16637 * For a remote host on a different subnet (through a router), 16638 * we ack every other packet to be conforming to RFC1122. 16639 * tcp_deferred_acks_max is default to 2. 16640 */ 16641 tcp->tcp_rack_abs_max = 16642 MIN(tcps->tcps_deferred_acks_max, rwnd / mss / 2); 16643 } 16644 if (tcp->tcp_rack_cur_max > tcp->tcp_rack_abs_max) 16645 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 16646 else 16647 tcp->tcp_rack_cur_max = 0; 16648 /* 16649 * Increment the current rwnd by the amount the maximum grew (we 16650 * can not overwrite it since we might be in the middle of a 16651 * connection.) 16652 */ 16653 tcp->tcp_rwnd += rwnd - old_max_rwnd; 16654 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 16655 if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 16656 tcp->tcp_cwnd_max = rwnd; 16657 16658 if (tcp_detached) 16659 return (rwnd); 16660 /* 16661 * We set the maximum receive window into rq->q_hiwat. 16662 * This is not actually used for flow control. 16663 */ 16664 tcp->tcp_rq->q_hiwat = rwnd; 16665 /* 16666 * Set the Stream head high water mark. This doesn't have to be 16667 * here, since we are simply using default values, but we would 16668 * prefer to choose these values algorithmically, with a likely 16669 * relationship to rwnd. 16670 */ 16671 (void) mi_set_sth_hiwat(tcp->tcp_rq, 16672 MAX(rwnd, tcps->tcps_sth_rcv_hiwat)); 16673 return (rwnd); 16674 } 16675 16676 /* 16677 * Return SNMP stuff in buffer in mpdata. 16678 */ 16679 mblk_t * 16680 tcp_snmp_get(queue_t *q, mblk_t *mpctl) 16681 { 16682 mblk_t *mpdata; 16683 mblk_t *mp_conn_ctl = NULL; 16684 mblk_t *mp_conn_tail; 16685 mblk_t *mp_attr_ctl = NULL; 16686 mblk_t *mp_attr_tail; 16687 mblk_t *mp6_conn_ctl = NULL; 16688 mblk_t *mp6_conn_tail; 16689 mblk_t *mp6_attr_ctl = NULL; 16690 mblk_t *mp6_attr_tail; 16691 struct opthdr *optp; 16692 mib2_tcpConnEntry_t tce; 16693 mib2_tcp6ConnEntry_t tce6; 16694 mib2_transportMLPEntry_t mlp; 16695 connf_t *connfp; 16696 int i; 16697 boolean_t ispriv; 16698 zoneid_t zoneid; 16699 int v4_conn_idx; 16700 int v6_conn_idx; 16701 conn_t *connp = Q_TO_CONN(q); 16702 tcp_stack_t *tcps; 16703 ip_stack_t *ipst; 16704 mblk_t *mp2ctl; 16705 16706 /* 16707 * make a copy of the original message 16708 */ 16709 mp2ctl = copymsg(mpctl); 16710 16711 if (mpctl == NULL || 16712 (mpdata = mpctl->b_cont) == NULL || 16713 (mp_conn_ctl = copymsg(mpctl)) == NULL || 16714 (mp_attr_ctl = copymsg(mpctl)) == NULL || 16715 (mp6_conn_ctl = copymsg(mpctl)) == NULL || 16716 (mp6_attr_ctl = copymsg(mpctl)) == NULL) { 16717 freemsg(mp_conn_ctl); 16718 freemsg(mp_attr_ctl); 16719 freemsg(mp6_conn_ctl); 16720 freemsg(mp6_attr_ctl); 16721 freemsg(mpctl); 16722 freemsg(mp2ctl); 16723 return (NULL); 16724 } 16725 16726 ipst = connp->conn_netstack->netstack_ip; 16727 tcps = connp->conn_netstack->netstack_tcp; 16728 16729 /* build table of connections -- need count in fixed part */ 16730 SET_MIB(tcps->tcps_mib.tcpRtoAlgorithm, 4); /* vanj */ 16731 SET_MIB(tcps->tcps_mib.tcpRtoMin, tcps->tcps_rexmit_interval_min); 16732 SET_MIB(tcps->tcps_mib.tcpRtoMax, tcps->tcps_rexmit_interval_max); 16733 SET_MIB(tcps->tcps_mib.tcpMaxConn, -1); 16734 SET_MIB(tcps->tcps_mib.tcpCurrEstab, 0); 16735 16736 ispriv = 16737 secpolicy_ip_config((Q_TO_CONN(q))->conn_cred, B_TRUE) == 0; 16738 zoneid = Q_TO_CONN(q)->conn_zoneid; 16739 16740 v4_conn_idx = v6_conn_idx = 0; 16741 mp_conn_tail = mp_attr_tail = mp6_conn_tail = mp6_attr_tail = NULL; 16742 16743 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 16744 ipst = tcps->tcps_netstack->netstack_ip; 16745 16746 connfp = &ipst->ips_ipcl_globalhash_fanout[i]; 16747 16748 connp = NULL; 16749 16750 while ((connp = 16751 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 16752 tcp_t *tcp; 16753 boolean_t needattr; 16754 16755 if (connp->conn_zoneid != zoneid) 16756 continue; /* not in this zone */ 16757 16758 tcp = connp->conn_tcp; 16759 UPDATE_MIB(&tcps->tcps_mib, 16760 tcpHCInSegs, tcp->tcp_ibsegs); 16761 tcp->tcp_ibsegs = 0; 16762 UPDATE_MIB(&tcps->tcps_mib, 16763 tcpHCOutSegs, tcp->tcp_obsegs); 16764 tcp->tcp_obsegs = 0; 16765 16766 tce6.tcp6ConnState = tce.tcpConnState = 16767 tcp_snmp_state(tcp); 16768 if (tce.tcpConnState == MIB2_TCP_established || 16769 tce.tcpConnState == MIB2_TCP_closeWait) 16770 BUMP_MIB(&tcps->tcps_mib, tcpCurrEstab); 16771 16772 needattr = B_FALSE; 16773 bzero(&mlp, sizeof (mlp)); 16774 if (connp->conn_mlp_type != mlptSingle) { 16775 if (connp->conn_mlp_type == mlptShared || 16776 connp->conn_mlp_type == mlptBoth) 16777 mlp.tme_flags |= MIB2_TMEF_SHARED; 16778 if (connp->conn_mlp_type == mlptPrivate || 16779 connp->conn_mlp_type == mlptBoth) 16780 mlp.tme_flags |= MIB2_TMEF_PRIVATE; 16781 needattr = B_TRUE; 16782 } 16783 if (connp->conn_peercred != NULL) { 16784 ts_label_t *tsl; 16785 16786 tsl = crgetlabel(connp->conn_peercred); 16787 mlp.tme_doi = label2doi(tsl); 16788 mlp.tme_label = *label2bslabel(tsl); 16789 needattr = B_TRUE; 16790 } 16791 16792 /* Create a message to report on IPv6 entries */ 16793 if (tcp->tcp_ipversion == IPV6_VERSION) { 16794 tce6.tcp6ConnLocalAddress = tcp->tcp_ip_src_v6; 16795 tce6.tcp6ConnRemAddress = tcp->tcp_remote_v6; 16796 tce6.tcp6ConnLocalPort = ntohs(tcp->tcp_lport); 16797 tce6.tcp6ConnRemPort = ntohs(tcp->tcp_fport); 16798 tce6.tcp6ConnIfIndex = tcp->tcp_bound_if; 16799 /* Don't want just anybody seeing these... */ 16800 if (ispriv) { 16801 tce6.tcp6ConnEntryInfo.ce_snxt = 16802 tcp->tcp_snxt; 16803 tce6.tcp6ConnEntryInfo.ce_suna = 16804 tcp->tcp_suna; 16805 tce6.tcp6ConnEntryInfo.ce_rnxt = 16806 tcp->tcp_rnxt; 16807 tce6.tcp6ConnEntryInfo.ce_rack = 16808 tcp->tcp_rack; 16809 } else { 16810 /* 16811 * Netstat, unfortunately, uses this to 16812 * get send/receive queue sizes. How to fix? 16813 * Why not compute the difference only? 16814 */ 16815 tce6.tcp6ConnEntryInfo.ce_snxt = 16816 tcp->tcp_snxt - tcp->tcp_suna; 16817 tce6.tcp6ConnEntryInfo.ce_suna = 0; 16818 tce6.tcp6ConnEntryInfo.ce_rnxt = 16819 tcp->tcp_rnxt - tcp->tcp_rack; 16820 tce6.tcp6ConnEntryInfo.ce_rack = 0; 16821 } 16822 16823 tce6.tcp6ConnEntryInfo.ce_swnd = tcp->tcp_swnd; 16824 tce6.tcp6ConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; 16825 tce6.tcp6ConnEntryInfo.ce_rto = tcp->tcp_rto; 16826 tce6.tcp6ConnEntryInfo.ce_mss = tcp->tcp_mss; 16827 tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state; 16828 16829 tce6.tcp6ConnCreationProcess = 16830 (tcp->tcp_cpid < 0) ? MIB2_UNKNOWN_PROCESS : 16831 tcp->tcp_cpid; 16832 tce6.tcp6ConnCreationTime = tcp->tcp_open_time; 16833 16834 (void) snmp_append_data2(mp6_conn_ctl->b_cont, 16835 &mp6_conn_tail, (char *)&tce6, sizeof (tce6)); 16836 16837 mlp.tme_connidx = v6_conn_idx++; 16838 if (needattr) 16839 (void) snmp_append_data2(mp6_attr_ctl->b_cont, 16840 &mp6_attr_tail, (char *)&mlp, sizeof (mlp)); 16841 } 16842 /* 16843 * Create an IPv4 table entry for IPv4 entries and also 16844 * for IPv6 entries which are bound to in6addr_any 16845 * but don't have IPV6_V6ONLY set. 16846 * (i.e. anything an IPv4 peer could connect to) 16847 */ 16848 if (tcp->tcp_ipversion == IPV4_VERSION || 16849 (tcp->tcp_state <= TCPS_LISTEN && 16850 !tcp->tcp_connp->conn_ipv6_v6only && 16851 IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip_src_v6))) { 16852 if (tcp->tcp_ipversion == IPV6_VERSION) { 16853 tce.tcpConnRemAddress = INADDR_ANY; 16854 tce.tcpConnLocalAddress = INADDR_ANY; 16855 } else { 16856 tce.tcpConnRemAddress = 16857 tcp->tcp_remote; 16858 tce.tcpConnLocalAddress = 16859 tcp->tcp_ip_src; 16860 } 16861 tce.tcpConnLocalPort = ntohs(tcp->tcp_lport); 16862 tce.tcpConnRemPort = ntohs(tcp->tcp_fport); 16863 /* Don't want just anybody seeing these... */ 16864 if (ispriv) { 16865 tce.tcpConnEntryInfo.ce_snxt = 16866 tcp->tcp_snxt; 16867 tce.tcpConnEntryInfo.ce_suna = 16868 tcp->tcp_suna; 16869 tce.tcpConnEntryInfo.ce_rnxt = 16870 tcp->tcp_rnxt; 16871 tce.tcpConnEntryInfo.ce_rack = 16872 tcp->tcp_rack; 16873 } else { 16874 /* 16875 * Netstat, unfortunately, uses this to 16876 * get send/receive queue sizes. How 16877 * to fix? 16878 * Why not compute the difference only? 16879 */ 16880 tce.tcpConnEntryInfo.ce_snxt = 16881 tcp->tcp_snxt - tcp->tcp_suna; 16882 tce.tcpConnEntryInfo.ce_suna = 0; 16883 tce.tcpConnEntryInfo.ce_rnxt = 16884 tcp->tcp_rnxt - tcp->tcp_rack; 16885 tce.tcpConnEntryInfo.ce_rack = 0; 16886 } 16887 16888 tce.tcpConnEntryInfo.ce_swnd = tcp->tcp_swnd; 16889 tce.tcpConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; 16890 tce.tcpConnEntryInfo.ce_rto = tcp->tcp_rto; 16891 tce.tcpConnEntryInfo.ce_mss = tcp->tcp_mss; 16892 tce.tcpConnEntryInfo.ce_state = 16893 tcp->tcp_state; 16894 16895 tce.tcpConnCreationProcess = 16896 (tcp->tcp_cpid < 0) ? MIB2_UNKNOWN_PROCESS : 16897 tcp->tcp_cpid; 16898 tce.tcpConnCreationTime = tcp->tcp_open_time; 16899 16900 (void) snmp_append_data2(mp_conn_ctl->b_cont, 16901 &mp_conn_tail, (char *)&tce, sizeof (tce)); 16902 16903 mlp.tme_connidx = v4_conn_idx++; 16904 if (needattr) 16905 (void) snmp_append_data2( 16906 mp_attr_ctl->b_cont, 16907 &mp_attr_tail, (char *)&mlp, 16908 sizeof (mlp)); 16909 } 16910 } 16911 } 16912 16913 /* fixed length structure for IPv4 and IPv6 counters */ 16914 SET_MIB(tcps->tcps_mib.tcpConnTableSize, sizeof (mib2_tcpConnEntry_t)); 16915 SET_MIB(tcps->tcps_mib.tcp6ConnTableSize, 16916 sizeof (mib2_tcp6ConnEntry_t)); 16917 /* synchronize 32- and 64-bit counters */ 16918 SYNC32_MIB(&tcps->tcps_mib, tcpInSegs, tcpHCInSegs); 16919 SYNC32_MIB(&tcps->tcps_mib, tcpOutSegs, tcpHCOutSegs); 16920 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16921 optp->level = MIB2_TCP; 16922 optp->name = 0; 16923 (void) snmp_append_data(mpdata, (char *)&tcps->tcps_mib, 16924 sizeof (tcps->tcps_mib)); 16925 optp->len = msgdsize(mpdata); 16926 qreply(q, mpctl); 16927 16928 /* table of connections... */ 16929 optp = (struct opthdr *)&mp_conn_ctl->b_rptr[ 16930 sizeof (struct T_optmgmt_ack)]; 16931 optp->level = MIB2_TCP; 16932 optp->name = MIB2_TCP_CONN; 16933 optp->len = msgdsize(mp_conn_ctl->b_cont); 16934 qreply(q, mp_conn_ctl); 16935 16936 /* table of MLP attributes... */ 16937 optp = (struct opthdr *)&mp_attr_ctl->b_rptr[ 16938 sizeof (struct T_optmgmt_ack)]; 16939 optp->level = MIB2_TCP; 16940 optp->name = EXPER_XPORT_MLP; 16941 optp->len = msgdsize(mp_attr_ctl->b_cont); 16942 if (optp->len == 0) 16943 freemsg(mp_attr_ctl); 16944 else 16945 qreply(q, mp_attr_ctl); 16946 16947 /* table of IPv6 connections... */ 16948 optp = (struct opthdr *)&mp6_conn_ctl->b_rptr[ 16949 sizeof (struct T_optmgmt_ack)]; 16950 optp->level = MIB2_TCP6; 16951 optp->name = MIB2_TCP6_CONN; 16952 optp->len = msgdsize(mp6_conn_ctl->b_cont); 16953 qreply(q, mp6_conn_ctl); 16954 16955 /* table of IPv6 MLP attributes... */ 16956 optp = (struct opthdr *)&mp6_attr_ctl->b_rptr[ 16957 sizeof (struct T_optmgmt_ack)]; 16958 optp->level = MIB2_TCP6; 16959 optp->name = EXPER_XPORT_MLP; 16960 optp->len = msgdsize(mp6_attr_ctl->b_cont); 16961 if (optp->len == 0) 16962 freemsg(mp6_attr_ctl); 16963 else 16964 qreply(q, mp6_attr_ctl); 16965 return (mp2ctl); 16966 } 16967 16968 /* Return 0 if invalid set request, 1 otherwise, including non-tcp requests */ 16969 /* ARGSUSED */ 16970 int 16971 tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) 16972 { 16973 mib2_tcpConnEntry_t *tce = (mib2_tcpConnEntry_t *)ptr; 16974 16975 switch (level) { 16976 case MIB2_TCP: 16977 switch (name) { 16978 case 13: 16979 if (tce->tcpConnState != MIB2_TCP_deleteTCB) 16980 return (0); 16981 /* TODO: delete entry defined by tce */ 16982 return (1); 16983 default: 16984 return (0); 16985 } 16986 default: 16987 return (1); 16988 } 16989 } 16990 16991 /* Translate TCP state to MIB2 TCP state. */ 16992 static int 16993 tcp_snmp_state(tcp_t *tcp) 16994 { 16995 if (tcp == NULL) 16996 return (0); 16997 16998 switch (tcp->tcp_state) { 16999 case TCPS_CLOSED: 17000 case TCPS_IDLE: /* RFC1213 doesn't have analogue for IDLE & BOUND */ 17001 case TCPS_BOUND: 17002 return (MIB2_TCP_closed); 17003 case TCPS_LISTEN: 17004 return (MIB2_TCP_listen); 17005 case TCPS_SYN_SENT: 17006 return (MIB2_TCP_synSent); 17007 case TCPS_SYN_RCVD: 17008 return (MIB2_TCP_synReceived); 17009 case TCPS_ESTABLISHED: 17010 return (MIB2_TCP_established); 17011 case TCPS_CLOSE_WAIT: 17012 return (MIB2_TCP_closeWait); 17013 case TCPS_FIN_WAIT_1: 17014 return (MIB2_TCP_finWait1); 17015 case TCPS_CLOSING: 17016 return (MIB2_TCP_closing); 17017 case TCPS_LAST_ACK: 17018 return (MIB2_TCP_lastAck); 17019 case TCPS_FIN_WAIT_2: 17020 return (MIB2_TCP_finWait2); 17021 case TCPS_TIME_WAIT: 17022 return (MIB2_TCP_timeWait); 17023 default: 17024 return (0); 17025 } 17026 } 17027 17028 static char tcp_report_header[] = 17029 "TCP " MI_COL_HDRPAD_STR 17030 "zone dest snxt suna " 17031 "swnd rnxt rack rwnd rto mss w sw rw t " 17032 "recent [lport,fport] state"; 17033 17034 /* 17035 * TCP status report triggered via the Named Dispatch mechanism. 17036 */ 17037 /* ARGSUSED */ 17038 static void 17039 tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, tcp_t *thisstream, 17040 cred_t *cr) 17041 { 17042 char hash[10], addrbuf[INET6_ADDRSTRLEN]; 17043 boolean_t ispriv = secpolicy_ip_config(cr, B_TRUE) == 0; 17044 char cflag; 17045 in6_addr_t v6dst; 17046 char buf[80]; 17047 uint_t print_len, buf_len; 17048 17049 buf_len = mp->b_datap->db_lim - mp->b_wptr; 17050 if (buf_len <= 0) 17051 return; 17052 17053 if (hashval >= 0) 17054 (void) sprintf(hash, "%03d ", hashval); 17055 else 17056 hash[0] = '\0'; 17057 17058 /* 17059 * Note that we use the remote address in the tcp_b structure. 17060 * This means that it will print out the real destination address, 17061 * not the next hop's address if source routing is used. This 17062 * avoid the confusion on the output because user may not 17063 * know that source routing is used for a connection. 17064 */ 17065 if (tcp->tcp_ipversion == IPV4_VERSION) { 17066 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &v6dst); 17067 } else { 17068 v6dst = tcp->tcp_remote_v6; 17069 } 17070 (void) inet_ntop(AF_INET6, &v6dst, addrbuf, sizeof (addrbuf)); 17071 /* 17072 * the ispriv checks are so that normal users cannot determine 17073 * sequence number information using NDD. 17074 */ 17075 17076 if (TCP_IS_DETACHED(tcp)) 17077 cflag = '*'; 17078 else 17079 cflag = ' '; 17080 print_len = snprintf((char *)mp->b_wptr, buf_len, 17081 "%s " MI_COL_PTRFMT_STR "%d %s %08x %08x %010d %08x %08x " 17082 "%010d %05ld %05d %1d %02d %02d %1d %08x %s%c\n", 17083 hash, 17084 (void *)tcp, 17085 tcp->tcp_connp->conn_zoneid, 17086 addrbuf, 17087 (ispriv) ? tcp->tcp_snxt : 0, 17088 (ispriv) ? tcp->tcp_suna : 0, 17089 tcp->tcp_swnd, 17090 (ispriv) ? tcp->tcp_rnxt : 0, 17091 (ispriv) ? tcp->tcp_rack : 0, 17092 tcp->tcp_rwnd, 17093 tcp->tcp_rto, 17094 tcp->tcp_mss, 17095 tcp->tcp_snd_ws_ok, 17096 tcp->tcp_snd_ws, 17097 tcp->tcp_rcv_ws, 17098 tcp->tcp_snd_ts_ok, 17099 tcp->tcp_ts_recent, 17100 tcp_display(tcp, buf, DISP_PORT_ONLY), cflag); 17101 if (print_len < buf_len) { 17102 ((mblk_t *)mp)->b_wptr += print_len; 17103 } else { 17104 ((mblk_t *)mp)->b_wptr += buf_len; 17105 } 17106 } 17107 17108 /* 17109 * TCP status report (for listeners only) triggered via the Named Dispatch 17110 * mechanism. 17111 */ 17112 /* ARGSUSED */ 17113 static void 17114 tcp_report_listener(mblk_t *mp, tcp_t *tcp, int hashval) 17115 { 17116 char addrbuf[INET6_ADDRSTRLEN]; 17117 in6_addr_t v6dst; 17118 uint_t print_len, buf_len; 17119 17120 buf_len = mp->b_datap->db_lim - mp->b_wptr; 17121 if (buf_len <= 0) 17122 return; 17123 17124 if (tcp->tcp_ipversion == IPV4_VERSION) { 17125 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6dst); 17126 (void) inet_ntop(AF_INET6, &v6dst, addrbuf, sizeof (addrbuf)); 17127 } else { 17128 (void) inet_ntop(AF_INET6, &tcp->tcp_ip6h->ip6_src, 17129 addrbuf, sizeof (addrbuf)); 17130 } 17131 print_len = snprintf((char *)mp->b_wptr, buf_len, 17132 "%03d " 17133 MI_COL_PTRFMT_STR 17134 "%d %s %05u %08u %d/%d/%d%c\n", 17135 hashval, (void *)tcp, 17136 tcp->tcp_connp->conn_zoneid, 17137 addrbuf, 17138 (uint_t)BE16_TO_U16(tcp->tcp_tcph->th_lport), 17139 tcp->tcp_conn_req_seqnum, 17140 tcp->tcp_conn_req_cnt_q0, tcp->tcp_conn_req_cnt_q, 17141 tcp->tcp_conn_req_max, 17142 tcp->tcp_syn_defense ? '*' : ' '); 17143 if (print_len < buf_len) { 17144 ((mblk_t *)mp)->b_wptr += print_len; 17145 } else { 17146 ((mblk_t *)mp)->b_wptr += buf_len; 17147 } 17148 } 17149 17150 /* TCP status report triggered via the Named Dispatch mechanism. */ 17151 /* ARGSUSED */ 17152 static int 17153 tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 17154 { 17155 tcp_t *tcp; 17156 int i; 17157 conn_t *connp; 17158 connf_t *connfp; 17159 zoneid_t zoneid; 17160 tcp_stack_t *tcps; 17161 ip_stack_t *ipst; 17162 17163 zoneid = Q_TO_CONN(q)->conn_zoneid; 17164 tcps = Q_TO_TCP(q)->tcp_tcps; 17165 17166 /* 17167 * Because of the ndd constraint, at most we can have 64K buffer 17168 * to put in all TCP info. So to be more efficient, just 17169 * allocate a 64K buffer here, assuming we need that large buffer. 17170 * This may be a problem as any user can read tcp_status. Therefore 17171 * we limit the rate of doing this using tcp_ndd_get_info_interval. 17172 * This should be OK as normal users should not do this too often. 17173 */ 17174 if (cr == NULL || secpolicy_ip_config(cr, B_TRUE) != 0) { 17175 if (ddi_get_lbolt() - tcps->tcps_last_ndd_get_info_time < 17176 drv_usectohz(tcps->tcps_ndd_get_info_interval * 1000)) { 17177 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 17178 return (0); 17179 } 17180 } 17181 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 17182 /* The following may work even if we cannot get a large buf. */ 17183 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 17184 return (0); 17185 } 17186 17187 (void) mi_mpprintf(mp, "%s", tcp_report_header); 17188 17189 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 17190 17191 ipst = tcps->tcps_netstack->netstack_ip; 17192 connfp = &ipst->ips_ipcl_globalhash_fanout[i]; 17193 17194 connp = NULL; 17195 17196 while ((connp = 17197 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 17198 tcp = connp->conn_tcp; 17199 if (zoneid != GLOBAL_ZONEID && 17200 zoneid != connp->conn_zoneid) 17201 continue; 17202 tcp_report_item(mp->b_cont, tcp, -1, tcp, 17203 cr); 17204 } 17205 17206 } 17207 17208 tcps->tcps_last_ndd_get_info_time = ddi_get_lbolt(); 17209 return (0); 17210 } 17211 17212 /* TCP status report triggered via the Named Dispatch mechanism. */ 17213 /* ARGSUSED */ 17214 static int 17215 tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 17216 { 17217 tf_t *tbf; 17218 tcp_t *tcp; 17219 int i; 17220 zoneid_t zoneid; 17221 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 17222 17223 zoneid = Q_TO_CONN(q)->conn_zoneid; 17224 17225 /* Refer to comments in tcp_status_report(). */ 17226 if (cr == NULL || secpolicy_ip_config(cr, B_TRUE) != 0) { 17227 if (ddi_get_lbolt() - tcps->tcps_last_ndd_get_info_time < 17228 drv_usectohz(tcps->tcps_ndd_get_info_interval * 1000)) { 17229 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 17230 return (0); 17231 } 17232 } 17233 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 17234 /* The following may work even if we cannot get a large buf. */ 17235 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 17236 return (0); 17237 } 17238 17239 (void) mi_mpprintf(mp, " %s", tcp_report_header); 17240 17241 for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { 17242 tbf = &tcps->tcps_bind_fanout[i]; 17243 mutex_enter(&tbf->tf_lock); 17244 for (tcp = tbf->tf_tcp; tcp != NULL; 17245 tcp = tcp->tcp_bind_hash) { 17246 if (zoneid != GLOBAL_ZONEID && 17247 zoneid != tcp->tcp_connp->conn_zoneid) 17248 continue; 17249 CONN_INC_REF(tcp->tcp_connp); 17250 tcp_report_item(mp->b_cont, tcp, i, 17251 Q_TO_TCP(q), cr); 17252 CONN_DEC_REF(tcp->tcp_connp); 17253 } 17254 mutex_exit(&tbf->tf_lock); 17255 } 17256 tcps->tcps_last_ndd_get_info_time = ddi_get_lbolt(); 17257 return (0); 17258 } 17259 17260 /* TCP status report triggered via the Named Dispatch mechanism. */ 17261 /* ARGSUSED */ 17262 static int 17263 tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 17264 { 17265 connf_t *connfp; 17266 conn_t *connp; 17267 tcp_t *tcp; 17268 int i; 17269 zoneid_t zoneid; 17270 tcp_stack_t *tcps; 17271 ip_stack_t *ipst; 17272 17273 zoneid = Q_TO_CONN(q)->conn_zoneid; 17274 tcps = Q_TO_TCP(q)->tcp_tcps; 17275 17276 /* Refer to comments in tcp_status_report(). */ 17277 if (cr == NULL || secpolicy_ip_config(cr, B_TRUE) != 0) { 17278 if (ddi_get_lbolt() - tcps->tcps_last_ndd_get_info_time < 17279 drv_usectohz(tcps->tcps_ndd_get_info_interval * 1000)) { 17280 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 17281 return (0); 17282 } 17283 } 17284 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 17285 /* The following may work even if we cannot get a large buf. */ 17286 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 17287 return (0); 17288 } 17289 17290 (void) mi_mpprintf(mp, 17291 " TCP " MI_COL_HDRPAD_STR 17292 "zone IP addr port seqnum backlog (q0/q/max)"); 17293 17294 ipst = tcps->tcps_netstack->netstack_ip; 17295 17296 for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { 17297 connfp = &ipst->ips_ipcl_bind_fanout[i]; 17298 connp = NULL; 17299 while ((connp = 17300 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 17301 tcp = connp->conn_tcp; 17302 if (zoneid != GLOBAL_ZONEID && 17303 zoneid != connp->conn_zoneid) 17304 continue; 17305 tcp_report_listener(mp->b_cont, tcp, i); 17306 } 17307 } 17308 17309 tcps->tcps_last_ndd_get_info_time = ddi_get_lbolt(); 17310 return (0); 17311 } 17312 17313 /* TCP status report triggered via the Named Dispatch mechanism. */ 17314 /* ARGSUSED */ 17315 static int 17316 tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 17317 { 17318 connf_t *connfp; 17319 conn_t *connp; 17320 tcp_t *tcp; 17321 int i; 17322 zoneid_t zoneid; 17323 tcp_stack_t *tcps; 17324 ip_stack_t *ipst; 17325 17326 zoneid = Q_TO_CONN(q)->conn_zoneid; 17327 tcps = Q_TO_TCP(q)->tcp_tcps; 17328 ipst = tcps->tcps_netstack->netstack_ip; 17329 17330 /* Refer to comments in tcp_status_report(). */ 17331 if (cr == NULL || secpolicy_ip_config(cr, B_TRUE) != 0) { 17332 if (ddi_get_lbolt() - tcps->tcps_last_ndd_get_info_time < 17333 drv_usectohz(tcps->tcps_ndd_get_info_interval * 1000)) { 17334 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 17335 return (0); 17336 } 17337 } 17338 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 17339 /* The following may work even if we cannot get a large buf. */ 17340 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 17341 return (0); 17342 } 17343 17344 (void) mi_mpprintf(mp, "tcp_conn_hash_size = %d", 17345 ipst->ips_ipcl_conn_fanout_size); 17346 (void) mi_mpprintf(mp, " %s", tcp_report_header); 17347 17348 for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { 17349 connfp = &ipst->ips_ipcl_conn_fanout[i]; 17350 connp = NULL; 17351 while ((connp = 17352 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 17353 tcp = connp->conn_tcp; 17354 if (zoneid != GLOBAL_ZONEID && 17355 zoneid != connp->conn_zoneid) 17356 continue; 17357 tcp_report_item(mp->b_cont, tcp, i, 17358 Q_TO_TCP(q), cr); 17359 } 17360 } 17361 17362 tcps->tcps_last_ndd_get_info_time = ddi_get_lbolt(); 17363 return (0); 17364 } 17365 17366 /* TCP status report triggered via the Named Dispatch mechanism. */ 17367 /* ARGSUSED */ 17368 static int 17369 tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 17370 { 17371 tf_t *tf; 17372 tcp_t *tcp; 17373 int i; 17374 zoneid_t zoneid; 17375 tcp_stack_t *tcps; 17376 17377 zoneid = Q_TO_CONN(q)->conn_zoneid; 17378 tcps = Q_TO_TCP(q)->tcp_tcps; 17379 17380 /* Refer to comments in tcp_status_report(). */ 17381 if (cr == NULL || secpolicy_ip_config(cr, B_TRUE) != 0) { 17382 if (ddi_get_lbolt() - tcps->tcps_last_ndd_get_info_time < 17383 drv_usectohz(tcps->tcps_ndd_get_info_interval * 1000)) { 17384 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 17385 return (0); 17386 } 17387 } 17388 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 17389 /* The following may work even if we cannot get a large buf. */ 17390 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 17391 return (0); 17392 } 17393 17394 (void) mi_mpprintf(mp, " %s", tcp_report_header); 17395 17396 for (i = 0; i < TCP_FANOUT_SIZE; i++) { 17397 tf = &tcps->tcps_acceptor_fanout[i]; 17398 mutex_enter(&tf->tf_lock); 17399 for (tcp = tf->tf_tcp; tcp != NULL; 17400 tcp = tcp->tcp_acceptor_hash) { 17401 if (zoneid != GLOBAL_ZONEID && 17402 zoneid != tcp->tcp_connp->conn_zoneid) 17403 continue; 17404 tcp_report_item(mp->b_cont, tcp, i, 17405 Q_TO_TCP(q), cr); 17406 } 17407 mutex_exit(&tf->tf_lock); 17408 } 17409 tcps->tcps_last_ndd_get_info_time = ddi_get_lbolt(); 17410 return (0); 17411 } 17412 17413 /* 17414 * tcp_timer is the timer service routine. It handles the retransmission, 17415 * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out 17416 * from the state of the tcp instance what kind of action needs to be done 17417 * at the time it is called. 17418 */ 17419 static void 17420 tcp_timer(void *arg) 17421 { 17422 mblk_t *mp; 17423 clock_t first_threshold; 17424 clock_t second_threshold; 17425 clock_t ms; 17426 uint32_t mss; 17427 conn_t *connp = (conn_t *)arg; 17428 tcp_t *tcp = connp->conn_tcp; 17429 tcp_stack_t *tcps = tcp->tcp_tcps; 17430 17431 tcp->tcp_timer_tid = 0; 17432 17433 if (tcp->tcp_fused) 17434 return; 17435 17436 first_threshold = tcp->tcp_first_timer_threshold; 17437 second_threshold = tcp->tcp_second_timer_threshold; 17438 switch (tcp->tcp_state) { 17439 case TCPS_IDLE: 17440 case TCPS_BOUND: 17441 case TCPS_LISTEN: 17442 return; 17443 case TCPS_SYN_RCVD: { 17444 tcp_t *listener = tcp->tcp_listener; 17445 17446 if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) { 17447 ASSERT(tcp->tcp_rq == listener->tcp_rq); 17448 /* it's our first timeout */ 17449 tcp->tcp_syn_rcvd_timeout = 1; 17450 mutex_enter(&listener->tcp_eager_lock); 17451 listener->tcp_syn_rcvd_timeout++; 17452 if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) { 17453 /* 17454 * Make this eager available for drop if we 17455 * need to drop one to accomodate a new 17456 * incoming SYN request. 17457 */ 17458 MAKE_DROPPABLE(listener, tcp); 17459 } 17460 if (!listener->tcp_syn_defense && 17461 (listener->tcp_syn_rcvd_timeout > 17462 (tcps->tcps_conn_req_max_q0 >> 2)) && 17463 (tcps->tcps_conn_req_max_q0 > 200)) { 17464 /* We may be under attack. Put on a defense. */ 17465 listener->tcp_syn_defense = B_TRUE; 17466 cmn_err(CE_WARN, "High TCP connect timeout " 17467 "rate! System (port %d) may be under a " 17468 "SYN flood attack!", 17469 BE16_TO_U16(listener->tcp_tcph->th_lport)); 17470 17471 listener->tcp_ip_addr_cache = kmem_zalloc( 17472 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t), 17473 KM_NOSLEEP); 17474 } 17475 mutex_exit(&listener->tcp_eager_lock); 17476 } else if (listener != NULL) { 17477 mutex_enter(&listener->tcp_eager_lock); 17478 tcp->tcp_syn_rcvd_timeout++; 17479 if (tcp->tcp_syn_rcvd_timeout > 1 && 17480 !tcp->tcp_closemp_used) { 17481 /* 17482 * This is our second timeout. Put the tcp in 17483 * the list of droppable eagers to allow it to 17484 * be dropped, if needed. We don't check 17485 * whether tcp_dontdrop is set or not to 17486 * protect ourselve from a SYN attack where a 17487 * remote host can spoof itself as one of the 17488 * good IP source and continue to hold 17489 * resources too long. 17490 */ 17491 MAKE_DROPPABLE(listener, tcp); 17492 } 17493 mutex_exit(&listener->tcp_eager_lock); 17494 } 17495 } 17496 /* FALLTHRU */ 17497 case TCPS_SYN_SENT: 17498 first_threshold = tcp->tcp_first_ctimer_threshold; 17499 second_threshold = tcp->tcp_second_ctimer_threshold; 17500 break; 17501 case TCPS_ESTABLISHED: 17502 case TCPS_FIN_WAIT_1: 17503 case TCPS_CLOSING: 17504 case TCPS_CLOSE_WAIT: 17505 case TCPS_LAST_ACK: 17506 /* If we have data to rexmit */ 17507 if (tcp->tcp_suna != tcp->tcp_snxt) { 17508 clock_t time_to_wait; 17509 17510 BUMP_MIB(&tcps->tcps_mib, tcpTimRetrans); 17511 if (!tcp->tcp_xmit_head) 17512 break; 17513 time_to_wait = lbolt - 17514 (clock_t)tcp->tcp_xmit_head->b_prev; 17515 time_to_wait = tcp->tcp_rto - 17516 TICK_TO_MSEC(time_to_wait); 17517 /* 17518 * If the timer fires too early, 1 clock tick earlier, 17519 * restart the timer. 17520 */ 17521 if (time_to_wait > msec_per_tick) { 17522 TCP_STAT(tcps, tcp_timer_fire_early); 17523 TCP_TIMER_RESTART(tcp, time_to_wait); 17524 return; 17525 } 17526 /* 17527 * When we probe zero windows, we force the swnd open. 17528 * If our peer acks with a closed window swnd will be 17529 * set to zero by tcp_rput(). As long as we are 17530 * receiving acks tcp_rput will 17531 * reset 'tcp_ms_we_have_waited' so as not to trip the 17532 * first and second interval actions. NOTE: the timer 17533 * interval is allowed to continue its exponential 17534 * backoff. 17535 */ 17536 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 17537 if (tcp->tcp_debug) { 17538 (void) strlog(TCP_MOD_ID, 0, 1, 17539 SL_TRACE, "tcp_timer: zero win"); 17540 } 17541 } else { 17542 /* 17543 * After retransmission, we need to do 17544 * slow start. Set the ssthresh to one 17545 * half of current effective window and 17546 * cwnd to one MSS. Also reset 17547 * tcp_cwnd_cnt. 17548 * 17549 * Note that if tcp_ssthresh is reduced because 17550 * of ECN, do not reduce it again unless it is 17551 * already one window of data away (tcp_cwr 17552 * should then be cleared) or this is a 17553 * timeout for a retransmitted segment. 17554 */ 17555 uint32_t npkt; 17556 17557 if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 17558 npkt = ((tcp->tcp_timer_backoff ? 17559 tcp->tcp_cwnd_ssthresh : 17560 tcp->tcp_snxt - 17561 tcp->tcp_suna) >> 1) / tcp->tcp_mss; 17562 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 17563 tcp->tcp_mss; 17564 } 17565 tcp->tcp_cwnd = tcp->tcp_mss; 17566 tcp->tcp_cwnd_cnt = 0; 17567 if (tcp->tcp_ecn_ok) { 17568 tcp->tcp_cwr = B_TRUE; 17569 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 17570 tcp->tcp_ecn_cwr_sent = B_FALSE; 17571 } 17572 } 17573 break; 17574 } 17575 /* 17576 * We have something to send yet we cannot send. The 17577 * reason can be: 17578 * 17579 * 1. Zero send window: we need to do zero window probe. 17580 * 2. Zero cwnd: because of ECN, we need to "clock out 17581 * segments. 17582 * 3. SWS avoidance: receiver may have shrunk window, 17583 * reset our knowledge. 17584 * 17585 * Note that condition 2 can happen with either 1 or 17586 * 3. But 1 and 3 are exclusive. 17587 */ 17588 if (tcp->tcp_unsent != 0) { 17589 if (tcp->tcp_cwnd == 0) { 17590 /* 17591 * Set tcp_cwnd to 1 MSS so that a 17592 * new segment can be sent out. We 17593 * are "clocking out" new data when 17594 * the network is really congested. 17595 */ 17596 ASSERT(tcp->tcp_ecn_ok); 17597 tcp->tcp_cwnd = tcp->tcp_mss; 17598 } 17599 if (tcp->tcp_swnd == 0) { 17600 /* Extend window for zero window probe */ 17601 tcp->tcp_swnd++; 17602 tcp->tcp_zero_win_probe = B_TRUE; 17603 BUMP_MIB(&tcps->tcps_mib, tcpOutWinProbe); 17604 } else { 17605 /* 17606 * Handle timeout from sender SWS avoidance. 17607 * Reset our knowledge of the max send window 17608 * since the receiver might have reduced its 17609 * receive buffer. Avoid setting tcp_max_swnd 17610 * to one since that will essentially disable 17611 * the SWS checks. 17612 * 17613 * Note that since we don't have a SWS 17614 * state variable, if the timeout is set 17615 * for ECN but not for SWS, this 17616 * code will also be executed. This is 17617 * fine as tcp_max_swnd is updated 17618 * constantly and it will not affect 17619 * anything. 17620 */ 17621 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 17622 } 17623 tcp_wput_data(tcp, NULL, B_FALSE); 17624 return; 17625 } 17626 /* Is there a FIN that needs to be to re retransmitted? */ 17627 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 17628 !tcp->tcp_fin_acked) 17629 break; 17630 /* Nothing to do, return without restarting timer. */ 17631 TCP_STAT(tcps, tcp_timer_fire_miss); 17632 return; 17633 case TCPS_FIN_WAIT_2: 17634 /* 17635 * User closed the TCP endpoint and peer ACK'ed our FIN. 17636 * We waited some time for for peer's FIN, but it hasn't 17637 * arrived. We flush the connection now to avoid 17638 * case where the peer has rebooted. 17639 */ 17640 if (TCP_IS_DETACHED(tcp)) { 17641 (void) tcp_clean_death(tcp, 0, 23); 17642 } else { 17643 TCP_TIMER_RESTART(tcp, 17644 tcps->tcps_fin_wait_2_flush_interval); 17645 } 17646 return; 17647 case TCPS_TIME_WAIT: 17648 (void) tcp_clean_death(tcp, 0, 24); 17649 return; 17650 default: 17651 if (tcp->tcp_debug) { 17652 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 17653 "tcp_timer: strange state (%d) %s", 17654 tcp->tcp_state, tcp_display(tcp, NULL, 17655 DISP_PORT_ONLY)); 17656 } 17657 return; 17658 } 17659 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 17660 /* 17661 * For zero window probe, we need to send indefinitely, 17662 * unless we have not heard from the other side for some 17663 * time... 17664 */ 17665 if ((tcp->tcp_zero_win_probe == 0) || 17666 (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) > 17667 second_threshold)) { 17668 BUMP_MIB(&tcps->tcps_mib, tcpTimRetransDrop); 17669 /* 17670 * If TCP is in SYN_RCVD state, send back a 17671 * RST|ACK as BSD does. Note that tcp_zero_win_probe 17672 * should be zero in TCPS_SYN_RCVD state. 17673 */ 17674 if (tcp->tcp_state == TCPS_SYN_RCVD) { 17675 tcp_xmit_ctl("tcp_timer: RST sent on timeout " 17676 "in SYN_RCVD", 17677 tcp, tcp->tcp_snxt, 17678 tcp->tcp_rnxt, TH_RST | TH_ACK); 17679 } 17680 (void) tcp_clean_death(tcp, 17681 tcp->tcp_client_errno ? 17682 tcp->tcp_client_errno : ETIMEDOUT, 25); 17683 return; 17684 } else { 17685 /* 17686 * Set tcp_ms_we_have_waited to second_threshold 17687 * so that in next timeout, we will do the above 17688 * check (lbolt - tcp_last_recv_time). This is 17689 * also to avoid overflow. 17690 * 17691 * We don't need to decrement tcp_timer_backoff 17692 * to avoid overflow because it will be decremented 17693 * later if new timeout value is greater than 17694 * tcp_rexmit_interval_max. In the case when 17695 * tcp_rexmit_interval_max is greater than 17696 * second_threshold, it means that we will wait 17697 * longer than second_threshold to send the next 17698 * window probe. 17699 */ 17700 tcp->tcp_ms_we_have_waited = second_threshold; 17701 } 17702 } else if (ms > first_threshold) { 17703 if (tcp->tcp_snd_zcopy_aware && (!tcp->tcp_xmit_zc_clean) && 17704 tcp->tcp_xmit_head != NULL) { 17705 tcp->tcp_xmit_head = 17706 tcp_zcopy_backoff(tcp, tcp->tcp_xmit_head, 1); 17707 } 17708 /* 17709 * We have been retransmitting for too long... The RTT 17710 * we calculated is probably incorrect. Reinitialize it. 17711 * Need to compensate for 0 tcp_rtt_sa. Reset 17712 * tcp_rtt_update so that we won't accidentally cache a 17713 * bad value. But only do this if this is not a zero 17714 * window probe. 17715 */ 17716 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { 17717 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 17718 (tcp->tcp_rtt_sa >> 5); 17719 tcp->tcp_rtt_sa = 0; 17720 tcp_ip_notify(tcp); 17721 tcp->tcp_rtt_update = 0; 17722 } 17723 } 17724 tcp->tcp_timer_backoff++; 17725 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 17726 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 17727 tcps->tcps_rexmit_interval_min) { 17728 /* 17729 * This means the original RTO is tcp_rexmit_interval_min. 17730 * So we will use tcp_rexmit_interval_min as the RTO value 17731 * and do the backoff. 17732 */ 17733 ms = tcps->tcps_rexmit_interval_min << tcp->tcp_timer_backoff; 17734 } else { 17735 ms <<= tcp->tcp_timer_backoff; 17736 } 17737 if (ms > tcps->tcps_rexmit_interval_max) { 17738 ms = tcps->tcps_rexmit_interval_max; 17739 /* 17740 * ms is at max, decrement tcp_timer_backoff to avoid 17741 * overflow. 17742 */ 17743 tcp->tcp_timer_backoff--; 17744 } 17745 tcp->tcp_ms_we_have_waited += ms; 17746 if (tcp->tcp_zero_win_probe == 0) { 17747 tcp->tcp_rto = ms; 17748 } 17749 TCP_TIMER_RESTART(tcp, ms); 17750 /* 17751 * This is after a timeout and tcp_rto is backed off. Set 17752 * tcp_set_timer to 1 so that next time RTO is updated, we will 17753 * restart the timer with a correct value. 17754 */ 17755 tcp->tcp_set_timer = 1; 17756 mss = tcp->tcp_snxt - tcp->tcp_suna; 17757 if (mss > tcp->tcp_mss) 17758 mss = tcp->tcp_mss; 17759 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 17760 mss = tcp->tcp_swnd; 17761 17762 if ((mp = tcp->tcp_xmit_head) != NULL) 17763 mp->b_prev = (mblk_t *)lbolt; 17764 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 17765 B_TRUE); 17766 17767 /* 17768 * When slow start after retransmission begins, start with 17769 * this seq no. tcp_rexmit_max marks the end of special slow 17770 * start phase. tcp_snd_burst controls how many segments 17771 * can be sent because of an ack. 17772 */ 17773 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 17774 tcp->tcp_snd_burst = TCP_CWND_SS; 17775 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 17776 (tcp->tcp_unsent == 0)) { 17777 tcp->tcp_rexmit_max = tcp->tcp_fss; 17778 } else { 17779 tcp->tcp_rexmit_max = tcp->tcp_snxt; 17780 } 17781 tcp->tcp_rexmit = B_TRUE; 17782 tcp->tcp_dupack_cnt = 0; 17783 17784 /* 17785 * Remove all rexmit SACK blk to start from fresh. 17786 */ 17787 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 17788 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 17789 tcp->tcp_num_notsack_blk = 0; 17790 tcp->tcp_cnt_notsack_list = 0; 17791 } 17792 if (mp == NULL) { 17793 return; 17794 } 17795 /* Attach credentials to retransmitted initial SYNs. */ 17796 if (tcp->tcp_state == TCPS_SYN_SENT) { 17797 mblk_setcred(mp, tcp->tcp_cred); 17798 DB_CPID(mp) = tcp->tcp_cpid; 17799 } 17800 17801 tcp->tcp_csuna = tcp->tcp_snxt; 17802 BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); 17803 UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, mss); 17804 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 17805 tcp_send_data(tcp, tcp->tcp_wq, mp); 17806 17807 } 17808 17809 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ 17810 static void 17811 tcp_unbind(tcp_t *tcp, mblk_t *mp) 17812 { 17813 conn_t *connp; 17814 17815 switch (tcp->tcp_state) { 17816 case TCPS_BOUND: 17817 case TCPS_LISTEN: 17818 break; 17819 default: 17820 tcp_err_ack(tcp, mp, TOUTSTATE, 0); 17821 return; 17822 } 17823 17824 /* 17825 * Need to clean up all the eagers since after the unbind, segments 17826 * will no longer be delivered to this listener stream. 17827 */ 17828 mutex_enter(&tcp->tcp_eager_lock); 17829 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 17830 tcp_eager_cleanup(tcp, 0); 17831 } 17832 mutex_exit(&tcp->tcp_eager_lock); 17833 17834 if (tcp->tcp_ipversion == IPV4_VERSION) { 17835 tcp->tcp_ipha->ipha_src = 0; 17836 } else { 17837 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); 17838 } 17839 V6_SET_ZERO(tcp->tcp_ip_src_v6); 17840 bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport)); 17841 tcp_bind_hash_remove(tcp); 17842 tcp->tcp_state = TCPS_IDLE; 17843 tcp->tcp_mdt = B_FALSE; 17844 /* Send M_FLUSH according to TPI */ 17845 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); 17846 connp = tcp->tcp_connp; 17847 connp->conn_mdt_ok = B_FALSE; 17848 ipcl_hash_remove(connp); 17849 bzero(&connp->conn_ports, sizeof (connp->conn_ports)); 17850 mp = mi_tpi_ok_ack_alloc(mp); 17851 putnext(tcp->tcp_rq, mp); 17852 } 17853 17854 /* 17855 * Don't let port fall into the privileged range. 17856 * Since the extra privileged ports can be arbitrary we also 17857 * ensure that we exclude those from consideration. 17858 * tcp_g_epriv_ports is not sorted thus we loop over it until 17859 * there are no changes. 17860 * 17861 * Note: No locks are held when inspecting tcp_g_*epriv_ports 17862 * but instead the code relies on: 17863 * - the fact that the address of the array and its size never changes 17864 * - the atomic assignment of the elements of the array 17865 * 17866 * Returns 0 if there are no more ports available. 17867 * 17868 * TS note: skip multilevel ports. 17869 */ 17870 static in_port_t 17871 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) 17872 { 17873 int i; 17874 boolean_t restart = B_FALSE; 17875 tcp_stack_t *tcps = tcp->tcp_tcps; 17876 17877 if (random && tcp_random_anon_port != 0) { 17878 (void) random_get_pseudo_bytes((uint8_t *)&port, 17879 sizeof (in_port_t)); 17880 /* 17881 * Unless changed by a sys admin, the smallest anon port 17882 * is 32768 and the largest anon port is 65535. It is 17883 * very likely (50%) for the random port to be smaller 17884 * than the smallest anon port. When that happens, 17885 * add port % (anon port range) to the smallest anon 17886 * port to get the random port. It should fall into the 17887 * valid anon port range. 17888 */ 17889 if (port < tcps->tcps_smallest_anon_port) { 17890 port = tcps->tcps_smallest_anon_port + 17891 port % (tcps->tcps_largest_anon_port - 17892 tcps->tcps_smallest_anon_port); 17893 } 17894 } 17895 17896 retry: 17897 if (port < tcps->tcps_smallest_anon_port) 17898 port = (in_port_t)tcps->tcps_smallest_anon_port; 17899 17900 if (port > tcps->tcps_largest_anon_port) { 17901 if (restart) 17902 return (0); 17903 restart = B_TRUE; 17904 port = (in_port_t)tcps->tcps_smallest_anon_port; 17905 } 17906 17907 if (port < tcps->tcps_smallest_nonpriv_port) 17908 port = (in_port_t)tcps->tcps_smallest_nonpriv_port; 17909 17910 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 17911 if (port == tcps->tcps_g_epriv_ports[i]) { 17912 port++; 17913 /* 17914 * Make sure whether the port is in the 17915 * valid range. 17916 */ 17917 goto retry; 17918 } 17919 } 17920 if (is_system_labeled() && 17921 (i = tsol_next_port(crgetzone(tcp->tcp_cred), port, 17922 IPPROTO_TCP, B_TRUE)) != 0) { 17923 port = i; 17924 goto retry; 17925 } 17926 return (port); 17927 } 17928 17929 /* 17930 * Return the next anonymous port in the privileged port range for 17931 * bind checking. It starts at IPPORT_RESERVED - 1 and goes 17932 * downwards. This is the same behavior as documented in the userland 17933 * library call rresvport(3N). 17934 * 17935 * TS note: skip multilevel ports. 17936 */ 17937 static in_port_t 17938 tcp_get_next_priv_port(const tcp_t *tcp) 17939 { 17940 static in_port_t next_priv_port = IPPORT_RESERVED - 1; 17941 in_port_t nextport; 17942 boolean_t restart = B_FALSE; 17943 tcp_stack_t *tcps = tcp->tcp_tcps; 17944 retry: 17945 if (next_priv_port < tcps->tcps_min_anonpriv_port || 17946 next_priv_port >= IPPORT_RESERVED) { 17947 next_priv_port = IPPORT_RESERVED - 1; 17948 if (restart) 17949 return (0); 17950 restart = B_TRUE; 17951 } 17952 if (is_system_labeled() && 17953 (nextport = tsol_next_port(crgetzone(tcp->tcp_cred), 17954 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { 17955 next_priv_port = nextport; 17956 goto retry; 17957 } 17958 return (next_priv_port--); 17959 } 17960 17961 /* The write side r/w procedure. */ 17962 17963 #if CCS_STATS 17964 struct { 17965 struct { 17966 int64_t count, bytes; 17967 } tot, hit; 17968 } wrw_stats; 17969 #endif 17970 17971 /* 17972 * Call by tcp_wput() to handle all non data, except M_PROTO and M_PCPROTO, 17973 * messages. 17974 */ 17975 /* ARGSUSED */ 17976 static void 17977 tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2) 17978 { 17979 conn_t *connp = (conn_t *)arg; 17980 tcp_t *tcp = connp->conn_tcp; 17981 queue_t *q = tcp->tcp_wq; 17982 17983 ASSERT(DB_TYPE(mp) != M_IOCTL); 17984 /* 17985 * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close. 17986 * Once the close starts, streamhead and sockfs will not let any data 17987 * packets come down (close ensures that there are no threads using the 17988 * queue and no new threads will come down) but since qprocsoff() 17989 * hasn't happened yet, a M_FLUSH or some non data message might 17990 * get reflected back (in response to our own FLUSHRW) and get 17991 * processed after tcp_close() is done. The conn would still be valid 17992 * because a ref would have added but we need to check the state 17993 * before actually processing the packet. 17994 */ 17995 if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) { 17996 freemsg(mp); 17997 return; 17998 } 17999 18000 switch (DB_TYPE(mp)) { 18001 case M_IOCDATA: 18002 tcp_wput_iocdata(tcp, mp); 18003 break; 18004 case M_FLUSH: 18005 tcp_wput_flush(tcp, mp); 18006 break; 18007 default: 18008 CALL_IP_WPUT(connp, q, mp); 18009 break; 18010 } 18011 } 18012 18013 /* 18014 * The TCP fast path write put procedure. 18015 * NOTE: the logic of the fast path is duplicated from tcp_wput_data() 18016 */ 18017 /* ARGSUSED */ 18018 void 18019 tcp_output(void *arg, mblk_t *mp, void *arg2) 18020 { 18021 int len; 18022 int hdrlen; 18023 int plen; 18024 mblk_t *mp1; 18025 uchar_t *rptr; 18026 uint32_t snxt; 18027 tcph_t *tcph; 18028 struct datab *db; 18029 uint32_t suna; 18030 uint32_t mss; 18031 ipaddr_t *dst; 18032 ipaddr_t *src; 18033 uint32_t sum; 18034 int usable; 18035 conn_t *connp = (conn_t *)arg; 18036 tcp_t *tcp = connp->conn_tcp; 18037 uint32_t msize; 18038 tcp_stack_t *tcps = tcp->tcp_tcps; 18039 18040 /* 18041 * Try and ASSERT the minimum possible references on the 18042 * conn early enough. Since we are executing on write side, 18043 * the connection is obviously not detached and that means 18044 * there is a ref each for TCP and IP. Since we are behind 18045 * the squeue, the minimum references needed are 3. If the 18046 * conn is in classifier hash list, there should be an 18047 * extra ref for that (we check both the possibilities). 18048 */ 18049 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 18050 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 18051 18052 ASSERT(DB_TYPE(mp) == M_DATA); 18053 msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 18054 18055 mutex_enter(&tcp->tcp_non_sq_lock); 18056 tcp->tcp_squeue_bytes -= msize; 18057 mutex_exit(&tcp->tcp_non_sq_lock); 18058 18059 /* Bypass tcp protocol for fused tcp loopback */ 18060 if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 18061 return; 18062 18063 mss = tcp->tcp_mss; 18064 if (tcp->tcp_xmit_zc_clean) 18065 mp = tcp_zcopy_backoff(tcp, mp, 0); 18066 18067 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 18068 len = (int)(mp->b_wptr - mp->b_rptr); 18069 18070 /* 18071 * Criteria for fast path: 18072 * 18073 * 1. no unsent data 18074 * 2. single mblk in request 18075 * 3. connection established 18076 * 4. data in mblk 18077 * 5. len <= mss 18078 * 6. no tcp_valid bits 18079 */ 18080 if ((tcp->tcp_unsent != 0) || 18081 (tcp->tcp_cork) || 18082 (mp->b_cont != NULL) || 18083 (tcp->tcp_state != TCPS_ESTABLISHED) || 18084 (len == 0) || 18085 (len > mss) || 18086 (tcp->tcp_valid_bits != 0)) { 18087 tcp_wput_data(tcp, mp, B_FALSE); 18088 return; 18089 } 18090 18091 ASSERT(tcp->tcp_xmit_tail_unsent == 0); 18092 ASSERT(tcp->tcp_fin_sent == 0); 18093 18094 /* queue new packet onto retransmission queue */ 18095 if (tcp->tcp_xmit_head == NULL) { 18096 tcp->tcp_xmit_head = mp; 18097 } else { 18098 tcp->tcp_xmit_last->b_cont = mp; 18099 } 18100 tcp->tcp_xmit_last = mp; 18101 tcp->tcp_xmit_tail = mp; 18102 18103 /* find out how much we can send */ 18104 /* BEGIN CSTYLED */ 18105 /* 18106 * un-acked usable 18107 * |--------------|-----------------| 18108 * tcp_suna tcp_snxt tcp_suna+tcp_swnd 18109 */ 18110 /* END CSTYLED */ 18111 18112 /* start sending from tcp_snxt */ 18113 snxt = tcp->tcp_snxt; 18114 18115 /* 18116 * Check to see if this connection has been idled for some 18117 * time and no ACK is expected. If it is, we need to slow 18118 * start again to get back the connection's "self-clock" as 18119 * described in VJ's paper. 18120 * 18121 * Refer to the comment in tcp_mss_set() for the calculation 18122 * of tcp_cwnd after idle. 18123 */ 18124 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 18125 (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 18126 SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); 18127 } 18128 18129 usable = tcp->tcp_swnd; /* tcp window size */ 18130 if (usable > tcp->tcp_cwnd) 18131 usable = tcp->tcp_cwnd; /* congestion window smaller */ 18132 usable -= snxt; /* subtract stuff already sent */ 18133 suna = tcp->tcp_suna; 18134 usable += suna; 18135 /* usable can be < 0 if the congestion window is smaller */ 18136 if (len > usable) { 18137 /* Can't send complete M_DATA in one shot */ 18138 goto slow; 18139 } 18140 18141 mutex_enter(&tcp->tcp_non_sq_lock); 18142 if (tcp->tcp_flow_stopped && 18143 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 18144 tcp_clrqfull(tcp); 18145 } 18146 mutex_exit(&tcp->tcp_non_sq_lock); 18147 18148 /* 18149 * determine if anything to send (Nagle). 18150 * 18151 * 1. len < tcp_mss (i.e. small) 18152 * 2. unacknowledged data present 18153 * 3. len < nagle limit 18154 * 4. last packet sent < nagle limit (previous packet sent) 18155 */ 18156 if ((len < mss) && (snxt != suna) && 18157 (len < (int)tcp->tcp_naglim) && 18158 (tcp->tcp_last_sent_len < tcp->tcp_naglim)) { 18159 /* 18160 * This was the first unsent packet and normally 18161 * mss < xmit_hiwater so there is no need to worry 18162 * about flow control. The next packet will go 18163 * through the flow control check in tcp_wput_data(). 18164 */ 18165 /* leftover work from above */ 18166 tcp->tcp_unsent = len; 18167 tcp->tcp_xmit_tail_unsent = len; 18168 18169 return; 18170 } 18171 18172 /* len <= tcp->tcp_mss && len == unsent so no silly window */ 18173 18174 if (snxt == suna) { 18175 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 18176 } 18177 18178 /* we have always sent something */ 18179 tcp->tcp_rack_cnt = 0; 18180 18181 tcp->tcp_snxt = snxt + len; 18182 tcp->tcp_rack = tcp->tcp_rnxt; 18183 18184 if ((mp1 = dupb(mp)) == 0) 18185 goto no_memory; 18186 mp->b_prev = (mblk_t *)(uintptr_t)lbolt; 18187 mp->b_next = (mblk_t *)(uintptr_t)snxt; 18188 18189 /* adjust tcp header information */ 18190 tcph = tcp->tcp_tcph; 18191 tcph->th_flags[0] = (TH_ACK|TH_PUSH); 18192 18193 sum = len + tcp->tcp_tcp_hdr_len + tcp->tcp_sum; 18194 sum = (sum >> 16) + (sum & 0xFFFF); 18195 U16_TO_ABE16(sum, tcph->th_sum); 18196 18197 U32_TO_ABE32(snxt, tcph->th_seq); 18198 18199 BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); 18200 UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); 18201 BUMP_LOCAL(tcp->tcp_obsegs); 18202 18203 /* Update the latest receive window size in TCP header. */ 18204 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 18205 tcph->th_win); 18206 18207 tcp->tcp_last_sent_len = (ushort_t)len; 18208 18209 plen = len + tcp->tcp_hdr_len; 18210 18211 if (tcp->tcp_ipversion == IPV4_VERSION) { 18212 tcp->tcp_ipha->ipha_length = htons(plen); 18213 } else { 18214 tcp->tcp_ip6h->ip6_plen = htons(plen - 18215 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 18216 } 18217 18218 /* see if we need to allocate a mblk for the headers */ 18219 hdrlen = tcp->tcp_hdr_len; 18220 rptr = mp1->b_rptr - hdrlen; 18221 db = mp1->b_datap; 18222 if ((db->db_ref != 2) || rptr < db->db_base || 18223 (!OK_32PTR(rptr))) { 18224 /* NOTE: we assume allocb returns an OK_32PTR */ 18225 mp = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 18226 tcps->tcps_wroff_xtra, BPRI_MED); 18227 if (!mp) { 18228 freemsg(mp1); 18229 goto no_memory; 18230 } 18231 mp->b_cont = mp1; 18232 mp1 = mp; 18233 /* Leave room for Link Level header */ 18234 /* hdrlen = tcp->tcp_hdr_len; */ 18235 rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra]; 18236 mp1->b_wptr = &rptr[hdrlen]; 18237 } 18238 mp1->b_rptr = rptr; 18239 18240 /* Fill in the timestamp option. */ 18241 if (tcp->tcp_snd_ts_ok) { 18242 U32_TO_BE32((uint32_t)lbolt, 18243 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 18244 U32_TO_BE32(tcp->tcp_ts_recent, 18245 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 18246 } else { 18247 ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 18248 } 18249 18250 /* copy header into outgoing packet */ 18251 dst = (ipaddr_t *)rptr; 18252 src = (ipaddr_t *)tcp->tcp_iphc; 18253 dst[0] = src[0]; 18254 dst[1] = src[1]; 18255 dst[2] = src[2]; 18256 dst[3] = src[3]; 18257 dst[4] = src[4]; 18258 dst[5] = src[5]; 18259 dst[6] = src[6]; 18260 dst[7] = src[7]; 18261 dst[8] = src[8]; 18262 dst[9] = src[9]; 18263 if (hdrlen -= 40) { 18264 hdrlen >>= 2; 18265 dst += 10; 18266 src += 10; 18267 do { 18268 *dst++ = *src++; 18269 } while (--hdrlen); 18270 } 18271 18272 /* 18273 * Set the ECN info in the TCP header. Note that this 18274 * is not the template header. 18275 */ 18276 if (tcp->tcp_ecn_ok) { 18277 SET_ECT(tcp, rptr); 18278 18279 tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 18280 if (tcp->tcp_ecn_echo_on) 18281 tcph->th_flags[0] |= TH_ECE; 18282 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 18283 tcph->th_flags[0] |= TH_CWR; 18284 tcp->tcp_ecn_cwr_sent = B_TRUE; 18285 } 18286 } 18287 18288 if (tcp->tcp_ip_forward_progress) { 18289 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 18290 *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; 18291 tcp->tcp_ip_forward_progress = B_FALSE; 18292 } 18293 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); 18294 tcp_send_data(tcp, tcp->tcp_wq, mp1); 18295 return; 18296 18297 /* 18298 * If we ran out of memory, we pretend to have sent the packet 18299 * and that it was lost on the wire. 18300 */ 18301 no_memory: 18302 return; 18303 18304 slow: 18305 /* leftover work from above */ 18306 tcp->tcp_unsent = len; 18307 tcp->tcp_xmit_tail_unsent = len; 18308 tcp_wput_data(tcp, NULL, B_FALSE); 18309 } 18310 18311 /* 18312 * The function called through squeue to get behind eager's perimeter to 18313 * finish the accept processing. 18314 */ 18315 /* ARGSUSED */ 18316 void 18317 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) 18318 { 18319 conn_t *connp = (conn_t *)arg; 18320 tcp_t *tcp = connp->conn_tcp; 18321 queue_t *q = tcp->tcp_rq; 18322 mblk_t *mp1; 18323 mblk_t *stropt_mp = mp; 18324 struct stroptions *stropt; 18325 uint_t thwin; 18326 tcp_stack_t *tcps = tcp->tcp_tcps; 18327 18328 /* 18329 * Drop the eager's ref on the listener, that was placed when 18330 * this eager began life in tcp_conn_request. 18331 */ 18332 CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); 18333 18334 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) { 18335 /* 18336 * Someone blewoff the eager before we could finish 18337 * the accept. 18338 * 18339 * The only reason eager exists it because we put in 18340 * a ref on it when conn ind went up. We need to send 18341 * a disconnect indication up while the last reference 18342 * on the eager will be dropped by the squeue when we 18343 * return. 18344 */ 18345 ASSERT(tcp->tcp_listener == NULL); 18346 if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { 18347 struct T_discon_ind *tdi; 18348 18349 (void) putnextctl1(q, M_FLUSH, FLUSHRW); 18350 /* 18351 * Let us reuse the incoming mblk to avoid memory 18352 * allocation failure problems. We know that the 18353 * size of the incoming mblk i.e. stroptions is greater 18354 * than sizeof T_discon_ind. So the reallocb below 18355 * can't fail. 18356 */ 18357 freemsg(mp->b_cont); 18358 mp->b_cont = NULL; 18359 ASSERT(DB_REF(mp) == 1); 18360 mp = reallocb(mp, sizeof (struct T_discon_ind), 18361 B_FALSE); 18362 ASSERT(mp != NULL); 18363 DB_TYPE(mp) = M_PROTO; 18364 ((union T_primitives *)mp->b_rptr)->type = T_DISCON_IND; 18365 tdi = (struct T_discon_ind *)mp->b_rptr; 18366 if (tcp->tcp_issocket) { 18367 tdi->DISCON_reason = ECONNREFUSED; 18368 tdi->SEQ_number = 0; 18369 } else { 18370 tdi->DISCON_reason = ENOPROTOOPT; 18371 tdi->SEQ_number = 18372 tcp->tcp_conn_req_seqnum; 18373 } 18374 mp->b_wptr = mp->b_rptr + sizeof (struct T_discon_ind); 18375 putnext(q, mp); 18376 } else { 18377 freemsg(mp); 18378 } 18379 if (tcp->tcp_hard_binding) { 18380 tcp->tcp_hard_binding = B_FALSE; 18381 tcp->tcp_hard_bound = B_TRUE; 18382 } 18383 tcp->tcp_detached = B_FALSE; 18384 return; 18385 } 18386 18387 mp1 = stropt_mp->b_cont; 18388 stropt_mp->b_cont = NULL; 18389 ASSERT(DB_TYPE(stropt_mp) == M_SETOPTS); 18390 stropt = (struct stroptions *)stropt_mp->b_rptr; 18391 18392 while (mp1 != NULL) { 18393 mp = mp1; 18394 mp1 = mp1->b_cont; 18395 mp->b_cont = NULL; 18396 tcp->tcp_drop_opt_ack_cnt++; 18397 CALL_IP_WPUT(connp, tcp->tcp_wq, mp); 18398 } 18399 mp = NULL; 18400 18401 /* 18402 * For a loopback connection with tcp_direct_sockfs on, note that 18403 * we don't have to protect tcp_rcv_list yet because synchronous 18404 * streams has not yet been enabled and tcp_fuse_rrw() cannot 18405 * possibly race with us. 18406 */ 18407 18408 /* 18409 * Set the max window size (tcp_rq->q_hiwat) of the acceptor 18410 * properly. This is the first time we know of the acceptor' 18411 * queue. So we do it here. 18412 */ 18413 if (tcp->tcp_rcv_list == NULL) { 18414 /* 18415 * Recv queue is empty, tcp_rwnd should not have changed. 18416 * That means it should be equal to the listener's tcp_rwnd. 18417 */ 18418 tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd; 18419 } else { 18420 #ifdef DEBUG 18421 uint_t cnt = 0; 18422 18423 mp1 = tcp->tcp_rcv_list; 18424 while ((mp = mp1) != NULL) { 18425 mp1 = mp->b_next; 18426 cnt += msgdsize(mp); 18427 } 18428 ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt); 18429 #endif 18430 /* There is some data, add them back to get the max. */ 18431 tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; 18432 } 18433 18434 stropt->so_flags = SO_HIWAT; 18435 stropt->so_hiwat = MAX(q->q_hiwat, tcps->tcps_sth_rcv_hiwat); 18436 18437 stropt->so_flags |= SO_MAXBLK; 18438 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 18439 18440 /* 18441 * This is the first time we run on the correct 18442 * queue after tcp_accept. So fix all the q parameters 18443 * here. 18444 */ 18445 /* Allocate room for SACK options if needed. */ 18446 stropt->so_flags |= SO_WROFF; 18447 if (tcp->tcp_fused) { 18448 ASSERT(tcp->tcp_loopback); 18449 ASSERT(tcp->tcp_loopback_peer != NULL); 18450 /* 18451 * For fused tcp loopback, set the stream head's write 18452 * offset value to zero since we won't be needing any room 18453 * for TCP/IP headers. This would also improve performance 18454 * since it would reduce the amount of work done by kmem. 18455 * Non-fused tcp loopback case is handled separately below. 18456 */ 18457 stropt->so_wroff = 0; 18458 /* 18459 * Record the stream head's high water mark for this endpoint; 18460 * this is used for flow-control purposes in tcp_fuse_output(). 18461 */ 18462 stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(tcp, q->q_hiwat); 18463 /* 18464 * Update the peer's transmit parameters according to 18465 * our recently calculated high water mark value. 18466 */ 18467 (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); 18468 } else if (tcp->tcp_snd_sack_ok) { 18469 stropt->so_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + 18470 (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra); 18471 } else { 18472 stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : 18473 tcps->tcps_wroff_xtra); 18474 } 18475 18476 /* 18477 * If this is endpoint is handling SSL, then reserve extra 18478 * offset and space at the end. 18479 * Also have the stream head allocate SSL3_MAX_RECORD_LEN packets, 18480 * overriding the previous setting. The extra cost of signing and 18481 * encrypting multiple MSS-size records (12 of them with Ethernet), 18482 * instead of a single contiguous one by the stream head 18483 * largely outweighs the statistical reduction of ACKs, when 18484 * applicable. The peer will also save on decryption and verification 18485 * costs. 18486 */ 18487 if (tcp->tcp_kssl_ctx != NULL) { 18488 stropt->so_wroff += SSL3_WROFFSET; 18489 18490 stropt->so_flags |= SO_TAIL; 18491 stropt->so_tail = SSL3_MAX_TAIL_LEN; 18492 18493 stropt->so_flags |= SO_COPYOPT; 18494 stropt->so_copyopt = ZCVMUNSAFE; 18495 18496 stropt->so_maxblk = SSL3_MAX_RECORD_LEN; 18497 } 18498 18499 /* Send the options up */ 18500 putnext(q, stropt_mp); 18501 18502 /* 18503 * Pass up any data and/or a fin that has been received. 18504 * 18505 * Adjust receive window in case it had decreased 18506 * (because there is data <=> tcp_rcv_list != NULL) 18507 * while the connection was detached. Note that 18508 * in case the eager was flow-controlled, w/o this 18509 * code, the rwnd may never open up again! 18510 */ 18511 if (tcp->tcp_rcv_list != NULL) { 18512 /* We drain directly in case of fused tcp loopback */ 18513 sodirect_t *sodp; 18514 18515 if (!tcp->tcp_fused && canputnext(q)) { 18516 tcp->tcp_rwnd = q->q_hiwat; 18517 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 18518 << tcp->tcp_rcv_ws; 18519 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 18520 if (tcp->tcp_state >= TCPS_ESTABLISHED && 18521 (q->q_hiwat - thwin >= tcp->tcp_mss)) { 18522 tcp_xmit_ctl(NULL, 18523 tcp, (tcp->tcp_swnd == 0) ? 18524 tcp->tcp_suna : tcp->tcp_snxt, 18525 tcp->tcp_rnxt, TH_ACK); 18526 BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); 18527 } 18528 18529 } 18530 18531 SOD_PTR_ENTER(tcp, sodp); 18532 if (sodp != NULL) { 18533 /* Sodirect, move from rcv_list */ 18534 ASSERT(!tcp->tcp_fused); 18535 while ((mp = tcp->tcp_rcv_list) != NULL) { 18536 tcp->tcp_rcv_list = mp->b_next; 18537 mp->b_next = NULL; 18538 (void) tcp_rcv_sod_enqueue(tcp, sodp, mp, 18539 msgdsize(mp)); 18540 } 18541 tcp->tcp_rcv_last_head = NULL; 18542 tcp->tcp_rcv_last_tail = NULL; 18543 tcp->tcp_rcv_cnt = 0; 18544 (void) tcp_rcv_sod_wakeup(tcp, sodp); 18545 /* sod_wakeup() did the mutex_exit() */ 18546 } else { 18547 /* Not sodirect, drain */ 18548 (void) tcp_rcv_drain(q, tcp); 18549 } 18550 18551 /* 18552 * For fused tcp loopback, back-enable peer endpoint 18553 * if it's currently flow-controlled. 18554 */ 18555 if (tcp->tcp_fused) { 18556 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 18557 18558 ASSERT(peer_tcp != NULL); 18559 ASSERT(peer_tcp->tcp_fused); 18560 /* 18561 * In order to change the peer's tcp_flow_stopped, 18562 * we need to take locks for both end points. The 18563 * highest address is taken first. 18564 */ 18565 if (peer_tcp > tcp) { 18566 mutex_enter(&peer_tcp->tcp_non_sq_lock); 18567 mutex_enter(&tcp->tcp_non_sq_lock); 18568 } else { 18569 mutex_enter(&tcp->tcp_non_sq_lock); 18570 mutex_enter(&peer_tcp->tcp_non_sq_lock); 18571 } 18572 if (peer_tcp->tcp_flow_stopped) { 18573 tcp_clrqfull(peer_tcp); 18574 TCP_STAT(tcps, tcp_fusion_backenabled); 18575 } 18576 mutex_exit(&peer_tcp->tcp_non_sq_lock); 18577 mutex_exit(&tcp->tcp_non_sq_lock); 18578 } 18579 } 18580 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 18581 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 18582 mp = mi_tpi_ordrel_ind(); 18583 if (mp) { 18584 tcp->tcp_ordrel_done = B_TRUE; 18585 putnext(q, mp); 18586 if (tcp->tcp_deferred_clean_death) { 18587 /* 18588 * tcp_clean_death was deferred 18589 * for T_ORDREL_IND - do it now 18590 */ 18591 (void) tcp_clean_death(tcp, 18592 tcp->tcp_client_errno, 21); 18593 tcp->tcp_deferred_clean_death = B_FALSE; 18594 } 18595 } else { 18596 /* 18597 * Run the orderly release in the 18598 * service routine. 18599 */ 18600 qenable(q); 18601 } 18602 } 18603 if (tcp->tcp_hard_binding) { 18604 tcp->tcp_hard_binding = B_FALSE; 18605 tcp->tcp_hard_bound = B_TRUE; 18606 } 18607 18608 tcp->tcp_detached = B_FALSE; 18609 18610 /* We can enable synchronous streams now */ 18611 if (tcp->tcp_fused) { 18612 tcp_fuse_syncstr_enable_pair(tcp); 18613 } 18614 18615 if (tcp->tcp_ka_enabled) { 18616 tcp->tcp_ka_last_intrvl = 0; 18617 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 18618 MSEC_TO_TICK(tcp->tcp_ka_interval)); 18619 } 18620 18621 /* 18622 * At this point, eager is fully established and will 18623 * have the following references - 18624 * 18625 * 2 references for connection to exist (1 for TCP and 1 for IP). 18626 * 1 reference for the squeue which will be dropped by the squeue as 18627 * soon as this function returns. 18628 * There will be 1 additonal reference for being in classifier 18629 * hash list provided something bad hasn't happened. 18630 */ 18631 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 18632 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 18633 } 18634 18635 /* 18636 * The function called through squeue to get behind listener's perimeter to 18637 * send a deffered conn_ind. 18638 */ 18639 /* ARGSUSED */ 18640 void 18641 tcp_send_pending(void *arg, mblk_t *mp, void *arg2) 18642 { 18643 conn_t *connp = (conn_t *)arg; 18644 tcp_t *listener = connp->conn_tcp; 18645 18646 if (listener->tcp_state == TCPS_CLOSED || 18647 TCP_IS_DETACHED(listener)) { 18648 /* 18649 * If listener has closed, it would have caused a 18650 * a cleanup/blowoff to happen for the eager. 18651 */ 18652 tcp_t *tcp; 18653 struct T_conn_ind *conn_ind; 18654 18655 conn_ind = (struct T_conn_ind *)mp->b_rptr; 18656 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 18657 conn_ind->OPT_length); 18658 /* 18659 * We need to drop the ref on eager that was put 18660 * tcp_rput_data() before trying to send the conn_ind 18661 * to listener. The conn_ind was deferred in tcp_send_conn_ind 18662 * and tcp_wput_accept() is sending this deferred conn_ind but 18663 * listener is closed so we drop the ref. 18664 */ 18665 CONN_DEC_REF(tcp->tcp_connp); 18666 freemsg(mp); 18667 return; 18668 } 18669 putnext(listener->tcp_rq, mp); 18670 } 18671 18672 18673 /* 18674 * This is the STREAMS entry point for T_CONN_RES coming down on 18675 * Acceptor STREAM when sockfs listener does accept processing. 18676 * Read the block comment on top of tcp_conn_request(). 18677 */ 18678 void 18679 tcp_wput_accept(queue_t *q, mblk_t *mp) 18680 { 18681 queue_t *rq = RD(q); 18682 struct T_conn_res *conn_res; 18683 tcp_t *eager; 18684 tcp_t *listener; 18685 struct T_ok_ack *ok; 18686 t_scalar_t PRIM_type; 18687 mblk_t *opt_mp; 18688 conn_t *econnp; 18689 18690 ASSERT(DB_TYPE(mp) == M_PROTO); 18691 18692 conn_res = (struct T_conn_res *)mp->b_rptr; 18693 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 18694 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { 18695 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 18696 if (mp != NULL) 18697 putnext(rq, mp); 18698 return; 18699 } 18700 switch (conn_res->PRIM_type) { 18701 case O_T_CONN_RES: 18702 case T_CONN_RES: 18703 /* 18704 * We pass up an err ack if allocb fails. This will 18705 * cause sockfs to issue a T_DISCON_REQ which will cause 18706 * tcp_eager_blowoff to be called. sockfs will then call 18707 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. 18708 * we need to do the allocb up here because we have to 18709 * make sure rq->q_qinfo->qi_qclose still points to the 18710 * correct function (tcpclose_accept) in case allocb 18711 * fails. 18712 */ 18713 opt_mp = allocb(sizeof (struct stroptions), BPRI_HI); 18714 if (opt_mp == NULL) { 18715 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 18716 if (mp != NULL) 18717 putnext(rq, mp); 18718 return; 18719 } 18720 18721 bcopy(mp->b_rptr + conn_res->OPT_offset, 18722 &eager, conn_res->OPT_length); 18723 PRIM_type = conn_res->PRIM_type; 18724 mp->b_datap->db_type = M_PCPROTO; 18725 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); 18726 ok = (struct T_ok_ack *)mp->b_rptr; 18727 ok->PRIM_type = T_OK_ACK; 18728 ok->CORRECT_prim = PRIM_type; 18729 econnp = eager->tcp_connp; 18730 econnp->conn_dev = (dev_t)RD(q)->q_ptr; 18731 econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr); 18732 eager->tcp_rq = rq; 18733 eager->tcp_wq = q; 18734 rq->q_ptr = econnp; 18735 rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */ 18736 q->q_ptr = econnp; 18737 q->q_qinfo = &tcp_winit; 18738 listener = eager->tcp_listener; 18739 eager->tcp_issocket = B_TRUE; 18740 18741 /* 18742 * TCP is _D_SODIRECT and sockfs is directly above so 18743 * save shared sodirect_t pointer (if any). 18744 * 18745 * If tcp_fused and sodirect enabled disable it. 18746 */ 18747 eager->tcp_sodirect = SOD_QTOSODP(eager->tcp_rq); 18748 if (eager->tcp_fused && eager->tcp_sodirect != NULL) { 18749 /* Fused, disable sodirect */ 18750 mutex_enter(eager->tcp_sodirect->sod_lock); 18751 SOD_DISABLE(eager->tcp_sodirect); 18752 mutex_exit(eager->tcp_sodirect->sod_lock); 18753 eager->tcp_sodirect = NULL; 18754 } 18755 18756 econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; 18757 econnp->conn_allzones = listener->tcp_connp->conn_allzones; 18758 ASSERT(econnp->conn_netstack == 18759 listener->tcp_connp->conn_netstack); 18760 ASSERT(eager->tcp_tcps == listener->tcp_tcps); 18761 18762 /* Put the ref for IP */ 18763 CONN_INC_REF(econnp); 18764 18765 /* 18766 * We should have minimum of 3 references on the conn 18767 * at this point. One each for TCP and IP and one for 18768 * the T_conn_ind that was sent up when the 3-way handshake 18769 * completed. In the normal case we would also have another 18770 * reference (making a total of 4) for the conn being in the 18771 * classifier hash list. However the eager could have received 18772 * an RST subsequently and tcp_closei_local could have removed 18773 * the eager from the classifier hash list, hence we can't 18774 * assert that reference. 18775 */ 18776 ASSERT(econnp->conn_ref >= 3); 18777 18778 /* 18779 * Send the new local address also up to sockfs. There 18780 * should already be enough space in the mp that came 18781 * down from soaccept(). 18782 */ 18783 if (eager->tcp_family == AF_INET) { 18784 sin_t *sin; 18785 18786 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 18787 (sizeof (struct T_ok_ack) + sizeof (sin_t))); 18788 sin = (sin_t *)mp->b_wptr; 18789 mp->b_wptr += sizeof (sin_t); 18790 sin->sin_family = AF_INET; 18791 sin->sin_port = eager->tcp_lport; 18792 sin->sin_addr.s_addr = eager->tcp_ipha->ipha_src; 18793 } else { 18794 sin6_t *sin6; 18795 18796 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 18797 sizeof (struct T_ok_ack) + sizeof (sin6_t)); 18798 sin6 = (sin6_t *)mp->b_wptr; 18799 mp->b_wptr += sizeof (sin6_t); 18800 sin6->sin6_family = AF_INET6; 18801 sin6->sin6_port = eager->tcp_lport; 18802 if (eager->tcp_ipversion == IPV4_VERSION) { 18803 sin6->sin6_flowinfo = 0; 18804 IN6_IPADDR_TO_V4MAPPED( 18805 eager->tcp_ipha->ipha_src, 18806 &sin6->sin6_addr); 18807 } else { 18808 ASSERT(eager->tcp_ip6h != NULL); 18809 sin6->sin6_flowinfo = 18810 eager->tcp_ip6h->ip6_vcf & 18811 ~IPV6_VERS_AND_FLOW_MASK; 18812 sin6->sin6_addr = eager->tcp_ip6h->ip6_src; 18813 } 18814 sin6->sin6_scope_id = 0; 18815 sin6->__sin6_src_id = 0; 18816 } 18817 18818 putnext(rq, mp); 18819 18820 opt_mp->b_datap->db_type = M_SETOPTS; 18821 opt_mp->b_wptr += sizeof (struct stroptions); 18822 18823 /* 18824 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO 18825 * from listener to acceptor. The message is chained on the 18826 * bind_mp which tcp_rput_other will send down to IP. 18827 */ 18828 if (listener->tcp_bound_if != 0) { 18829 /* allocate optmgmt req */ 18830 mp = tcp_setsockopt_mp(IPPROTO_IPV6, 18831 IPV6_BOUND_IF, (char *)&listener->tcp_bound_if, 18832 sizeof (int)); 18833 if (mp != NULL) 18834 linkb(opt_mp, mp); 18835 } 18836 if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { 18837 uint_t on = 1; 18838 18839 /* allocate optmgmt req */ 18840 mp = tcp_setsockopt_mp(IPPROTO_IPV6, 18841 IPV6_RECVPKTINFO, (char *)&on, sizeof (on)); 18842 if (mp != NULL) 18843 linkb(opt_mp, mp); 18844 } 18845 18846 18847 mutex_enter(&listener->tcp_eager_lock); 18848 18849 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 18850 18851 tcp_t *tail; 18852 tcp_t *tcp; 18853 mblk_t *mp1; 18854 18855 tcp = listener->tcp_eager_prev_q0; 18856 /* 18857 * listener->tcp_eager_prev_q0 points to the TAIL of the 18858 * deferred T_conn_ind queue. We need to get to the head 18859 * of the queue in order to send up T_conn_ind the same 18860 * order as how the 3WHS is completed. 18861 */ 18862 while (tcp != listener) { 18863 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 && 18864 !tcp->tcp_kssl_pending) 18865 break; 18866 else 18867 tcp = tcp->tcp_eager_prev_q0; 18868 } 18869 /* None of the pending eagers can be sent up now */ 18870 if (tcp == listener) 18871 goto no_more_eagers; 18872 18873 mp1 = tcp->tcp_conn.tcp_eager_conn_ind; 18874 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 18875 /* Move from q0 to q */ 18876 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 18877 listener->tcp_conn_req_cnt_q0--; 18878 listener->tcp_conn_req_cnt_q++; 18879 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 18880 tcp->tcp_eager_prev_q0; 18881 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 18882 tcp->tcp_eager_next_q0; 18883 tcp->tcp_eager_prev_q0 = NULL; 18884 tcp->tcp_eager_next_q0 = NULL; 18885 tcp->tcp_conn_def_q0 = B_FALSE; 18886 18887 /* Make sure the tcp isn't in the list of droppables */ 18888 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && 18889 tcp->tcp_eager_prev_drop_q0 == NULL); 18890 18891 /* 18892 * Insert at end of the queue because sockfs sends 18893 * down T_CONN_RES in chronological order. Leaving 18894 * the older conn indications at front of the queue 18895 * helps reducing search time. 18896 */ 18897 tail = listener->tcp_eager_last_q; 18898 if (tail != NULL) { 18899 tail->tcp_eager_next_q = tcp; 18900 } else { 18901 listener->tcp_eager_next_q = tcp; 18902 } 18903 listener->tcp_eager_last_q = tcp; 18904 tcp->tcp_eager_next_q = NULL; 18905 18906 /* Need to get inside the listener perimeter */ 18907 CONN_INC_REF(listener->tcp_connp); 18908 squeue_fill(listener->tcp_connp->conn_sqp, mp1, 18909 tcp_send_pending, listener->tcp_connp, 18910 SQTAG_TCP_SEND_PENDING); 18911 } 18912 no_more_eagers: 18913 tcp_eager_unlink(eager); 18914 mutex_exit(&listener->tcp_eager_lock); 18915 18916 /* 18917 * At this point, the eager is detached from the listener 18918 * but we still have an extra refs on eager (apart from the 18919 * usual tcp references). The ref was placed in tcp_rput_data 18920 * before sending the conn_ind in tcp_send_conn_ind. 18921 * The ref will be dropped in tcp_accept_finish(). 18922 */ 18923 squeue_enter_nodrain(econnp->conn_sqp, opt_mp, 18924 tcp_accept_finish, econnp, SQTAG_TCP_ACCEPT_FINISH_Q0); 18925 return; 18926 default: 18927 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); 18928 if (mp != NULL) 18929 putnext(rq, mp); 18930 return; 18931 } 18932 } 18933 18934 static int 18935 tcp_getmyname(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp) 18936 { 18937 sin_t *sin = (sin_t *)sa; 18938 sin6_t *sin6 = (sin6_t *)sa; 18939 18940 switch (tcp->tcp_family) { 18941 case AF_INET: 18942 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 18943 18944 if (*salenp < sizeof (sin_t)) 18945 return (EINVAL); 18946 18947 *sin = sin_null; 18948 sin->sin_family = AF_INET; 18949 sin->sin_port = tcp->tcp_lport; 18950 sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src; 18951 break; 18952 18953 case AF_INET6: 18954 if (*salenp < sizeof (sin6_t)) 18955 return (EINVAL); 18956 18957 *sin6 = sin6_null; 18958 sin6->sin6_family = AF_INET6; 18959 sin6->sin6_port = tcp->tcp_lport; 18960 if (tcp->tcp_ipversion == IPV4_VERSION) { 18961 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 18962 &sin6->sin6_addr); 18963 } else { 18964 sin6->sin6_addr = tcp->tcp_ip6h->ip6_src; 18965 } 18966 break; 18967 } 18968 18969 return (0); 18970 } 18971 18972 static int 18973 tcp_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp) 18974 { 18975 sin_t *sin = (sin_t *)sa; 18976 sin6_t *sin6 = (sin6_t *)sa; 18977 18978 if (tcp->tcp_state < TCPS_SYN_RCVD) 18979 return (ENOTCONN); 18980 18981 switch (tcp->tcp_family) { 18982 case AF_INET: 18983 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 18984 18985 if (*salenp < sizeof (sin_t)) 18986 return (EINVAL); 18987 18988 *sin = sin_null; 18989 sin->sin_family = AF_INET; 18990 sin->sin_port = tcp->tcp_fport; 18991 IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6, 18992 sin->sin_addr.s_addr); 18993 break; 18994 18995 case AF_INET6: 18996 if (*salenp < sizeof (sin6_t)) 18997 return (EINVAL); 18998 18999 *sin6 = sin6_null; 19000 sin6->sin6_family = AF_INET6; 19001 sin6->sin6_port = tcp->tcp_fport; 19002 sin6->sin6_addr = tcp->tcp_remote_v6; 19003 if (tcp->tcp_ipversion == IPV6_VERSION) { 19004 sin6->sin6_flowinfo = tcp->tcp_ip6h->ip6_vcf & 19005 ~IPV6_VERS_AND_FLOW_MASK; 19006 } 19007 break; 19008 } 19009 19010 return (0); 19011 } 19012 19013 /* 19014 * Handle special out-of-band ioctl requests (see PSARC/2008/265). 19015 */ 19016 static void 19017 tcp_wput_cmdblk(queue_t *q, mblk_t *mp) 19018 { 19019 void *data; 19020 mblk_t *datamp = mp->b_cont; 19021 tcp_t *tcp = Q_TO_TCP(q); 19022 cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr; 19023 19024 if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) { 19025 cmdp->cb_error = EPROTO; 19026 qreply(q, mp); 19027 return; 19028 } 19029 19030 data = datamp->b_rptr; 19031 19032 switch (cmdp->cb_cmd) { 19033 case TI_GETPEERNAME: 19034 cmdp->cb_error = tcp_getpeername(tcp, data, &cmdp->cb_len); 19035 break; 19036 case TI_GETMYNAME: 19037 cmdp->cb_error = tcp_getmyname(tcp, data, &cmdp->cb_len); 19038 break; 19039 default: 19040 cmdp->cb_error = EINVAL; 19041 break; 19042 } 19043 19044 qreply(q, mp); 19045 } 19046 19047 void 19048 tcp_wput(queue_t *q, mblk_t *mp) 19049 { 19050 conn_t *connp = Q_TO_CONN(q); 19051 tcp_t *tcp; 19052 void (*output_proc)(); 19053 t_scalar_t type; 19054 uchar_t *rptr; 19055 struct iocblk *iocp; 19056 uint32_t msize; 19057 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 19058 19059 ASSERT(connp->conn_ref >= 2); 19060 19061 switch (DB_TYPE(mp)) { 19062 case M_DATA: 19063 tcp = connp->conn_tcp; 19064 ASSERT(tcp != NULL); 19065 19066 msize = msgdsize(mp); 19067 19068 mutex_enter(&tcp->tcp_non_sq_lock); 19069 tcp->tcp_squeue_bytes += msize; 19070 if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) { 19071 tcp_setqfull(tcp); 19072 } 19073 mutex_exit(&tcp->tcp_non_sq_lock); 19074 19075 CONN_INC_REF(connp); 19076 (*tcp_squeue_wput_proc)(connp->conn_sqp, mp, 19077 tcp_output, connp, SQTAG_TCP_OUTPUT); 19078 return; 19079 19080 case M_CMD: 19081 tcp_wput_cmdblk(q, mp); 19082 return; 19083 19084 case M_PROTO: 19085 case M_PCPROTO: 19086 /* 19087 * if it is a snmp message, don't get behind the squeue 19088 */ 19089 tcp = connp->conn_tcp; 19090 rptr = mp->b_rptr; 19091 if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 19092 type = ((union T_primitives *)rptr)->type; 19093 } else { 19094 if (tcp->tcp_debug) { 19095 (void) strlog(TCP_MOD_ID, 0, 1, 19096 SL_ERROR|SL_TRACE, 19097 "tcp_wput_proto, dropping one..."); 19098 } 19099 freemsg(mp); 19100 return; 19101 } 19102 if (type == T_SVR4_OPTMGMT_REQ) { 19103 cred_t *cr = DB_CREDDEF(mp, tcp->tcp_cred); 19104 if (snmpcom_req(q, mp, tcp_snmp_set, ip_snmp_get, 19105 cr)) { 19106 /* 19107 * This was a SNMP request 19108 */ 19109 return; 19110 } else { 19111 output_proc = tcp_wput_proto; 19112 } 19113 } else { 19114 output_proc = tcp_wput_proto; 19115 } 19116 break; 19117 case M_IOCTL: 19118 /* 19119 * Most ioctls can be processed right away without going via 19120 * squeues - process them right here. Those that do require 19121 * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK) 19122 * are processed by tcp_wput_ioctl(). 19123 */ 19124 iocp = (struct iocblk *)mp->b_rptr; 19125 tcp = connp->conn_tcp; 19126 19127 switch (iocp->ioc_cmd) { 19128 case TCP_IOC_ABORT_CONN: 19129 tcp_ioctl_abort_conn(q, mp); 19130 return; 19131 case TI_GETPEERNAME: 19132 case TI_GETMYNAME: 19133 mi_copyin(q, mp, NULL, 19134 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 19135 return; 19136 case ND_SET: 19137 /* nd_getset does the necessary checks */ 19138 case ND_GET: 19139 if (!nd_getset(q, tcps->tcps_g_nd, mp)) { 19140 CALL_IP_WPUT(connp, q, mp); 19141 return; 19142 } 19143 qreply(q, mp); 19144 return; 19145 case TCP_IOC_DEFAULT_Q: 19146 /* 19147 * Wants to be the default wq. Check the credentials 19148 * first, the rest is executed via squeue. 19149 */ 19150 if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { 19151 iocp->ioc_error = EPERM; 19152 iocp->ioc_count = 0; 19153 mp->b_datap->db_type = M_IOCACK; 19154 qreply(q, mp); 19155 return; 19156 } 19157 output_proc = tcp_wput_ioctl; 19158 break; 19159 default: 19160 output_proc = tcp_wput_ioctl; 19161 break; 19162 } 19163 break; 19164 default: 19165 output_proc = tcp_wput_nondata; 19166 break; 19167 } 19168 19169 CONN_INC_REF(connp); 19170 (*tcp_squeue_wput_proc)(connp->conn_sqp, mp, 19171 output_proc, connp, SQTAG_TCP_WPUT_OTHER); 19172 } 19173 19174 /* 19175 * Initial STREAMS write side put() procedure for sockets. It tries to 19176 * handle the T_CAPABILITY_REQ which sockfs sends down while setting 19177 * up the socket without using the squeue. Non T_CAPABILITY_REQ messages 19178 * are handled by tcp_wput() as usual. 19179 * 19180 * All further messages will also be handled by tcp_wput() because we cannot 19181 * be sure that the above short cut is safe later. 19182 */ 19183 static void 19184 tcp_wput_sock(queue_t *wq, mblk_t *mp) 19185 { 19186 conn_t *connp = Q_TO_CONN(wq); 19187 tcp_t *tcp = connp->conn_tcp; 19188 struct T_capability_req *car = (struct T_capability_req *)mp->b_rptr; 19189 19190 ASSERT(wq->q_qinfo == &tcp_sock_winit); 19191 wq->q_qinfo = &tcp_winit; 19192 19193 ASSERT(IPCL_IS_TCP(connp)); 19194 ASSERT(TCP_IS_SOCKET(tcp)); 19195 19196 if (DB_TYPE(mp) == M_PCPROTO && 19197 MBLKL(mp) == sizeof (struct T_capability_req) && 19198 car->PRIM_type == T_CAPABILITY_REQ) { 19199 tcp_capability_req(tcp, mp); 19200 return; 19201 } 19202 19203 tcp_wput(wq, mp); 19204 } 19205 19206 static boolean_t 19207 tcp_zcopy_check(tcp_t *tcp) 19208 { 19209 conn_t *connp = tcp->tcp_connp; 19210 ire_t *ire; 19211 boolean_t zc_enabled = B_FALSE; 19212 tcp_stack_t *tcps = tcp->tcp_tcps; 19213 19214 if (do_tcpzcopy == 2) 19215 zc_enabled = B_TRUE; 19216 else if (tcp->tcp_ipversion == IPV4_VERSION && 19217 IPCL_IS_CONNECTED(connp) && 19218 (connp->conn_flags & IPCL_CHECK_POLICY) == 0 && 19219 connp->conn_dontroute == 0 && 19220 !connp->conn_nexthop_set && 19221 connp->conn_outgoing_ill == NULL && 19222 connp->conn_nofailover_ill == NULL && 19223 do_tcpzcopy == 1) { 19224 /* 19225 * the checks above closely resemble the fast path checks 19226 * in tcp_send_data(). 19227 */ 19228 mutex_enter(&connp->conn_lock); 19229 ire = connp->conn_ire_cache; 19230 ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); 19231 if (ire != NULL && !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 19232 IRE_REFHOLD(ire); 19233 if (ire->ire_stq != NULL) { 19234 ill_t *ill = (ill_t *)ire->ire_stq->q_ptr; 19235 19236 zc_enabled = ill && (ill->ill_capabilities & 19237 ILL_CAPAB_ZEROCOPY) && 19238 (ill->ill_zerocopy_capab-> 19239 ill_zerocopy_flags != 0); 19240 } 19241 IRE_REFRELE(ire); 19242 } 19243 mutex_exit(&connp->conn_lock); 19244 } 19245 tcp->tcp_snd_zcopy_on = zc_enabled; 19246 if (!TCP_IS_DETACHED(tcp)) { 19247 if (zc_enabled) { 19248 (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMSAFE); 19249 TCP_STAT(tcps, tcp_zcopy_on); 19250 } else { 19251 (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE); 19252 TCP_STAT(tcps, tcp_zcopy_off); 19253 } 19254 } 19255 return (zc_enabled); 19256 } 19257 19258 static mblk_t * 19259 tcp_zcopy_disable(tcp_t *tcp, mblk_t *bp) 19260 { 19261 tcp_stack_t *tcps = tcp->tcp_tcps; 19262 19263 if (do_tcpzcopy == 2) 19264 return (bp); 19265 else if (tcp->tcp_snd_zcopy_on) { 19266 tcp->tcp_snd_zcopy_on = B_FALSE; 19267 if (!TCP_IS_DETACHED(tcp)) { 19268 (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE); 19269 TCP_STAT(tcps, tcp_zcopy_disable); 19270 } 19271 } 19272 return (tcp_zcopy_backoff(tcp, bp, 0)); 19273 } 19274 19275 /* 19276 * Backoff from a zero-copy mblk by copying data to a new mblk and freeing 19277 * the original desballoca'ed segmapped mblk. 19278 */ 19279 static mblk_t * 19280 tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, int fix_xmitlist) 19281 { 19282 mblk_t *head, *tail, *nbp; 19283 tcp_stack_t *tcps = tcp->tcp_tcps; 19284 19285 if (IS_VMLOANED_MBLK(bp)) { 19286 TCP_STAT(tcps, tcp_zcopy_backoff); 19287 if ((head = copyb(bp)) == NULL) { 19288 /* fail to backoff; leave it for the next backoff */ 19289 tcp->tcp_xmit_zc_clean = B_FALSE; 19290 return (bp); 19291 } 19292 if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { 19293 if (fix_xmitlist) 19294 tcp_zcopy_notify(tcp); 19295 else 19296 head->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 19297 } 19298 nbp = bp->b_cont; 19299 if (fix_xmitlist) { 19300 head->b_prev = bp->b_prev; 19301 head->b_next = bp->b_next; 19302 if (tcp->tcp_xmit_tail == bp) 19303 tcp->tcp_xmit_tail = head; 19304 } 19305 bp->b_next = NULL; 19306 bp->b_prev = NULL; 19307 freeb(bp); 19308 } else { 19309 head = bp; 19310 nbp = bp->b_cont; 19311 } 19312 tail = head; 19313 while (nbp) { 19314 if (IS_VMLOANED_MBLK(nbp)) { 19315 TCP_STAT(tcps, tcp_zcopy_backoff); 19316 if ((tail->b_cont = copyb(nbp)) == NULL) { 19317 tcp->tcp_xmit_zc_clean = B_FALSE; 19318 tail->b_cont = nbp; 19319 return (head); 19320 } 19321 tail = tail->b_cont; 19322 if (nbp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { 19323 if (fix_xmitlist) 19324 tcp_zcopy_notify(tcp); 19325 else 19326 tail->b_datap->db_struioflag |= 19327 STRUIO_ZCNOTIFY; 19328 } 19329 bp = nbp; 19330 nbp = nbp->b_cont; 19331 if (fix_xmitlist) { 19332 tail->b_prev = bp->b_prev; 19333 tail->b_next = bp->b_next; 19334 if (tcp->tcp_xmit_tail == bp) 19335 tcp->tcp_xmit_tail = tail; 19336 } 19337 bp->b_next = NULL; 19338 bp->b_prev = NULL; 19339 freeb(bp); 19340 } else { 19341 tail->b_cont = nbp; 19342 tail = nbp; 19343 nbp = nbp->b_cont; 19344 } 19345 } 19346 if (fix_xmitlist) { 19347 tcp->tcp_xmit_last = tail; 19348 tcp->tcp_xmit_zc_clean = B_TRUE; 19349 } 19350 return (head); 19351 } 19352 19353 static void 19354 tcp_zcopy_notify(tcp_t *tcp) 19355 { 19356 struct stdata *stp; 19357 19358 if (tcp->tcp_detached) 19359 return; 19360 stp = STREAM(tcp->tcp_rq); 19361 mutex_enter(&stp->sd_lock); 19362 stp->sd_flag |= STZCNOTIFY; 19363 cv_broadcast(&stp->sd_zcopy_wait); 19364 mutex_exit(&stp->sd_lock); 19365 } 19366 19367 static boolean_t 19368 tcp_send_find_ire(tcp_t *tcp, ipaddr_t *dst, ire_t **irep) 19369 { 19370 ire_t *ire; 19371 conn_t *connp = tcp->tcp_connp; 19372 tcp_stack_t *tcps = tcp->tcp_tcps; 19373 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 19374 19375 mutex_enter(&connp->conn_lock); 19376 ire = connp->conn_ire_cache; 19377 ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); 19378 19379 if ((ire != NULL) && 19380 (((dst != NULL) && (ire->ire_addr == *dst)) || ((dst == NULL) && 19381 IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &tcp->tcp_ip6h->ip6_dst))) && 19382 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 19383 IRE_REFHOLD(ire); 19384 mutex_exit(&connp->conn_lock); 19385 } else { 19386 boolean_t cached = B_FALSE; 19387 ts_label_t *tsl; 19388 19389 /* force a recheck later on */ 19390 tcp->tcp_ire_ill_check_done = B_FALSE; 19391 19392 TCP_DBGSTAT(tcps, tcp_ire_null1); 19393 connp->conn_ire_cache = NULL; 19394 mutex_exit(&connp->conn_lock); 19395 19396 if (ire != NULL) 19397 IRE_REFRELE_NOTR(ire); 19398 19399 tsl = crgetlabel(CONN_CRED(connp)); 19400 ire = (dst ? 19401 ire_cache_lookup(*dst, connp->conn_zoneid, tsl, ipst) : 19402 ire_cache_lookup_v6(&tcp->tcp_ip6h->ip6_dst, 19403 connp->conn_zoneid, tsl, ipst)); 19404 19405 if (ire == NULL) { 19406 TCP_STAT(tcps, tcp_ire_null); 19407 return (B_FALSE); 19408 } 19409 19410 IRE_REFHOLD_NOTR(ire); 19411 /* 19412 * Since we are inside the squeue, there cannot be another 19413 * thread in TCP trying to set the conn_ire_cache now. The 19414 * check for IRE_MARK_CONDEMNED ensures that an interface 19415 * unplumb thread has not yet started cleaning up the conns. 19416 * Hence we don't need to grab the conn lock. 19417 */ 19418 if (CONN_CACHE_IRE(connp)) { 19419 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 19420 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 19421 TCP_CHECK_IREINFO(tcp, ire); 19422 connp->conn_ire_cache = ire; 19423 cached = B_TRUE; 19424 } 19425 rw_exit(&ire->ire_bucket->irb_lock); 19426 } 19427 19428 /* 19429 * We can continue to use the ire but since it was 19430 * not cached, we should drop the extra reference. 19431 */ 19432 if (!cached) 19433 IRE_REFRELE_NOTR(ire); 19434 19435 /* 19436 * Rampart note: no need to select a new label here, since 19437 * labels are not allowed to change during the life of a TCP 19438 * connection. 19439 */ 19440 } 19441 19442 *irep = ire; 19443 19444 return (B_TRUE); 19445 } 19446 19447 /* 19448 * Called from tcp_send() or tcp_send_data() to find workable IRE. 19449 * 19450 * 0 = success; 19451 * 1 = failed to find ire and ill. 19452 */ 19453 static boolean_t 19454 tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp) 19455 { 19456 ipha_t *ipha; 19457 ipaddr_t dst; 19458 ire_t *ire; 19459 ill_t *ill; 19460 conn_t *connp = tcp->tcp_connp; 19461 mblk_t *ire_fp_mp; 19462 tcp_stack_t *tcps = tcp->tcp_tcps; 19463 19464 if (mp != NULL) 19465 ipha = (ipha_t *)mp->b_rptr; 19466 else 19467 ipha = tcp->tcp_ipha; 19468 dst = ipha->ipha_dst; 19469 19470 if (!tcp_send_find_ire(tcp, &dst, &ire)) 19471 return (B_FALSE); 19472 19473 if ((ire->ire_flags & RTF_MULTIRT) || 19474 (ire->ire_stq == NULL) || 19475 (ire->ire_nce == NULL) || 19476 ((ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL) || 19477 ((mp != NULL) && (ire->ire_max_frag < ntohs(ipha->ipha_length) || 19478 MBLKL(ire_fp_mp) > MBLKHEAD(mp)))) { 19479 TCP_STAT(tcps, tcp_ip_ire_send); 19480 IRE_REFRELE(ire); 19481 return (B_FALSE); 19482 } 19483 19484 ill = ire_to_ill(ire); 19485 if (connp->conn_outgoing_ill != NULL) { 19486 ill_t *conn_outgoing_ill = NULL; 19487 /* 19488 * Choose a good ill in the group to send the packets on. 19489 */ 19490 ire = conn_set_outgoing_ill(connp, ire, &conn_outgoing_ill); 19491 ill = ire_to_ill(ire); 19492 } 19493 ASSERT(ill != NULL); 19494 19495 if (!tcp->tcp_ire_ill_check_done) { 19496 tcp_ire_ill_check(tcp, ire, ill, B_TRUE); 19497 tcp->tcp_ire_ill_check_done = B_TRUE; 19498 } 19499 19500 *irep = ire; 19501 *illp = ill; 19502 19503 return (B_TRUE); 19504 } 19505 19506 static void 19507 tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) 19508 { 19509 ipha_t *ipha; 19510 ipaddr_t src; 19511 ipaddr_t dst; 19512 uint32_t cksum; 19513 ire_t *ire; 19514 uint16_t *up; 19515 ill_t *ill; 19516 conn_t *connp = tcp->tcp_connp; 19517 uint32_t hcksum_txflags = 0; 19518 mblk_t *ire_fp_mp; 19519 uint_t ire_fp_mp_len; 19520 tcp_stack_t *tcps = tcp->tcp_tcps; 19521 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 19522 19523 ASSERT(DB_TYPE(mp) == M_DATA); 19524 19525 if (DB_CRED(mp) == NULL) 19526 mblk_setcred(mp, CONN_CRED(connp)); 19527 19528 ipha = (ipha_t *)mp->b_rptr; 19529 src = ipha->ipha_src; 19530 dst = ipha->ipha_dst; 19531 19532 /* 19533 * Drop off fast path for IPv6 and also if options are present or 19534 * we need to resolve a TS label. 19535 */ 19536 if (tcp->tcp_ipversion != IPV4_VERSION || 19537 !IPCL_IS_CONNECTED(connp) || 19538 !CONN_IS_LSO_MD_FASTPATH(connp) || 19539 (connp->conn_flags & IPCL_CHECK_POLICY) != 0 || 19540 !connp->conn_ulp_labeled || 19541 ipha->ipha_ident == IP_HDR_INCLUDED || 19542 ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION || 19543 IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { 19544 if (tcp->tcp_snd_zcopy_aware) 19545 mp = tcp_zcopy_disable(tcp, mp); 19546 TCP_STAT(tcps, tcp_ip_send); 19547 CALL_IP_WPUT(connp, q, mp); 19548 return; 19549 } 19550 19551 if (!tcp_send_find_ire_ill(tcp, mp, &ire, &ill)) { 19552 if (tcp->tcp_snd_zcopy_aware) 19553 mp = tcp_zcopy_backoff(tcp, mp, 0); 19554 CALL_IP_WPUT(connp, q, mp); 19555 return; 19556 } 19557 ire_fp_mp = ire->ire_nce->nce_fp_mp; 19558 ire_fp_mp_len = MBLKL(ire_fp_mp); 19559 19560 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); 19561 ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); 19562 #ifndef _BIG_ENDIAN 19563 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); 19564 #endif 19565 19566 /* 19567 * Check to see if we need to re-enable LSO/MDT for this connection 19568 * because it was previously disabled due to changes in the ill; 19569 * note that by doing it here, this re-enabling only applies when 19570 * the packet is not dispatched through CALL_IP_WPUT(). 19571 * 19572 * That means for IPv4, it is worth re-enabling LSO/MDT for the fastpath 19573 * case, since that's how we ended up here. For IPv6, we do the 19574 * re-enabling work in ip_xmit_v6(), albeit indirectly via squeue. 19575 */ 19576 if (connp->conn_lso_ok && !tcp->tcp_lso && ILL_LSO_TCP_USABLE(ill)) { 19577 /* 19578 * Restore LSO for this connection, so that next time around 19579 * it is eligible to go through tcp_lsosend() path again. 19580 */ 19581 TCP_STAT(tcps, tcp_lso_enabled); 19582 tcp->tcp_lso = B_TRUE; 19583 ip1dbg(("tcp_send_data: reenabling LSO for connp %p on " 19584 "interface %s\n", (void *)connp, ill->ill_name)); 19585 } else if (connp->conn_mdt_ok && !tcp->tcp_mdt && ILL_MDT_USABLE(ill)) { 19586 /* 19587 * Restore MDT for this connection, so that next time around 19588 * it is eligible to go through tcp_multisend() path again. 19589 */ 19590 TCP_STAT(tcps, tcp_mdt_conn_resumed1); 19591 tcp->tcp_mdt = B_TRUE; 19592 ip1dbg(("tcp_send_data: reenabling MDT for connp %p on " 19593 "interface %s\n", (void *)connp, ill->ill_name)); 19594 } 19595 19596 if (tcp->tcp_snd_zcopy_aware) { 19597 if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 || 19598 (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0)) 19599 mp = tcp_zcopy_disable(tcp, mp); 19600 /* 19601 * we shouldn't need to reset ipha as the mp containing 19602 * ipha should never be a zero-copy mp. 19603 */ 19604 } 19605 19606 if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { 19607 ASSERT(ill->ill_hcksum_capab != NULL); 19608 hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; 19609 } 19610 19611 /* pseudo-header checksum (do it in parts for IP header checksum) */ 19612 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 19613 19614 ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION); 19615 up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); 19616 19617 IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up, 19618 IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum); 19619 19620 /* Software checksum? */ 19621 if (DB_CKSUMFLAGS(mp) == 0) { 19622 TCP_STAT(tcps, tcp_out_sw_cksum); 19623 TCP_STAT_UPDATE(tcps, tcp_out_sw_cksum_bytes, 19624 ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH); 19625 } 19626 19627 ipha->ipha_fragment_offset_and_flags |= 19628 (uint32_t)htons(ire->ire_frag_flag); 19629 19630 /* Calculate IP header checksum if hardware isn't capable */ 19631 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 19632 IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0], 19633 ((uint16_t *)ipha)[4]); 19634 } 19635 19636 ASSERT(DB_TYPE(ire_fp_mp) == M_DATA); 19637 mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len; 19638 bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len); 19639 19640 UPDATE_OB_PKT_COUNT(ire); 19641 ire->ire_last_used_time = lbolt; 19642 19643 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 19644 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits); 19645 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, 19646 ntohs(ipha->ipha_length)); 19647 19648 if (ILL_DLS_CAPABLE(ill)) { 19649 /* 19650 * Send the packet directly to DLD, where it may be queued 19651 * depending on the availability of transmit resources at 19652 * the media layer. 19653 */ 19654 IP_DLS_ILL_TX(ill, ipha, mp, ipst); 19655 } else { 19656 ill_t *out_ill = (ill_t *)ire->ire_stq->q_ptr; 19657 DTRACE_PROBE4(ip4__physical__out__start, 19658 ill_t *, NULL, ill_t *, out_ill, 19659 ipha_t *, ipha, mblk_t *, mp); 19660 FW_HOOKS(ipst->ips_ip4_physical_out_event, 19661 ipst->ips_ipv4firewall_physical_out, 19662 NULL, out_ill, ipha, mp, mp, 0, ipst); 19663 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 19664 if (mp != NULL) 19665 putnext(ire->ire_stq, mp); 19666 } 19667 IRE_REFRELE(ire); 19668 } 19669 19670 /* 19671 * This handles the case when the receiver has shrunk its win. Per RFC 1122 19672 * if the receiver shrinks the window, i.e. moves the right window to the 19673 * left, the we should not send new data, but should retransmit normally the 19674 * old unacked data between suna and suna + swnd. We might has sent data 19675 * that is now outside the new window, pretend that we didn't send it. 19676 */ 19677 static void 19678 tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count) 19679 { 19680 uint32_t snxt = tcp->tcp_snxt; 19681 mblk_t *xmit_tail; 19682 int32_t offset; 19683 19684 ASSERT(shrunk_count > 0); 19685 19686 /* Pretend we didn't send the data outside the window */ 19687 snxt -= shrunk_count; 19688 19689 /* Get the mblk and the offset in it per the shrunk window */ 19690 xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset); 19691 19692 ASSERT(xmit_tail != NULL); 19693 19694 /* Reset all the values per the now shrunk window */ 19695 tcp->tcp_snxt = snxt; 19696 tcp->tcp_xmit_tail = xmit_tail; 19697 tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr - xmit_tail->b_rptr - 19698 offset; 19699 tcp->tcp_unsent += shrunk_count; 19700 19701 if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0) 19702 /* 19703 * Make sure the timer is running so that we will probe a zero 19704 * window. 19705 */ 19706 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 19707 } 19708 19709 19710 /* 19711 * The TCP normal data output path. 19712 * NOTE: the logic of the fast path is duplicated from this function. 19713 */ 19714 static void 19715 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) 19716 { 19717 int len; 19718 mblk_t *local_time; 19719 mblk_t *mp1; 19720 uint32_t snxt; 19721 int tail_unsent; 19722 int tcpstate; 19723 int usable = 0; 19724 mblk_t *xmit_tail; 19725 queue_t *q = tcp->tcp_wq; 19726 int32_t mss; 19727 int32_t num_sack_blk = 0; 19728 int32_t tcp_hdr_len; 19729 int32_t tcp_tcp_hdr_len; 19730 int mdt_thres; 19731 int rc; 19732 tcp_stack_t *tcps = tcp->tcp_tcps; 19733 ip_stack_t *ipst; 19734 19735 tcpstate = tcp->tcp_state; 19736 if (mp == NULL) { 19737 /* 19738 * tcp_wput_data() with NULL mp should only be called when 19739 * there is unsent data. 19740 */ 19741 ASSERT(tcp->tcp_unsent > 0); 19742 /* Really tacky... but we need this for detached closes. */ 19743 len = tcp->tcp_unsent; 19744 goto data_null; 19745 } 19746 19747 #if CCS_STATS 19748 wrw_stats.tot.count++; 19749 wrw_stats.tot.bytes += msgdsize(mp); 19750 #endif 19751 ASSERT(mp->b_datap->db_type == M_DATA); 19752 /* 19753 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 19754 * or before a connection attempt has begun. 19755 */ 19756 if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 19757 (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 19758 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 19759 #ifdef DEBUG 19760 cmn_err(CE_WARN, 19761 "tcp_wput_data: data after ordrel, %s", 19762 tcp_display(tcp, NULL, 19763 DISP_ADDR_AND_PORT)); 19764 #else 19765 if (tcp->tcp_debug) { 19766 (void) strlog(TCP_MOD_ID, 0, 1, 19767 SL_TRACE|SL_ERROR, 19768 "tcp_wput_data: data after ordrel, %s\n", 19769 tcp_display(tcp, NULL, 19770 DISP_ADDR_AND_PORT)); 19771 } 19772 #endif /* DEBUG */ 19773 } 19774 if (tcp->tcp_snd_zcopy_aware && 19775 (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0) 19776 tcp_zcopy_notify(tcp); 19777 freemsg(mp); 19778 mutex_enter(&tcp->tcp_non_sq_lock); 19779 if (tcp->tcp_flow_stopped && 19780 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 19781 tcp_clrqfull(tcp); 19782 } 19783 mutex_exit(&tcp->tcp_non_sq_lock); 19784 return; 19785 } 19786 19787 /* Strip empties */ 19788 for (;;) { 19789 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 19790 (uintptr_t)INT_MAX); 19791 len = (int)(mp->b_wptr - mp->b_rptr); 19792 if (len > 0) 19793 break; 19794 mp1 = mp; 19795 mp = mp->b_cont; 19796 freeb(mp1); 19797 if (!mp) { 19798 return; 19799 } 19800 } 19801 19802 /* If we are the first on the list ... */ 19803 if (tcp->tcp_xmit_head == NULL) { 19804 tcp->tcp_xmit_head = mp; 19805 tcp->tcp_xmit_tail = mp; 19806 tcp->tcp_xmit_tail_unsent = len; 19807 } else { 19808 /* If tiny tx and room in txq tail, pullup to save mblks. */ 19809 struct datab *dp; 19810 19811 mp1 = tcp->tcp_xmit_last; 19812 if (len < tcp_tx_pull_len && 19813 (dp = mp1->b_datap)->db_ref == 1 && 19814 dp->db_lim - mp1->b_wptr >= len) { 19815 ASSERT(len > 0); 19816 ASSERT(!mp1->b_cont); 19817 if (len == 1) { 19818 *mp1->b_wptr++ = *mp->b_rptr; 19819 } else { 19820 bcopy(mp->b_rptr, mp1->b_wptr, len); 19821 mp1->b_wptr += len; 19822 } 19823 if (mp1 == tcp->tcp_xmit_tail) 19824 tcp->tcp_xmit_tail_unsent += len; 19825 mp1->b_cont = mp->b_cont; 19826 if (tcp->tcp_snd_zcopy_aware && 19827 (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 19828 mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 19829 freeb(mp); 19830 mp = mp1; 19831 } else { 19832 tcp->tcp_xmit_last->b_cont = mp; 19833 } 19834 len += tcp->tcp_unsent; 19835 } 19836 19837 /* Tack on however many more positive length mblks we have */ 19838 if ((mp1 = mp->b_cont) != NULL) { 19839 do { 19840 int tlen; 19841 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 19842 (uintptr_t)INT_MAX); 19843 tlen = (int)(mp1->b_wptr - mp1->b_rptr); 19844 if (tlen <= 0) { 19845 mp->b_cont = mp1->b_cont; 19846 freeb(mp1); 19847 } else { 19848 len += tlen; 19849 mp = mp1; 19850 } 19851 } while ((mp1 = mp->b_cont) != NULL); 19852 } 19853 tcp->tcp_xmit_last = mp; 19854 tcp->tcp_unsent = len; 19855 19856 if (urgent) 19857 usable = 1; 19858 19859 data_null: 19860 snxt = tcp->tcp_snxt; 19861 xmit_tail = tcp->tcp_xmit_tail; 19862 tail_unsent = tcp->tcp_xmit_tail_unsent; 19863 19864 /* 19865 * Note that tcp_mss has been adjusted to take into account the 19866 * timestamp option if applicable. Because SACK options do not 19867 * appear in every TCP segments and they are of variable lengths, 19868 * they cannot be included in tcp_mss. Thus we need to calculate 19869 * the actual segment length when we need to send a segment which 19870 * includes SACK options. 19871 */ 19872 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 19873 int32_t opt_len; 19874 19875 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 19876 tcp->tcp_num_sack_blk); 19877 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 19878 2 + TCPOPT_HEADER_LEN; 19879 mss = tcp->tcp_mss - opt_len; 19880 tcp_hdr_len = tcp->tcp_hdr_len + opt_len; 19881 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + opt_len; 19882 } else { 19883 mss = tcp->tcp_mss; 19884 tcp_hdr_len = tcp->tcp_hdr_len; 19885 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; 19886 } 19887 19888 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 19889 (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 19890 SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); 19891 } 19892 if (tcpstate == TCPS_SYN_RCVD) { 19893 /* 19894 * The three-way connection establishment handshake is not 19895 * complete yet. We want to queue the data for transmission 19896 * after entering ESTABLISHED state (RFC793). A jump to 19897 * "done" label effectively leaves data on the queue. 19898 */ 19899 goto done; 19900 } else { 19901 int usable_r; 19902 19903 /* 19904 * In the special case when cwnd is zero, which can only 19905 * happen if the connection is ECN capable, return now. 19906 * New segments is sent using tcp_timer(). The timer 19907 * is set in tcp_rput_data(). 19908 */ 19909 if (tcp->tcp_cwnd == 0) { 19910 /* 19911 * Note that tcp_cwnd is 0 before 3-way handshake is 19912 * finished. 19913 */ 19914 ASSERT(tcp->tcp_ecn_ok || 19915 tcp->tcp_state < TCPS_ESTABLISHED); 19916 return; 19917 } 19918 19919 /* NOTE: trouble if xmitting while SYN not acked? */ 19920 usable_r = snxt - tcp->tcp_suna; 19921 usable_r = tcp->tcp_swnd - usable_r; 19922 19923 /* 19924 * Check if the receiver has shrunk the window. If 19925 * tcp_wput_data() with NULL mp is called, tcp_fin_sent 19926 * cannot be set as there is unsent data, so FIN cannot 19927 * be sent out. Otherwise, we need to take into account 19928 * of FIN as it consumes an "invisible" sequence number. 19929 */ 19930 ASSERT(tcp->tcp_fin_sent == 0); 19931 if (usable_r < 0) { 19932 /* 19933 * The receiver has shrunk the window and we have sent 19934 * -usable_r date beyond the window, re-adjust. 19935 * 19936 * If TCP window scaling is enabled, there can be 19937 * round down error as the advertised receive window 19938 * is actually right shifted n bits. This means that 19939 * the lower n bits info is wiped out. It will look 19940 * like the window is shrunk. Do a check here to 19941 * see if the shrunk amount is actually within the 19942 * error in window calculation. If it is, just 19943 * return. Note that this check is inside the 19944 * shrunk window check. This makes sure that even 19945 * though tcp_process_shrunk_swnd() is not called, 19946 * we will stop further processing. 19947 */ 19948 if ((-usable_r >> tcp->tcp_snd_ws) > 0) { 19949 tcp_process_shrunk_swnd(tcp, -usable_r); 19950 } 19951 return; 19952 } 19953 19954 /* usable = MIN(swnd, cwnd) - unacked_bytes */ 19955 if (tcp->tcp_swnd > tcp->tcp_cwnd) 19956 usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd; 19957 19958 /* usable = MIN(usable, unsent) */ 19959 if (usable_r > len) 19960 usable_r = len; 19961 19962 /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 19963 if (usable_r > 0) { 19964 usable = usable_r; 19965 } else { 19966 /* Bypass all other unnecessary processing. */ 19967 goto done; 19968 } 19969 } 19970 19971 local_time = (mblk_t *)lbolt; 19972 19973 /* 19974 * "Our" Nagle Algorithm. This is not the same as in the old 19975 * BSD. This is more in line with the true intent of Nagle. 19976 * 19977 * The conditions are: 19978 * 1. The amount of unsent data (or amount of data which can be 19979 * sent, whichever is smaller) is less than Nagle limit. 19980 * 2. The last sent size is also less than Nagle limit. 19981 * 3. There is unack'ed data. 19982 * 4. Urgent pointer is not set. Send urgent data ignoring the 19983 * Nagle algorithm. This reduces the probability that urgent 19984 * bytes get "merged" together. 19985 * 5. The app has not closed the connection. This eliminates the 19986 * wait time of the receiving side waiting for the last piece of 19987 * (small) data. 19988 * 19989 * If all are satisified, exit without sending anything. Note 19990 * that Nagle limit can be smaller than 1 MSS. Nagle limit is 19991 * the smaller of 1 MSS and global tcp_naglim_def (default to be 19992 * 4095). 19993 */ 19994 if (usable < (int)tcp->tcp_naglim && 19995 tcp->tcp_naglim > tcp->tcp_last_sent_len && 19996 snxt != tcp->tcp_suna && 19997 !(tcp->tcp_valid_bits & TCP_URG_VALID) && 19998 !(tcp->tcp_valid_bits & TCP_FSS_VALID)) { 19999 goto done; 20000 } 20001 20002 if (tcp->tcp_cork) { 20003 /* 20004 * if the tcp->tcp_cork option is set, then we have to force 20005 * TCP not to send partial segment (smaller than MSS bytes). 20006 * We are calculating the usable now based on full mss and 20007 * will save the rest of remaining data for later. 20008 */ 20009 if (usable < mss) 20010 goto done; 20011 usable = (usable / mss) * mss; 20012 } 20013 20014 /* Update the latest receive window size in TCP header. */ 20015 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 20016 tcp->tcp_tcph->th_win); 20017 20018 /* 20019 * Determine if it's worthwhile to attempt LSO or MDT, based on: 20020 * 20021 * 1. Simple TCP/IP{v4,v6} (no options). 20022 * 2. IPSEC/IPQoS processing is not needed for the TCP connection. 20023 * 3. If the TCP connection is in ESTABLISHED state. 20024 * 4. The TCP is not detached. 20025 * 20026 * If any of the above conditions have changed during the 20027 * connection, stop using LSO/MDT and restore the stream head 20028 * parameters accordingly. 20029 */ 20030 ipst = tcps->tcps_netstack->netstack_ip; 20031 20032 if ((tcp->tcp_lso || tcp->tcp_mdt) && 20033 ((tcp->tcp_ipversion == IPV4_VERSION && 20034 tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || 20035 (tcp->tcp_ipversion == IPV6_VERSION && 20036 tcp->tcp_ip_hdr_len != IPV6_HDR_LEN) || 20037 tcp->tcp_state != TCPS_ESTABLISHED || 20038 TCP_IS_DETACHED(tcp) || !CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp) || 20039 CONN_IPSEC_OUT_ENCAPSULATED(tcp->tcp_connp) || 20040 IPP_ENABLED(IPP_LOCAL_OUT, ipst))) { 20041 if (tcp->tcp_lso) { 20042 tcp->tcp_connp->conn_lso_ok = B_FALSE; 20043 tcp->tcp_lso = B_FALSE; 20044 } else { 20045 tcp->tcp_connp->conn_mdt_ok = B_FALSE; 20046 tcp->tcp_mdt = B_FALSE; 20047 } 20048 20049 /* Anything other than detached is considered pathological */ 20050 if (!TCP_IS_DETACHED(tcp)) { 20051 if (tcp->tcp_lso) 20052 TCP_STAT(tcps, tcp_lso_disabled); 20053 else 20054 TCP_STAT(tcps, tcp_mdt_conn_halted1); 20055 (void) tcp_maxpsz_set(tcp, B_TRUE); 20056 } 20057 } 20058 20059 /* Use MDT if sendable amount is greater than the threshold */ 20060 if (tcp->tcp_mdt && 20061 (mdt_thres = mss << tcp_mdt_smss_threshold, usable > mdt_thres) && 20062 (tail_unsent > mdt_thres || (xmit_tail->b_cont != NULL && 20063 MBLKL(xmit_tail->b_cont) > mdt_thres)) && 20064 (tcp->tcp_valid_bits == 0 || 20065 tcp->tcp_valid_bits == TCP_FSS_VALID)) { 20066 ASSERT(tcp->tcp_connp->conn_mdt_ok); 20067 rc = tcp_multisend(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, 20068 num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, 20069 local_time, mdt_thres); 20070 } else { 20071 rc = tcp_send(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, 20072 num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, 20073 local_time, INT_MAX); 20074 } 20075 20076 /* Pretend that all we were trying to send really got sent */ 20077 if (rc < 0 && tail_unsent < 0) { 20078 do { 20079 xmit_tail = xmit_tail->b_cont; 20080 xmit_tail->b_prev = local_time; 20081 ASSERT((uintptr_t)(xmit_tail->b_wptr - 20082 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 20083 tail_unsent += (int)(xmit_tail->b_wptr - 20084 xmit_tail->b_rptr); 20085 } while (tail_unsent < 0); 20086 } 20087 done:; 20088 tcp->tcp_xmit_tail = xmit_tail; 20089 tcp->tcp_xmit_tail_unsent = tail_unsent; 20090 len = tcp->tcp_snxt - snxt; 20091 if (len) { 20092 /* 20093 * If new data was sent, need to update the notsack 20094 * list, which is, afterall, data blocks that have 20095 * not been sack'ed by the receiver. New data is 20096 * not sack'ed. 20097 */ 20098 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 20099 /* len is a negative value. */ 20100 tcp->tcp_pipe -= len; 20101 tcp_notsack_update(&(tcp->tcp_notsack_list), 20102 tcp->tcp_snxt, snxt, 20103 &(tcp->tcp_num_notsack_blk), 20104 &(tcp->tcp_cnt_notsack_list)); 20105 } 20106 tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 20107 tcp->tcp_rack = tcp->tcp_rnxt; 20108 tcp->tcp_rack_cnt = 0; 20109 if ((snxt + len) == tcp->tcp_suna) { 20110 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 20111 } 20112 } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 20113 /* 20114 * Didn't send anything. Make sure the timer is running 20115 * so that we will probe a zero window. 20116 */ 20117 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 20118 } 20119 /* Note that len is the amount we just sent but with a negative sign */ 20120 tcp->tcp_unsent += len; 20121 mutex_enter(&tcp->tcp_non_sq_lock); 20122 if (tcp->tcp_flow_stopped) { 20123 if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 20124 tcp_clrqfull(tcp); 20125 } 20126 } else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) { 20127 tcp_setqfull(tcp); 20128 } 20129 mutex_exit(&tcp->tcp_non_sq_lock); 20130 } 20131 20132 /* 20133 * tcp_fill_header is called by tcp_send() and tcp_multisend() to fill the 20134 * outgoing TCP header with the template header, as well as other 20135 * options such as time-stamp, ECN and/or SACK. 20136 */ 20137 static void 20138 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) 20139 { 20140 tcph_t *tcp_tmpl, *tcp_h; 20141 uint32_t *dst, *src; 20142 int hdrlen; 20143 20144 ASSERT(OK_32PTR(rptr)); 20145 20146 /* Template header */ 20147 tcp_tmpl = tcp->tcp_tcph; 20148 20149 /* Header of outgoing packet */ 20150 tcp_h = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 20151 20152 /* dst and src are opaque 32-bit fields, used for copying */ 20153 dst = (uint32_t *)rptr; 20154 src = (uint32_t *)tcp->tcp_iphc; 20155 hdrlen = tcp->tcp_hdr_len; 20156 20157 /* Fill time-stamp option if needed */ 20158 if (tcp->tcp_snd_ts_ok) { 20159 U32_TO_BE32((uint32_t)now, 20160 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); 20161 U32_TO_BE32(tcp->tcp_ts_recent, 20162 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); 20163 } else { 20164 ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 20165 } 20166 20167 /* 20168 * Copy the template header; is this really more efficient than 20169 * calling bcopy()? For simple IPv4/TCP, it may be the case, 20170 * but perhaps not for other scenarios. 20171 */ 20172 dst[0] = src[0]; 20173 dst[1] = src[1]; 20174 dst[2] = src[2]; 20175 dst[3] = src[3]; 20176 dst[4] = src[4]; 20177 dst[5] = src[5]; 20178 dst[6] = src[6]; 20179 dst[7] = src[7]; 20180 dst[8] = src[8]; 20181 dst[9] = src[9]; 20182 if (hdrlen -= 40) { 20183 hdrlen >>= 2; 20184 dst += 10; 20185 src += 10; 20186 do { 20187 *dst++ = *src++; 20188 } while (--hdrlen); 20189 } 20190 20191 /* 20192 * Set the ECN info in the TCP header if it is not a zero 20193 * window probe. Zero window probe is only sent in 20194 * tcp_wput_data() and tcp_timer(). 20195 */ 20196 if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 20197 SET_ECT(tcp, rptr); 20198 20199 if (tcp->tcp_ecn_echo_on) 20200 tcp_h->th_flags[0] |= TH_ECE; 20201 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 20202 tcp_h->th_flags[0] |= TH_CWR; 20203 tcp->tcp_ecn_cwr_sent = B_TRUE; 20204 } 20205 } 20206 20207 /* Fill in SACK options */ 20208 if (num_sack_blk > 0) { 20209 uchar_t *wptr = rptr + tcp->tcp_hdr_len; 20210 sack_blk_t *tmp; 20211 int32_t i; 20212 20213 wptr[0] = TCPOPT_NOP; 20214 wptr[1] = TCPOPT_NOP; 20215 wptr[2] = TCPOPT_SACK; 20216 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 20217 sizeof (sack_blk_t); 20218 wptr += TCPOPT_REAL_SACK_LEN; 20219 20220 tmp = tcp->tcp_sack_list; 20221 for (i = 0; i < num_sack_blk; i++) { 20222 U32_TO_BE32(tmp[i].begin, wptr); 20223 wptr += sizeof (tcp_seq); 20224 U32_TO_BE32(tmp[i].end, wptr); 20225 wptr += sizeof (tcp_seq); 20226 } 20227 tcp_h->th_offset_and_rsrvd[0] += 20228 ((num_sack_blk * 2 + 1) << 4); 20229 } 20230 } 20231 20232 /* 20233 * tcp_mdt_add_attrs() is called by tcp_multisend() in order to attach 20234 * the destination address and SAP attribute, and if necessary, the 20235 * hardware checksum offload attribute to a Multidata message. 20236 */ 20237 static int 20238 tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum, 20239 const uint32_t start, const uint32_t stuff, const uint32_t end, 20240 const uint32_t flags, tcp_stack_t *tcps) 20241 { 20242 /* Add global destination address & SAP attribute */ 20243 if (dlmp == NULL || !ip_md_addr_attr(mmd, NULL, dlmp)) { 20244 ip1dbg(("tcp_mdt_add_attrs: can't add global physical " 20245 "destination address+SAP\n")); 20246 20247 if (dlmp != NULL) 20248 TCP_STAT(tcps, tcp_mdt_allocfail); 20249 return (-1); 20250 } 20251 20252 /* Add global hwcksum attribute */ 20253 if (hwcksum && 20254 !ip_md_hcksum_attr(mmd, NULL, start, stuff, end, flags)) { 20255 ip1dbg(("tcp_mdt_add_attrs: can't add global hardware " 20256 "checksum attribute\n")); 20257 20258 TCP_STAT(tcps, tcp_mdt_allocfail); 20259 return (-1); 20260 } 20261 20262 return (0); 20263 } 20264 20265 /* 20266 * Smaller and private version of pdescinfo_t used specifically for TCP, 20267 * which allows for only two payload spans per packet. 20268 */ 20269 typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t; 20270 20271 /* 20272 * tcp_multisend() is called by tcp_wput_data() for Multidata Transmit 20273 * scheme, and returns one the following: 20274 * 20275 * -1 = failed allocation. 20276 * 0 = success; burst count reached, or usable send window is too small, 20277 * and that we'd rather wait until later before sending again. 20278 */ 20279 static int 20280 tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, 20281 const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable, 20282 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 20283 const int mdt_thres) 20284 { 20285 mblk_t *md_mp_head, *md_mp, *md_pbuf, *md_pbuf_nxt, *md_hbuf; 20286 multidata_t *mmd; 20287 uint_t obsegs, obbytes, hdr_frag_sz; 20288 uint_t cur_hdr_off, cur_pld_off, base_pld_off, first_snxt; 20289 int num_burst_seg, max_pld; 20290 pdesc_t *pkt; 20291 tcp_pdescinfo_t tcp_pkt_info; 20292 pdescinfo_t *pkt_info; 20293 int pbuf_idx, pbuf_idx_nxt; 20294 int seg_len, len, spill, af; 20295 boolean_t add_buffer, zcopy, clusterwide; 20296 boolean_t buf_trunked = B_FALSE; 20297 boolean_t rconfirm = B_FALSE; 20298 boolean_t done = B_FALSE; 20299 uint32_t cksum; 20300 uint32_t hwcksum_flags; 20301 ire_t *ire = NULL; 20302 ill_t *ill; 20303 ipha_t *ipha; 20304 ip6_t *ip6h; 20305 ipaddr_t src, dst; 20306 ill_zerocopy_capab_t *zc_cap = NULL; 20307 uint16_t *up; 20308 int err; 20309 conn_t *connp; 20310 mblk_t *mp, *mp1, *fw_mp_head = NULL; 20311 uchar_t *pld_start; 20312 tcp_stack_t *tcps = tcp->tcp_tcps; 20313 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 20314 20315 #ifdef _BIG_ENDIAN 20316 #define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 28) & 0x7) 20317 #else 20318 #define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 4) & 0x7) 20319 #endif 20320 20321 #define PREP_NEW_MULTIDATA() { \ 20322 mmd = NULL; \ 20323 md_mp = md_hbuf = NULL; \ 20324 cur_hdr_off = 0; \ 20325 max_pld = tcp->tcp_mdt_max_pld; \ 20326 pbuf_idx = pbuf_idx_nxt = -1; \ 20327 add_buffer = B_TRUE; \ 20328 zcopy = B_FALSE; \ 20329 } 20330 20331 #define PREP_NEW_PBUF() { \ 20332 md_pbuf = md_pbuf_nxt = NULL; \ 20333 pbuf_idx = pbuf_idx_nxt = -1; \ 20334 cur_pld_off = 0; \ 20335 first_snxt = *snxt; \ 20336 ASSERT(*tail_unsent > 0); \ 20337 base_pld_off = MBLKL(*xmit_tail) - *tail_unsent; \ 20338 } 20339 20340 ASSERT(mdt_thres >= mss); 20341 ASSERT(*usable > 0 && *usable > mdt_thres); 20342 ASSERT(tcp->tcp_state == TCPS_ESTABLISHED); 20343 ASSERT(!TCP_IS_DETACHED(tcp)); 20344 ASSERT(tcp->tcp_valid_bits == 0 || 20345 tcp->tcp_valid_bits == TCP_FSS_VALID); 20346 ASSERT((tcp->tcp_ipversion == IPV4_VERSION && 20347 tcp->tcp_ip_hdr_len == IP_SIMPLE_HDR_LENGTH) || 20348 (tcp->tcp_ipversion == IPV6_VERSION && 20349 tcp->tcp_ip_hdr_len == IPV6_HDR_LEN)); 20350 20351 connp = tcp->tcp_connp; 20352 ASSERT(connp != NULL); 20353 ASSERT(CONN_IS_LSO_MD_FASTPATH(connp)); 20354 ASSERT(!CONN_IPSEC_OUT_ENCAPSULATED(connp)); 20355 20356 /* 20357 * Note that tcp will only declare at most 2 payload spans per 20358 * packet, which is much lower than the maximum allowable number 20359 * of packet spans per Multidata. For this reason, we use the 20360 * privately declared and smaller descriptor info structure, in 20361 * order to save some stack space. 20362 */ 20363 pkt_info = (pdescinfo_t *)&tcp_pkt_info; 20364 20365 af = (tcp->tcp_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6; 20366 if (af == AF_INET) { 20367 dst = tcp->tcp_ipha->ipha_dst; 20368 src = tcp->tcp_ipha->ipha_src; 20369 ASSERT(!CLASSD(dst)); 20370 } 20371 ASSERT(af == AF_INET || 20372 !IN6_IS_ADDR_MULTICAST(&tcp->tcp_ip6h->ip6_dst)); 20373 20374 obsegs = obbytes = 0; 20375 num_burst_seg = tcp->tcp_snd_burst; 20376 md_mp_head = NULL; 20377 PREP_NEW_MULTIDATA(); 20378 20379 /* 20380 * Before we go on further, make sure there is an IRE that we can 20381 * use, and that the ILL supports MDT. Otherwise, there's no point 20382 * in proceeding any further, and we should just hand everything 20383 * off to the legacy path. 20384 */ 20385 if (!tcp_send_find_ire(tcp, (af == AF_INET) ? &dst : NULL, &ire)) 20386 goto legacy_send_no_md; 20387 20388 ASSERT(ire != NULL); 20389 ASSERT(af != AF_INET || ire->ire_ipversion == IPV4_VERSION); 20390 ASSERT(af == AF_INET || !IN6_IS_ADDR_V4MAPPED(&(ire->ire_addr_v6))); 20391 ASSERT(af == AF_INET || ire->ire_nce != NULL); 20392 ASSERT(!(ire->ire_type & IRE_BROADCAST)); 20393 /* 20394 * If we do support loopback for MDT (which requires modifications 20395 * to the receiving paths), the following assertions should go away, 20396 * and we would be sending the Multidata to loopback conn later on. 20397 */ 20398 ASSERT(!IRE_IS_LOCAL(ire)); 20399 ASSERT(ire->ire_stq != NULL); 20400 20401 ill = ire_to_ill(ire); 20402 ASSERT(ill != NULL); 20403 ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL); 20404 20405 if (!tcp->tcp_ire_ill_check_done) { 20406 tcp_ire_ill_check(tcp, ire, ill, B_TRUE); 20407 tcp->tcp_ire_ill_check_done = B_TRUE; 20408 } 20409 20410 /* 20411 * If the underlying interface conditions have changed, or if the 20412 * new interface does not support MDT, go back to legacy path. 20413 */ 20414 if (!ILL_MDT_USABLE(ill) || (ire->ire_flags & RTF_MULTIRT) != 0) { 20415 /* don't go through this path anymore for this connection */ 20416 TCP_STAT(tcps, tcp_mdt_conn_halted2); 20417 tcp->tcp_mdt = B_FALSE; 20418 ip1dbg(("tcp_multisend: disabling MDT for connp %p on " 20419 "interface %s\n", (void *)connp, ill->ill_name)); 20420 /* IRE will be released prior to returning */ 20421 goto legacy_send_no_md; 20422 } 20423 20424 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) 20425 zc_cap = ill->ill_zerocopy_capab; 20426 20427 /* 20428 * Check if we can take tcp fast-path. Note that "incomplete" 20429 * ire's (where the link-layer for next hop is not resolved 20430 * or where the fast-path header in nce_fp_mp is not available 20431 * yet) are sent down the legacy (slow) path. 20432 * NOTE: We should fix ip_xmit_v4 to handle M_MULTIDATA 20433 */ 20434 if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) { 20435 /* IRE will be released prior to returning */ 20436 goto legacy_send_no_md; 20437 } 20438 20439 /* go to legacy path if interface doesn't support zerocopy */ 20440 if (tcp->tcp_snd_zcopy_aware && do_tcpzcopy != 2 && 20441 (zc_cap == NULL || zc_cap->ill_zerocopy_flags == 0)) { 20442 /* IRE will be released prior to returning */ 20443 goto legacy_send_no_md; 20444 } 20445 20446 /* does the interface support hardware checksum offload? */ 20447 hwcksum_flags = 0; 20448 if (ILL_HCKSUM_CAPABLE(ill) && 20449 (ill->ill_hcksum_capab->ill_hcksum_txflags & 20450 (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL | 20451 HCKSUM_IPHDRCKSUM)) && dohwcksum) { 20452 if (ill->ill_hcksum_capab->ill_hcksum_txflags & 20453 HCKSUM_IPHDRCKSUM) 20454 hwcksum_flags = HCK_IPV4_HDRCKSUM; 20455 20456 if (ill->ill_hcksum_capab->ill_hcksum_txflags & 20457 (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6)) 20458 hwcksum_flags |= HCK_FULLCKSUM; 20459 else if (ill->ill_hcksum_capab->ill_hcksum_txflags & 20460 HCKSUM_INET_PARTIAL) 20461 hwcksum_flags |= HCK_PARTIALCKSUM; 20462 } 20463 20464 /* 20465 * Each header fragment consists of the leading extra space, 20466 * followed by the TCP/IP header, and the trailing extra space. 20467 * We make sure that each header fragment begins on a 32-bit 20468 * aligned memory address (tcp_mdt_hdr_head is already 32-bit 20469 * aligned in tcp_mdt_update). 20470 */ 20471 hdr_frag_sz = roundup((tcp->tcp_mdt_hdr_head + tcp_hdr_len + 20472 tcp->tcp_mdt_hdr_tail), 4); 20473 20474 /* are we starting from the beginning of data block? */ 20475 if (*tail_unsent == 0) { 20476 *xmit_tail = (*xmit_tail)->b_cont; 20477 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= (uintptr_t)INT_MAX); 20478 *tail_unsent = (int)MBLKL(*xmit_tail); 20479 } 20480 20481 /* 20482 * Here we create one or more Multidata messages, each made up of 20483 * one header buffer and up to N payload buffers. This entire 20484 * operation is done within two loops: 20485 * 20486 * The outer loop mostly deals with creating the Multidata message, 20487 * as well as the header buffer that gets added to it. It also 20488 * links the Multidata messages together such that all of them can 20489 * be sent down to the lower layer in a single putnext call; this 20490 * linking behavior depends on the tcp_mdt_chain tunable. 20491 * 20492 * The inner loop takes an existing Multidata message, and adds 20493 * one or more (up to tcp_mdt_max_pld) payload buffers to it. It 20494 * packetizes those buffers by filling up the corresponding header 20495 * buffer fragments with the proper IP and TCP headers, and by 20496 * describing the layout of each packet in the packet descriptors 20497 * that get added to the Multidata. 20498 */ 20499 do { 20500 /* 20501 * If usable send window is too small, or data blocks in 20502 * transmit list are smaller than our threshold (i.e. app 20503 * performs large writes followed by small ones), we hand 20504 * off the control over to the legacy path. Note that we'll 20505 * get back the control once it encounters a large block. 20506 */ 20507 if (*usable < mss || (*tail_unsent <= mdt_thres && 20508 (*xmit_tail)->b_cont != NULL && 20509 MBLKL((*xmit_tail)->b_cont) <= mdt_thres)) { 20510 /* send down what we've got so far */ 20511 if (md_mp_head != NULL) { 20512 tcp_multisend_data(tcp, ire, ill, md_mp_head, 20513 obsegs, obbytes, &rconfirm); 20514 } 20515 /* 20516 * Pass control over to tcp_send(), but tell it to 20517 * return to us once a large-size transmission is 20518 * possible. 20519 */ 20520 TCP_STAT(tcps, tcp_mdt_legacy_small); 20521 if ((err = tcp_send(q, tcp, mss, tcp_hdr_len, 20522 tcp_tcp_hdr_len, num_sack_blk, usable, snxt, 20523 tail_unsent, xmit_tail, local_time, 20524 mdt_thres)) <= 0) { 20525 /* burst count reached, or alloc failed */ 20526 IRE_REFRELE(ire); 20527 return (err); 20528 } 20529 20530 /* tcp_send() may have sent everything, so check */ 20531 if (*usable <= 0) { 20532 IRE_REFRELE(ire); 20533 return (0); 20534 } 20535 20536 TCP_STAT(tcps, tcp_mdt_legacy_ret); 20537 /* 20538 * We may have delivered the Multidata, so make sure 20539 * to re-initialize before the next round. 20540 */ 20541 md_mp_head = NULL; 20542 obsegs = obbytes = 0; 20543 num_burst_seg = tcp->tcp_snd_burst; 20544 PREP_NEW_MULTIDATA(); 20545 20546 /* are we starting from the beginning of data block? */ 20547 if (*tail_unsent == 0) { 20548 *xmit_tail = (*xmit_tail)->b_cont; 20549 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= 20550 (uintptr_t)INT_MAX); 20551 *tail_unsent = (int)MBLKL(*xmit_tail); 20552 } 20553 } 20554 20555 /* 20556 * max_pld limits the number of mblks in tcp's transmit 20557 * queue that can be added to a Multidata message. Once 20558 * this counter reaches zero, no more additional mblks 20559 * can be added to it. What happens afterwards depends 20560 * on whether or not we are set to chain the Multidata 20561 * messages. If we are to link them together, reset 20562 * max_pld to its original value (tcp_mdt_max_pld) and 20563 * prepare to create a new Multidata message which will 20564 * get linked to md_mp_head. Else, leave it alone and 20565 * let the inner loop break on its own. 20566 */ 20567 if (tcp_mdt_chain && max_pld == 0) 20568 PREP_NEW_MULTIDATA(); 20569 20570 /* adding a payload buffer; re-initialize values */ 20571 if (add_buffer) 20572 PREP_NEW_PBUF(); 20573 20574 /* 20575 * If we don't have a Multidata, either because we just 20576 * (re)entered this outer loop, or after we branched off 20577 * to tcp_send above, setup the Multidata and header 20578 * buffer to be used. 20579 */ 20580 if (md_mp == NULL) { 20581 int md_hbuflen; 20582 uint32_t start, stuff; 20583 20584 /* 20585 * Calculate Multidata header buffer size large enough 20586 * to hold all of the headers that can possibly be 20587 * sent at this moment. We'd rather over-estimate 20588 * the size than running out of space; this is okay 20589 * since this buffer is small anyway. 20590 */ 20591 md_hbuflen = (howmany(*usable, mss) + 1) * hdr_frag_sz; 20592 20593 /* 20594 * Start and stuff offset for partial hardware 20595 * checksum offload; these are currently for IPv4. 20596 * For full checksum offload, they are set to zero. 20597 */ 20598 if ((hwcksum_flags & HCK_PARTIALCKSUM)) { 20599 if (af == AF_INET) { 20600 start = IP_SIMPLE_HDR_LENGTH; 20601 stuff = IP_SIMPLE_HDR_LENGTH + 20602 TCP_CHECKSUM_OFFSET; 20603 } else { 20604 start = IPV6_HDR_LEN; 20605 stuff = IPV6_HDR_LEN + 20606 TCP_CHECKSUM_OFFSET; 20607 } 20608 } else { 20609 start = stuff = 0; 20610 } 20611 20612 /* 20613 * Create the header buffer, Multidata, as well as 20614 * any necessary attributes (destination address, 20615 * SAP and hardware checksum offload) that should 20616 * be associated with the Multidata message. 20617 */ 20618 ASSERT(cur_hdr_off == 0); 20619 if ((md_hbuf = allocb(md_hbuflen, BPRI_HI)) == NULL || 20620 ((md_hbuf->b_wptr += md_hbuflen), 20621 (mmd = mmd_alloc(md_hbuf, &md_mp, 20622 KM_NOSLEEP)) == NULL) || (tcp_mdt_add_attrs(mmd, 20623 /* fastpath mblk */ 20624 ire->ire_nce->nce_res_mp, 20625 /* hardware checksum enabled */ 20626 (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)), 20627 /* hardware checksum offsets */ 20628 start, stuff, 0, 20629 /* hardware checksum flag */ 20630 hwcksum_flags, tcps) != 0)) { 20631 legacy_send: 20632 if (md_mp != NULL) { 20633 /* Unlink message from the chain */ 20634 if (md_mp_head != NULL) { 20635 err = (intptr_t)rmvb(md_mp_head, 20636 md_mp); 20637 /* 20638 * We can't assert that rmvb 20639 * did not return -1, since we 20640 * may get here before linkb 20641 * happens. We do, however, 20642 * check if we just removed the 20643 * only element in the list. 20644 */ 20645 if (err == 0) 20646 md_mp_head = NULL; 20647 } 20648 /* md_hbuf gets freed automatically */ 20649 TCP_STAT(tcps, tcp_mdt_discarded); 20650 freeb(md_mp); 20651 } else { 20652 /* Either allocb or mmd_alloc failed */ 20653 TCP_STAT(tcps, tcp_mdt_allocfail); 20654 if (md_hbuf != NULL) 20655 freeb(md_hbuf); 20656 } 20657 20658 /* send down what we've got so far */ 20659 if (md_mp_head != NULL) { 20660 tcp_multisend_data(tcp, ire, ill, 20661 md_mp_head, obsegs, obbytes, 20662 &rconfirm); 20663 } 20664 legacy_send_no_md: 20665 if (ire != NULL) 20666 IRE_REFRELE(ire); 20667 /* 20668 * Too bad; let the legacy path handle this. 20669 * We specify INT_MAX for the threshold, since 20670 * we gave up with the Multidata processings 20671 * and let the old path have it all. 20672 */ 20673 TCP_STAT(tcps, tcp_mdt_legacy_all); 20674 return (tcp_send(q, tcp, mss, tcp_hdr_len, 20675 tcp_tcp_hdr_len, num_sack_blk, usable, 20676 snxt, tail_unsent, xmit_tail, local_time, 20677 INT_MAX)); 20678 } 20679 20680 /* link to any existing ones, if applicable */ 20681 TCP_STAT(tcps, tcp_mdt_allocd); 20682 if (md_mp_head == NULL) { 20683 md_mp_head = md_mp; 20684 } else if (tcp_mdt_chain) { 20685 TCP_STAT(tcps, tcp_mdt_linked); 20686 linkb(md_mp_head, md_mp); 20687 } 20688 } 20689 20690 ASSERT(md_mp_head != NULL); 20691 ASSERT(tcp_mdt_chain || md_mp_head->b_cont == NULL); 20692 ASSERT(md_mp != NULL && mmd != NULL); 20693 ASSERT(md_hbuf != NULL); 20694 20695 /* 20696 * Packetize the transmittable portion of the data block; 20697 * each data block is essentially added to the Multidata 20698 * as a payload buffer. We also deal with adding more 20699 * than one payload buffers, which happens when the remaining 20700 * packetized portion of the current payload buffer is less 20701 * than MSS, while the next data block in transmit queue 20702 * has enough data to make up for one. This "spillover" 20703 * case essentially creates a split-packet, where portions 20704 * of the packet's payload fragments may span across two 20705 * virtually discontiguous address blocks. 20706 */ 20707 seg_len = mss; 20708 do { 20709 len = seg_len; 20710 20711 ASSERT(len > 0); 20712 ASSERT(max_pld >= 0); 20713 ASSERT(!add_buffer || cur_pld_off == 0); 20714 20715 /* 20716 * First time around for this payload buffer; note 20717 * in the case of a spillover, the following has 20718 * been done prior to adding the split-packet 20719 * descriptor to Multidata, and we don't want to 20720 * repeat the process. 20721 */ 20722 if (add_buffer) { 20723 ASSERT(mmd != NULL); 20724 ASSERT(md_pbuf == NULL); 20725 ASSERT(md_pbuf_nxt == NULL); 20726 ASSERT(pbuf_idx == -1 && pbuf_idx_nxt == -1); 20727 20728 /* 20729 * Have we reached the limit? We'd get to 20730 * this case when we're not chaining the 20731 * Multidata messages together, and since 20732 * we're done, terminate this loop. 20733 */ 20734 if (max_pld == 0) 20735 break; /* done */ 20736 20737 if ((md_pbuf = dupb(*xmit_tail)) == NULL) { 20738 TCP_STAT(tcps, tcp_mdt_allocfail); 20739 goto legacy_send; /* out_of_mem */ 20740 } 20741 20742 if (IS_VMLOANED_MBLK(md_pbuf) && !zcopy && 20743 zc_cap != NULL) { 20744 if (!ip_md_zcopy_attr(mmd, NULL, 20745 zc_cap->ill_zerocopy_flags)) { 20746 freeb(md_pbuf); 20747 TCP_STAT(tcps, 20748 tcp_mdt_allocfail); 20749 /* out_of_mem */ 20750 goto legacy_send; 20751 } 20752 zcopy = B_TRUE; 20753 } 20754 20755 md_pbuf->b_rptr += base_pld_off; 20756 20757 /* 20758 * Add a payload buffer to the Multidata; this 20759 * operation must not fail, or otherwise our 20760 * logic in this routine is broken. There 20761 * is no memory allocation done by the 20762 * routine, so any returned failure simply 20763 * tells us that we've done something wrong. 20764 * 20765 * A failure tells us that either we're adding 20766 * the same payload buffer more than once, or 20767 * we're trying to add more buffers than 20768 * allowed (max_pld calculation is wrong). 20769 * None of the above cases should happen, and 20770 * we panic because either there's horrible 20771 * heap corruption, and/or programming mistake. 20772 */ 20773 pbuf_idx = mmd_addpldbuf(mmd, md_pbuf); 20774 if (pbuf_idx < 0) { 20775 cmn_err(CE_PANIC, "tcp_multisend: " 20776 "payload buffer logic error " 20777 "detected for tcp %p mmd %p " 20778 "pbuf %p (%d)\n", 20779 (void *)tcp, (void *)mmd, 20780 (void *)md_pbuf, pbuf_idx); 20781 } 20782 20783 ASSERT(max_pld > 0); 20784 --max_pld; 20785 add_buffer = B_FALSE; 20786 } 20787 20788 ASSERT(md_mp_head != NULL); 20789 ASSERT(md_pbuf != NULL); 20790 ASSERT(md_pbuf_nxt == NULL); 20791 ASSERT(pbuf_idx != -1); 20792 ASSERT(pbuf_idx_nxt == -1); 20793 ASSERT(*usable > 0); 20794 20795 /* 20796 * We spillover to the next payload buffer only 20797 * if all of the following is true: 20798 * 20799 * 1. There is not enough data on the current 20800 * payload buffer to make up `len', 20801 * 2. We are allowed to send `len', 20802 * 3. The next payload buffer length is large 20803 * enough to accomodate `spill'. 20804 */ 20805 if ((spill = len - *tail_unsent) > 0 && 20806 *usable >= len && 20807 MBLKL((*xmit_tail)->b_cont) >= spill && 20808 max_pld > 0) { 20809 md_pbuf_nxt = dupb((*xmit_tail)->b_cont); 20810 if (md_pbuf_nxt == NULL) { 20811 TCP_STAT(tcps, tcp_mdt_allocfail); 20812 goto legacy_send; /* out_of_mem */ 20813 } 20814 20815 if (IS_VMLOANED_MBLK(md_pbuf_nxt) && !zcopy && 20816 zc_cap != NULL) { 20817 if (!ip_md_zcopy_attr(mmd, NULL, 20818 zc_cap->ill_zerocopy_flags)) { 20819 freeb(md_pbuf_nxt); 20820 TCP_STAT(tcps, 20821 tcp_mdt_allocfail); 20822 /* out_of_mem */ 20823 goto legacy_send; 20824 } 20825 zcopy = B_TRUE; 20826 } 20827 20828 /* 20829 * See comments above on the first call to 20830 * mmd_addpldbuf for explanation on the panic. 20831 */ 20832 pbuf_idx_nxt = mmd_addpldbuf(mmd, md_pbuf_nxt); 20833 if (pbuf_idx_nxt < 0) { 20834 panic("tcp_multisend: " 20835 "next payload buffer logic error " 20836 "detected for tcp %p mmd %p " 20837 "pbuf %p (%d)\n", 20838 (void *)tcp, (void *)mmd, 20839 (void *)md_pbuf_nxt, pbuf_idx_nxt); 20840 } 20841 20842 ASSERT(max_pld > 0); 20843 --max_pld; 20844 } else if (spill > 0) { 20845 /* 20846 * If there's a spillover, but the following 20847 * xmit_tail couldn't give us enough octets 20848 * to reach "len", then stop the current 20849 * Multidata creation and let the legacy 20850 * tcp_send() path take over. We don't want 20851 * to send the tiny segment as part of this 20852 * Multidata for performance reasons; instead, 20853 * we let the legacy path deal with grouping 20854 * it with the subsequent small mblks. 20855 */ 20856 if (*usable >= len && 20857 MBLKL((*xmit_tail)->b_cont) < spill) { 20858 max_pld = 0; 20859 break; /* done */ 20860 } 20861 20862 /* 20863 * We can't spillover, and we are near 20864 * the end of the current payload buffer, 20865 * so send what's left. 20866 */ 20867 ASSERT(*tail_unsent > 0); 20868 len = *tail_unsent; 20869 } 20870 20871 /* tail_unsent is negated if there is a spillover */ 20872 *tail_unsent -= len; 20873 *usable -= len; 20874 ASSERT(*usable >= 0); 20875 20876 if (*usable < mss) 20877 seg_len = *usable; 20878 /* 20879 * Sender SWS avoidance; see comments in tcp_send(); 20880 * everything else is the same, except that we only 20881 * do this here if there is no more data to be sent 20882 * following the current xmit_tail. We don't check 20883 * for 1-byte urgent data because we shouldn't get 20884 * here if TCP_URG_VALID is set. 20885 */ 20886 if (*usable > 0 && *usable < mss && 20887 ((md_pbuf_nxt == NULL && 20888 (*xmit_tail)->b_cont == NULL) || 20889 (md_pbuf_nxt != NULL && 20890 (*xmit_tail)->b_cont->b_cont == NULL)) && 20891 seg_len < (tcp->tcp_max_swnd >> 1) && 20892 (tcp->tcp_unsent - 20893 ((*snxt + len) - tcp->tcp_snxt)) > seg_len && 20894 !tcp->tcp_zero_win_probe) { 20895 if ((*snxt + len) == tcp->tcp_snxt && 20896 (*snxt + len) == tcp->tcp_suna) { 20897 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 20898 } 20899 done = B_TRUE; 20900 } 20901 20902 /* 20903 * Prime pump for IP's checksumming on our behalf; 20904 * include the adjustment for a source route if any. 20905 * Do this only for software/partial hardware checksum 20906 * offload, as this field gets zeroed out later for 20907 * the full hardware checksum offload case. 20908 */ 20909 if (!(hwcksum_flags & HCK_FULLCKSUM)) { 20910 cksum = len + tcp_tcp_hdr_len + tcp->tcp_sum; 20911 cksum = (cksum >> 16) + (cksum & 0xFFFF); 20912 U16_TO_ABE16(cksum, tcp->tcp_tcph->th_sum); 20913 } 20914 20915 U32_TO_ABE32(*snxt, tcp->tcp_tcph->th_seq); 20916 *snxt += len; 20917 20918 tcp->tcp_tcph->th_flags[0] = TH_ACK; 20919 /* 20920 * We set the PUSH bit only if TCP has no more buffered 20921 * data to be transmitted (or if sender SWS avoidance 20922 * takes place), as opposed to setting it for every 20923 * last packet in the burst. 20924 */ 20925 if (done || 20926 (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) == 0) 20927 tcp->tcp_tcph->th_flags[0] |= TH_PUSH; 20928 20929 /* 20930 * Set FIN bit if this is our last segment; snxt 20931 * already includes its length, and it will not 20932 * be adjusted after this point. 20933 */ 20934 if (tcp->tcp_valid_bits == TCP_FSS_VALID && 20935 *snxt == tcp->tcp_fss) { 20936 if (!tcp->tcp_fin_acked) { 20937 tcp->tcp_tcph->th_flags[0] |= TH_FIN; 20938 BUMP_MIB(&tcps->tcps_mib, 20939 tcpOutControl); 20940 } 20941 if (!tcp->tcp_fin_sent) { 20942 tcp->tcp_fin_sent = B_TRUE; 20943 /* 20944 * tcp state must be ESTABLISHED 20945 * in order for us to get here in 20946 * the first place. 20947 */ 20948 tcp->tcp_state = TCPS_FIN_WAIT_1; 20949 20950 /* 20951 * Upon returning from this routine, 20952 * tcp_wput_data() will set tcp_snxt 20953 * to be equal to snxt + tcp_fin_sent. 20954 * This is essentially the same as 20955 * setting it to tcp_fss + 1. 20956 */ 20957 } 20958 } 20959 20960 tcp->tcp_last_sent_len = (ushort_t)len; 20961 20962 len += tcp_hdr_len; 20963 if (tcp->tcp_ipversion == IPV4_VERSION) 20964 tcp->tcp_ipha->ipha_length = htons(len); 20965 else 20966 tcp->tcp_ip6h->ip6_plen = htons(len - 20967 ((char *)&tcp->tcp_ip6h[1] - 20968 tcp->tcp_iphc)); 20969 20970 pkt_info->flags = (PDESC_HBUF_REF | PDESC_PBUF_REF); 20971 20972 /* setup header fragment */ 20973 PDESC_HDR_ADD(pkt_info, 20974 md_hbuf->b_rptr + cur_hdr_off, /* base */ 20975 tcp->tcp_mdt_hdr_head, /* head room */ 20976 tcp_hdr_len, /* len */ 20977 tcp->tcp_mdt_hdr_tail); /* tail room */ 20978 20979 ASSERT(pkt_info->hdr_lim - pkt_info->hdr_base == 20980 hdr_frag_sz); 20981 ASSERT(MBLKIN(md_hbuf, 20982 (pkt_info->hdr_base - md_hbuf->b_rptr), 20983 PDESC_HDRSIZE(pkt_info))); 20984 20985 /* setup first payload fragment */ 20986 PDESC_PLD_INIT(pkt_info); 20987 PDESC_PLD_SPAN_ADD(pkt_info, 20988 pbuf_idx, /* index */ 20989 md_pbuf->b_rptr + cur_pld_off, /* start */ 20990 tcp->tcp_last_sent_len); /* len */ 20991 20992 /* create a split-packet in case of a spillover */ 20993 if (md_pbuf_nxt != NULL) { 20994 ASSERT(spill > 0); 20995 ASSERT(pbuf_idx_nxt > pbuf_idx); 20996 ASSERT(!add_buffer); 20997 20998 md_pbuf = md_pbuf_nxt; 20999 md_pbuf_nxt = NULL; 21000 pbuf_idx = pbuf_idx_nxt; 21001 pbuf_idx_nxt = -1; 21002 cur_pld_off = spill; 21003 21004 /* trim out first payload fragment */ 21005 PDESC_PLD_SPAN_TRIM(pkt_info, 0, spill); 21006 21007 /* setup second payload fragment */ 21008 PDESC_PLD_SPAN_ADD(pkt_info, 21009 pbuf_idx, /* index */ 21010 md_pbuf->b_rptr, /* start */ 21011 spill); /* len */ 21012 21013 if ((*xmit_tail)->b_next == NULL) { 21014 /* 21015 * Store the lbolt used for RTT 21016 * estimation. We can only record one 21017 * timestamp per mblk so we do it when 21018 * we reach the end of the payload 21019 * buffer. Also we only take a new 21020 * timestamp sample when the previous 21021 * timed data from the same mblk has 21022 * been ack'ed. 21023 */ 21024 (*xmit_tail)->b_prev = local_time; 21025 (*xmit_tail)->b_next = 21026 (mblk_t *)(uintptr_t)first_snxt; 21027 } 21028 21029 first_snxt = *snxt - spill; 21030 21031 /* 21032 * Advance xmit_tail; usable could be 0 by 21033 * the time we got here, but we made sure 21034 * above that we would only spillover to 21035 * the next data block if usable includes 21036 * the spilled-over amount prior to the 21037 * subtraction. Therefore, we are sure 21038 * that xmit_tail->b_cont can't be NULL. 21039 */ 21040 ASSERT((*xmit_tail)->b_cont != NULL); 21041 *xmit_tail = (*xmit_tail)->b_cont; 21042 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= 21043 (uintptr_t)INT_MAX); 21044 *tail_unsent = (int)MBLKL(*xmit_tail) - spill; 21045 } else { 21046 cur_pld_off += tcp->tcp_last_sent_len; 21047 } 21048 21049 /* 21050 * Fill in the header using the template header, and 21051 * add options such as time-stamp, ECN and/or SACK, 21052 * as needed. 21053 */ 21054 tcp_fill_header(tcp, pkt_info->hdr_rptr, 21055 (clock_t)local_time, num_sack_blk); 21056 21057 /* take care of some IP header businesses */ 21058 if (af == AF_INET) { 21059 ipha = (ipha_t *)pkt_info->hdr_rptr; 21060 21061 ASSERT(OK_32PTR((uchar_t *)ipha)); 21062 ASSERT(PDESC_HDRL(pkt_info) >= 21063 IP_SIMPLE_HDR_LENGTH); 21064 ASSERT(ipha->ipha_version_and_hdr_length == 21065 IP_SIMPLE_HDR_VERSION); 21066 21067 /* 21068 * Assign ident value for current packet; see 21069 * related comments in ip_wput_ire() about the 21070 * contract private interface with clustering 21071 * group. 21072 */ 21073 clusterwide = B_FALSE; 21074 if (cl_inet_ipident != NULL) { 21075 ASSERT(cl_inet_isclusterwide != NULL); 21076 if ((*cl_inet_isclusterwide)(IPPROTO_IP, 21077 AF_INET, 21078 (uint8_t *)(uintptr_t)src)) { 21079 ipha->ipha_ident = 21080 (*cl_inet_ipident) 21081 (IPPROTO_IP, AF_INET, 21082 (uint8_t *)(uintptr_t)src, 21083 (uint8_t *)(uintptr_t)dst); 21084 clusterwide = B_TRUE; 21085 } 21086 } 21087 21088 if (!clusterwide) { 21089 ipha->ipha_ident = (uint16_t) 21090 atomic_add_32_nv( 21091 &ire->ire_ident, 1); 21092 } 21093 #ifndef _BIG_ENDIAN 21094 ipha->ipha_ident = (ipha->ipha_ident << 8) | 21095 (ipha->ipha_ident >> 8); 21096 #endif 21097 } else { 21098 ip6h = (ip6_t *)pkt_info->hdr_rptr; 21099 21100 ASSERT(OK_32PTR((uchar_t *)ip6h)); 21101 ASSERT(IPVER(ip6h) == IPV6_VERSION); 21102 ASSERT(ip6h->ip6_nxt == IPPROTO_TCP); 21103 ASSERT(PDESC_HDRL(pkt_info) >= 21104 (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET + 21105 TCP_CHECKSUM_SIZE)); 21106 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 21107 21108 if (tcp->tcp_ip_forward_progress) { 21109 rconfirm = B_TRUE; 21110 tcp->tcp_ip_forward_progress = B_FALSE; 21111 } 21112 } 21113 21114 /* at least one payload span, and at most two */ 21115 ASSERT(pkt_info->pld_cnt > 0 && pkt_info->pld_cnt < 3); 21116 21117 /* add the packet descriptor to Multidata */ 21118 if ((pkt = mmd_addpdesc(mmd, pkt_info, &err, 21119 KM_NOSLEEP)) == NULL) { 21120 /* 21121 * Any failure other than ENOMEM indicates 21122 * that we have passed in invalid pkt_info 21123 * or parameters to mmd_addpdesc, which must 21124 * not happen. 21125 * 21126 * EINVAL is a result of failure on boundary 21127 * checks against the pkt_info contents. It 21128 * should not happen, and we panic because 21129 * either there's horrible heap corruption, 21130 * and/or programming mistake. 21131 */ 21132 if (err != ENOMEM) { 21133 cmn_err(CE_PANIC, "tcp_multisend: " 21134 "pdesc logic error detected for " 21135 "tcp %p mmd %p pinfo %p (%d)\n", 21136 (void *)tcp, (void *)mmd, 21137 (void *)pkt_info, err); 21138 } 21139 TCP_STAT(tcps, tcp_mdt_addpdescfail); 21140 goto legacy_send; /* out_of_mem */ 21141 } 21142 ASSERT(pkt != NULL); 21143 21144 /* calculate IP header and TCP checksums */ 21145 if (af == AF_INET) { 21146 /* calculate pseudo-header checksum */ 21147 cksum = (dst >> 16) + (dst & 0xFFFF) + 21148 (src >> 16) + (src & 0xFFFF); 21149 21150 /* offset for TCP header checksum */ 21151 up = IPH_TCPH_CHECKSUMP(ipha, 21152 IP_SIMPLE_HDR_LENGTH); 21153 } else { 21154 up = (uint16_t *)&ip6h->ip6_src; 21155 21156 /* calculate pseudo-header checksum */ 21157 cksum = up[0] + up[1] + up[2] + up[3] + 21158 up[4] + up[5] + up[6] + up[7] + 21159 up[8] + up[9] + up[10] + up[11] + 21160 up[12] + up[13] + up[14] + up[15]; 21161 21162 /* Fold the initial sum */ 21163 cksum = (cksum & 0xffff) + (cksum >> 16); 21164 21165 up = (uint16_t *)(((uchar_t *)ip6h) + 21166 IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET); 21167 } 21168 21169 if (hwcksum_flags & HCK_FULLCKSUM) { 21170 /* clear checksum field for hardware */ 21171 *up = 0; 21172 } else if (hwcksum_flags & HCK_PARTIALCKSUM) { 21173 uint32_t sum; 21174 21175 /* pseudo-header checksumming */ 21176 sum = *up + cksum + IP_TCP_CSUM_COMP; 21177 sum = (sum & 0xFFFF) + (sum >> 16); 21178 *up = (sum & 0xFFFF) + (sum >> 16); 21179 } else { 21180 /* software checksumming */ 21181 TCP_STAT(tcps, tcp_out_sw_cksum); 21182 TCP_STAT_UPDATE(tcps, tcp_out_sw_cksum_bytes, 21183 tcp->tcp_hdr_len + tcp->tcp_last_sent_len); 21184 *up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len, 21185 cksum + IP_TCP_CSUM_COMP); 21186 if (*up == 0) 21187 *up = 0xFFFF; 21188 } 21189 21190 /* IPv4 header checksum */ 21191 if (af == AF_INET) { 21192 ipha->ipha_fragment_offset_and_flags |= 21193 (uint32_t)htons(ire->ire_frag_flag); 21194 21195 if (hwcksum_flags & HCK_IPV4_HDRCKSUM) { 21196 ipha->ipha_hdr_checksum = 0; 21197 } else { 21198 IP_HDR_CKSUM(ipha, cksum, 21199 ((uint32_t *)ipha)[0], 21200 ((uint16_t *)ipha)[4]); 21201 } 21202 } 21203 21204 if (af == AF_INET && 21205 HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) || 21206 af == AF_INET6 && 21207 HOOKS6_INTERESTED_PHYSICAL_OUT(ipst)) { 21208 /* build header(IP/TCP) mblk for this segment */ 21209 if ((mp = dupb(md_hbuf)) == NULL) 21210 goto legacy_send; 21211 21212 mp->b_rptr = pkt_info->hdr_rptr; 21213 mp->b_wptr = pkt_info->hdr_wptr; 21214 21215 /* build payload mblk for this segment */ 21216 if ((mp1 = dupb(*xmit_tail)) == NULL) { 21217 freemsg(mp); 21218 goto legacy_send; 21219 } 21220 mp1->b_wptr = md_pbuf->b_rptr + cur_pld_off; 21221 mp1->b_rptr = mp1->b_wptr - 21222 tcp->tcp_last_sent_len; 21223 linkb(mp, mp1); 21224 21225 pld_start = mp1->b_rptr; 21226 21227 if (af == AF_INET) { 21228 DTRACE_PROBE4( 21229 ip4__physical__out__start, 21230 ill_t *, NULL, 21231 ill_t *, ill, 21232 ipha_t *, ipha, 21233 mblk_t *, mp); 21234 FW_HOOKS( 21235 ipst->ips_ip4_physical_out_event, 21236 ipst->ips_ipv4firewall_physical_out, 21237 NULL, ill, ipha, mp, mp, 0, ipst); 21238 DTRACE_PROBE1( 21239 ip4__physical__out__end, 21240 mblk_t *, mp); 21241 } else { 21242 DTRACE_PROBE4( 21243 ip6__physical__out_start, 21244 ill_t *, NULL, 21245 ill_t *, ill, 21246 ip6_t *, ip6h, 21247 mblk_t *, mp); 21248 FW_HOOKS6( 21249 ipst->ips_ip6_physical_out_event, 21250 ipst->ips_ipv6firewall_physical_out, 21251 NULL, ill, ip6h, mp, mp, 0, ipst); 21252 DTRACE_PROBE1( 21253 ip6__physical__out__end, 21254 mblk_t *, mp); 21255 } 21256 21257 if (buf_trunked && mp != NULL) { 21258 /* 21259 * Need to pass it to normal path. 21260 */ 21261 CALL_IP_WPUT(tcp->tcp_connp, q, mp); 21262 } else if (mp == NULL || 21263 mp->b_rptr != pkt_info->hdr_rptr || 21264 mp->b_wptr != pkt_info->hdr_wptr || 21265 (mp1 = mp->b_cont) == NULL || 21266 mp1->b_rptr != pld_start || 21267 mp1->b_wptr != pld_start + 21268 tcp->tcp_last_sent_len || 21269 mp1->b_cont != NULL) { 21270 /* 21271 * Need to pass all packets of this 21272 * buffer to normal path, either when 21273 * packet is blocked, or when boundary 21274 * of header buffer or payload buffer 21275 * has been changed by FW_HOOKS[6]. 21276 */ 21277 buf_trunked = B_TRUE; 21278 if (md_mp_head != NULL) { 21279 err = (intptr_t)rmvb(md_mp_head, 21280 md_mp); 21281 if (err == 0) 21282 md_mp_head = NULL; 21283 } 21284 21285 /* send down what we've got so far */ 21286 if (md_mp_head != NULL) { 21287 tcp_multisend_data(tcp, ire, 21288 ill, md_mp_head, obsegs, 21289 obbytes, &rconfirm); 21290 } 21291 md_mp_head = NULL; 21292 21293 if (mp != NULL) 21294 CALL_IP_WPUT(tcp->tcp_connp, 21295 q, mp); 21296 21297 mp1 = fw_mp_head; 21298 do { 21299 mp = mp1; 21300 mp1 = mp1->b_next; 21301 mp->b_next = NULL; 21302 mp->b_prev = NULL; 21303 CALL_IP_WPUT(tcp->tcp_connp, 21304 q, mp); 21305 } while (mp1 != NULL); 21306 21307 fw_mp_head = NULL; 21308 } else { 21309 if (fw_mp_head == NULL) 21310 fw_mp_head = mp; 21311 else 21312 fw_mp_head->b_prev->b_next = mp; 21313 fw_mp_head->b_prev = mp; 21314 } 21315 } 21316 21317 /* advance header offset */ 21318 cur_hdr_off += hdr_frag_sz; 21319 21320 obbytes += tcp->tcp_last_sent_len; 21321 ++obsegs; 21322 } while (!done && *usable > 0 && --num_burst_seg > 0 && 21323 *tail_unsent > 0); 21324 21325 if ((*xmit_tail)->b_next == NULL) { 21326 /* 21327 * Store the lbolt used for RTT estimation. We can only 21328 * record one timestamp per mblk so we do it when we 21329 * reach the end of the payload buffer. Also we only 21330 * take a new timestamp sample when the previous timed 21331 * data from the same mblk has been ack'ed. 21332 */ 21333 (*xmit_tail)->b_prev = local_time; 21334 (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)first_snxt; 21335 } 21336 21337 ASSERT(*tail_unsent >= 0); 21338 if (*tail_unsent > 0) { 21339 /* 21340 * We got here because we broke out of the above 21341 * loop due to of one of the following cases: 21342 * 21343 * 1. len < adjusted MSS (i.e. small), 21344 * 2. Sender SWS avoidance, 21345 * 3. max_pld is zero. 21346 * 21347 * We are done for this Multidata, so trim our 21348 * last payload buffer (if any) accordingly. 21349 */ 21350 if (md_pbuf != NULL) 21351 md_pbuf->b_wptr -= *tail_unsent; 21352 } else if (*usable > 0) { 21353 *xmit_tail = (*xmit_tail)->b_cont; 21354 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= 21355 (uintptr_t)INT_MAX); 21356 *tail_unsent = (int)MBLKL(*xmit_tail); 21357 add_buffer = B_TRUE; 21358 } 21359 21360 while (fw_mp_head) { 21361 mp = fw_mp_head; 21362 fw_mp_head = fw_mp_head->b_next; 21363 mp->b_prev = mp->b_next = NULL; 21364 freemsg(mp); 21365 } 21366 if (buf_trunked) { 21367 TCP_STAT(tcps, tcp_mdt_discarded); 21368 freeb(md_mp); 21369 buf_trunked = B_FALSE; 21370 } 21371 } while (!done && *usable > 0 && num_burst_seg > 0 && 21372 (tcp_mdt_chain || max_pld > 0)); 21373 21374 if (md_mp_head != NULL) { 21375 /* send everything down */ 21376 tcp_multisend_data(tcp, ire, ill, md_mp_head, obsegs, obbytes, 21377 &rconfirm); 21378 } 21379 21380 #undef PREP_NEW_MULTIDATA 21381 #undef PREP_NEW_PBUF 21382 #undef IPVER 21383 21384 IRE_REFRELE(ire); 21385 return (0); 21386 } 21387 21388 /* 21389 * A wrapper function for sending one or more Multidata messages down to 21390 * the module below ip; this routine does not release the reference of the 21391 * IRE (caller does that). This routine is analogous to tcp_send_data(). 21392 */ 21393 static void 21394 tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head, 21395 const uint_t obsegs, const uint_t obbytes, boolean_t *rconfirm) 21396 { 21397 uint64_t delta; 21398 nce_t *nce; 21399 tcp_stack_t *tcps = tcp->tcp_tcps; 21400 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 21401 21402 ASSERT(ire != NULL && ill != NULL); 21403 ASSERT(ire->ire_stq != NULL); 21404 ASSERT(md_mp_head != NULL); 21405 ASSERT(rconfirm != NULL); 21406 21407 /* adjust MIBs and IRE timestamp */ 21408 TCP_RECORD_TRACE(tcp, md_mp_head, TCP_TRACE_SEND_PKT); 21409 tcp->tcp_obsegs += obsegs; 21410 UPDATE_MIB(&tcps->tcps_mib, tcpOutDataSegs, obsegs); 21411 UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, obbytes); 21412 TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out, obsegs); 21413 21414 if (tcp->tcp_ipversion == IPV4_VERSION) { 21415 TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out_v4, obsegs); 21416 } else { 21417 TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out_v6, obsegs); 21418 } 21419 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests, obsegs); 21420 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, obsegs); 21421 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, obbytes); 21422 21423 ire->ire_ob_pkt_count += obsegs; 21424 if (ire->ire_ipif != NULL) 21425 atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, obsegs); 21426 ire->ire_last_used_time = lbolt; 21427 21428 /* send it down */ 21429 if (ILL_DLS_CAPABLE(ill)) { 21430 ill_dls_capab_t *ill_dls = ill->ill_dls_capab; 21431 ill_dls->ill_tx(ill_dls->ill_tx_handle, md_mp_head); 21432 } else { 21433 putnext(ire->ire_stq, md_mp_head); 21434 } 21435 21436 /* we're done for TCP/IPv4 */ 21437 if (tcp->tcp_ipversion == IPV4_VERSION) 21438 return; 21439 21440 nce = ire->ire_nce; 21441 21442 ASSERT(nce != NULL); 21443 ASSERT(!(nce->nce_flags & (NCE_F_NONUD|NCE_F_PERMANENT))); 21444 ASSERT(nce->nce_state != ND_INCOMPLETE); 21445 21446 /* reachability confirmation? */ 21447 if (*rconfirm) { 21448 nce->nce_last = TICK_TO_MSEC(lbolt64); 21449 if (nce->nce_state != ND_REACHABLE) { 21450 mutex_enter(&nce->nce_lock); 21451 nce->nce_state = ND_REACHABLE; 21452 nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; 21453 mutex_exit(&nce->nce_lock); 21454 (void) untimeout(nce->nce_timeout_id); 21455 if (ip_debug > 2) { 21456 /* ip1dbg */ 21457 pr_addr_dbg("tcp_multisend_data: state " 21458 "for %s changed to REACHABLE\n", 21459 AF_INET6, &ire->ire_addr_v6); 21460 } 21461 } 21462 /* reset transport reachability confirmation */ 21463 *rconfirm = B_FALSE; 21464 } 21465 21466 delta = TICK_TO_MSEC(lbolt64) - nce->nce_last; 21467 ip1dbg(("tcp_multisend_data: delta = %" PRId64 21468 " ill_reachable_time = %d \n", delta, ill->ill_reachable_time)); 21469 21470 if (delta > (uint64_t)ill->ill_reachable_time) { 21471 mutex_enter(&nce->nce_lock); 21472 switch (nce->nce_state) { 21473 case ND_REACHABLE: 21474 case ND_STALE: 21475 /* 21476 * ND_REACHABLE is identical to ND_STALE in this 21477 * specific case. If reachable time has expired for 21478 * this neighbor (delta is greater than reachable 21479 * time), conceptually, the neighbor cache is no 21480 * longer in REACHABLE state, but already in STALE 21481 * state. So the correct transition here is to 21482 * ND_DELAY. 21483 */ 21484 nce->nce_state = ND_DELAY; 21485 mutex_exit(&nce->nce_lock); 21486 NDP_RESTART_TIMER(nce, 21487 ipst->ips_delay_first_probe_time); 21488 if (ip_debug > 3) { 21489 /* ip2dbg */ 21490 pr_addr_dbg("tcp_multisend_data: state " 21491 "for %s changed to DELAY\n", 21492 AF_INET6, &ire->ire_addr_v6); 21493 } 21494 break; 21495 case ND_DELAY: 21496 case ND_PROBE: 21497 mutex_exit(&nce->nce_lock); 21498 /* Timers have already started */ 21499 break; 21500 case ND_UNREACHABLE: 21501 /* 21502 * ndp timer has detected that this nce is 21503 * unreachable and initiated deleting this nce 21504 * and all its associated IREs. This is a race 21505 * where we found the ire before it was deleted 21506 * and have just sent out a packet using this 21507 * unreachable nce. 21508 */ 21509 mutex_exit(&nce->nce_lock); 21510 break; 21511 default: 21512 ASSERT(0); 21513 } 21514 } 21515 } 21516 21517 /* 21518 * Derived from tcp_send_data(). 21519 */ 21520 static void 21521 tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss, 21522 int num_lso_seg) 21523 { 21524 ipha_t *ipha; 21525 mblk_t *ire_fp_mp; 21526 uint_t ire_fp_mp_len; 21527 uint32_t hcksum_txflags = 0; 21528 ipaddr_t src; 21529 ipaddr_t dst; 21530 uint32_t cksum; 21531 uint16_t *up; 21532 tcp_stack_t *tcps = tcp->tcp_tcps; 21533 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 21534 21535 ASSERT(DB_TYPE(mp) == M_DATA); 21536 ASSERT(tcp->tcp_state == TCPS_ESTABLISHED); 21537 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 21538 ASSERT(tcp->tcp_connp != NULL); 21539 ASSERT(CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp)); 21540 21541 ipha = (ipha_t *)mp->b_rptr; 21542 src = ipha->ipha_src; 21543 dst = ipha->ipha_dst; 21544 21545 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); 21546 ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 21547 num_lso_seg); 21548 #ifndef _BIG_ENDIAN 21549 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); 21550 #endif 21551 if (tcp->tcp_snd_zcopy_aware) { 21552 if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 || 21553 (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0)) 21554 mp = tcp_zcopy_disable(tcp, mp); 21555 } 21556 21557 if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { 21558 ASSERT(ill->ill_hcksum_capab != NULL); 21559 hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; 21560 } 21561 21562 /* 21563 * Since the TCP checksum should be recalculated by h/w, we can just 21564 * zero the checksum field for HCK_FULLCKSUM, or calculate partial 21565 * pseudo-header checksum for HCK_PARTIALCKSUM. 21566 * The partial pseudo-header excludes TCP length, that was calculated 21567 * in tcp_send(), so to zero *up before further processing. 21568 */ 21569 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 21570 21571 up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); 21572 *up = 0; 21573 21574 IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up, 21575 IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum); 21576 21577 /* 21578 * Append LSO flag to DB_LSOFLAGS(mp) and set the mss to DB_LSOMSS(mp). 21579 */ 21580 DB_LSOFLAGS(mp) |= HW_LSO; 21581 DB_LSOMSS(mp) = mss; 21582 21583 ipha->ipha_fragment_offset_and_flags |= 21584 (uint32_t)htons(ire->ire_frag_flag); 21585 21586 ire_fp_mp = ire->ire_nce->nce_fp_mp; 21587 ire_fp_mp_len = MBLKL(ire_fp_mp); 21588 ASSERT(DB_TYPE(ire_fp_mp) == M_DATA); 21589 mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len; 21590 bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len); 21591 21592 UPDATE_OB_PKT_COUNT(ire); 21593 ire->ire_last_used_time = lbolt; 21594 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); 21595 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits); 21596 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, 21597 ntohs(ipha->ipha_length)); 21598 21599 if (ILL_DLS_CAPABLE(ill)) { 21600 /* 21601 * Send the packet directly to DLD, where it may be queued 21602 * depending on the availability of transmit resources at 21603 * the media layer. 21604 */ 21605 IP_DLS_ILL_TX(ill, ipha, mp, ipst); 21606 } else { 21607 ill_t *out_ill = (ill_t *)ire->ire_stq->q_ptr; 21608 DTRACE_PROBE4(ip4__physical__out__start, 21609 ill_t *, NULL, ill_t *, out_ill, 21610 ipha_t *, ipha, mblk_t *, mp); 21611 FW_HOOKS(ipst->ips_ip4_physical_out_event, 21612 ipst->ips_ipv4firewall_physical_out, 21613 NULL, out_ill, ipha, mp, mp, 0, ipst); 21614 DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); 21615 if (mp != NULL) 21616 putnext(ire->ire_stq, mp); 21617 } 21618 } 21619 21620 /* 21621 * tcp_send() is called by tcp_wput_data() for non-Multidata transmission 21622 * scheme, and returns one of the following: 21623 * 21624 * -1 = failed allocation. 21625 * 0 = success; burst count reached, or usable send window is too small, 21626 * and that we'd rather wait until later before sending again. 21627 * 1 = success; we are called from tcp_multisend(), and both usable send 21628 * window and tail_unsent are greater than the MDT threshold, and thus 21629 * Multidata Transmit should be used instead. 21630 */ 21631 static int 21632 tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, 21633 const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable, 21634 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 21635 const int mdt_thres) 21636 { 21637 int num_burst_seg = tcp->tcp_snd_burst; 21638 ire_t *ire = NULL; 21639 ill_t *ill = NULL; 21640 mblk_t *ire_fp_mp = NULL; 21641 uint_t ire_fp_mp_len = 0; 21642 int num_lso_seg = 1; 21643 uint_t lso_usable; 21644 boolean_t do_lso_send = B_FALSE; 21645 tcp_stack_t *tcps = tcp->tcp_tcps; 21646 21647 /* 21648 * Check LSO capability before any further work. And the similar check 21649 * need to be done in for(;;) loop. 21650 * LSO will be deployed when therer is more than one mss of available 21651 * data and a burst transmission is allowed. 21652 */ 21653 if (tcp->tcp_lso && 21654 (tcp->tcp_valid_bits == 0 || 21655 tcp->tcp_valid_bits == TCP_FSS_VALID) && 21656 num_burst_seg >= 2 && (*usable - 1) / mss >= 1) { 21657 /* 21658 * Try to find usable IRE/ILL and do basic check to the ILL. 21659 */ 21660 if (tcp_send_find_ire_ill(tcp, NULL, &ire, &ill)) { 21661 /* 21662 * Enable LSO with this transmission. 21663 * Since IRE has been hold in 21664 * tcp_send_find_ire_ill(), IRE_REFRELE(ire) 21665 * should be called before return. 21666 */ 21667 do_lso_send = B_TRUE; 21668 ire_fp_mp = ire->ire_nce->nce_fp_mp; 21669 ire_fp_mp_len = MBLKL(ire_fp_mp); 21670 /* Round up to multiple of 4 */ 21671 ire_fp_mp_len = ((ire_fp_mp_len + 3) / 4) * 4; 21672 } else { 21673 do_lso_send = B_FALSE; 21674 ill = NULL; 21675 } 21676 } 21677 21678 for (;;) { 21679 struct datab *db; 21680 tcph_t *tcph; 21681 uint32_t sum; 21682 mblk_t *mp, *mp1; 21683 uchar_t *rptr; 21684 int len; 21685 21686 /* 21687 * If we're called by tcp_multisend(), and the amount of 21688 * sendable data as well as the size of current xmit_tail 21689 * is beyond the MDT threshold, return to the caller and 21690 * let the large data transmit be done using MDT. 21691 */ 21692 if (*usable > 0 && *usable > mdt_thres && 21693 (*tail_unsent > mdt_thres || (*tail_unsent == 0 && 21694 MBLKL((*xmit_tail)->b_cont) > mdt_thres))) { 21695 ASSERT(tcp->tcp_mdt); 21696 return (1); /* success; do large send */ 21697 } 21698 21699 if (num_burst_seg == 0) 21700 break; /* success; burst count reached */ 21701 21702 /* 21703 * Calculate the maximum payload length we can send in *one* 21704 * time. 21705 */ 21706 if (do_lso_send) { 21707 /* 21708 * Check whether need to do LSO any more. 21709 */ 21710 if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) { 21711 lso_usable = MIN(tcp->tcp_lso_max, *usable); 21712 lso_usable = MIN(lso_usable, 21713 num_burst_seg * mss); 21714 21715 num_lso_seg = lso_usable / mss; 21716 if (lso_usable % mss) { 21717 num_lso_seg++; 21718 tcp->tcp_last_sent_len = (ushort_t) 21719 (lso_usable % mss); 21720 } else { 21721 tcp->tcp_last_sent_len = (ushort_t)mss; 21722 } 21723 } else { 21724 do_lso_send = B_FALSE; 21725 num_lso_seg = 1; 21726 lso_usable = mss; 21727 } 21728 } 21729 21730 ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1); 21731 21732 /* 21733 * Adjust num_burst_seg here. 21734 */ 21735 num_burst_seg -= num_lso_seg; 21736 21737 len = mss; 21738 if (len > *usable) { 21739 ASSERT(do_lso_send == B_FALSE); 21740 21741 len = *usable; 21742 if (len <= 0) { 21743 /* Terminate the loop */ 21744 break; /* success; too small */ 21745 } 21746 /* 21747 * Sender silly-window avoidance. 21748 * Ignore this if we are going to send a 21749 * zero window probe out. 21750 * 21751 * TODO: force data into microscopic window? 21752 * ==> (!pushed || (unsent > usable)) 21753 */ 21754 if (len < (tcp->tcp_max_swnd >> 1) && 21755 (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len && 21756 !((tcp->tcp_valid_bits & TCP_URG_VALID) && 21757 len == 1) && (! tcp->tcp_zero_win_probe)) { 21758 /* 21759 * If the retransmit timer is not running 21760 * we start it so that we will retransmit 21761 * in the case when the the receiver has 21762 * decremented the window. 21763 */ 21764 if (*snxt == tcp->tcp_snxt && 21765 *snxt == tcp->tcp_suna) { 21766 /* 21767 * We are not supposed to send 21768 * anything. So let's wait a little 21769 * bit longer before breaking SWS 21770 * avoidance. 21771 * 21772 * What should the value be? 21773 * Suggestion: MAX(init rexmit time, 21774 * tcp->tcp_rto) 21775 */ 21776 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 21777 } 21778 break; /* success; too small */ 21779 } 21780 } 21781 21782 tcph = tcp->tcp_tcph; 21783 21784 /* 21785 * The reason to adjust len here is that we need to set flags 21786 * and calculate checksum. 21787 */ 21788 if (do_lso_send) 21789 len = lso_usable; 21790 21791 *usable -= len; /* Approximate - can be adjusted later */ 21792 if (*usable > 0) 21793 tcph->th_flags[0] = TH_ACK; 21794 else 21795 tcph->th_flags[0] = (TH_ACK | TH_PUSH); 21796 21797 /* 21798 * Prime pump for IP's checksumming on our behalf 21799 * Include the adjustment for a source route if any. 21800 */ 21801 sum = len + tcp_tcp_hdr_len + tcp->tcp_sum; 21802 sum = (sum >> 16) + (sum & 0xFFFF); 21803 U16_TO_ABE16(sum, tcph->th_sum); 21804 21805 U32_TO_ABE32(*snxt, tcph->th_seq); 21806 21807 /* 21808 * Branch off to tcp_xmit_mp() if any of the VALID bits is 21809 * set. For the case when TCP_FSS_VALID is the only valid 21810 * bit (normal active close), branch off only when we think 21811 * that the FIN flag needs to be set. Note for this case, 21812 * that (snxt + len) may not reflect the actual seg_len, 21813 * as len may be further reduced in tcp_xmit_mp(). If len 21814 * gets modified, we will end up here again. 21815 */ 21816 if (tcp->tcp_valid_bits != 0 && 21817 (tcp->tcp_valid_bits != TCP_FSS_VALID || 21818 ((*snxt + len) == tcp->tcp_fss))) { 21819 uchar_t *prev_rptr; 21820 uint32_t prev_snxt = tcp->tcp_snxt; 21821 21822 if (*tail_unsent == 0) { 21823 ASSERT((*xmit_tail)->b_cont != NULL); 21824 *xmit_tail = (*xmit_tail)->b_cont; 21825 prev_rptr = (*xmit_tail)->b_rptr; 21826 *tail_unsent = (int)((*xmit_tail)->b_wptr - 21827 (*xmit_tail)->b_rptr); 21828 } else { 21829 prev_rptr = (*xmit_tail)->b_rptr; 21830 (*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr - 21831 *tail_unsent; 21832 } 21833 mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL, 21834 *snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 21835 /* Restore tcp_snxt so we get amount sent right. */ 21836 tcp->tcp_snxt = prev_snxt; 21837 if (prev_rptr == (*xmit_tail)->b_rptr) { 21838 /* 21839 * If the previous timestamp is still in use, 21840 * don't stomp on it. 21841 */ 21842 if ((*xmit_tail)->b_next == NULL) { 21843 (*xmit_tail)->b_prev = local_time; 21844 (*xmit_tail)->b_next = 21845 (mblk_t *)(uintptr_t)(*snxt); 21846 } 21847 } else 21848 (*xmit_tail)->b_rptr = prev_rptr; 21849 21850 if (mp == NULL) { 21851 if (ire != NULL) 21852 IRE_REFRELE(ire); 21853 return (-1); 21854 } 21855 mp1 = mp->b_cont; 21856 21857 if (len <= mss) /* LSO is unusable (!do_lso_send) */ 21858 tcp->tcp_last_sent_len = (ushort_t)len; 21859 while (mp1->b_cont) { 21860 *xmit_tail = (*xmit_tail)->b_cont; 21861 (*xmit_tail)->b_prev = local_time; 21862 (*xmit_tail)->b_next = 21863 (mblk_t *)(uintptr_t)(*snxt); 21864 mp1 = mp1->b_cont; 21865 } 21866 *snxt += len; 21867 *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; 21868 BUMP_LOCAL(tcp->tcp_obsegs); 21869 BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); 21870 UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); 21871 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 21872 tcp_send_data(tcp, q, mp); 21873 continue; 21874 } 21875 21876 *snxt += len; /* Adjust later if we don't send all of len */ 21877 BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); 21878 UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); 21879 21880 if (*tail_unsent) { 21881 /* Are the bytes above us in flight? */ 21882 rptr = (*xmit_tail)->b_wptr - *tail_unsent; 21883 if (rptr != (*xmit_tail)->b_rptr) { 21884 *tail_unsent -= len; 21885 if (len <= mss) /* LSO is unusable */ 21886 tcp->tcp_last_sent_len = (ushort_t)len; 21887 len += tcp_hdr_len; 21888 if (tcp->tcp_ipversion == IPV4_VERSION) 21889 tcp->tcp_ipha->ipha_length = htons(len); 21890 else 21891 tcp->tcp_ip6h->ip6_plen = 21892 htons(len - 21893 ((char *)&tcp->tcp_ip6h[1] - 21894 tcp->tcp_iphc)); 21895 mp = dupb(*xmit_tail); 21896 if (mp == NULL) { 21897 if (ire != NULL) 21898 IRE_REFRELE(ire); 21899 return (-1); /* out_of_mem */ 21900 } 21901 mp->b_rptr = rptr; 21902 /* 21903 * If the old timestamp is no longer in use, 21904 * sample a new timestamp now. 21905 */ 21906 if ((*xmit_tail)->b_next == NULL) { 21907 (*xmit_tail)->b_prev = local_time; 21908 (*xmit_tail)->b_next = 21909 (mblk_t *)(uintptr_t)(*snxt-len); 21910 } 21911 goto must_alloc; 21912 } 21913 } else { 21914 *xmit_tail = (*xmit_tail)->b_cont; 21915 ASSERT((uintptr_t)((*xmit_tail)->b_wptr - 21916 (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX); 21917 *tail_unsent = (int)((*xmit_tail)->b_wptr - 21918 (*xmit_tail)->b_rptr); 21919 } 21920 21921 (*xmit_tail)->b_prev = local_time; 21922 (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len); 21923 21924 *tail_unsent -= len; 21925 if (len <= mss) /* LSO is unusable (!do_lso_send) */ 21926 tcp->tcp_last_sent_len = (ushort_t)len; 21927 21928 len += tcp_hdr_len; 21929 if (tcp->tcp_ipversion == IPV4_VERSION) 21930 tcp->tcp_ipha->ipha_length = htons(len); 21931 else 21932 tcp->tcp_ip6h->ip6_plen = htons(len - 21933 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 21934 21935 mp = dupb(*xmit_tail); 21936 if (mp == NULL) { 21937 if (ire != NULL) 21938 IRE_REFRELE(ire); 21939 return (-1); /* out_of_mem */ 21940 } 21941 21942 len = tcp_hdr_len; 21943 /* 21944 * There are four reasons to allocate a new hdr mblk: 21945 * 1) The bytes above us are in use by another packet 21946 * 2) We don't have good alignment 21947 * 3) The mblk is being shared 21948 * 4) We don't have enough room for a header 21949 */ 21950 rptr = mp->b_rptr - len; 21951 if (!OK_32PTR(rptr) || 21952 ((db = mp->b_datap), db->db_ref != 2) || 21953 rptr < db->db_base + ire_fp_mp_len) { 21954 /* NOTE: we assume allocb returns an OK_32PTR */ 21955 21956 must_alloc:; 21957 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 21958 tcps->tcps_wroff_xtra + ire_fp_mp_len, BPRI_MED); 21959 if (mp1 == NULL) { 21960 freemsg(mp); 21961 if (ire != NULL) 21962 IRE_REFRELE(ire); 21963 return (-1); /* out_of_mem */ 21964 } 21965 mp1->b_cont = mp; 21966 mp = mp1; 21967 /* Leave room for Link Level header */ 21968 len = tcp_hdr_len; 21969 rptr = 21970 &mp->b_rptr[tcps->tcps_wroff_xtra + ire_fp_mp_len]; 21971 mp->b_wptr = &rptr[len]; 21972 } 21973 21974 /* 21975 * Fill in the header using the template header, and add 21976 * options such as time-stamp, ECN and/or SACK, as needed. 21977 */ 21978 tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk); 21979 21980 mp->b_rptr = rptr; 21981 21982 if (*tail_unsent) { 21983 int spill = *tail_unsent; 21984 21985 mp1 = mp->b_cont; 21986 if (mp1 == NULL) 21987 mp1 = mp; 21988 21989 /* 21990 * If we're a little short, tack on more mblks until 21991 * there is no more spillover. 21992 */ 21993 while (spill < 0) { 21994 mblk_t *nmp; 21995 int nmpsz; 21996 21997 nmp = (*xmit_tail)->b_cont; 21998 nmpsz = MBLKL(nmp); 21999 22000 /* 22001 * Excess data in mblk; can we split it? 22002 * If MDT is enabled for the connection, 22003 * keep on splitting as this is a transient 22004 * send path. 22005 */ 22006 if (!do_lso_send && !tcp->tcp_mdt && 22007 (spill + nmpsz > 0)) { 22008 /* 22009 * Don't split if stream head was 22010 * told to break up larger writes 22011 * into smaller ones. 22012 */ 22013 if (tcp->tcp_maxpsz > 0) 22014 break; 22015 22016 /* 22017 * Next mblk is less than SMSS/2 22018 * rounded up to nearest 64-byte; 22019 * let it get sent as part of the 22020 * next segment. 22021 */ 22022 if (tcp->tcp_localnet && 22023 !tcp->tcp_cork && 22024 (nmpsz < roundup((mss >> 1), 64))) 22025 break; 22026 } 22027 22028 *xmit_tail = nmp; 22029 ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX); 22030 /* Stash for rtt use later */ 22031 (*xmit_tail)->b_prev = local_time; 22032 (*xmit_tail)->b_next = 22033 (mblk_t *)(uintptr_t)(*snxt - len); 22034 mp1->b_cont = dupb(*xmit_tail); 22035 mp1 = mp1->b_cont; 22036 22037 spill += nmpsz; 22038 if (mp1 == NULL) { 22039 *tail_unsent = spill; 22040 freemsg(mp); 22041 if (ire != NULL) 22042 IRE_REFRELE(ire); 22043 return (-1); /* out_of_mem */ 22044 } 22045 } 22046 22047 /* Trim back any surplus on the last mblk */ 22048 if (spill >= 0) { 22049 mp1->b_wptr -= spill; 22050 *tail_unsent = spill; 22051 } else { 22052 /* 22053 * We did not send everything we could in 22054 * order to remain within the b_cont limit. 22055 */ 22056 *usable -= spill; 22057 *snxt += spill; 22058 tcp->tcp_last_sent_len += spill; 22059 UPDATE_MIB(&tcps->tcps_mib, 22060 tcpOutDataBytes, spill); 22061 /* 22062 * Adjust the checksum 22063 */ 22064 tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 22065 sum += spill; 22066 sum = (sum >> 16) + (sum & 0xFFFF); 22067 U16_TO_ABE16(sum, tcph->th_sum); 22068 if (tcp->tcp_ipversion == IPV4_VERSION) { 22069 sum = ntohs( 22070 ((ipha_t *)rptr)->ipha_length) + 22071 spill; 22072 ((ipha_t *)rptr)->ipha_length = 22073 htons(sum); 22074 } else { 22075 sum = ntohs( 22076 ((ip6_t *)rptr)->ip6_plen) + 22077 spill; 22078 ((ip6_t *)rptr)->ip6_plen = 22079 htons(sum); 22080 } 22081 *tail_unsent = 0; 22082 } 22083 } 22084 if (tcp->tcp_ip_forward_progress) { 22085 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 22086 *(uint32_t *)mp->b_rptr |= IP_FORWARD_PROG; 22087 tcp->tcp_ip_forward_progress = B_FALSE; 22088 } 22089 22090 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 22091 if (do_lso_send) { 22092 tcp_lsosend_data(tcp, mp, ire, ill, mss, 22093 num_lso_seg); 22094 tcp->tcp_obsegs += num_lso_seg; 22095 22096 TCP_STAT(tcps, tcp_lso_times); 22097 TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg); 22098 } else { 22099 tcp_send_data(tcp, q, mp); 22100 BUMP_LOCAL(tcp->tcp_obsegs); 22101 } 22102 } 22103 22104 if (ire != NULL) 22105 IRE_REFRELE(ire); 22106 return (0); 22107 } 22108 22109 /* Unlink and return any mblk that looks like it contains a MDT info */ 22110 static mblk_t * 22111 tcp_mdt_info_mp(mblk_t *mp) 22112 { 22113 mblk_t *prev_mp; 22114 22115 for (;;) { 22116 prev_mp = mp; 22117 /* no more to process? */ 22118 if ((mp = mp->b_cont) == NULL) 22119 break; 22120 22121 switch (DB_TYPE(mp)) { 22122 case M_CTL: 22123 if (*(uint32_t *)mp->b_rptr != MDT_IOC_INFO_UPDATE) 22124 continue; 22125 ASSERT(prev_mp != NULL); 22126 prev_mp->b_cont = mp->b_cont; 22127 mp->b_cont = NULL; 22128 return (mp); 22129 default: 22130 break; 22131 } 22132 } 22133 return (mp); 22134 } 22135 22136 /* MDT info update routine, called when IP notifies us about MDT */ 22137 static void 22138 tcp_mdt_update(tcp_t *tcp, ill_mdt_capab_t *mdt_capab, boolean_t first) 22139 { 22140 boolean_t prev_state; 22141 tcp_stack_t *tcps = tcp->tcp_tcps; 22142 22143 /* 22144 * IP is telling us to abort MDT on this connection? We know 22145 * this because the capability is only turned off when IP 22146 * encounters some pathological cases, e.g. link-layer change 22147 * where the new driver doesn't support MDT, or in situation 22148 * where MDT usage on the link-layer has been switched off. 22149 * IP would not have sent us the initial MDT_IOC_INFO_UPDATE 22150 * if the link-layer doesn't support MDT, and if it does, it 22151 * will indicate that the feature is to be turned on. 22152 */ 22153 prev_state = tcp->tcp_mdt; 22154 tcp->tcp_mdt = (mdt_capab->ill_mdt_on != 0); 22155 if (!tcp->tcp_mdt && !first) { 22156 TCP_STAT(tcps, tcp_mdt_conn_halted3); 22157 ip1dbg(("tcp_mdt_update: disabling MDT for connp %p\n", 22158 (void *)tcp->tcp_connp)); 22159 } 22160 22161 /* 22162 * We currently only support MDT on simple TCP/{IPv4,IPv6}, 22163 * so disable MDT otherwise. The checks are done here 22164 * and in tcp_wput_data(). 22165 */ 22166 if (tcp->tcp_mdt && 22167 (tcp->tcp_ipversion == IPV4_VERSION && 22168 tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || 22169 (tcp->tcp_ipversion == IPV6_VERSION && 22170 tcp->tcp_ip_hdr_len != IPV6_HDR_LEN)) 22171 tcp->tcp_mdt = B_FALSE; 22172 22173 if (tcp->tcp_mdt) { 22174 if (mdt_capab->ill_mdt_version != MDT_VERSION_2) { 22175 cmn_err(CE_NOTE, "tcp_mdt_update: unknown MDT " 22176 "version (%d), expected version is %d", 22177 mdt_capab->ill_mdt_version, MDT_VERSION_2); 22178 tcp->tcp_mdt = B_FALSE; 22179 return; 22180 } 22181 22182 /* 22183 * We need the driver to be able to handle at least three 22184 * spans per packet in order for tcp MDT to be utilized. 22185 * The first is for the header portion, while the rest are 22186 * needed to handle a packet that straddles across two 22187 * virtually non-contiguous buffers; a typical tcp packet 22188 * therefore consists of only two spans. Note that we take 22189 * a zero as "don't care". 22190 */ 22191 if (mdt_capab->ill_mdt_span_limit > 0 && 22192 mdt_capab->ill_mdt_span_limit < 3) { 22193 tcp->tcp_mdt = B_FALSE; 22194 return; 22195 } 22196 22197 /* a zero means driver wants default value */ 22198 tcp->tcp_mdt_max_pld = MIN(mdt_capab->ill_mdt_max_pld, 22199 tcps->tcps_mdt_max_pbufs); 22200 if (tcp->tcp_mdt_max_pld == 0) 22201 tcp->tcp_mdt_max_pld = tcps->tcps_mdt_max_pbufs; 22202 22203 /* ensure 32-bit alignment */ 22204 tcp->tcp_mdt_hdr_head = roundup(MAX(tcps->tcps_mdt_hdr_head_min, 22205 mdt_capab->ill_mdt_hdr_head), 4); 22206 tcp->tcp_mdt_hdr_tail = roundup(MAX(tcps->tcps_mdt_hdr_tail_min, 22207 mdt_capab->ill_mdt_hdr_tail), 4); 22208 22209 if (!first && !prev_state) { 22210 TCP_STAT(tcps, tcp_mdt_conn_resumed2); 22211 ip1dbg(("tcp_mdt_update: reenabling MDT for connp %p\n", 22212 (void *)tcp->tcp_connp)); 22213 } 22214 } 22215 } 22216 22217 /* Unlink and return any mblk that looks like it contains a LSO info */ 22218 static mblk_t * 22219 tcp_lso_info_mp(mblk_t *mp) 22220 { 22221 mblk_t *prev_mp; 22222 22223 for (;;) { 22224 prev_mp = mp; 22225 /* no more to process? */ 22226 if ((mp = mp->b_cont) == NULL) 22227 break; 22228 22229 switch (DB_TYPE(mp)) { 22230 case M_CTL: 22231 if (*(uint32_t *)mp->b_rptr != LSO_IOC_INFO_UPDATE) 22232 continue; 22233 ASSERT(prev_mp != NULL); 22234 prev_mp->b_cont = mp->b_cont; 22235 mp->b_cont = NULL; 22236 return (mp); 22237 default: 22238 break; 22239 } 22240 } 22241 22242 return (mp); 22243 } 22244 22245 /* LSO info update routine, called when IP notifies us about LSO */ 22246 static void 22247 tcp_lso_update(tcp_t *tcp, ill_lso_capab_t *lso_capab) 22248 { 22249 tcp_stack_t *tcps = tcp->tcp_tcps; 22250 22251 /* 22252 * IP is telling us to abort LSO on this connection? We know 22253 * this because the capability is only turned off when IP 22254 * encounters some pathological cases, e.g. link-layer change 22255 * where the new NIC/driver doesn't support LSO, or in situation 22256 * where LSO usage on the link-layer has been switched off. 22257 * IP would not have sent us the initial LSO_IOC_INFO_UPDATE 22258 * if the link-layer doesn't support LSO, and if it does, it 22259 * will indicate that the feature is to be turned on. 22260 */ 22261 tcp->tcp_lso = (lso_capab->ill_lso_on != 0); 22262 TCP_STAT(tcps, tcp_lso_enabled); 22263 22264 /* 22265 * We currently only support LSO on simple TCP/IPv4, 22266 * so disable LSO otherwise. The checks are done here 22267 * and in tcp_wput_data(). 22268 */ 22269 if (tcp->tcp_lso && 22270 (tcp->tcp_ipversion == IPV4_VERSION && 22271 tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || 22272 (tcp->tcp_ipversion == IPV6_VERSION)) { 22273 tcp->tcp_lso = B_FALSE; 22274 TCP_STAT(tcps, tcp_lso_disabled); 22275 } else { 22276 tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, 22277 lso_capab->ill_lso_max); 22278 } 22279 } 22280 22281 static void 22282 tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_lso_mdt) 22283 { 22284 conn_t *connp = tcp->tcp_connp; 22285 tcp_stack_t *tcps = tcp->tcp_tcps; 22286 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 22287 22288 ASSERT(ire != NULL); 22289 22290 /* 22291 * We may be in the fastpath here, and although we essentially do 22292 * similar checks as in ip_bind_connected{_v6}/ip_xxinfo_return, 22293 * we try to keep things as brief as possible. After all, these 22294 * are only best-effort checks, and we do more thorough ones prior 22295 * to calling tcp_send()/tcp_multisend(). 22296 */ 22297 if ((ipst->ips_ip_lso_outbound || ipst->ips_ip_multidata_outbound) && 22298 check_lso_mdt && !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 22299 ill != NULL && !CONN_IPSEC_OUT_ENCAPSULATED(connp) && 22300 !(ire->ire_flags & RTF_MULTIRT) && 22301 !IPP_ENABLED(IPP_LOCAL_OUT, ipst) && 22302 CONN_IS_LSO_MD_FASTPATH(connp)) { 22303 if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) { 22304 /* Cache the result */ 22305 connp->conn_lso_ok = B_TRUE; 22306 22307 ASSERT(ill->ill_lso_capab != NULL); 22308 if (!ill->ill_lso_capab->ill_lso_on) { 22309 ill->ill_lso_capab->ill_lso_on = 1; 22310 ip1dbg(("tcp_ire_ill_check: connp %p enables " 22311 "LSO for interface %s\n", (void *)connp, 22312 ill->ill_name)); 22313 } 22314 tcp_lso_update(tcp, ill->ill_lso_capab); 22315 } else if (ipst->ips_ip_multidata_outbound && 22316 ILL_MDT_CAPABLE(ill)) { 22317 /* Cache the result */ 22318 connp->conn_mdt_ok = B_TRUE; 22319 22320 ASSERT(ill->ill_mdt_capab != NULL); 22321 if (!ill->ill_mdt_capab->ill_mdt_on) { 22322 ill->ill_mdt_capab->ill_mdt_on = 1; 22323 ip1dbg(("tcp_ire_ill_check: connp %p enables " 22324 "MDT for interface %s\n", (void *)connp, 22325 ill->ill_name)); 22326 } 22327 tcp_mdt_update(tcp, ill->ill_mdt_capab, B_TRUE); 22328 } 22329 } 22330 22331 /* 22332 * The goal is to reduce the number of generated tcp segments by 22333 * setting the maxpsz multiplier to 0; this will have an affect on 22334 * tcp_maxpsz_set(). With this behavior, tcp will pack more data 22335 * into each packet, up to SMSS bytes. Doing this reduces the number 22336 * of outbound segments and incoming ACKs, thus allowing for better 22337 * network and system performance. In contrast the legacy behavior 22338 * may result in sending less than SMSS size, because the last mblk 22339 * for some packets may have more data than needed to make up SMSS, 22340 * and the legacy code refused to "split" it. 22341 * 22342 * We apply the new behavior on following situations: 22343 * 22344 * 1) Loopback connections, 22345 * 2) Connections in which the remote peer is not on local subnet, 22346 * 3) Local subnet connections over the bge interface (see below). 22347 * 22348 * Ideally, we would like this behavior to apply for interfaces other 22349 * than bge. However, doing so would negatively impact drivers which 22350 * perform dynamic mapping and unmapping of DMA resources, which are 22351 * increased by setting the maxpsz multiplier to 0 (more mblks per 22352 * packet will be generated by tcp). The bge driver does not suffer 22353 * from this, as it copies the mblks into pre-mapped buffers, and 22354 * therefore does not require more I/O resources than before. 22355 * 22356 * Otherwise, this behavior is present on all network interfaces when 22357 * the destination endpoint is non-local, since reducing the number 22358 * of packets in general is good for the network. 22359 * 22360 * TODO We need to remove this hard-coded conditional for bge once 22361 * a better "self-tuning" mechanism, or a way to comprehend 22362 * the driver transmit strategy is devised. Until the solution 22363 * is found and well understood, we live with this hack. 22364 */ 22365 if (!tcp_static_maxpsz && 22366 (tcp->tcp_loopback || !tcp->tcp_localnet || 22367 (ill->ill_name_length > 3 && bcmp(ill->ill_name, "bge", 3) == 0))) { 22368 /* override the default value */ 22369 tcp->tcp_maxpsz = 0; 22370 22371 ip3dbg(("tcp_ire_ill_check: connp %p tcp_maxpsz %d on " 22372 "interface %s\n", (void *)connp, tcp->tcp_maxpsz, 22373 ill != NULL ? ill->ill_name : ipif_loopback_name)); 22374 } 22375 22376 /* set the stream head parameters accordingly */ 22377 (void) tcp_maxpsz_set(tcp, B_TRUE); 22378 } 22379 22380 /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */ 22381 static void 22382 tcp_wput_flush(tcp_t *tcp, mblk_t *mp) 22383 { 22384 uchar_t fval = *mp->b_rptr; 22385 mblk_t *tail; 22386 queue_t *q = tcp->tcp_wq; 22387 22388 /* TODO: How should flush interact with urgent data? */ 22389 if ((fval & FLUSHW) && tcp->tcp_xmit_head && 22390 !(tcp->tcp_valid_bits & TCP_URG_VALID)) { 22391 /* 22392 * Flush only data that has not yet been put on the wire. If 22393 * we flush data that we have already transmitted, life, as we 22394 * know it, may come to an end. 22395 */ 22396 tail = tcp->tcp_xmit_tail; 22397 tail->b_wptr -= tcp->tcp_xmit_tail_unsent; 22398 tcp->tcp_xmit_tail_unsent = 0; 22399 tcp->tcp_unsent = 0; 22400 if (tail->b_wptr != tail->b_rptr) 22401 tail = tail->b_cont; 22402 if (tail) { 22403 mblk_t **excess = &tcp->tcp_xmit_head; 22404 for (;;) { 22405 mblk_t *mp1 = *excess; 22406 if (mp1 == tail) 22407 break; 22408 tcp->tcp_xmit_tail = mp1; 22409 tcp->tcp_xmit_last = mp1; 22410 excess = &mp1->b_cont; 22411 } 22412 *excess = NULL; 22413 tcp_close_mpp(&tail); 22414 if (tcp->tcp_snd_zcopy_aware) 22415 tcp_zcopy_notify(tcp); 22416 } 22417 /* 22418 * We have no unsent data, so unsent must be less than 22419 * tcp_xmit_lowater, so re-enable flow. 22420 */ 22421 mutex_enter(&tcp->tcp_non_sq_lock); 22422 if (tcp->tcp_flow_stopped) { 22423 tcp_clrqfull(tcp); 22424 } 22425 mutex_exit(&tcp->tcp_non_sq_lock); 22426 } 22427 /* 22428 * TODO: you can't just flush these, you have to increase rwnd for one 22429 * thing. For another, how should urgent data interact? 22430 */ 22431 if (fval & FLUSHR) { 22432 *mp->b_rptr = fval & ~FLUSHW; 22433 /* XXX */ 22434 qreply(q, mp); 22435 return; 22436 } 22437 freemsg(mp); 22438 } 22439 22440 /* 22441 * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA 22442 * messages. 22443 */ 22444 static void 22445 tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) 22446 { 22447 mblk_t *mp1; 22448 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 22449 STRUCT_HANDLE(strbuf, sb); 22450 queue_t *q = tcp->tcp_wq; 22451 int error; 22452 uint_t addrlen; 22453 22454 /* Make sure it is one of ours. */ 22455 switch (iocp->ioc_cmd) { 22456 case TI_GETMYNAME: 22457 case TI_GETPEERNAME: 22458 break; 22459 default: 22460 CALL_IP_WPUT(tcp->tcp_connp, q, mp); 22461 return; 22462 } 22463 switch (mi_copy_state(q, mp, &mp1)) { 22464 case -1: 22465 return; 22466 case MI_COPY_CASE(MI_COPY_IN, 1): 22467 break; 22468 case MI_COPY_CASE(MI_COPY_OUT, 1): 22469 /* Copy out the strbuf. */ 22470 mi_copyout(q, mp); 22471 return; 22472 case MI_COPY_CASE(MI_COPY_OUT, 2): 22473 /* All done. */ 22474 mi_copy_done(q, mp, 0); 22475 return; 22476 default: 22477 mi_copy_done(q, mp, EPROTO); 22478 return; 22479 } 22480 /* Check alignment of the strbuf */ 22481 if (!OK_32PTR(mp1->b_rptr)) { 22482 mi_copy_done(q, mp, EINVAL); 22483 return; 22484 } 22485 22486 STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr); 22487 addrlen = tcp->tcp_family == AF_INET ? sizeof (sin_t) : sizeof (sin6_t); 22488 if (STRUCT_FGET(sb, maxlen) < addrlen) { 22489 mi_copy_done(q, mp, EINVAL); 22490 return; 22491 } 22492 22493 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 22494 if (mp1 == NULL) 22495 return; 22496 22497 switch (iocp->ioc_cmd) { 22498 case TI_GETMYNAME: 22499 error = tcp_getmyname(tcp, (void *)mp1->b_rptr, &addrlen); 22500 break; 22501 case TI_GETPEERNAME: 22502 error = tcp_getpeername(tcp, (void *)mp1->b_rptr, &addrlen); 22503 break; 22504 } 22505 22506 if (error != 0) { 22507 mi_copy_done(q, mp, error); 22508 } else { 22509 mp1->b_wptr += addrlen; 22510 STRUCT_FSET(sb, len, addrlen); 22511 22512 /* Copy out the address */ 22513 mi_copyout(q, mp); 22514 } 22515 } 22516 22517 /* 22518 * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL 22519 * messages. 22520 */ 22521 /* ARGSUSED */ 22522 static void 22523 tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) 22524 { 22525 conn_t *connp = (conn_t *)arg; 22526 tcp_t *tcp = connp->conn_tcp; 22527 queue_t *q = tcp->tcp_wq; 22528 struct iocblk *iocp; 22529 tcp_stack_t *tcps = tcp->tcp_tcps; 22530 22531 ASSERT(DB_TYPE(mp) == M_IOCTL); 22532 /* 22533 * Try and ASSERT the minimum possible references on the 22534 * conn early enough. Since we are executing on write side, 22535 * the connection is obviously not detached and that means 22536 * there is a ref each for TCP and IP. Since we are behind 22537 * the squeue, the minimum references needed are 3. If the 22538 * conn is in classifier hash list, there should be an 22539 * extra ref for that (we check both the possibilities). 22540 */ 22541 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 22542 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 22543 22544 iocp = (struct iocblk *)mp->b_rptr; 22545 switch (iocp->ioc_cmd) { 22546 case TCP_IOC_DEFAULT_Q: 22547 /* Wants to be the default wq. */ 22548 if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { 22549 iocp->ioc_error = EPERM; 22550 iocp->ioc_count = 0; 22551 mp->b_datap->db_type = M_IOCACK; 22552 qreply(q, mp); 22553 return; 22554 } 22555 tcp_def_q_set(tcp, mp); 22556 return; 22557 case _SIOCSOCKFALLBACK: 22558 /* 22559 * Either sockmod is about to be popped and the socket 22560 * would now be treated as a plain stream, or a module 22561 * is about to be pushed so we could no longer use read- 22562 * side synchronous streams for fused loopback tcp. 22563 * Drain any queued data and disable direct sockfs 22564 * interface from now on. 22565 */ 22566 if (!tcp->tcp_issocket) { 22567 DB_TYPE(mp) = M_IOCNAK; 22568 iocp->ioc_error = EINVAL; 22569 } else { 22570 #ifdef _ILP32 22571 tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); 22572 #else 22573 tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev; 22574 #endif 22575 /* 22576 * Insert this socket into the acceptor hash. 22577 * We might need it for T_CONN_RES message 22578 */ 22579 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 22580 22581 if (tcp->tcp_fused) { 22582 /* 22583 * This is a fused loopback tcp; disable 22584 * read-side synchronous streams interface 22585 * and drain any queued data. It is okay 22586 * to do this for non-synchronous streams 22587 * fused tcp as well. 22588 */ 22589 tcp_fuse_disable_pair(tcp, B_FALSE); 22590 } 22591 tcp->tcp_issocket = B_FALSE; 22592 tcp->tcp_sodirect = NULL; 22593 TCP_STAT(tcps, tcp_sock_fallback); 22594 22595 DB_TYPE(mp) = M_IOCACK; 22596 iocp->ioc_error = 0; 22597 } 22598 iocp->ioc_count = 0; 22599 iocp->ioc_rval = 0; 22600 qreply(q, mp); 22601 return; 22602 } 22603 CALL_IP_WPUT(connp, q, mp); 22604 } 22605 22606 /* 22607 * This routine is called by tcp_wput() to handle all TPI requests. 22608 */ 22609 /* ARGSUSED */ 22610 static void 22611 tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) 22612 { 22613 conn_t *connp = (conn_t *)arg; 22614 tcp_t *tcp = connp->conn_tcp; 22615 union T_primitives *tprim = (union T_primitives *)mp->b_rptr; 22616 uchar_t *rptr; 22617 t_scalar_t type; 22618 int len; 22619 cred_t *cr = DB_CREDDEF(mp, tcp->tcp_cred); 22620 22621 /* 22622 * Try and ASSERT the minimum possible references on the 22623 * conn early enough. Since we are executing on write side, 22624 * the connection is obviously not detached and that means 22625 * there is a ref each for TCP and IP. Since we are behind 22626 * the squeue, the minimum references needed are 3. If the 22627 * conn is in classifier hash list, there should be an 22628 * extra ref for that (we check both the possibilities). 22629 */ 22630 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 22631 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 22632 22633 rptr = mp->b_rptr; 22634 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 22635 if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 22636 type = ((union T_primitives *)rptr)->type; 22637 if (type == T_EXDATA_REQ) { 22638 uint32_t msize = msgdsize(mp->b_cont); 22639 22640 len = msize - 1; 22641 if (len < 0) { 22642 freemsg(mp); 22643 return; 22644 } 22645 /* 22646 * Try to force urgent data out on the wire. 22647 * Even if we have unsent data this will 22648 * at least send the urgent flag. 22649 * XXX does not handle more flag correctly. 22650 */ 22651 len += tcp->tcp_unsent; 22652 len += tcp->tcp_snxt; 22653 tcp->tcp_urg = len; 22654 tcp->tcp_valid_bits |= TCP_URG_VALID; 22655 22656 /* Bypass tcp protocol for fused tcp loopback */ 22657 if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 22658 return; 22659 } else if (type != T_DATA_REQ) { 22660 goto non_urgent_data; 22661 } 22662 /* TODO: options, flags, ... from user */ 22663 /* Set length to zero for reclamation below */ 22664 tcp_wput_data(tcp, mp->b_cont, B_TRUE); 22665 freeb(mp); 22666 return; 22667 } else { 22668 if (tcp->tcp_debug) { 22669 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 22670 "tcp_wput_proto, dropping one..."); 22671 } 22672 freemsg(mp); 22673 return; 22674 } 22675 22676 non_urgent_data: 22677 22678 switch ((int)tprim->type) { 22679 case T_SSL_PROXY_BIND_REQ: /* an SSL proxy endpoint bind request */ 22680 /* 22681 * save the kssl_ent_t from the next block, and convert this 22682 * back to a normal bind_req. 22683 */ 22684 if (mp->b_cont != NULL) { 22685 ASSERT(MBLKL(mp->b_cont) >= sizeof (kssl_ent_t)); 22686 22687 if (tcp->tcp_kssl_ent != NULL) { 22688 kssl_release_ent(tcp->tcp_kssl_ent, NULL, 22689 KSSL_NO_PROXY); 22690 tcp->tcp_kssl_ent = NULL; 22691 } 22692 bcopy(mp->b_cont->b_rptr, &tcp->tcp_kssl_ent, 22693 sizeof (kssl_ent_t)); 22694 kssl_hold_ent(tcp->tcp_kssl_ent); 22695 freemsg(mp->b_cont); 22696 mp->b_cont = NULL; 22697 } 22698 tprim->type = T_BIND_REQ; 22699 22700 /* FALLTHROUGH */ 22701 case O_T_BIND_REQ: /* bind request */ 22702 case T_BIND_REQ: /* new semantics bind request */ 22703 tcp_bind(tcp, mp); 22704 break; 22705 case T_UNBIND_REQ: /* unbind request */ 22706 tcp_unbind(tcp, mp); 22707 break; 22708 case O_T_CONN_RES: /* old connection response XXX */ 22709 case T_CONN_RES: /* connection response */ 22710 tcp_accept(tcp, mp); 22711 break; 22712 case T_CONN_REQ: /* connection request */ 22713 tcp_connect(tcp, mp); 22714 break; 22715 case T_DISCON_REQ: /* disconnect request */ 22716 tcp_disconnect(tcp, mp); 22717 break; 22718 case T_CAPABILITY_REQ: 22719 tcp_capability_req(tcp, mp); /* capability request */ 22720 break; 22721 case T_INFO_REQ: /* information request */ 22722 tcp_info_req(tcp, mp); 22723 break; 22724 case T_SVR4_OPTMGMT_REQ: /* manage options req */ 22725 (void) svr4_optcom_req(tcp->tcp_wq, mp, cr, 22726 &tcp_opt_obj, B_TRUE); 22727 break; 22728 case T_OPTMGMT_REQ: 22729 /* 22730 * Note: no support for snmpcom_req() through new 22731 * T_OPTMGMT_REQ. See comments in ip.c 22732 */ 22733 /* Only IP is allowed to return meaningful value */ 22734 (void) tpi_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj, 22735 B_TRUE); 22736 break; 22737 22738 case T_UNITDATA_REQ: /* unitdata request */ 22739 tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 22740 break; 22741 case T_ORDREL_REQ: /* orderly release req */ 22742 freemsg(mp); 22743 22744 if (tcp->tcp_fused) 22745 tcp_unfuse(tcp); 22746 22747 if (tcp_xmit_end(tcp) != 0) { 22748 /* 22749 * We were crossing FINs and got a reset from 22750 * the other side. Just ignore it. 22751 */ 22752 if (tcp->tcp_debug) { 22753 (void) strlog(TCP_MOD_ID, 0, 1, 22754 SL_ERROR|SL_TRACE, 22755 "tcp_wput_proto, T_ORDREL_REQ out of " 22756 "state %s", 22757 tcp_display(tcp, NULL, 22758 DISP_ADDR_AND_PORT)); 22759 } 22760 } 22761 break; 22762 case T_ADDR_REQ: 22763 tcp_addr_req(tcp, mp); 22764 break; 22765 default: 22766 if (tcp->tcp_debug) { 22767 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 22768 "tcp_wput_proto, bogus TPI msg, type %d", 22769 tprim->type); 22770 } 22771 /* 22772 * We used to M_ERROR. Sending TNOTSUPPORT gives the user 22773 * to recover. 22774 */ 22775 tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 22776 break; 22777 } 22778 } 22779 22780 /* 22781 * The TCP write service routine should never be called... 22782 */ 22783 /* ARGSUSED */ 22784 static void 22785 tcp_wsrv(queue_t *q) 22786 { 22787 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 22788 22789 TCP_STAT(tcps, tcp_wsrv_called); 22790 } 22791 22792 /* Non overlapping byte exchanger */ 22793 static void 22794 tcp_xchg(uchar_t *a, uchar_t *b, int len) 22795 { 22796 uchar_t uch; 22797 22798 while (len-- > 0) { 22799 uch = a[len]; 22800 a[len] = b[len]; 22801 b[len] = uch; 22802 } 22803 } 22804 22805 /* 22806 * Send out a control packet on the tcp connection specified. This routine 22807 * is typically called where we need a simple ACK or RST generated. 22808 */ 22809 static void 22810 tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) 22811 { 22812 uchar_t *rptr; 22813 tcph_t *tcph; 22814 ipha_t *ipha = NULL; 22815 ip6_t *ip6h = NULL; 22816 uint32_t sum; 22817 int tcp_hdr_len; 22818 int tcp_ip_hdr_len; 22819 mblk_t *mp; 22820 tcp_stack_t *tcps = tcp->tcp_tcps; 22821 22822 /* 22823 * Save sum for use in source route later. 22824 */ 22825 ASSERT(tcp != NULL); 22826 sum = tcp->tcp_tcp_hdr_len + tcp->tcp_sum; 22827 tcp_hdr_len = tcp->tcp_hdr_len; 22828 tcp_ip_hdr_len = tcp->tcp_ip_hdr_len; 22829 22830 /* If a text string is passed in with the request, pass it to strlog. */ 22831 if (str != NULL && tcp->tcp_debug) { 22832 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 22833 "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", 22834 str, seq, ack, ctl); 22835 } 22836 mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcps->tcps_wroff_xtra, 22837 BPRI_MED); 22838 if (mp == NULL) { 22839 return; 22840 } 22841 rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; 22842 mp->b_rptr = rptr; 22843 mp->b_wptr = &rptr[tcp_hdr_len]; 22844 bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); 22845 22846 if (tcp->tcp_ipversion == IPV4_VERSION) { 22847 ipha = (ipha_t *)rptr; 22848 ipha->ipha_length = htons(tcp_hdr_len); 22849 } else { 22850 ip6h = (ip6_t *)rptr; 22851 ASSERT(tcp != NULL); 22852 ip6h->ip6_plen = htons(tcp->tcp_hdr_len - 22853 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 22854 } 22855 tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; 22856 tcph->th_flags[0] = (uint8_t)ctl; 22857 if (ctl & TH_RST) { 22858 BUMP_MIB(&tcps->tcps_mib, tcpOutRsts); 22859 BUMP_MIB(&tcps->tcps_mib, tcpOutControl); 22860 /* 22861 * Don't send TSopt w/ TH_RST packets per RFC 1323. 22862 */ 22863 if (tcp->tcp_snd_ts_ok && 22864 tcp->tcp_state > TCPS_SYN_SENT) { 22865 mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN]; 22866 *(mp->b_wptr) = TCPOPT_EOL; 22867 if (tcp->tcp_ipversion == IPV4_VERSION) { 22868 ipha->ipha_length = htons(tcp_hdr_len - 22869 TCPOPT_REAL_TS_LEN); 22870 } else { 22871 ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - 22872 TCPOPT_REAL_TS_LEN); 22873 } 22874 tcph->th_offset_and_rsrvd[0] -= (3 << 4); 22875 sum -= TCPOPT_REAL_TS_LEN; 22876 } 22877 } 22878 if (ctl & TH_ACK) { 22879 if (tcp->tcp_snd_ts_ok) { 22880 U32_TO_BE32(lbolt, 22881 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 22882 U32_TO_BE32(tcp->tcp_ts_recent, 22883 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 22884 } 22885 22886 /* Update the latest receive window size in TCP header. */ 22887 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 22888 tcph->th_win); 22889 tcp->tcp_rack = ack; 22890 tcp->tcp_rack_cnt = 0; 22891 BUMP_MIB(&tcps->tcps_mib, tcpOutAck); 22892 } 22893 BUMP_LOCAL(tcp->tcp_obsegs); 22894 U32_TO_BE32(seq, tcph->th_seq); 22895 U32_TO_BE32(ack, tcph->th_ack); 22896 /* 22897 * Include the adjustment for a source route if any. 22898 */ 22899 sum = (sum >> 16) + (sum & 0xFFFF); 22900 U16_TO_BE16(sum, tcph->th_sum); 22901 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 22902 tcp_send_data(tcp, tcp->tcp_wq, mp); 22903 } 22904 22905 /* 22906 * If this routine returns B_TRUE, TCP can generate a RST in response 22907 * to a segment. If it returns B_FALSE, TCP should not respond. 22908 */ 22909 static boolean_t 22910 tcp_send_rst_chk(tcp_stack_t *tcps) 22911 { 22912 clock_t now; 22913 22914 /* 22915 * TCP needs to protect itself from generating too many RSTs. 22916 * This can be a DoS attack by sending us random segments 22917 * soliciting RSTs. 22918 * 22919 * What we do here is to have a limit of tcp_rst_sent_rate RSTs 22920 * in each 1 second interval. In this way, TCP still generate 22921 * RSTs in normal cases but when under attack, the impact is 22922 * limited. 22923 */ 22924 if (tcps->tcps_rst_sent_rate_enabled != 0) { 22925 now = lbolt; 22926 /* lbolt can wrap around. */ 22927 if ((tcps->tcps_last_rst_intrvl > now) || 22928 (TICK_TO_MSEC(now - tcps->tcps_last_rst_intrvl) > 22929 1*SECONDS)) { 22930 tcps->tcps_last_rst_intrvl = now; 22931 tcps->tcps_rst_cnt = 1; 22932 } else if (++tcps->tcps_rst_cnt > tcps->tcps_rst_sent_rate) { 22933 return (B_FALSE); 22934 } 22935 } 22936 return (B_TRUE); 22937 } 22938 22939 /* 22940 * Send down the advice IP ioctl to tell IP to mark an IRE temporary. 22941 */ 22942 static void 22943 tcp_ip_ire_mark_advice(tcp_t *tcp) 22944 { 22945 mblk_t *mp; 22946 ipic_t *ipic; 22947 22948 if (tcp->tcp_ipversion == IPV4_VERSION) { 22949 mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN, 22950 &ipic); 22951 } else { 22952 mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN, 22953 &ipic); 22954 } 22955 if (mp == NULL) 22956 return; 22957 ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY; 22958 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 22959 } 22960 22961 /* 22962 * Return an IP advice ioctl mblk and set ipic to be the pointer 22963 * to the advice structure. 22964 */ 22965 static mblk_t * 22966 tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic) 22967 { 22968 struct iocblk *ioc; 22969 mblk_t *mp, *mp1; 22970 22971 mp = allocb(sizeof (ipic_t) + addr_len, BPRI_HI); 22972 if (mp == NULL) 22973 return (NULL); 22974 bzero(mp->b_rptr, sizeof (ipic_t) + addr_len); 22975 *ipic = (ipic_t *)mp->b_rptr; 22976 (*ipic)->ipic_cmd = IP_IOC_IRE_ADVISE_NO_REPLY; 22977 (*ipic)->ipic_addr_offset = sizeof (ipic_t); 22978 22979 bcopy(addr, *ipic + 1, addr_len); 22980 22981 (*ipic)->ipic_addr_length = addr_len; 22982 mp->b_wptr = &mp->b_rptr[sizeof (ipic_t) + addr_len]; 22983 22984 mp1 = mkiocb(IP_IOCTL); 22985 if (mp1 == NULL) { 22986 freemsg(mp); 22987 return (NULL); 22988 } 22989 mp1->b_cont = mp; 22990 ioc = (struct iocblk *)mp1->b_rptr; 22991 ioc->ioc_count = sizeof (ipic_t) + addr_len; 22992 22993 return (mp1); 22994 } 22995 22996 /* 22997 * Generate a reset based on an inbound packet, connp is set by caller 22998 * when RST is in response to an unexpected inbound packet for which 22999 * there is active tcp state in the system. 23000 * 23001 * IPSEC NOTE : Try to send the reply with the same protection as it came 23002 * in. We still have the ipsec_mp that the packet was attached to. Thus 23003 * the packet will go out at the same level of protection as it came in by 23004 * converting the IPSEC_IN to IPSEC_OUT. 23005 */ 23006 static void 23007 tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, 23008 uint32_t ack, int ctl, uint_t ip_hdr_len, zoneid_t zoneid, 23009 tcp_stack_t *tcps, conn_t *connp) 23010 { 23011 ipha_t *ipha = NULL; 23012 ip6_t *ip6h = NULL; 23013 ushort_t len; 23014 tcph_t *tcph; 23015 int i; 23016 mblk_t *ipsec_mp; 23017 boolean_t mctl_present; 23018 ipic_t *ipic; 23019 ipaddr_t v4addr; 23020 in6_addr_t v6addr; 23021 int addr_len; 23022 void *addr; 23023 queue_t *q = tcps->tcps_g_q; 23024 tcp_t *tcp; 23025 cred_t *cr; 23026 mblk_t *nmp; 23027 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 23028 23029 if (tcps->tcps_g_q == NULL) { 23030 /* 23031 * For non-zero stackids the default queue isn't created 23032 * until the first open, thus there can be a need to send 23033 * a reset before then. But we can't do that, hence we just 23034 * drop the packet. Later during boot, when the default queue 23035 * has been setup, a retransmitted packet from the peer 23036 * will result in a reset. 23037 */ 23038 ASSERT(tcps->tcps_netstack->netstack_stackid != 23039 GLOBAL_NETSTACKID); 23040 freemsg(mp); 23041 return; 23042 } 23043 23044 if (connp != NULL) 23045 tcp = connp->conn_tcp; 23046 else 23047 tcp = Q_TO_TCP(q); 23048 23049 if (!tcp_send_rst_chk(tcps)) { 23050 tcps->tcps_rst_unsent++; 23051 freemsg(mp); 23052 return; 23053 } 23054 23055 if (mp->b_datap->db_type == M_CTL) { 23056 ipsec_mp = mp; 23057 mp = mp->b_cont; 23058 mctl_present = B_TRUE; 23059 } else { 23060 ipsec_mp = mp; 23061 mctl_present = B_FALSE; 23062 } 23063 23064 if (str && q && tcps->tcps_dbg) { 23065 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 23066 "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 23067 "flags 0x%x", 23068 str, seq, ack, ctl); 23069 } 23070 if (mp->b_datap->db_ref != 1) { 23071 mblk_t *mp1 = copyb(mp); 23072 freemsg(mp); 23073 mp = mp1; 23074 if (!mp) { 23075 if (mctl_present) 23076 freeb(ipsec_mp); 23077 return; 23078 } else { 23079 if (mctl_present) { 23080 ipsec_mp->b_cont = mp; 23081 } else { 23082 ipsec_mp = mp; 23083 } 23084 } 23085 } else if (mp->b_cont) { 23086 freemsg(mp->b_cont); 23087 mp->b_cont = NULL; 23088 } 23089 /* 23090 * We skip reversing source route here. 23091 * (for now we replace all IP options with EOL) 23092 */ 23093 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 23094 ipha = (ipha_t *)mp->b_rptr; 23095 for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 23096 mp->b_rptr[i] = IPOPT_EOL; 23097 /* 23098 * Make sure that src address isn't flagrantly invalid. 23099 * Not all broadcast address checking for the src address 23100 * is possible, since we don't know the netmask of the src 23101 * addr. No check for destination address is done, since 23102 * IP will not pass up a packet with a broadcast dest 23103 * address to TCP. Similar checks are done below for IPv6. 23104 */ 23105 if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST || 23106 CLASSD(ipha->ipha_src)) { 23107 freemsg(ipsec_mp); 23108 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); 23109 return; 23110 } 23111 } else { 23112 ip6h = (ip6_t *)mp->b_rptr; 23113 23114 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || 23115 IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { 23116 freemsg(ipsec_mp); 23117 BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); 23118 return; 23119 } 23120 23121 /* Remove any extension headers assuming partial overlay */ 23122 if (ip_hdr_len > IPV6_HDR_LEN) { 23123 uint8_t *to; 23124 23125 to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN; 23126 ovbcopy(ip6h, to, IPV6_HDR_LEN); 23127 mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN; 23128 ip_hdr_len = IPV6_HDR_LEN; 23129 ip6h = (ip6_t *)mp->b_rptr; 23130 ip6h->ip6_nxt = IPPROTO_TCP; 23131 } 23132 } 23133 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 23134 if (tcph->th_flags[0] & TH_RST) { 23135 freemsg(ipsec_mp); 23136 return; 23137 } 23138 tcph->th_offset_and_rsrvd[0] = (5 << 4); 23139 len = ip_hdr_len + sizeof (tcph_t); 23140 mp->b_wptr = &mp->b_rptr[len]; 23141 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 23142 ipha->ipha_length = htons(len); 23143 /* Swap addresses */ 23144 v4addr = ipha->ipha_src; 23145 ipha->ipha_src = ipha->ipha_dst; 23146 ipha->ipha_dst = v4addr; 23147 ipha->ipha_ident = 0; 23148 ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; 23149 addr_len = IP_ADDR_LEN; 23150 addr = &v4addr; 23151 } else { 23152 /* No ip6i_t in this case */ 23153 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 23154 /* Swap addresses */ 23155 v6addr = ip6h->ip6_src; 23156 ip6h->ip6_src = ip6h->ip6_dst; 23157 ip6h->ip6_dst = v6addr; 23158 ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit; 23159 addr_len = IPV6_ADDR_LEN; 23160 addr = &v6addr; 23161 } 23162 tcp_xchg(tcph->th_fport, tcph->th_lport, 2); 23163 U32_TO_BE32(ack, tcph->th_ack); 23164 U32_TO_BE32(seq, tcph->th_seq); 23165 U16_TO_BE16(0, tcph->th_win); 23166 U16_TO_BE16(sizeof (tcph_t), tcph->th_sum); 23167 tcph->th_flags[0] = (uint8_t)ctl; 23168 if (ctl & TH_RST) { 23169 BUMP_MIB(&tcps->tcps_mib, tcpOutRsts); 23170 BUMP_MIB(&tcps->tcps_mib, tcpOutControl); 23171 } 23172 23173 /* IP trusts us to set up labels when required. */ 23174 if (is_system_labeled() && (cr = DB_CRED(mp)) != NULL && 23175 crgetlabel(cr) != NULL) { 23176 int err; 23177 23178 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) 23179 err = tsol_check_label(cr, &mp, 23180 tcp->tcp_connp->conn_mac_exempt, 23181 tcps->tcps_netstack->netstack_ip); 23182 else 23183 err = tsol_check_label_v6(cr, &mp, 23184 tcp->tcp_connp->conn_mac_exempt, 23185 tcps->tcps_netstack->netstack_ip); 23186 if (mctl_present) 23187 ipsec_mp->b_cont = mp; 23188 else 23189 ipsec_mp = mp; 23190 if (err != 0) { 23191 freemsg(ipsec_mp); 23192 return; 23193 } 23194 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 23195 ipha = (ipha_t *)mp->b_rptr; 23196 } else { 23197 ip6h = (ip6_t *)mp->b_rptr; 23198 } 23199 } 23200 23201 if (mctl_present) { 23202 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; 23203 23204 ASSERT(ii->ipsec_in_type == IPSEC_IN); 23205 if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h)) { 23206 return; 23207 } 23208 } 23209 if (zoneid == ALL_ZONES) 23210 zoneid = GLOBAL_ZONEID; 23211 23212 /* Add the zoneid so ip_output routes it properly */ 23213 if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid, ipst)) == NULL) { 23214 freemsg(ipsec_mp); 23215 return; 23216 } 23217 ipsec_mp = nmp; 23218 23219 /* 23220 * NOTE: one might consider tracing a TCP packet here, but 23221 * this function has no active TCP state and no tcp structure 23222 * that has a trace buffer. If we traced here, we would have 23223 * to keep a local trace buffer in tcp_record_trace(). 23224 * 23225 * TSol note: The mblk that contains the incoming packet was 23226 * reused by tcp_xmit_listener_reset, so it already contains 23227 * the right credentials and we don't need to call mblk_setcred. 23228 * Also the conn's cred is not right since it is associated 23229 * with tcps_g_q. 23230 */ 23231 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp); 23232 23233 /* 23234 * Tell IP to mark the IRE used for this destination temporary. 23235 * This way, we can limit our exposure to DoS attack because IP 23236 * creates an IRE for each destination. If there are too many, 23237 * the time to do any routing lookup will be extremely long. And 23238 * the lookup can be in interrupt context. 23239 * 23240 * Note that in normal circumstances, this marking should not 23241 * affect anything. It would be nice if only 1 message is 23242 * needed to inform IP that the IRE created for this RST should 23243 * not be added to the cache table. But there is currently 23244 * not such communication mechanism between TCP and IP. So 23245 * the best we can do now is to send the advice ioctl to IP 23246 * to mark the IRE temporary. 23247 */ 23248 if ((mp = tcp_ip_advise_mblk(addr, addr_len, &ipic)) != NULL) { 23249 ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY; 23250 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 23251 } 23252 } 23253 23254 /* 23255 * Initiate closedown sequence on an active connection. (May be called as 23256 * writer.) Return value zero for OK return, non-zero for error return. 23257 */ 23258 static int 23259 tcp_xmit_end(tcp_t *tcp) 23260 { 23261 ipic_t *ipic; 23262 mblk_t *mp; 23263 tcp_stack_t *tcps = tcp->tcp_tcps; 23264 23265 if (tcp->tcp_state < TCPS_SYN_RCVD || 23266 tcp->tcp_state > TCPS_CLOSE_WAIT) { 23267 /* 23268 * Invalid state, only states TCPS_SYN_RCVD, 23269 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 23270 */ 23271 return (-1); 23272 } 23273 23274 tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 23275 tcp->tcp_valid_bits |= TCP_FSS_VALID; 23276 /* 23277 * If there is nothing more unsent, send the FIN now. 23278 * Otherwise, it will go out with the last segment. 23279 */ 23280 if (tcp->tcp_unsent == 0) { 23281 mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 23282 tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 23283 23284 if (mp) { 23285 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 23286 tcp_send_data(tcp, tcp->tcp_wq, mp); 23287 } else { 23288 /* 23289 * Couldn't allocate msg. Pretend we got it out. 23290 * Wait for rexmit timeout. 23291 */ 23292 tcp->tcp_snxt = tcp->tcp_fss + 1; 23293 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 23294 } 23295 23296 /* 23297 * If needed, update tcp_rexmit_snxt as tcp_snxt is 23298 * changed. 23299 */ 23300 if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 23301 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 23302 } 23303 } else { 23304 /* 23305 * If tcp->tcp_cork is set, then the data will not get sent, 23306 * so we have to check that and unset it first. 23307 */ 23308 if (tcp->tcp_cork) 23309 tcp->tcp_cork = B_FALSE; 23310 tcp_wput_data(tcp, NULL, B_FALSE); 23311 } 23312 23313 /* 23314 * If TCP does not get enough samples of RTT or tcp_rtt_updates 23315 * is 0, don't update the cache. 23316 */ 23317 if (tcps->tcps_rtt_updates == 0 || 23318 tcp->tcp_rtt_update < tcps->tcps_rtt_updates) 23319 return (0); 23320 23321 /* 23322 * NOTE: should not update if source routes i.e. if tcp_remote if 23323 * different from the destination. 23324 */ 23325 if (tcp->tcp_ipversion == IPV4_VERSION) { 23326 if (tcp->tcp_remote != tcp->tcp_ipha->ipha_dst) { 23327 return (0); 23328 } 23329 mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN, 23330 &ipic); 23331 } else { 23332 if (!(IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, 23333 &tcp->tcp_ip6h->ip6_dst))) { 23334 return (0); 23335 } 23336 mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN, 23337 &ipic); 23338 } 23339 23340 /* Record route attributes in the IRE for use by future connections. */ 23341 if (mp == NULL) 23342 return (0); 23343 23344 /* 23345 * We do not have a good algorithm to update ssthresh at this time. 23346 * So don't do any update. 23347 */ 23348 ipic->ipic_rtt = tcp->tcp_rtt_sa; 23349 ipic->ipic_rtt_sd = tcp->tcp_rtt_sd; 23350 23351 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 23352 return (0); 23353 } 23354 23355 /* 23356 * Generate a "no listener here" RST in response to an "unknown" segment. 23357 * connp is set by caller when RST is in response to an unexpected 23358 * inbound packet for which there is active tcp state in the system. 23359 * Note that we are reusing the incoming mp to construct the outgoing RST. 23360 */ 23361 void 23362 tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid, 23363 tcp_stack_t *tcps, conn_t *connp) 23364 { 23365 uchar_t *rptr; 23366 uint32_t seg_len; 23367 tcph_t *tcph; 23368 uint32_t seg_seq; 23369 uint32_t seg_ack; 23370 uint_t flags; 23371 mblk_t *ipsec_mp; 23372 ipha_t *ipha; 23373 ip6_t *ip6h; 23374 boolean_t mctl_present = B_FALSE; 23375 boolean_t check = B_TRUE; 23376 boolean_t policy_present; 23377 ipsec_stack_t *ipss = tcps->tcps_netstack->netstack_ipsec; 23378 23379 TCP_STAT(tcps, tcp_no_listener); 23380 23381 ipsec_mp = mp; 23382 23383 if (mp->b_datap->db_type == M_CTL) { 23384 ipsec_in_t *ii; 23385 23386 mctl_present = B_TRUE; 23387 mp = mp->b_cont; 23388 23389 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 23390 ASSERT(ii->ipsec_in_type == IPSEC_IN); 23391 if (ii->ipsec_in_dont_check) { 23392 check = B_FALSE; 23393 if (!ii->ipsec_in_secure) { 23394 freeb(ipsec_mp); 23395 mctl_present = B_FALSE; 23396 ipsec_mp = mp; 23397 } 23398 } 23399 } 23400 23401 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 23402 policy_present = ipss->ipsec_inbound_v4_policy_present; 23403 ipha = (ipha_t *)mp->b_rptr; 23404 ip6h = NULL; 23405 } else { 23406 policy_present = ipss->ipsec_inbound_v6_policy_present; 23407 ipha = NULL; 23408 ip6h = (ip6_t *)mp->b_rptr; 23409 } 23410 23411 if (check && policy_present) { 23412 /* 23413 * The conn_t parameter is NULL because we already know 23414 * nobody's home. 23415 */ 23416 ipsec_mp = ipsec_check_global_policy( 23417 ipsec_mp, (conn_t *)NULL, ipha, ip6h, mctl_present, 23418 tcps->tcps_netstack); 23419 if (ipsec_mp == NULL) 23420 return; 23421 } 23422 if (is_system_labeled() && !tsol_can_reply_error(mp)) { 23423 DTRACE_PROBE2( 23424 tx__ip__log__error__nolistener__tcp, 23425 char *, "Could not reply with RST to mp(1)", 23426 mblk_t *, mp); 23427 ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n")); 23428 freemsg(ipsec_mp); 23429 return; 23430 } 23431 23432 rptr = mp->b_rptr; 23433 23434 tcph = (tcph_t *)&rptr[ip_hdr_len]; 23435 seg_seq = BE32_TO_U32(tcph->th_seq); 23436 seg_ack = BE32_TO_U32(tcph->th_ack); 23437 flags = tcph->th_flags[0]; 23438 23439 seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len); 23440 if (flags & TH_RST) { 23441 freemsg(ipsec_mp); 23442 } else if (flags & TH_ACK) { 23443 tcp_xmit_early_reset("no tcp, reset", 23444 ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid, tcps, 23445 connp); 23446 } else { 23447 if (flags & TH_SYN) { 23448 seg_len++; 23449 } else { 23450 /* 23451 * Here we violate the RFC. Note that a normal 23452 * TCP will never send a segment without the ACK 23453 * flag, except for RST or SYN segment. This 23454 * segment is neither. Just drop it on the 23455 * floor. 23456 */ 23457 freemsg(ipsec_mp); 23458 tcps->tcps_rst_unsent++; 23459 return; 23460 } 23461 23462 tcp_xmit_early_reset("no tcp, reset/ack", 23463 ipsec_mp, 0, seg_seq + seg_len, 23464 TH_RST | TH_ACK, ip_hdr_len, zoneid, tcps, connp); 23465 } 23466 } 23467 23468 /* 23469 * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 23470 * ip and tcp header ready to pass down to IP. If the mp passed in is 23471 * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 23472 * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 23473 * otherwise it will dup partial mblks.) 23474 * Otherwise, an appropriate ACK packet will be generated. This 23475 * routine is not usually called to send new data for the first time. It 23476 * is mostly called out of the timer for retransmits, and to generate ACKs. 23477 * 23478 * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 23479 * be adjusted by *offset. And after dupb(), the offset and the ending mblk 23480 * of the original mblk chain will be returned in *offset and *end_mp. 23481 */ 23482 mblk_t * 23483 tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 23484 mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 23485 boolean_t rexmit) 23486 { 23487 int data_length; 23488 int32_t off = 0; 23489 uint_t flags; 23490 mblk_t *mp1; 23491 mblk_t *mp2; 23492 uchar_t *rptr; 23493 tcph_t *tcph; 23494 int32_t num_sack_blk = 0; 23495 int32_t sack_opt_len = 0; 23496 tcp_stack_t *tcps = tcp->tcp_tcps; 23497 23498 /* Allocate for our maximum TCP header + link-level */ 23499 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 23500 tcps->tcps_wroff_xtra, BPRI_MED); 23501 if (!mp1) 23502 return (NULL); 23503 data_length = 0; 23504 23505 /* 23506 * Note that tcp_mss has been adjusted to take into account the 23507 * timestamp option if applicable. Because SACK options do not 23508 * appear in every TCP segments and they are of variable lengths, 23509 * they cannot be included in tcp_mss. Thus we need to calculate 23510 * the actual segment length when we need to send a segment which 23511 * includes SACK options. 23512 */ 23513 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 23514 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 23515 tcp->tcp_num_sack_blk); 23516 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 23517 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 23518 if (max_to_send + sack_opt_len > tcp->tcp_mss) 23519 max_to_send -= sack_opt_len; 23520 } 23521 23522 if (offset != NULL) { 23523 off = *offset; 23524 /* We use offset as an indicator that end_mp is not NULL. */ 23525 *end_mp = NULL; 23526 } 23527 for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 23528 /* This could be faster with cooperation from downstream */ 23529 if (mp2 != mp1 && !sendall && 23530 data_length + (int)(mp->b_wptr - mp->b_rptr) > 23531 max_to_send) 23532 /* 23533 * Don't send the next mblk since the whole mblk 23534 * does not fit. 23535 */ 23536 break; 23537 mp2->b_cont = dupb(mp); 23538 mp2 = mp2->b_cont; 23539 if (!mp2) { 23540 freemsg(mp1); 23541 return (NULL); 23542 } 23543 mp2->b_rptr += off; 23544 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 23545 (uintptr_t)INT_MAX); 23546 23547 data_length += (int)(mp2->b_wptr - mp2->b_rptr); 23548 if (data_length > max_to_send) { 23549 mp2->b_wptr -= data_length - max_to_send; 23550 data_length = max_to_send; 23551 off = mp2->b_wptr - mp->b_rptr; 23552 break; 23553 } else { 23554 off = 0; 23555 } 23556 } 23557 if (offset != NULL) { 23558 *offset = off; 23559 *end_mp = mp; 23560 } 23561 if (seg_len != NULL) { 23562 *seg_len = data_length; 23563 } 23564 23565 /* Update the latest receive window size in TCP header. */ 23566 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 23567 tcp->tcp_tcph->th_win); 23568 23569 rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; 23570 mp1->b_rptr = rptr; 23571 mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; 23572 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 23573 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 23574 U32_TO_ABE32(seq, tcph->th_seq); 23575 23576 /* 23577 * Use tcp_unsent to determine if the PUSH bit should be used assumes 23578 * that this function was called from tcp_wput_data. Thus, when called 23579 * to retransmit data the setting of the PUSH bit may appear some 23580 * what random in that it might get set when it should not. This 23581 * should not pose any performance issues. 23582 */ 23583 if (data_length != 0 && (tcp->tcp_unsent == 0 || 23584 tcp->tcp_unsent == data_length)) { 23585 flags = TH_ACK | TH_PUSH; 23586 } else { 23587 flags = TH_ACK; 23588 } 23589 23590 if (tcp->tcp_ecn_ok) { 23591 if (tcp->tcp_ecn_echo_on) 23592 flags |= TH_ECE; 23593 23594 /* 23595 * Only set ECT bit and ECN_CWR if a segment contains new data. 23596 * There is no TCP flow control for non-data segments, and 23597 * only data segment is transmitted reliably. 23598 */ 23599 if (data_length > 0 && !rexmit) { 23600 SET_ECT(tcp, rptr); 23601 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 23602 flags |= TH_CWR; 23603 tcp->tcp_ecn_cwr_sent = B_TRUE; 23604 } 23605 } 23606 } 23607 23608 if (tcp->tcp_valid_bits) { 23609 uint32_t u1; 23610 23611 if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 23612 seq == tcp->tcp_iss) { 23613 uchar_t *wptr; 23614 23615 /* 23616 * If TCP_ISS_VALID and the seq number is tcp_iss, 23617 * TCP can only be in SYN-SENT, SYN-RCVD or 23618 * FIN-WAIT-1 state. It can be FIN-WAIT-1 if 23619 * our SYN is not ack'ed but the app closes this 23620 * TCP connection. 23621 */ 23622 ASSERT(tcp->tcp_state == TCPS_SYN_SENT || 23623 tcp->tcp_state == TCPS_SYN_RCVD || 23624 tcp->tcp_state == TCPS_FIN_WAIT_1); 23625 23626 /* 23627 * Tack on the MSS option. It is always needed 23628 * for both active and passive open. 23629 * 23630 * MSS option value should be interface MTU - MIN 23631 * TCP/IP header according to RFC 793 as it means 23632 * the maximum segment size TCP can receive. But 23633 * to get around some broken middle boxes/end hosts 23634 * out there, we allow the option value to be the 23635 * same as the MSS option size on the peer side. 23636 * In this way, the other side will not send 23637 * anything larger than they can receive. 23638 * 23639 * Note that for SYN_SENT state, the ndd param 23640 * tcp_use_smss_as_mss_opt has no effect as we 23641 * don't know the peer's MSS option value. So 23642 * the only case we need to take care of is in 23643 * SYN_RCVD state, which is done later. 23644 */ 23645 wptr = mp1->b_wptr; 23646 wptr[0] = TCPOPT_MAXSEG; 23647 wptr[1] = TCPOPT_MAXSEG_LEN; 23648 wptr += 2; 23649 u1 = tcp->tcp_if_mtu - 23650 (tcp->tcp_ipversion == IPV4_VERSION ? 23651 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - 23652 TCP_MIN_HEADER_LENGTH; 23653 U16_TO_BE16(u1, wptr); 23654 mp1->b_wptr = wptr + 2; 23655 /* Update the offset to cover the additional word */ 23656 tcph->th_offset_and_rsrvd[0] += (1 << 4); 23657 23658 /* 23659 * Note that the following way of filling in 23660 * TCP options are not optimal. Some NOPs can 23661 * be saved. But there is no need at this time 23662 * to optimize it. When it is needed, we will 23663 * do it. 23664 */ 23665 switch (tcp->tcp_state) { 23666 case TCPS_SYN_SENT: 23667 flags = TH_SYN; 23668 23669 if (tcp->tcp_snd_ts_ok) { 23670 uint32_t llbolt = (uint32_t)lbolt; 23671 23672 wptr = mp1->b_wptr; 23673 wptr[0] = TCPOPT_NOP; 23674 wptr[1] = TCPOPT_NOP; 23675 wptr[2] = TCPOPT_TSTAMP; 23676 wptr[3] = TCPOPT_TSTAMP_LEN; 23677 wptr += 4; 23678 U32_TO_BE32(llbolt, wptr); 23679 wptr += 4; 23680 ASSERT(tcp->tcp_ts_recent == 0); 23681 U32_TO_BE32(0L, wptr); 23682 mp1->b_wptr += TCPOPT_REAL_TS_LEN; 23683 tcph->th_offset_and_rsrvd[0] += 23684 (3 << 4); 23685 } 23686 23687 /* 23688 * Set up all the bits to tell other side 23689 * we are ECN capable. 23690 */ 23691 if (tcp->tcp_ecn_ok) { 23692 flags |= (TH_ECE | TH_CWR); 23693 } 23694 break; 23695 case TCPS_SYN_RCVD: 23696 flags |= TH_SYN; 23697 23698 /* 23699 * Reset the MSS option value to be SMSS 23700 * We should probably add back the bytes 23701 * for timestamp option and IPsec. We 23702 * don't do that as this is a workaround 23703 * for broken middle boxes/end hosts, it 23704 * is better for us to be more cautious. 23705 * They may not take these things into 23706 * account in their SMSS calculation. Thus 23707 * the peer's calculated SMSS may be smaller 23708 * than what it can be. This should be OK. 23709 */ 23710 if (tcps->tcps_use_smss_as_mss_opt) { 23711 u1 = tcp->tcp_mss; 23712 U16_TO_BE16(u1, wptr); 23713 } 23714 23715 /* 23716 * If the other side is ECN capable, reply 23717 * that we are also ECN capable. 23718 */ 23719 if (tcp->tcp_ecn_ok) 23720 flags |= TH_ECE; 23721 break; 23722 default: 23723 /* 23724 * The above ASSERT() makes sure that this 23725 * must be FIN-WAIT-1 state. Our SYN has 23726 * not been ack'ed so retransmit it. 23727 */ 23728 flags |= TH_SYN; 23729 break; 23730 } 23731 23732 if (tcp->tcp_snd_ws_ok) { 23733 wptr = mp1->b_wptr; 23734 wptr[0] = TCPOPT_NOP; 23735 wptr[1] = TCPOPT_WSCALE; 23736 wptr[2] = TCPOPT_WS_LEN; 23737 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 23738 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 23739 tcph->th_offset_and_rsrvd[0] += (1 << 4); 23740 } 23741 23742 if (tcp->tcp_snd_sack_ok) { 23743 wptr = mp1->b_wptr; 23744 wptr[0] = TCPOPT_NOP; 23745 wptr[1] = TCPOPT_NOP; 23746 wptr[2] = TCPOPT_SACK_PERMITTED; 23747 wptr[3] = TCPOPT_SACK_OK_LEN; 23748 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 23749 tcph->th_offset_and_rsrvd[0] += (1 << 4); 23750 } 23751 23752 /* allocb() of adequate mblk assures space */ 23753 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 23754 (uintptr_t)INT_MAX); 23755 u1 = (int)(mp1->b_wptr - mp1->b_rptr); 23756 /* 23757 * Get IP set to checksum on our behalf 23758 * Include the adjustment for a source route if any. 23759 */ 23760 u1 += tcp->tcp_sum; 23761 u1 = (u1 >> 16) + (u1 & 0xFFFF); 23762 U16_TO_BE16(u1, tcph->th_sum); 23763 BUMP_MIB(&tcps->tcps_mib, tcpOutControl); 23764 } 23765 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 23766 (seq + data_length) == tcp->tcp_fss) { 23767 if (!tcp->tcp_fin_acked) { 23768 flags |= TH_FIN; 23769 BUMP_MIB(&tcps->tcps_mib, tcpOutControl); 23770 } 23771 if (!tcp->tcp_fin_sent) { 23772 tcp->tcp_fin_sent = B_TRUE; 23773 switch (tcp->tcp_state) { 23774 case TCPS_SYN_RCVD: 23775 case TCPS_ESTABLISHED: 23776 tcp->tcp_state = TCPS_FIN_WAIT_1; 23777 break; 23778 case TCPS_CLOSE_WAIT: 23779 tcp->tcp_state = TCPS_LAST_ACK; 23780 break; 23781 } 23782 if (tcp->tcp_suna == tcp->tcp_snxt) 23783 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 23784 tcp->tcp_snxt = tcp->tcp_fss + 1; 23785 } 23786 } 23787 /* 23788 * Note the trick here. u1 is unsigned. When tcp_urg 23789 * is smaller than seq, u1 will become a very huge value. 23790 * So the comparison will fail. Also note that tcp_urp 23791 * should be positive, see RFC 793 page 17. 23792 */ 23793 u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION; 23794 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 && 23795 u1 < (uint32_t)(64 * 1024)) { 23796 flags |= TH_URG; 23797 BUMP_MIB(&tcps->tcps_mib, tcpOutUrg); 23798 U32_TO_ABE16(u1, tcph->th_urp); 23799 } 23800 } 23801 tcph->th_flags[0] = (uchar_t)flags; 23802 tcp->tcp_rack = tcp->tcp_rnxt; 23803 tcp->tcp_rack_cnt = 0; 23804 23805 if (tcp->tcp_snd_ts_ok) { 23806 if (tcp->tcp_state != TCPS_SYN_SENT) { 23807 uint32_t llbolt = (uint32_t)lbolt; 23808 23809 U32_TO_BE32(llbolt, 23810 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 23811 U32_TO_BE32(tcp->tcp_ts_recent, 23812 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 23813 } 23814 } 23815 23816 if (num_sack_blk > 0) { 23817 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 23818 sack_blk_t *tmp; 23819 int32_t i; 23820 23821 wptr[0] = TCPOPT_NOP; 23822 wptr[1] = TCPOPT_NOP; 23823 wptr[2] = TCPOPT_SACK; 23824 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 23825 sizeof (sack_blk_t); 23826 wptr += TCPOPT_REAL_SACK_LEN; 23827 23828 tmp = tcp->tcp_sack_list; 23829 for (i = 0; i < num_sack_blk; i++) { 23830 U32_TO_BE32(tmp[i].begin, wptr); 23831 wptr += sizeof (tcp_seq); 23832 U32_TO_BE32(tmp[i].end, wptr); 23833 wptr += sizeof (tcp_seq); 23834 } 23835 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4); 23836 } 23837 ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 23838 data_length += (int)(mp1->b_wptr - rptr); 23839 if (tcp->tcp_ipversion == IPV4_VERSION) { 23840 ((ipha_t *)rptr)->ipha_length = htons(data_length); 23841 } else { 23842 ip6_t *ip6 = (ip6_t *)(rptr + 23843 (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ? 23844 sizeof (ip6i_t) : 0)); 23845 23846 ip6->ip6_plen = htons(data_length - 23847 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 23848 } 23849 23850 /* 23851 * Prime pump for IP 23852 * Include the adjustment for a source route if any. 23853 */ 23854 data_length -= tcp->tcp_ip_hdr_len; 23855 data_length += tcp->tcp_sum; 23856 data_length = (data_length >> 16) + (data_length & 0xFFFF); 23857 U16_TO_ABE16(data_length, tcph->th_sum); 23858 if (tcp->tcp_ip_forward_progress) { 23859 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 23860 *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; 23861 tcp->tcp_ip_forward_progress = B_FALSE; 23862 } 23863 return (mp1); 23864 } 23865 23866 /* This function handles the push timeout. */ 23867 void 23868 tcp_push_timer(void *arg) 23869 { 23870 conn_t *connp = (conn_t *)arg; 23871 tcp_t *tcp = connp->conn_tcp; 23872 tcp_stack_t *tcps = tcp->tcp_tcps; 23873 uint_t flags; 23874 sodirect_t *sodp; 23875 23876 TCP_DBGSTAT(tcps, tcp_push_timer_cnt); 23877 23878 ASSERT(tcp->tcp_listener == NULL); 23879 23880 /* 23881 * We need to plug synchronous streams during our drain to prevent 23882 * a race with tcp_fuse_rrw() or tcp_fusion_rinfop(). 23883 */ 23884 TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp); 23885 tcp->tcp_push_tid = 0; 23886 23887 SOD_PTR_ENTER(tcp, sodp); 23888 if (sodp != NULL) { 23889 flags = tcp_rcv_sod_wakeup(tcp, sodp); 23890 /* sod_wakeup() does the mutex_exit() */ 23891 } else if (tcp->tcp_rcv_list != NULL) { 23892 flags = tcp_rcv_drain(tcp->tcp_rq, tcp); 23893 } 23894 if (flags == TH_ACK_NEEDED) 23895 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 23896 23897 TCP_FUSE_SYNCSTR_UNPLUG_DRAIN(tcp); 23898 } 23899 23900 /* 23901 * This function handles delayed ACK timeout. 23902 */ 23903 static void 23904 tcp_ack_timer(void *arg) 23905 { 23906 conn_t *connp = (conn_t *)arg; 23907 tcp_t *tcp = connp->conn_tcp; 23908 mblk_t *mp; 23909 tcp_stack_t *tcps = tcp->tcp_tcps; 23910 23911 TCP_DBGSTAT(tcps, tcp_ack_timer_cnt); 23912 23913 tcp->tcp_ack_tid = 0; 23914 23915 if (tcp->tcp_fused) 23916 return; 23917 23918 /* 23919 * Do not send ACK if there is no outstanding unack'ed data. 23920 */ 23921 if (tcp->tcp_rnxt == tcp->tcp_rack) { 23922 return; 23923 } 23924 23925 if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) { 23926 /* 23927 * Make sure we don't allow deferred ACKs to result in 23928 * timer-based ACKing. If we have held off an ACK 23929 * when there was more than an mss here, and the timer 23930 * goes off, we have to worry about the possibility 23931 * that the sender isn't doing slow-start, or is out 23932 * of step with us for some other reason. We fall 23933 * permanently back in the direction of 23934 * ACK-every-other-packet as suggested in RFC 1122. 23935 */ 23936 if (tcp->tcp_rack_abs_max > 2) 23937 tcp->tcp_rack_abs_max--; 23938 tcp->tcp_rack_cur_max = 2; 23939 } 23940 mp = tcp_ack_mp(tcp); 23941 23942 if (mp != NULL) { 23943 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 23944 BUMP_LOCAL(tcp->tcp_obsegs); 23945 BUMP_MIB(&tcps->tcps_mib, tcpOutAck); 23946 BUMP_MIB(&tcps->tcps_mib, tcpOutAckDelayed); 23947 tcp_send_data(tcp, tcp->tcp_wq, mp); 23948 } 23949 } 23950 23951 23952 /* Generate an ACK-only (no data) segment for a TCP endpoint */ 23953 static mblk_t * 23954 tcp_ack_mp(tcp_t *tcp) 23955 { 23956 uint32_t seq_no; 23957 tcp_stack_t *tcps = tcp->tcp_tcps; 23958 23959 /* 23960 * There are a few cases to be considered while setting the sequence no. 23961 * Essentially, we can come here while processing an unacceptable pkt 23962 * in the TCPS_SYN_RCVD state, in which case we set the sequence number 23963 * to snxt (per RFC 793), note the swnd wouldn't have been set yet. 23964 * If we are here for a zero window probe, stick with suna. In all 23965 * other cases, we check if suna + swnd encompasses snxt and set 23966 * the sequence number to snxt, if so. If snxt falls outside the 23967 * window (the receiver probably shrunk its window), we will go with 23968 * suna + swnd, otherwise the sequence no will be unacceptable to the 23969 * receiver. 23970 */ 23971 if (tcp->tcp_zero_win_probe) { 23972 seq_no = tcp->tcp_suna; 23973 } else if (tcp->tcp_state == TCPS_SYN_RCVD) { 23974 ASSERT(tcp->tcp_swnd == 0); 23975 seq_no = tcp->tcp_snxt; 23976 } else { 23977 seq_no = SEQ_GT(tcp->tcp_snxt, 23978 (tcp->tcp_suna + tcp->tcp_swnd)) ? 23979 (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt; 23980 } 23981 23982 if (tcp->tcp_valid_bits) { 23983 /* 23984 * For the complex case where we have to send some 23985 * controls (FIN or SYN), let tcp_xmit_mp do it. 23986 */ 23987 return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE, 23988 NULL, B_FALSE)); 23989 } else { 23990 /* Generate a simple ACK */ 23991 int data_length; 23992 uchar_t *rptr; 23993 tcph_t *tcph; 23994 mblk_t *mp1; 23995 int32_t tcp_hdr_len; 23996 int32_t tcp_tcp_hdr_len; 23997 int32_t num_sack_blk = 0; 23998 int32_t sack_opt_len; 23999 24000 /* 24001 * Allocate space for TCP + IP headers 24002 * and link-level header 24003 */ 24004 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 24005 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 24006 tcp->tcp_num_sack_blk); 24007 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 24008 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 24009 tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len; 24010 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + sack_opt_len; 24011 } else { 24012 tcp_hdr_len = tcp->tcp_hdr_len; 24013 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; 24014 } 24015 mp1 = allocb(tcp_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED); 24016 if (!mp1) 24017 return (NULL); 24018 24019 /* Update the latest receive window size in TCP header. */ 24020 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 24021 tcp->tcp_tcph->th_win); 24022 /* copy in prototype TCP + IP header */ 24023 rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; 24024 mp1->b_rptr = rptr; 24025 mp1->b_wptr = rptr + tcp_hdr_len; 24026 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 24027 24028 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 24029 24030 /* Set the TCP sequence number. */ 24031 U32_TO_ABE32(seq_no, tcph->th_seq); 24032 24033 /* Set up the TCP flag field. */ 24034 tcph->th_flags[0] = (uchar_t)TH_ACK; 24035 if (tcp->tcp_ecn_echo_on) 24036 tcph->th_flags[0] |= TH_ECE; 24037 24038 tcp->tcp_rack = tcp->tcp_rnxt; 24039 tcp->tcp_rack_cnt = 0; 24040 24041 /* fill in timestamp option if in use */ 24042 if (tcp->tcp_snd_ts_ok) { 24043 uint32_t llbolt = (uint32_t)lbolt; 24044 24045 U32_TO_BE32(llbolt, 24046 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 24047 U32_TO_BE32(tcp->tcp_ts_recent, 24048 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 24049 } 24050 24051 /* Fill in SACK options */ 24052 if (num_sack_blk > 0) { 24053 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 24054 sack_blk_t *tmp; 24055 int32_t i; 24056 24057 wptr[0] = TCPOPT_NOP; 24058 wptr[1] = TCPOPT_NOP; 24059 wptr[2] = TCPOPT_SACK; 24060 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 24061 sizeof (sack_blk_t); 24062 wptr += TCPOPT_REAL_SACK_LEN; 24063 24064 tmp = tcp->tcp_sack_list; 24065 for (i = 0; i < num_sack_blk; i++) { 24066 U32_TO_BE32(tmp[i].begin, wptr); 24067 wptr += sizeof (tcp_seq); 24068 U32_TO_BE32(tmp[i].end, wptr); 24069 wptr += sizeof (tcp_seq); 24070 } 24071 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 24072 << 4); 24073 } 24074 24075 if (tcp->tcp_ipversion == IPV4_VERSION) { 24076 ((ipha_t *)rptr)->ipha_length = htons(tcp_hdr_len); 24077 } else { 24078 /* Check for ip6i_t header in sticky hdrs */ 24079 ip6_t *ip6 = (ip6_t *)(rptr + 24080 (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ? 24081 sizeof (ip6i_t) : 0)); 24082 24083 ip6->ip6_plen = htons(tcp_hdr_len - 24084 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 24085 } 24086 24087 /* 24088 * Prime pump for checksum calculation in IP. Include the 24089 * adjustment for a source route if any. 24090 */ 24091 data_length = tcp_tcp_hdr_len + tcp->tcp_sum; 24092 data_length = (data_length >> 16) + (data_length & 0xFFFF); 24093 U16_TO_ABE16(data_length, tcph->th_sum); 24094 24095 if (tcp->tcp_ip_forward_progress) { 24096 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 24097 *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; 24098 tcp->tcp_ip_forward_progress = B_FALSE; 24099 } 24100 return (mp1); 24101 } 24102 } 24103 24104 /* 24105 * To create a temporary tcp structure for inserting into bind hash list. 24106 * The parameter is assumed to be in network byte order, ready for use. 24107 */ 24108 /* ARGSUSED */ 24109 static tcp_t * 24110 tcp_alloc_temp_tcp(in_port_t port, tcp_stack_t *tcps) 24111 { 24112 conn_t *connp; 24113 tcp_t *tcp; 24114 24115 connp = ipcl_conn_create(IPCL_TCPCONN, KM_SLEEP, tcps->tcps_netstack); 24116 if (connp == NULL) 24117 return (NULL); 24118 24119 tcp = connp->conn_tcp; 24120 tcp->tcp_tcps = tcps; 24121 TCPS_REFHOLD(tcps); 24122 24123 /* 24124 * Only initialize the necessary info in those structures. Note 24125 * that since INADDR_ANY is all 0, we do not need to set 24126 * tcp_bound_source to INADDR_ANY here. 24127 */ 24128 tcp->tcp_state = TCPS_BOUND; 24129 tcp->tcp_lport = port; 24130 tcp->tcp_exclbind = 1; 24131 tcp->tcp_reserved_port = 1; 24132 24133 /* Just for place holding... */ 24134 tcp->tcp_ipversion = IPV4_VERSION; 24135 24136 return (tcp); 24137 } 24138 24139 /* 24140 * To remove a port range specified by lo_port and hi_port from the 24141 * reserved port ranges. This is one of the three public functions of 24142 * the reserved port interface. Note that a port range has to be removed 24143 * as a whole. Ports in a range cannot be removed individually. 24144 * 24145 * Params: 24146 * in_port_t lo_port: the beginning port of the reserved port range to 24147 * be deleted. 24148 * in_port_t hi_port: the ending port of the reserved port range to 24149 * be deleted. 24150 * 24151 * Return: 24152 * B_TRUE if the deletion is successful, B_FALSE otherwise. 24153 * 24154 * Assumes that nca is only for zoneid=0 24155 */ 24156 boolean_t 24157 tcp_reserved_port_del(in_port_t lo_port, in_port_t hi_port) 24158 { 24159 int i, j; 24160 int size; 24161 tcp_t **temp_tcp_array; 24162 tcp_t *tcp; 24163 tcp_stack_t *tcps; 24164 24165 tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_tcp; 24166 ASSERT(tcps != NULL); 24167 24168 rw_enter(&tcps->tcps_reserved_port_lock, RW_WRITER); 24169 24170 /* First make sure that the port ranage is indeed reserved. */ 24171 for (i = 0; i < tcps->tcps_reserved_port_array_size; i++) { 24172 if (tcps->tcps_reserved_port[i].lo_port == lo_port) { 24173 hi_port = tcps->tcps_reserved_port[i].hi_port; 24174 temp_tcp_array = 24175 tcps->tcps_reserved_port[i].temp_tcp_array; 24176 break; 24177 } 24178 } 24179 if (i == tcps->tcps_reserved_port_array_size) { 24180 rw_exit(&tcps->tcps_reserved_port_lock); 24181 netstack_rele(tcps->tcps_netstack); 24182 return (B_FALSE); 24183 } 24184 24185 /* 24186 * Remove the range from the array. This simple loop is possible 24187 * because port ranges are inserted in ascending order. 24188 */ 24189 for (j = i; j < tcps->tcps_reserved_port_array_size - 1; j++) { 24190 tcps->tcps_reserved_port[j].lo_port = 24191 tcps->tcps_reserved_port[j+1].lo_port; 24192 tcps->tcps_reserved_port[j].hi_port = 24193 tcps->tcps_reserved_port[j+1].hi_port; 24194 tcps->tcps_reserved_port[j].temp_tcp_array = 24195 tcps->tcps_reserved_port[j+1].temp_tcp_array; 24196 } 24197 24198 /* Remove all the temporary tcp structures. */ 24199 size = hi_port - lo_port + 1; 24200 while (size > 0) { 24201 tcp = temp_tcp_array[size - 1]; 24202 ASSERT(tcp != NULL); 24203 tcp_bind_hash_remove(tcp); 24204 CONN_DEC_REF(tcp->tcp_connp); 24205 size--; 24206 } 24207 kmem_free(temp_tcp_array, (hi_port - lo_port + 1) * sizeof (tcp_t *)); 24208 tcps->tcps_reserved_port_array_size--; 24209 rw_exit(&tcps->tcps_reserved_port_lock); 24210 netstack_rele(tcps->tcps_netstack); 24211 return (B_TRUE); 24212 } 24213 24214 /* 24215 * Macro to remove temporary tcp structure from the bind hash list. The 24216 * first parameter is the list of tcp to be removed. The second parameter 24217 * is the number of tcps in the array. 24218 */ 24219 #define TCP_TMP_TCP_REMOVE(tcp_array, num, tcps) \ 24220 { \ 24221 while ((num) > 0) { \ 24222 tcp_t *tcp = (tcp_array)[(num) - 1]; \ 24223 tf_t *tbf; \ 24224 tcp_t *tcpnext; \ 24225 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)]; \ 24226 mutex_enter(&tbf->tf_lock); \ 24227 tcpnext = tcp->tcp_bind_hash; \ 24228 if (tcpnext) { \ 24229 tcpnext->tcp_ptpbhn = \ 24230 tcp->tcp_ptpbhn; \ 24231 } \ 24232 *tcp->tcp_ptpbhn = tcpnext; \ 24233 mutex_exit(&tbf->tf_lock); \ 24234 kmem_free(tcp, sizeof (tcp_t)); \ 24235 (tcp_array)[(num) - 1] = NULL; \ 24236 (num)--; \ 24237 } \ 24238 } 24239 24240 /* 24241 * The public interface for other modules to call to reserve a port range 24242 * in TCP. The caller passes in how large a port range it wants. TCP 24243 * will try to find a range and return it via lo_port and hi_port. This is 24244 * used by NCA's nca_conn_init. 24245 * NCA can only be used in the global zone so this only affects the global 24246 * zone's ports. 24247 * 24248 * Params: 24249 * int size: the size of the port range to be reserved. 24250 * in_port_t *lo_port (referenced): returns the beginning port of the 24251 * reserved port range added. 24252 * in_port_t *hi_port (referenced): returns the ending port of the 24253 * reserved port range added. 24254 * 24255 * Return: 24256 * B_TRUE if the port reservation is successful, B_FALSE otherwise. 24257 * 24258 * Assumes that nca is only for zoneid=0 24259 */ 24260 boolean_t 24261 tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) 24262 { 24263 tcp_t *tcp; 24264 tcp_t *tmp_tcp; 24265 tcp_t **temp_tcp_array; 24266 tf_t *tbf; 24267 in_port_t net_port; 24268 in_port_t port; 24269 int32_t cur_size; 24270 int i, j; 24271 boolean_t used; 24272 tcp_rport_t tmp_ports[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE]; 24273 zoneid_t zoneid = GLOBAL_ZONEID; 24274 tcp_stack_t *tcps; 24275 24276 /* Sanity check. */ 24277 if (size <= 0 || size > TCP_RESERVED_PORTS_RANGE_MAX) { 24278 return (B_FALSE); 24279 } 24280 24281 tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_tcp; 24282 ASSERT(tcps != NULL); 24283 24284 rw_enter(&tcps->tcps_reserved_port_lock, RW_WRITER); 24285 if (tcps->tcps_reserved_port_array_size == 24286 TCP_RESERVED_PORTS_ARRAY_MAX_SIZE) { 24287 rw_exit(&tcps->tcps_reserved_port_lock); 24288 netstack_rele(tcps->tcps_netstack); 24289 return (B_FALSE); 24290 } 24291 24292 /* 24293 * Find the starting port to try. Since the port ranges are ordered 24294 * in the reserved port array, we can do a simple search here. 24295 */ 24296 *lo_port = TCP_SMALLEST_RESERVED_PORT; 24297 *hi_port = TCP_LARGEST_RESERVED_PORT; 24298 for (i = 0; i < tcps->tcps_reserved_port_array_size; 24299 *lo_port = tcps->tcps_reserved_port[i].hi_port + 1, i++) { 24300 if (tcps->tcps_reserved_port[i].lo_port - *lo_port >= size) { 24301 *hi_port = tcps->tcps_reserved_port[i].lo_port - 1; 24302 break; 24303 } 24304 } 24305 /* No available port range. */ 24306 if (i == tcps->tcps_reserved_port_array_size && 24307 *hi_port - *lo_port < size) { 24308 rw_exit(&tcps->tcps_reserved_port_lock); 24309 netstack_rele(tcps->tcps_netstack); 24310 return (B_FALSE); 24311 } 24312 24313 temp_tcp_array = kmem_zalloc(size * sizeof (tcp_t *), KM_NOSLEEP); 24314 if (temp_tcp_array == NULL) { 24315 rw_exit(&tcps->tcps_reserved_port_lock); 24316 netstack_rele(tcps->tcps_netstack); 24317 return (B_FALSE); 24318 } 24319 24320 /* Go thru the port range to see if some ports are already bound. */ 24321 for (port = *lo_port, cur_size = 0; 24322 cur_size < size && port <= *hi_port; 24323 cur_size++, port++) { 24324 used = B_FALSE; 24325 net_port = htons(port); 24326 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(net_port)]; 24327 mutex_enter(&tbf->tf_lock); 24328 for (tcp = tbf->tf_tcp; tcp != NULL; 24329 tcp = tcp->tcp_bind_hash) { 24330 if (IPCL_ZONE_MATCH(tcp->tcp_connp, zoneid) && 24331 net_port == tcp->tcp_lport) { 24332 /* 24333 * A port is already bound. Search again 24334 * starting from port + 1. Release all 24335 * temporary tcps. 24336 */ 24337 mutex_exit(&tbf->tf_lock); 24338 TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size, 24339 tcps); 24340 *lo_port = port + 1; 24341 cur_size = -1; 24342 used = B_TRUE; 24343 break; 24344 } 24345 } 24346 if (!used) { 24347 if ((tmp_tcp = tcp_alloc_temp_tcp(net_port, tcps)) == 24348 NULL) { 24349 /* 24350 * Allocation failure. Just fail the request. 24351 * Need to remove all those temporary tcp 24352 * structures. 24353 */ 24354 mutex_exit(&tbf->tf_lock); 24355 TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size, 24356 tcps); 24357 rw_exit(&tcps->tcps_reserved_port_lock); 24358 kmem_free(temp_tcp_array, 24359 (hi_port - lo_port + 1) * 24360 sizeof (tcp_t *)); 24361 netstack_rele(tcps->tcps_netstack); 24362 return (B_FALSE); 24363 } 24364 temp_tcp_array[cur_size] = tmp_tcp; 24365 tcp_bind_hash_insert(tbf, tmp_tcp, B_TRUE); 24366 mutex_exit(&tbf->tf_lock); 24367 } 24368 } 24369 24370 /* 24371 * The current range is not large enough. We can actually do another 24372 * search if this search is done between 2 reserved port ranges. But 24373 * for first release, we just stop here and return saying that no port 24374 * range is available. 24375 */ 24376 if (cur_size < size) { 24377 TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size, tcps); 24378 rw_exit(&tcps->tcps_reserved_port_lock); 24379 kmem_free(temp_tcp_array, size * sizeof (tcp_t *)); 24380 netstack_rele(tcps->tcps_netstack); 24381 return (B_FALSE); 24382 } 24383 *hi_port = port - 1; 24384 24385 /* 24386 * Insert range into array in ascending order. Since this function 24387 * must not be called often, we choose to use the simplest method. 24388 * The above array should not consume excessive stack space as 24389 * the size must be very small. If in future releases, we find 24390 * that we should provide more reserved port ranges, this function 24391 * has to be modified to be more efficient. 24392 */ 24393 if (tcps->tcps_reserved_port_array_size == 0) { 24394 tcps->tcps_reserved_port[0].lo_port = *lo_port; 24395 tcps->tcps_reserved_port[0].hi_port = *hi_port; 24396 tcps->tcps_reserved_port[0].temp_tcp_array = temp_tcp_array; 24397 } else { 24398 for (i = 0, j = 0; i < tcps->tcps_reserved_port_array_size; 24399 i++, j++) { 24400 if (*lo_port < tcps->tcps_reserved_port[i].lo_port && 24401 i == j) { 24402 tmp_ports[j].lo_port = *lo_port; 24403 tmp_ports[j].hi_port = *hi_port; 24404 tmp_ports[j].temp_tcp_array = temp_tcp_array; 24405 j++; 24406 } 24407 tmp_ports[j].lo_port = 24408 tcps->tcps_reserved_port[i].lo_port; 24409 tmp_ports[j].hi_port = 24410 tcps->tcps_reserved_port[i].hi_port; 24411 tmp_ports[j].temp_tcp_array = 24412 tcps->tcps_reserved_port[i].temp_tcp_array; 24413 } 24414 if (j == i) { 24415 tmp_ports[j].lo_port = *lo_port; 24416 tmp_ports[j].hi_port = *hi_port; 24417 tmp_ports[j].temp_tcp_array = temp_tcp_array; 24418 } 24419 bcopy(tmp_ports, tcps->tcps_reserved_port, sizeof (tmp_ports)); 24420 } 24421 tcps->tcps_reserved_port_array_size++; 24422 rw_exit(&tcps->tcps_reserved_port_lock); 24423 netstack_rele(tcps->tcps_netstack); 24424 return (B_TRUE); 24425 } 24426 24427 /* 24428 * Check to see if a port is in any reserved port range. 24429 * 24430 * Params: 24431 * in_port_t port: the port to be verified. 24432 * 24433 * Return: 24434 * B_TRUE is the port is inside a reserved port range, B_FALSE otherwise. 24435 */ 24436 boolean_t 24437 tcp_reserved_port_check(in_port_t port, tcp_stack_t *tcps) 24438 { 24439 int i; 24440 24441 rw_enter(&tcps->tcps_reserved_port_lock, RW_READER); 24442 for (i = 0; i < tcps->tcps_reserved_port_array_size; i++) { 24443 if (port >= tcps->tcps_reserved_port[i].lo_port || 24444 port <= tcps->tcps_reserved_port[i].hi_port) { 24445 rw_exit(&tcps->tcps_reserved_port_lock); 24446 return (B_TRUE); 24447 } 24448 } 24449 rw_exit(&tcps->tcps_reserved_port_lock); 24450 return (B_FALSE); 24451 } 24452 24453 /* 24454 * To list all reserved port ranges. This is the function to handle 24455 * ndd tcp_reserved_port_list. 24456 */ 24457 /* ARGSUSED */ 24458 static int 24459 tcp_reserved_port_list(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 24460 { 24461 int i; 24462 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 24463 24464 rw_enter(&tcps->tcps_reserved_port_lock, RW_READER); 24465 if (tcps->tcps_reserved_port_array_size > 0) 24466 (void) mi_mpprintf(mp, "The following ports are reserved:"); 24467 else 24468 (void) mi_mpprintf(mp, "No port is reserved."); 24469 for (i = 0; i < tcps->tcps_reserved_port_array_size; i++) { 24470 (void) mi_mpprintf(mp, "%d-%d", 24471 tcps->tcps_reserved_port[i].lo_port, 24472 tcps->tcps_reserved_port[i].hi_port); 24473 } 24474 rw_exit(&tcps->tcps_reserved_port_lock); 24475 return (0); 24476 } 24477 24478 /* 24479 * Hash list insertion routine for tcp_t structures. 24480 * Inserts entries with the ones bound to a specific IP address first 24481 * followed by those bound to INADDR_ANY. 24482 */ 24483 static void 24484 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) 24485 { 24486 tcp_t **tcpp; 24487 tcp_t *tcpnext; 24488 24489 if (tcp->tcp_ptpbhn != NULL) { 24490 ASSERT(!caller_holds_lock); 24491 tcp_bind_hash_remove(tcp); 24492 } 24493 tcpp = &tbf->tf_tcp; 24494 if (!caller_holds_lock) { 24495 mutex_enter(&tbf->tf_lock); 24496 } else { 24497 ASSERT(MUTEX_HELD(&tbf->tf_lock)); 24498 } 24499 tcpnext = tcpp[0]; 24500 if (tcpnext) { 24501 /* 24502 * If the new tcp bound to the INADDR_ANY address 24503 * and the first one in the list is not bound to 24504 * INADDR_ANY we skip all entries until we find the 24505 * first one bound to INADDR_ANY. 24506 * This makes sure that applications binding to a 24507 * specific address get preference over those binding to 24508 * INADDR_ANY. 24509 */ 24510 if (V6_OR_V4_INADDR_ANY(tcp->tcp_bound_source_v6) && 24511 !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) { 24512 while ((tcpnext = tcpp[0]) != NULL && 24513 !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) 24514 tcpp = &(tcpnext->tcp_bind_hash); 24515 if (tcpnext) 24516 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash; 24517 } else 24518 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash; 24519 } 24520 tcp->tcp_bind_hash = tcpnext; 24521 tcp->tcp_ptpbhn = tcpp; 24522 tcpp[0] = tcp; 24523 if (!caller_holds_lock) 24524 mutex_exit(&tbf->tf_lock); 24525 } 24526 24527 /* 24528 * Hash list removal routine for tcp_t structures. 24529 */ 24530 static void 24531 tcp_bind_hash_remove(tcp_t *tcp) 24532 { 24533 tcp_t *tcpnext; 24534 kmutex_t *lockp; 24535 tcp_stack_t *tcps = tcp->tcp_tcps; 24536 24537 if (tcp->tcp_ptpbhn == NULL) 24538 return; 24539 24540 /* 24541 * Extract the lock pointer in case there are concurrent 24542 * hash_remove's for this instance. 24543 */ 24544 ASSERT(tcp->tcp_lport != 0); 24545 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)].tf_lock; 24546 24547 ASSERT(lockp != NULL); 24548 mutex_enter(lockp); 24549 if (tcp->tcp_ptpbhn) { 24550 tcpnext = tcp->tcp_bind_hash; 24551 if (tcpnext) { 24552 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 24553 tcp->tcp_bind_hash = NULL; 24554 } 24555 *tcp->tcp_ptpbhn = tcpnext; 24556 tcp->tcp_ptpbhn = NULL; 24557 } 24558 mutex_exit(lockp); 24559 } 24560 24561 24562 /* 24563 * Hash list lookup routine for tcp_t structures. 24564 * Returns with a CONN_INC_REF tcp structure. Caller must do a CONN_DEC_REF. 24565 */ 24566 static tcp_t * 24567 tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *tcps) 24568 { 24569 tf_t *tf; 24570 tcp_t *tcp; 24571 24572 tf = &tcps->tcps_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 24573 mutex_enter(&tf->tf_lock); 24574 for (tcp = tf->tf_tcp; tcp != NULL; 24575 tcp = tcp->tcp_acceptor_hash) { 24576 if (tcp->tcp_acceptor_id == id) { 24577 CONN_INC_REF(tcp->tcp_connp); 24578 mutex_exit(&tf->tf_lock); 24579 return (tcp); 24580 } 24581 } 24582 mutex_exit(&tf->tf_lock); 24583 return (NULL); 24584 } 24585 24586 24587 /* 24588 * Hash list insertion routine for tcp_t structures. 24589 */ 24590 void 24591 tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp) 24592 { 24593 tf_t *tf; 24594 tcp_t **tcpp; 24595 tcp_t *tcpnext; 24596 tcp_stack_t *tcps = tcp->tcp_tcps; 24597 24598 tf = &tcps->tcps_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 24599 24600 if (tcp->tcp_ptpahn != NULL) 24601 tcp_acceptor_hash_remove(tcp); 24602 tcpp = &tf->tf_tcp; 24603 mutex_enter(&tf->tf_lock); 24604 tcpnext = tcpp[0]; 24605 if (tcpnext) 24606 tcpnext->tcp_ptpahn = &tcp->tcp_acceptor_hash; 24607 tcp->tcp_acceptor_hash = tcpnext; 24608 tcp->tcp_ptpahn = tcpp; 24609 tcpp[0] = tcp; 24610 tcp->tcp_acceptor_lockp = &tf->tf_lock; /* For tcp_*_hash_remove */ 24611 mutex_exit(&tf->tf_lock); 24612 } 24613 24614 /* 24615 * Hash list removal routine for tcp_t structures. 24616 */ 24617 static void 24618 tcp_acceptor_hash_remove(tcp_t *tcp) 24619 { 24620 tcp_t *tcpnext; 24621 kmutex_t *lockp; 24622 24623 /* 24624 * Extract the lock pointer in case there are concurrent 24625 * hash_remove's for this instance. 24626 */ 24627 lockp = tcp->tcp_acceptor_lockp; 24628 24629 if (tcp->tcp_ptpahn == NULL) 24630 return; 24631 24632 ASSERT(lockp != NULL); 24633 mutex_enter(lockp); 24634 if (tcp->tcp_ptpahn) { 24635 tcpnext = tcp->tcp_acceptor_hash; 24636 if (tcpnext) { 24637 tcpnext->tcp_ptpahn = tcp->tcp_ptpahn; 24638 tcp->tcp_acceptor_hash = NULL; 24639 } 24640 *tcp->tcp_ptpahn = tcpnext; 24641 tcp->tcp_ptpahn = NULL; 24642 } 24643 mutex_exit(lockp); 24644 tcp->tcp_acceptor_lockp = NULL; 24645 } 24646 24647 /* ARGSUSED */ 24648 static int 24649 tcp_host_param_setvalue(queue_t *q, mblk_t *mp, char *value, caddr_t cp, int af) 24650 { 24651 int error = 0; 24652 int retval; 24653 char *end; 24654 tcp_hsp_t *hsp; 24655 tcp_hsp_t *hspprev; 24656 ipaddr_t addr = 0; /* Address we're looking for */ 24657 in6_addr_t v6addr; /* Address we're looking for */ 24658 uint32_t hash; /* Hash of that address */ 24659 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 24660 24661 /* 24662 * If the following variables are still zero after parsing the input 24663 * string, the user didn't specify them and we don't change them in 24664 * the HSP. 24665 */ 24666 24667 ipaddr_t mask = 0; /* Subnet mask */ 24668 in6_addr_t v6mask; 24669 long sendspace = 0; /* Send buffer size */ 24670 long recvspace = 0; /* Receive buffer size */ 24671 long timestamp = 0; /* Originate TCP TSTAMP option, 1 = yes */ 24672 boolean_t delete = B_FALSE; /* User asked to delete this HSP */ 24673 24674 rw_enter(&tcps->tcps_hsp_lock, RW_WRITER); 24675 24676 /* Parse and validate address */ 24677 if (af == AF_INET) { 24678 retval = inet_pton(af, value, &addr); 24679 if (retval == 1) 24680 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 24681 } else if (af == AF_INET6) { 24682 retval = inet_pton(af, value, &v6addr); 24683 } else { 24684 error = EINVAL; 24685 goto done; 24686 } 24687 if (retval == 0) { 24688 error = EINVAL; 24689 goto done; 24690 } 24691 24692 while ((*value) && *value != ' ') 24693 value++; 24694 24695 /* Parse individual keywords, set variables if found */ 24696 while (*value) { 24697 /* Skip leading blanks */ 24698 24699 while (*value == ' ' || *value == '\t') 24700 value++; 24701 24702 /* If at end of string, we're done */ 24703 24704 if (!*value) 24705 break; 24706 24707 /* We have a word, figure out what it is */ 24708 24709 if (strncmp("mask", value, 4) == 0) { 24710 value += 4; 24711 while (*value == ' ' || *value == '\t') 24712 value++; 24713 /* Parse subnet mask */ 24714 if (af == AF_INET) { 24715 retval = inet_pton(af, value, &mask); 24716 if (retval == 1) { 24717 V4MASK_TO_V6(mask, v6mask); 24718 } 24719 } else if (af == AF_INET6) { 24720 retval = inet_pton(af, value, &v6mask); 24721 } 24722 if (retval != 1) { 24723 error = EINVAL; 24724 goto done; 24725 } 24726 while ((*value) && *value != ' ') 24727 value++; 24728 } else if (strncmp("sendspace", value, 9) == 0) { 24729 value += 9; 24730 24731 if (ddi_strtol(value, &end, 0, &sendspace) != 0 || 24732 sendspace < TCP_XMIT_HIWATER || 24733 sendspace >= (1L<<30)) { 24734 error = EINVAL; 24735 goto done; 24736 } 24737 value = end; 24738 } else if (strncmp("recvspace", value, 9) == 0) { 24739 value += 9; 24740 24741 if (ddi_strtol(value, &end, 0, &recvspace) != 0 || 24742 recvspace < TCP_RECV_HIWATER || 24743 recvspace >= (1L<<30)) { 24744 error = EINVAL; 24745 goto done; 24746 } 24747 value = end; 24748 } else if (strncmp("timestamp", value, 9) == 0) { 24749 value += 9; 24750 24751 if (ddi_strtol(value, &end, 0, ×tamp) != 0 || 24752 timestamp < 0 || timestamp > 1) { 24753 error = EINVAL; 24754 goto done; 24755 } 24756 24757 /* 24758 * We increment timestamp so we know it's been set; 24759 * this is undone when we put it in the HSP 24760 */ 24761 timestamp++; 24762 value = end; 24763 } else if (strncmp("delete", value, 6) == 0) { 24764 value += 6; 24765 delete = B_TRUE; 24766 } else { 24767 error = EINVAL; 24768 goto done; 24769 } 24770 } 24771 24772 /* Hash address for lookup */ 24773 24774 hash = TCP_HSP_HASH(addr); 24775 24776 if (delete) { 24777 /* 24778 * Note that deletes don't return an error if the thing 24779 * we're trying to delete isn't there. 24780 */ 24781 if (tcps->tcps_hsp_hash == NULL) 24782 goto done; 24783 hsp = tcps->tcps_hsp_hash[hash]; 24784 24785 if (hsp) { 24786 if (IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, 24787 &v6addr)) { 24788 tcps->tcps_hsp_hash[hash] = hsp->tcp_hsp_next; 24789 mi_free((char *)hsp); 24790 } else { 24791 hspprev = hsp; 24792 while ((hsp = hsp->tcp_hsp_next) != NULL) { 24793 if (IN6_ARE_ADDR_EQUAL( 24794 &hsp->tcp_hsp_addr_v6, &v6addr)) { 24795 hspprev->tcp_hsp_next = 24796 hsp->tcp_hsp_next; 24797 mi_free((char *)hsp); 24798 break; 24799 } 24800 hspprev = hsp; 24801 } 24802 } 24803 } 24804 } else { 24805 /* 24806 * We're adding/modifying an HSP. If we haven't already done 24807 * so, allocate the hash table. 24808 */ 24809 24810 if (!tcps->tcps_hsp_hash) { 24811 tcps->tcps_hsp_hash = (tcp_hsp_t **) 24812 mi_zalloc(sizeof (tcp_hsp_t *) * TCP_HSP_HASH_SIZE); 24813 if (!tcps->tcps_hsp_hash) { 24814 error = EINVAL; 24815 goto done; 24816 } 24817 } 24818 24819 /* Get head of hash chain */ 24820 24821 hsp = tcps->tcps_hsp_hash[hash]; 24822 24823 /* Try to find pre-existing hsp on hash chain */ 24824 /* Doesn't handle CIDR prefixes. */ 24825 while (hsp) { 24826 if (IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, &v6addr)) 24827 break; 24828 hsp = hsp->tcp_hsp_next; 24829 } 24830 24831 /* 24832 * If we didn't, create one with default values and put it 24833 * at head of hash chain 24834 */ 24835 24836 if (!hsp) { 24837 hsp = (tcp_hsp_t *)mi_zalloc(sizeof (tcp_hsp_t)); 24838 if (!hsp) { 24839 error = EINVAL; 24840 goto done; 24841 } 24842 hsp->tcp_hsp_next = tcps->tcps_hsp_hash[hash]; 24843 tcps->tcps_hsp_hash[hash] = hsp; 24844 } 24845 24846 /* Set values that the user asked us to change */ 24847 24848 hsp->tcp_hsp_addr_v6 = v6addr; 24849 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) 24850 hsp->tcp_hsp_vers = IPV4_VERSION; 24851 else 24852 hsp->tcp_hsp_vers = IPV6_VERSION; 24853 hsp->tcp_hsp_subnet_v6 = v6mask; 24854 if (sendspace > 0) 24855 hsp->tcp_hsp_sendspace = sendspace; 24856 if (recvspace > 0) 24857 hsp->tcp_hsp_recvspace = recvspace; 24858 if (timestamp > 0) 24859 hsp->tcp_hsp_tstamp = timestamp - 1; 24860 } 24861 24862 done: 24863 rw_exit(&tcps->tcps_hsp_lock); 24864 return (error); 24865 } 24866 24867 /* Set callback routine passed to nd_load by tcp_param_register. */ 24868 /* ARGSUSED */ 24869 static int 24870 tcp_host_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) 24871 { 24872 return (tcp_host_param_setvalue(q, mp, value, cp, AF_INET)); 24873 } 24874 /* ARGSUSED */ 24875 static int 24876 tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 24877 cred_t *cr) 24878 { 24879 return (tcp_host_param_setvalue(q, mp, value, cp, AF_INET6)); 24880 } 24881 24882 /* TCP host parameters report triggered via the Named Dispatch mechanism. */ 24883 /* ARGSUSED */ 24884 static int 24885 tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 24886 { 24887 tcp_hsp_t *hsp; 24888 int i; 24889 char addrbuf[INET6_ADDRSTRLEN], subnetbuf[INET6_ADDRSTRLEN]; 24890 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 24891 24892 rw_enter(&tcps->tcps_hsp_lock, RW_READER); 24893 (void) mi_mpprintf(mp, 24894 "Hash HSP " MI_COL_HDRPAD_STR 24895 "Address Subnet Mask Send Receive TStamp"); 24896 if (tcps->tcps_hsp_hash) { 24897 for (i = 0; i < TCP_HSP_HASH_SIZE; i++) { 24898 hsp = tcps->tcps_hsp_hash[i]; 24899 while (hsp) { 24900 if (hsp->tcp_hsp_vers == IPV4_VERSION) { 24901 (void) inet_ntop(AF_INET, 24902 &hsp->tcp_hsp_addr, 24903 addrbuf, sizeof (addrbuf)); 24904 (void) inet_ntop(AF_INET, 24905 &hsp->tcp_hsp_subnet, 24906 subnetbuf, sizeof (subnetbuf)); 24907 } else { 24908 (void) inet_ntop(AF_INET6, 24909 &hsp->tcp_hsp_addr_v6, 24910 addrbuf, sizeof (addrbuf)); 24911 (void) inet_ntop(AF_INET6, 24912 &hsp->tcp_hsp_subnet_v6, 24913 subnetbuf, sizeof (subnetbuf)); 24914 } 24915 (void) mi_mpprintf(mp, 24916 " %03d " MI_COL_PTRFMT_STR 24917 "%s %s %010d %010d %d", 24918 i, 24919 (void *)hsp, 24920 addrbuf, 24921 subnetbuf, 24922 hsp->tcp_hsp_sendspace, 24923 hsp->tcp_hsp_recvspace, 24924 hsp->tcp_hsp_tstamp); 24925 24926 hsp = hsp->tcp_hsp_next; 24927 } 24928 } 24929 } 24930 rw_exit(&tcps->tcps_hsp_lock); 24931 return (0); 24932 } 24933 24934 24935 /* Data for fast netmask macro used by tcp_hsp_lookup */ 24936 24937 static ipaddr_t netmasks[] = { 24938 IN_CLASSA_NET, IN_CLASSA_NET, IN_CLASSB_NET, 24939 IN_CLASSC_NET | IN_CLASSD_NET /* Class C,D,E */ 24940 }; 24941 24942 #define netmask(addr) (netmasks[(ipaddr_t)(addr) >> 30]) 24943 24944 /* 24945 * XXX This routine should go away and instead we should use the metrics 24946 * associated with the routes to determine the default sndspace and rcvspace. 24947 */ 24948 static tcp_hsp_t * 24949 tcp_hsp_lookup(ipaddr_t addr, tcp_stack_t *tcps) 24950 { 24951 tcp_hsp_t *hsp = NULL; 24952 24953 /* Quick check without acquiring the lock. */ 24954 if (tcps->tcps_hsp_hash == NULL) 24955 return (NULL); 24956 24957 rw_enter(&tcps->tcps_hsp_lock, RW_READER); 24958 24959 /* This routine finds the best-matching HSP for address addr. */ 24960 24961 if (tcps->tcps_hsp_hash) { 24962 int i; 24963 ipaddr_t srchaddr; 24964 tcp_hsp_t *hsp_net; 24965 24966 /* We do three passes: host, network, and subnet. */ 24967 24968 srchaddr = addr; 24969 24970 for (i = 1; i <= 3; i++) { 24971 /* Look for exact match on srchaddr */ 24972 24973 hsp = tcps->tcps_hsp_hash[TCP_HSP_HASH(srchaddr)]; 24974 while (hsp) { 24975 if (hsp->tcp_hsp_vers == IPV4_VERSION && 24976 hsp->tcp_hsp_addr == srchaddr) 24977 break; 24978 hsp = hsp->tcp_hsp_next; 24979 } 24980 ASSERT(hsp == NULL || 24981 hsp->tcp_hsp_vers == IPV4_VERSION); 24982 24983 /* 24984 * If this is the first pass: 24985 * If we found a match, great, return it. 24986 * If not, search for the network on the second pass. 24987 */ 24988 24989 if (i == 1) 24990 if (hsp) 24991 break; 24992 else 24993 { 24994 srchaddr = addr & netmask(addr); 24995 continue; 24996 } 24997 24998 /* 24999 * If this is the second pass: 25000 * If we found a match, but there's a subnet mask, 25001 * save the match but try again using the subnet 25002 * mask on the third pass. 25003 * Otherwise, return whatever we found. 25004 */ 25005 25006 if (i == 2) { 25007 if (hsp && hsp->tcp_hsp_subnet) { 25008 hsp_net = hsp; 25009 srchaddr = addr & hsp->tcp_hsp_subnet; 25010 continue; 25011 } else { 25012 break; 25013 } 25014 } 25015 25016 /* 25017 * This must be the third pass. If we didn't find 25018 * anything, return the saved network HSP instead. 25019 */ 25020 25021 if (!hsp) 25022 hsp = hsp_net; 25023 } 25024 } 25025 25026 rw_exit(&tcps->tcps_hsp_lock); 25027 return (hsp); 25028 } 25029 25030 /* 25031 * XXX Equally broken as the IPv4 routine. Doesn't handle longest 25032 * match lookup. 25033 */ 25034 static tcp_hsp_t * 25035 tcp_hsp_lookup_ipv6(in6_addr_t *v6addr, tcp_stack_t *tcps) 25036 { 25037 tcp_hsp_t *hsp = NULL; 25038 25039 /* Quick check without acquiring the lock. */ 25040 if (tcps->tcps_hsp_hash == NULL) 25041 return (NULL); 25042 25043 rw_enter(&tcps->tcps_hsp_lock, RW_READER); 25044 25045 /* This routine finds the best-matching HSP for address addr. */ 25046 25047 if (tcps->tcps_hsp_hash) { 25048 int i; 25049 in6_addr_t v6srchaddr; 25050 tcp_hsp_t *hsp_net; 25051 25052 /* We do three passes: host, network, and subnet. */ 25053 25054 v6srchaddr = *v6addr; 25055 25056 for (i = 1; i <= 3; i++) { 25057 /* Look for exact match on srchaddr */ 25058 25059 hsp = tcps->tcps_hsp_hash[TCP_HSP_HASH( 25060 V4_PART_OF_V6(v6srchaddr))]; 25061 while (hsp) { 25062 if (hsp->tcp_hsp_vers == IPV6_VERSION && 25063 IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, 25064 &v6srchaddr)) 25065 break; 25066 hsp = hsp->tcp_hsp_next; 25067 } 25068 25069 /* 25070 * If this is the first pass: 25071 * If we found a match, great, return it. 25072 * If not, search for the network on the second pass. 25073 */ 25074 25075 if (i == 1) 25076 if (hsp) 25077 break; 25078 else { 25079 /* Assume a 64 bit mask */ 25080 v6srchaddr.s6_addr32[0] = 25081 v6addr->s6_addr32[0]; 25082 v6srchaddr.s6_addr32[1] = 25083 v6addr->s6_addr32[1]; 25084 v6srchaddr.s6_addr32[2] = 0; 25085 v6srchaddr.s6_addr32[3] = 0; 25086 continue; 25087 } 25088 25089 /* 25090 * If this is the second pass: 25091 * If we found a match, but there's a subnet mask, 25092 * save the match but try again using the subnet 25093 * mask on the third pass. 25094 * Otherwise, return whatever we found. 25095 */ 25096 25097 if (i == 2) { 25098 ASSERT(hsp == NULL || 25099 hsp->tcp_hsp_vers == IPV6_VERSION); 25100 if (hsp && 25101 !IN6_IS_ADDR_UNSPECIFIED( 25102 &hsp->tcp_hsp_subnet_v6)) { 25103 hsp_net = hsp; 25104 V6_MASK_COPY(*v6addr, 25105 hsp->tcp_hsp_subnet_v6, v6srchaddr); 25106 continue; 25107 } else { 25108 break; 25109 } 25110 } 25111 25112 /* 25113 * This must be the third pass. If we didn't find 25114 * anything, return the saved network HSP instead. 25115 */ 25116 25117 if (!hsp) 25118 hsp = hsp_net; 25119 } 25120 } 25121 25122 rw_exit(&tcps->tcps_hsp_lock); 25123 return (hsp); 25124 } 25125 25126 /* 25127 * Type three generator adapted from the random() function in 4.4 BSD: 25128 */ 25129 25130 /* 25131 * Copyright (c) 1983, 1993 25132 * The Regents of the University of California. All rights reserved. 25133 * 25134 * Redistribution and use in source and binary forms, with or without 25135 * modification, are permitted provided that the following conditions 25136 * are met: 25137 * 1. Redistributions of source code must retain the above copyright 25138 * notice, this list of conditions and the following disclaimer. 25139 * 2. Redistributions in binary form must reproduce the above copyright 25140 * notice, this list of conditions and the following disclaimer in the 25141 * documentation and/or other materials provided with the distribution. 25142 * 3. All advertising materials mentioning features or use of this software 25143 * must display the following acknowledgement: 25144 * This product includes software developed by the University of 25145 * California, Berkeley and its contributors. 25146 * 4. Neither the name of the University nor the names of its contributors 25147 * may be used to endorse or promote products derived from this software 25148 * without specific prior written permission. 25149 * 25150 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25151 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 25152 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25153 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25154 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25155 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25156 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25157 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25158 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25159 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25160 * SUCH DAMAGE. 25161 */ 25162 25163 /* Type 3 -- x**31 + x**3 + 1 */ 25164 #define DEG_3 31 25165 #define SEP_3 3 25166 25167 25168 /* Protected by tcp_random_lock */ 25169 static int tcp_randtbl[DEG_3 + 1]; 25170 25171 static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 25172 static int *tcp_random_rptr = &tcp_randtbl[1]; 25173 25174 static int *tcp_random_state = &tcp_randtbl[1]; 25175 static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 25176 25177 kmutex_t tcp_random_lock; 25178 25179 void 25180 tcp_random_init(void) 25181 { 25182 int i; 25183 hrtime_t hrt; 25184 time_t wallclock; 25185 uint64_t result; 25186 25187 /* 25188 * Use high-res timer and current time for seed. Gethrtime() returns 25189 * a longlong, which may contain resolution down to nanoseconds. 25190 * The current time will either be a 32-bit or a 64-bit quantity. 25191 * XOR the two together in a 64-bit result variable. 25192 * Convert the result to a 32-bit value by multiplying the high-order 25193 * 32-bits by the low-order 32-bits. 25194 */ 25195 25196 hrt = gethrtime(); 25197 (void) drv_getparm(TIME, &wallclock); 25198 result = (uint64_t)wallclock ^ (uint64_t)hrt; 25199 mutex_enter(&tcp_random_lock); 25200 tcp_random_state[0] = ((result >> 32) & 0xffffffff) * 25201 (result & 0xffffffff); 25202 25203 for (i = 1; i < DEG_3; i++) 25204 tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 25205 + 12345; 25206 tcp_random_fptr = &tcp_random_state[SEP_3]; 25207 tcp_random_rptr = &tcp_random_state[0]; 25208 mutex_exit(&tcp_random_lock); 25209 for (i = 0; i < 10 * DEG_3; i++) 25210 (void) tcp_random(); 25211 } 25212 25213 /* 25214 * tcp_random: Return a random number in the range [1 - (128K + 1)]. 25215 * This range is selected to be approximately centered on TCP_ISS / 2, 25216 * and easy to compute. We get this value by generating a 32-bit random 25217 * number, selecting out the high-order 17 bits, and then adding one so 25218 * that we never return zero. 25219 */ 25220 int 25221 tcp_random(void) 25222 { 25223 int i; 25224 25225 mutex_enter(&tcp_random_lock); 25226 *tcp_random_fptr += *tcp_random_rptr; 25227 25228 /* 25229 * The high-order bits are more random than the low-order bits, 25230 * so we select out the high-order 17 bits and add one so that 25231 * we never return zero. 25232 */ 25233 i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 25234 if (++tcp_random_fptr >= tcp_random_end_ptr) { 25235 tcp_random_fptr = tcp_random_state; 25236 ++tcp_random_rptr; 25237 } else if (++tcp_random_rptr >= tcp_random_end_ptr) 25238 tcp_random_rptr = tcp_random_state; 25239 25240 mutex_exit(&tcp_random_lock); 25241 return (i); 25242 } 25243 25244 /* 25245 * XXX This will go away when TPI is extended to send 25246 * info reqs to sockfs/timod ..... 25247 * Given a queue, set the max packet size for the write 25248 * side of the queue below stream head. This value is 25249 * cached on the stream head. 25250 * Returns 1 on success, 0 otherwise. 25251 */ 25252 static int 25253 setmaxps(queue_t *q, int maxpsz) 25254 { 25255 struct stdata *stp; 25256 queue_t *wq; 25257 stp = STREAM(q); 25258 25259 /* 25260 * At this point change of a queue parameter is not allowed 25261 * when a multiplexor is sitting on top. 25262 */ 25263 if (stp->sd_flag & STPLEX) 25264 return (0); 25265 25266 claimstr(stp->sd_wrq); 25267 wq = stp->sd_wrq->q_next; 25268 ASSERT(wq != NULL); 25269 (void) strqset(wq, QMAXPSZ, 0, maxpsz); 25270 releasestr(stp->sd_wrq); 25271 return (1); 25272 } 25273 25274 static int 25275 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, 25276 int *t_errorp, int *sys_errorp) 25277 { 25278 int error; 25279 int is_absreq_failure; 25280 t_scalar_t *opt_lenp; 25281 t_scalar_t opt_offset; 25282 int prim_type; 25283 struct T_conn_req *tcreqp; 25284 struct T_conn_res *tcresp; 25285 cred_t *cr; 25286 25287 cr = DB_CREDDEF(mp, tcp->tcp_cred); 25288 25289 prim_type = ((union T_primitives *)mp->b_rptr)->type; 25290 ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || 25291 prim_type == T_CONN_RES); 25292 25293 switch (prim_type) { 25294 case T_CONN_REQ: 25295 tcreqp = (struct T_conn_req *)mp->b_rptr; 25296 opt_offset = tcreqp->OPT_offset; 25297 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; 25298 break; 25299 case O_T_CONN_RES: 25300 case T_CONN_RES: 25301 tcresp = (struct T_conn_res *)mp->b_rptr; 25302 opt_offset = tcresp->OPT_offset; 25303 opt_lenp = (t_scalar_t *)&tcresp->OPT_length; 25304 break; 25305 } 25306 25307 *t_errorp = 0; 25308 *sys_errorp = 0; 25309 *do_disconnectp = 0; 25310 25311 error = tpi_optcom_buf(tcp->tcp_wq, mp, opt_lenp, 25312 opt_offset, cr, &tcp_opt_obj, 25313 NULL, &is_absreq_failure); 25314 25315 switch (error) { 25316 case 0: /* no error */ 25317 ASSERT(is_absreq_failure == 0); 25318 return (0); 25319 case ENOPROTOOPT: 25320 *t_errorp = TBADOPT; 25321 break; 25322 case EACCES: 25323 *t_errorp = TACCES; 25324 break; 25325 default: 25326 *t_errorp = TSYSERR; *sys_errorp = error; 25327 break; 25328 } 25329 if (is_absreq_failure != 0) { 25330 /* 25331 * The connection request should get the local ack 25332 * T_OK_ACK and then a T_DISCON_IND. 25333 */ 25334 *do_disconnectp = 1; 25335 } 25336 return (-1); 25337 } 25338 25339 /* 25340 * Split this function out so that if the secret changes, I'm okay. 25341 * 25342 * Initialize the tcp_iss_cookie and tcp_iss_key. 25343 */ 25344 25345 #define PASSWD_SIZE 16 /* MUST be multiple of 4 */ 25346 25347 static void 25348 tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *tcps) 25349 { 25350 struct { 25351 int32_t current_time; 25352 uint32_t randnum; 25353 uint16_t pad; 25354 uint8_t ether[6]; 25355 uint8_t passwd[PASSWD_SIZE]; 25356 } tcp_iss_cookie; 25357 time_t t; 25358 25359 /* 25360 * Start with the current absolute time. 25361 */ 25362 (void) drv_getparm(TIME, &t); 25363 tcp_iss_cookie.current_time = t; 25364 25365 /* 25366 * XXX - Need a more random number per RFC 1750, not this crap. 25367 * OTOH, if what follows is pretty random, then I'm in better shape. 25368 */ 25369 tcp_iss_cookie.randnum = (uint32_t)(gethrtime() + tcp_random()); 25370 tcp_iss_cookie.pad = 0x365c; /* Picked from HMAC pad values. */ 25371 25372 /* 25373 * The cpu_type_info is pretty non-random. Ugggh. It does serve 25374 * as a good template. 25375 */ 25376 bcopy(&cpu_list->cpu_type_info, &tcp_iss_cookie.passwd, 25377 min(PASSWD_SIZE, sizeof (cpu_list->cpu_type_info))); 25378 25379 /* 25380 * The pass-phrase. Normally this is supplied by user-called NDD. 25381 */ 25382 bcopy(phrase, &tcp_iss_cookie.passwd, min(PASSWD_SIZE, len)); 25383 25384 /* 25385 * See 4010593 if this section becomes a problem again, 25386 * but the local ethernet address is useful here. 25387 */ 25388 (void) localetheraddr(NULL, 25389 (struct ether_addr *)&tcp_iss_cookie.ether); 25390 25391 /* 25392 * Hash 'em all together. The MD5Final is called per-connection. 25393 */ 25394 mutex_enter(&tcps->tcps_iss_key_lock); 25395 MD5Init(&tcps->tcps_iss_key); 25396 MD5Update(&tcps->tcps_iss_key, (uchar_t *)&tcp_iss_cookie, 25397 sizeof (tcp_iss_cookie)); 25398 mutex_exit(&tcps->tcps_iss_key_lock); 25399 } 25400 25401 /* 25402 * Set the RFC 1948 pass phrase 25403 */ 25404 /* ARGSUSED */ 25405 static int 25406 tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 25407 cred_t *cr) 25408 { 25409 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; 25410 25411 /* 25412 * Basically, value contains a new pass phrase. Pass it along! 25413 */ 25414 tcp_iss_key_init((uint8_t *)value, strlen(value), tcps); 25415 return (0); 25416 } 25417 25418 /* ARGSUSED */ 25419 static int 25420 tcp_sack_info_constructor(void *buf, void *cdrarg, int kmflags) 25421 { 25422 bzero(buf, sizeof (tcp_sack_info_t)); 25423 return (0); 25424 } 25425 25426 /* ARGSUSED */ 25427 static int 25428 tcp_iphc_constructor(void *buf, void *cdrarg, int kmflags) 25429 { 25430 bzero(buf, TCP_MAX_COMBINED_HEADER_LENGTH); 25431 return (0); 25432 } 25433 25434 /* 25435 * Make sure we wait until the default queue is setup, yet allow 25436 * tcp_g_q_create() to open a TCP stream. 25437 * We need to allow tcp_g_q_create() do do an open 25438 * of tcp, hence we compare curhread. 25439 * All others have to wait until the tcps_g_q has been 25440 * setup. 25441 */ 25442 void 25443 tcp_g_q_setup(tcp_stack_t *tcps) 25444 { 25445 mutex_enter(&tcps->tcps_g_q_lock); 25446 if (tcps->tcps_g_q != NULL) { 25447 mutex_exit(&tcps->tcps_g_q_lock); 25448 return; 25449 } 25450 if (tcps->tcps_g_q_creator == NULL) { 25451 /* This thread will set it up */ 25452 tcps->tcps_g_q_creator = curthread; 25453 mutex_exit(&tcps->tcps_g_q_lock); 25454 tcp_g_q_create(tcps); 25455 mutex_enter(&tcps->tcps_g_q_lock); 25456 ASSERT(tcps->tcps_g_q_creator == curthread); 25457 tcps->tcps_g_q_creator = NULL; 25458 cv_signal(&tcps->tcps_g_q_cv); 25459 ASSERT(tcps->tcps_g_q != NULL); 25460 mutex_exit(&tcps->tcps_g_q_lock); 25461 return; 25462 } 25463 /* Everybody but the creator has to wait */ 25464 if (tcps->tcps_g_q_creator != curthread) { 25465 while (tcps->tcps_g_q == NULL) 25466 cv_wait(&tcps->tcps_g_q_cv, &tcps->tcps_g_q_lock); 25467 } 25468 mutex_exit(&tcps->tcps_g_q_lock); 25469 } 25470 25471 #define IP "ip" 25472 25473 #define TCP6DEV "/devices/pseudo/tcp6@0:tcp6" 25474 25475 /* 25476 * Create a default tcp queue here instead of in strplumb 25477 */ 25478 void 25479 tcp_g_q_create(tcp_stack_t *tcps) 25480 { 25481 int error; 25482 ldi_handle_t lh = NULL; 25483 ldi_ident_t li = NULL; 25484 int rval; 25485 cred_t *cr; 25486 major_t IP_MAJ; 25487 25488 #ifdef NS_DEBUG 25489 (void) printf("tcp_g_q_create()\n"); 25490 #endif 25491 25492 IP_MAJ = ddi_name_to_major(IP); 25493 25494 ASSERT(tcps->tcps_g_q_creator == curthread); 25495 25496 error = ldi_ident_from_major(IP_MAJ, &li); 25497 if (error) { 25498 #ifdef DEBUG 25499 printf("tcp_g_q_create: lyr ident get failed error %d\n", 25500 error); 25501 #endif 25502 return; 25503 } 25504 25505 cr = zone_get_kcred(netstackid_to_zoneid( 25506 tcps->tcps_netstack->netstack_stackid)); 25507 ASSERT(cr != NULL); 25508 /* 25509 * We set the tcp default queue to IPv6 because IPv4 falls 25510 * back to IPv6 when it can't find a client, but 25511 * IPv6 does not fall back to IPv4. 25512 */ 25513 error = ldi_open_by_name(TCP6DEV, FREAD|FWRITE, cr, &lh, li); 25514 if (error) { 25515 #ifdef DEBUG 25516 printf("tcp_g_q_create: open of TCP6DEV failed error %d\n", 25517 error); 25518 #endif 25519 goto out; 25520 } 25521 25522 /* 25523 * This ioctl causes the tcp framework to cache a pointer to 25524 * this stream, so we don't want to close the stream after 25525 * this operation. 25526 * Use the kernel credentials that are for the zone we're in. 25527 */ 25528 error = ldi_ioctl(lh, TCP_IOC_DEFAULT_Q, 25529 (intptr_t)0, FKIOCTL, cr, &rval); 25530 if (error) { 25531 #ifdef DEBUG 25532 printf("tcp_g_q_create: ioctl TCP_IOC_DEFAULT_Q failed " 25533 "error %d\n", error); 25534 #endif 25535 goto out; 25536 } 25537 tcps->tcps_g_q_lh = lh; /* For tcp_g_q_close */ 25538 lh = NULL; 25539 out: 25540 /* Close layered handles */ 25541 if (li) 25542 ldi_ident_release(li); 25543 /* Keep cred around until _inactive needs it */ 25544 tcps->tcps_g_q_cr = cr; 25545 } 25546 25547 /* 25548 * We keep tcp_g_q set until all other tcp_t's in the zone 25549 * has gone away, and then when tcp_g_q_inactive() is called 25550 * we clear it. 25551 */ 25552 void 25553 tcp_g_q_destroy(tcp_stack_t *tcps) 25554 { 25555 #ifdef NS_DEBUG 25556 (void) printf("tcp_g_q_destroy()for stack %d\n", 25557 tcps->tcps_netstack->netstack_stackid); 25558 #endif 25559 25560 if (tcps->tcps_g_q == NULL) { 25561 return; /* Nothing to cleanup */ 25562 } 25563 /* 25564 * Drop reference corresponding to the default queue. 25565 * This reference was added from tcp_open when the default queue 25566 * was created, hence we compensate for this extra drop in 25567 * tcp_g_q_close. If the refcnt drops to zero here it means 25568 * the default queue was the last one to be open, in which 25569 * case, then tcp_g_q_inactive will be 25570 * called as a result of the refrele. 25571 */ 25572 TCPS_REFRELE(tcps); 25573 } 25574 25575 /* 25576 * Called when last tcp_t drops reference count using TCPS_REFRELE. 25577 * Run by tcp_q_q_inactive using a taskq. 25578 */ 25579 static void 25580 tcp_g_q_close(void *arg) 25581 { 25582 tcp_stack_t *tcps = arg; 25583 int error; 25584 ldi_handle_t lh = NULL; 25585 ldi_ident_t li = NULL; 25586 cred_t *cr; 25587 major_t IP_MAJ; 25588 25589 IP_MAJ = ddi_name_to_major(IP); 25590 25591 #ifdef NS_DEBUG 25592 (void) printf("tcp_g_q_inactive() for stack %d refcnt %d\n", 25593 tcps->tcps_netstack->netstack_stackid, 25594 tcps->tcps_netstack->netstack_refcnt); 25595 #endif 25596 lh = tcps->tcps_g_q_lh; 25597 if (lh == NULL) 25598 return; /* Nothing to cleanup */ 25599 25600 ASSERT(tcps->tcps_refcnt == 1); 25601 ASSERT(tcps->tcps_g_q != NULL); 25602 25603 error = ldi_ident_from_major(IP_MAJ, &li); 25604 if (error) { 25605 #ifdef DEBUG 25606 printf("tcp_g_q_inactive: lyr ident get failed error %d\n", 25607 error); 25608 #endif 25609 return; 25610 } 25611 25612 cr = tcps->tcps_g_q_cr; 25613 tcps->tcps_g_q_cr = NULL; 25614 ASSERT(cr != NULL); 25615 25616 /* 25617 * Make sure we can break the recursion when tcp_close decrements 25618 * the reference count causing g_q_inactive to be called again. 25619 */ 25620 tcps->tcps_g_q_lh = NULL; 25621 25622 /* close the default queue */ 25623 (void) ldi_close(lh, FREAD|FWRITE, cr); 25624 /* 25625 * At this point in time tcps and the rest of netstack_t might 25626 * have been deleted. 25627 */ 25628 tcps = NULL; 25629 25630 /* Close layered handles */ 25631 ldi_ident_release(li); 25632 crfree(cr); 25633 } 25634 25635 /* 25636 * Called when last tcp_t drops reference count using TCPS_REFRELE. 25637 * 25638 * Have to ensure that the ldi routines are not used by an 25639 * interrupt thread by using a taskq. 25640 */ 25641 void 25642 tcp_g_q_inactive(tcp_stack_t *tcps) 25643 { 25644 if (tcps->tcps_g_q_lh == NULL) 25645 return; /* Nothing to cleanup */ 25646 25647 ASSERT(tcps->tcps_refcnt == 0); 25648 TCPS_REFHOLD(tcps); /* Compensate for what g_q_destroy did */ 25649 25650 if (servicing_interrupt()) { 25651 (void) taskq_dispatch(tcp_taskq, tcp_g_q_close, 25652 (void *) tcps, TQ_SLEEP); 25653 } else { 25654 tcp_g_q_close(tcps); 25655 } 25656 } 25657 25658 /* 25659 * Called by IP when IP is loaded into the kernel 25660 */ 25661 void 25662 tcp_ddi_g_init(void) 25663 { 25664 tcp_timercache = kmem_cache_create("tcp_timercache", 25665 sizeof (tcp_timer_t) + sizeof (mblk_t), 0, 25666 NULL, NULL, NULL, NULL, NULL, 0); 25667 25668 tcp_sack_info_cache = kmem_cache_create("tcp_sack_info_cache", 25669 sizeof (tcp_sack_info_t), 0, 25670 tcp_sack_info_constructor, NULL, NULL, NULL, NULL, 0); 25671 25672 tcp_iphc_cache = kmem_cache_create("tcp_iphc_cache", 25673 TCP_MAX_COMBINED_HEADER_LENGTH, 0, 25674 tcp_iphc_constructor, NULL, NULL, NULL, NULL, 0); 25675 25676 mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL); 25677 25678 /* Initialize the random number generator */ 25679 tcp_random_init(); 25680 25681 tcp_squeue_wput_proc = tcp_squeue_switch(tcp_squeue_wput); 25682 tcp_squeue_close_proc = tcp_squeue_switch(tcp_squeue_close); 25683 25684 /* A single callback independently of how many netstacks we have */ 25685 ip_squeue_init(tcp_squeue_add); 25686 25687 tcp_g_kstat = tcp_g_kstat_init(&tcp_g_statistics); 25688 25689 tcp_taskq = taskq_create("tcp_taskq", 1, minclsyspri, 1, 1, 25690 TASKQ_PREPOPULATE); 25691 25692 /* 25693 * We want to be informed each time a stack is created or 25694 * destroyed in the kernel, so we can maintain the 25695 * set of tcp_stack_t's. 25696 */ 25697 netstack_register(NS_TCP, tcp_stack_init, tcp_stack_shutdown, 25698 tcp_stack_fini); 25699 } 25700 25701 25702 /* 25703 * Initialize the TCP stack instance. 25704 */ 25705 static void * 25706 tcp_stack_init(netstackid_t stackid, netstack_t *ns) 25707 { 25708 tcp_stack_t *tcps; 25709 tcpparam_t *pa; 25710 int i; 25711 25712 tcps = (tcp_stack_t *)kmem_zalloc(sizeof (*tcps), KM_SLEEP); 25713 tcps->tcps_netstack = ns; 25714 25715 /* Initialize locks */ 25716 rw_init(&tcps->tcps_hsp_lock, NULL, RW_DEFAULT, NULL); 25717 mutex_init(&tcps->tcps_g_q_lock, NULL, MUTEX_DEFAULT, NULL); 25718 cv_init(&tcps->tcps_g_q_cv, NULL, CV_DEFAULT, NULL); 25719 mutex_init(&tcps->tcps_iss_key_lock, NULL, MUTEX_DEFAULT, NULL); 25720 mutex_init(&tcps->tcps_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL); 25721 rw_init(&tcps->tcps_reserved_port_lock, NULL, RW_DEFAULT, NULL); 25722 25723 tcps->tcps_g_num_epriv_ports = TCP_NUM_EPRIV_PORTS; 25724 tcps->tcps_g_epriv_ports[0] = 2049; 25725 tcps->tcps_g_epriv_ports[1] = 4045; 25726 tcps->tcps_min_anonpriv_port = 512; 25727 25728 tcps->tcps_bind_fanout = kmem_zalloc(sizeof (tf_t) * 25729 TCP_BIND_FANOUT_SIZE, KM_SLEEP); 25730 tcps->tcps_acceptor_fanout = kmem_zalloc(sizeof (tf_t) * 25731 TCP_FANOUT_SIZE, KM_SLEEP); 25732 tcps->tcps_reserved_port = kmem_zalloc(sizeof (tcp_rport_t) * 25733 TCP_RESERVED_PORTS_ARRAY_MAX_SIZE, KM_SLEEP); 25734 25735 for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { 25736 mutex_init(&tcps->tcps_bind_fanout[i].tf_lock, NULL, 25737 MUTEX_DEFAULT, NULL); 25738 } 25739 25740 for (i = 0; i < TCP_FANOUT_SIZE; i++) { 25741 mutex_init(&tcps->tcps_acceptor_fanout[i].tf_lock, NULL, 25742 MUTEX_DEFAULT, NULL); 25743 } 25744 25745 /* TCP's IPsec code calls the packet dropper. */ 25746 ip_drop_register(&tcps->tcps_dropper, "TCP IPsec policy enforcement"); 25747 25748 pa = (tcpparam_t *)kmem_alloc(sizeof (lcl_tcp_param_arr), KM_SLEEP); 25749 tcps->tcps_params = pa; 25750 bcopy(lcl_tcp_param_arr, tcps->tcps_params, sizeof (lcl_tcp_param_arr)); 25751 25752 (void) tcp_param_register(&tcps->tcps_g_nd, tcps->tcps_params, 25753 A_CNT(lcl_tcp_param_arr), tcps); 25754 25755 /* 25756 * Note: To really walk the device tree you need the devinfo 25757 * pointer to your device which is only available after probe/attach. 25758 * The following is safe only because it uses ddi_root_node() 25759 */ 25760 tcp_max_optsize = optcom_max_optsize(tcp_opt_obj.odb_opt_des_arr, 25761 tcp_opt_obj.odb_opt_arr_cnt); 25762 25763 /* 25764 * Initialize RFC 1948 secret values. This will probably be reset once 25765 * by the boot scripts. 25766 * 25767 * Use NULL name, as the name is caught by the new lockstats. 25768 * 25769 * Initialize with some random, non-guessable string, like the global 25770 * T_INFO_ACK. 25771 */ 25772 25773 tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack, 25774 sizeof (tcp_g_t_info_ack), tcps); 25775 25776 tcps->tcps_kstat = tcp_kstat2_init(stackid, &tcps->tcps_statistics); 25777 tcps->tcps_mibkp = tcp_kstat_init(stackid, tcps); 25778 25779 return (tcps); 25780 } 25781 25782 /* 25783 * Called when the IP module is about to be unloaded. 25784 */ 25785 void 25786 tcp_ddi_g_destroy(void) 25787 { 25788 tcp_g_kstat_fini(tcp_g_kstat); 25789 tcp_g_kstat = NULL; 25790 bzero(&tcp_g_statistics, sizeof (tcp_g_statistics)); 25791 25792 mutex_destroy(&tcp_random_lock); 25793 25794 kmem_cache_destroy(tcp_timercache); 25795 kmem_cache_destroy(tcp_sack_info_cache); 25796 kmem_cache_destroy(tcp_iphc_cache); 25797 25798 netstack_unregister(NS_TCP); 25799 taskq_destroy(tcp_taskq); 25800 } 25801 25802 /* 25803 * Shut down the TCP stack instance. 25804 */ 25805 /* ARGSUSED */ 25806 static void 25807 tcp_stack_shutdown(netstackid_t stackid, void *arg) 25808 { 25809 tcp_stack_t *tcps = (tcp_stack_t *)arg; 25810 25811 tcp_g_q_destroy(tcps); 25812 } 25813 25814 /* 25815 * Free the TCP stack instance. 25816 */ 25817 static void 25818 tcp_stack_fini(netstackid_t stackid, void *arg) 25819 { 25820 tcp_stack_t *tcps = (tcp_stack_t *)arg; 25821 int i; 25822 25823 nd_free(&tcps->tcps_g_nd); 25824 kmem_free(tcps->tcps_params, sizeof (lcl_tcp_param_arr)); 25825 tcps->tcps_params = NULL; 25826 kmem_free(tcps->tcps_wroff_xtra_param, sizeof (tcpparam_t)); 25827 tcps->tcps_wroff_xtra_param = NULL; 25828 kmem_free(tcps->tcps_mdt_head_param, sizeof (tcpparam_t)); 25829 tcps->tcps_mdt_head_param = NULL; 25830 kmem_free(tcps->tcps_mdt_tail_param, sizeof (tcpparam_t)); 25831 tcps->tcps_mdt_tail_param = NULL; 25832 kmem_free(tcps->tcps_mdt_max_pbufs_param, sizeof (tcpparam_t)); 25833 tcps->tcps_mdt_max_pbufs_param = NULL; 25834 25835 for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { 25836 ASSERT(tcps->tcps_bind_fanout[i].tf_tcp == NULL); 25837 mutex_destroy(&tcps->tcps_bind_fanout[i].tf_lock); 25838 } 25839 25840 for (i = 0; i < TCP_FANOUT_SIZE; i++) { 25841 ASSERT(tcps->tcps_acceptor_fanout[i].tf_tcp == NULL); 25842 mutex_destroy(&tcps->tcps_acceptor_fanout[i].tf_lock); 25843 } 25844 25845 kmem_free(tcps->tcps_bind_fanout, sizeof (tf_t) * TCP_BIND_FANOUT_SIZE); 25846 tcps->tcps_bind_fanout = NULL; 25847 25848 kmem_free(tcps->tcps_acceptor_fanout, sizeof (tf_t) * TCP_FANOUT_SIZE); 25849 tcps->tcps_acceptor_fanout = NULL; 25850 25851 kmem_free(tcps->tcps_reserved_port, sizeof (tcp_rport_t) * 25852 TCP_RESERVED_PORTS_ARRAY_MAX_SIZE); 25853 tcps->tcps_reserved_port = NULL; 25854 25855 mutex_destroy(&tcps->tcps_iss_key_lock); 25856 rw_destroy(&tcps->tcps_hsp_lock); 25857 mutex_destroy(&tcps->tcps_g_q_lock); 25858 cv_destroy(&tcps->tcps_g_q_cv); 25859 mutex_destroy(&tcps->tcps_epriv_port_lock); 25860 rw_destroy(&tcps->tcps_reserved_port_lock); 25861 25862 ip_drop_unregister(&tcps->tcps_dropper); 25863 25864 tcp_kstat2_fini(stackid, tcps->tcps_kstat); 25865 tcps->tcps_kstat = NULL; 25866 bzero(&tcps->tcps_statistics, sizeof (tcps->tcps_statistics)); 25867 25868 tcp_kstat_fini(stackid, tcps->tcps_mibkp); 25869 tcps->tcps_mibkp = NULL; 25870 25871 kmem_free(tcps, sizeof (*tcps)); 25872 } 25873 25874 /* 25875 * Generate ISS, taking into account NDD changes may happen halfway through. 25876 * (If the iss is not zero, set it.) 25877 */ 25878 25879 static void 25880 tcp_iss_init(tcp_t *tcp) 25881 { 25882 MD5_CTX context; 25883 struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg; 25884 uint32_t answer[4]; 25885 tcp_stack_t *tcps = tcp->tcp_tcps; 25886 25887 tcps->tcps_iss_incr_extra += (ISS_INCR >> 1); 25888 tcp->tcp_iss = tcps->tcps_iss_incr_extra; 25889 switch (tcps->tcps_strong_iss) { 25890 case 2: 25891 mutex_enter(&tcps->tcps_iss_key_lock); 25892 context = tcps->tcps_iss_key; 25893 mutex_exit(&tcps->tcps_iss_key_lock); 25894 arg.ports = tcp->tcp_ports; 25895 if (tcp->tcp_ipversion == IPV4_VERSION) { 25896 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 25897 &arg.src); 25898 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_dst, 25899 &arg.dst); 25900 } else { 25901 arg.src = tcp->tcp_ip6h->ip6_src; 25902 arg.dst = tcp->tcp_ip6h->ip6_dst; 25903 } 25904 MD5Update(&context, (uchar_t *)&arg, sizeof (arg)); 25905 MD5Final((uchar_t *)answer, &context); 25906 tcp->tcp_iss += answer[0] ^ answer[1] ^ answer[2] ^ answer[3]; 25907 /* 25908 * Now that we've hashed into a unique per-connection sequence 25909 * space, add a random increment per strong_iss == 1. So I 25910 * guess we'll have to... 25911 */ 25912 /* FALLTHRU */ 25913 case 1: 25914 tcp->tcp_iss += (gethrtime() >> ISS_NSEC_SHT) + tcp_random(); 25915 break; 25916 default: 25917 tcp->tcp_iss += (uint32_t)gethrestime_sec() * ISS_INCR; 25918 break; 25919 } 25920 tcp->tcp_valid_bits = TCP_ISS_VALID; 25921 tcp->tcp_fss = tcp->tcp_iss - 1; 25922 tcp->tcp_suna = tcp->tcp_iss; 25923 tcp->tcp_snxt = tcp->tcp_iss + 1; 25924 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 25925 tcp->tcp_csuna = tcp->tcp_snxt; 25926 } 25927 25928 /* 25929 * Exported routine for extracting active tcp connection status. 25930 * 25931 * This is used by the Solaris Cluster Networking software to 25932 * gather a list of connections that need to be forwarded to 25933 * specific nodes in the cluster when configuration changes occur. 25934 * 25935 * The callback is invoked for each tcp_t structure. Returning 25936 * non-zero from the callback routine terminates the search. 25937 */ 25938 int 25939 cl_tcp_walk_list(int (*cl_callback)(cl_tcp_info_t *, void *), 25940 void *arg) 25941 { 25942 netstack_handle_t nh; 25943 netstack_t *ns; 25944 int ret = 0; 25945 25946 netstack_next_init(&nh); 25947 while ((ns = netstack_next(&nh)) != NULL) { 25948 ret = cl_tcp_walk_list_stack(cl_callback, arg, 25949 ns->netstack_tcp); 25950 netstack_rele(ns); 25951 } 25952 netstack_next_fini(&nh); 25953 return (ret); 25954 } 25955 25956 static int 25957 cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg, 25958 tcp_stack_t *tcps) 25959 { 25960 tcp_t *tcp; 25961 cl_tcp_info_t cl_tcpi; 25962 connf_t *connfp; 25963 conn_t *connp; 25964 int i; 25965 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 25966 25967 ASSERT(callback != NULL); 25968 25969 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 25970 connfp = &ipst->ips_ipcl_globalhash_fanout[i]; 25971 connp = NULL; 25972 25973 while ((connp = 25974 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 25975 25976 tcp = connp->conn_tcp; 25977 cl_tcpi.cl_tcpi_version = CL_TCPI_V1; 25978 cl_tcpi.cl_tcpi_ipversion = tcp->tcp_ipversion; 25979 cl_tcpi.cl_tcpi_state = tcp->tcp_state; 25980 cl_tcpi.cl_tcpi_lport = tcp->tcp_lport; 25981 cl_tcpi.cl_tcpi_fport = tcp->tcp_fport; 25982 /* 25983 * The macros tcp_laddr and tcp_faddr give the IPv4 25984 * addresses. They are copied implicitly below as 25985 * mapped addresses. 25986 */ 25987 cl_tcpi.cl_tcpi_laddr_v6 = tcp->tcp_ip_src_v6; 25988 if (tcp->tcp_ipversion == IPV4_VERSION) { 25989 cl_tcpi.cl_tcpi_faddr = 25990 tcp->tcp_ipha->ipha_dst; 25991 } else { 25992 cl_tcpi.cl_tcpi_faddr_v6 = 25993 tcp->tcp_ip6h->ip6_dst; 25994 } 25995 25996 /* 25997 * If the callback returns non-zero 25998 * we terminate the traversal. 25999 */ 26000 if ((*callback)(&cl_tcpi, arg) != 0) { 26001 CONN_DEC_REF(tcp->tcp_connp); 26002 return (1); 26003 } 26004 } 26005 } 26006 26007 return (0); 26008 } 26009 26010 /* 26011 * Macros used for accessing the different types of sockaddr 26012 * structures inside a tcp_ioc_abort_conn_t. 26013 */ 26014 #define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local) 26015 #define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote) 26016 #define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr) 26017 #define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr) 26018 #define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port) 26019 #define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port) 26020 #define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local) 26021 #define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote) 26022 #define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr) 26023 #define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr) 26024 #define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port) 26025 #define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port) 26026 26027 /* 26028 * Return the correct error code to mimic the behavior 26029 * of a connection reset. 26030 */ 26031 #define TCP_AC_GET_ERRCODE(state, err) { \ 26032 switch ((state)) { \ 26033 case TCPS_SYN_SENT: \ 26034 case TCPS_SYN_RCVD: \ 26035 (err) = ECONNREFUSED; \ 26036 break; \ 26037 case TCPS_ESTABLISHED: \ 26038 case TCPS_FIN_WAIT_1: \ 26039 case TCPS_FIN_WAIT_2: \ 26040 case TCPS_CLOSE_WAIT: \ 26041 (err) = ECONNRESET; \ 26042 break; \ 26043 case TCPS_CLOSING: \ 26044 case TCPS_LAST_ACK: \ 26045 case TCPS_TIME_WAIT: \ 26046 (err) = 0; \ 26047 break; \ 26048 default: \ 26049 (err) = ENXIO; \ 26050 } \ 26051 } 26052 26053 /* 26054 * Check if a tcp structure matches the info in acp. 26055 */ 26056 #define TCP_AC_ADDR_MATCH(acp, tcp) \ 26057 (((acp)->ac_local.ss_family == AF_INET) ? \ 26058 ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \ 26059 TCP_AC_V4LOCAL((acp)) == (tcp)->tcp_ip_src) && \ 26060 (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \ 26061 TCP_AC_V4REMOTE((acp)) == (tcp)->tcp_remote) && \ 26062 (TCP_AC_V4LPORT((acp)) == 0 || \ 26063 TCP_AC_V4LPORT((acp)) == (tcp)->tcp_lport) && \ 26064 (TCP_AC_V4RPORT((acp)) == 0 || \ 26065 TCP_AC_V4RPORT((acp)) == (tcp)->tcp_fport) && \ 26066 (acp)->ac_start <= (tcp)->tcp_state && \ 26067 (acp)->ac_end >= (tcp)->tcp_state) : \ 26068 ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \ 26069 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \ 26070 &(tcp)->tcp_ip_src_v6)) && \ 26071 (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \ 26072 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \ 26073 &(tcp)->tcp_remote_v6)) && \ 26074 (TCP_AC_V6LPORT((acp)) == 0 || \ 26075 TCP_AC_V6LPORT((acp)) == (tcp)->tcp_lport) && \ 26076 (TCP_AC_V6RPORT((acp)) == 0 || \ 26077 TCP_AC_V6RPORT((acp)) == (tcp)->tcp_fport) && \ 26078 (acp)->ac_start <= (tcp)->tcp_state && \ 26079 (acp)->ac_end >= (tcp)->tcp_state)) 26080 26081 #define TCP_AC_MATCH(acp, tcp) \ 26082 (((acp)->ac_zoneid == ALL_ZONES || \ 26083 (acp)->ac_zoneid == tcp->tcp_connp->conn_zoneid) ? \ 26084 TCP_AC_ADDR_MATCH(acp, tcp) : 0) 26085 26086 /* 26087 * Build a message containing a tcp_ioc_abort_conn_t structure 26088 * which is filled in with information from acp and tp. 26089 */ 26090 static mblk_t * 26091 tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) 26092 { 26093 mblk_t *mp; 26094 tcp_ioc_abort_conn_t *tacp; 26095 26096 mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO); 26097 if (mp == NULL) 26098 return (NULL); 26099 26100 mp->b_datap->db_type = M_CTL; 26101 26102 *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN; 26103 tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr + 26104 sizeof (uint32_t)); 26105 26106 tacp->ac_start = acp->ac_start; 26107 tacp->ac_end = acp->ac_end; 26108 tacp->ac_zoneid = acp->ac_zoneid; 26109 26110 if (acp->ac_local.ss_family == AF_INET) { 26111 tacp->ac_local.ss_family = AF_INET; 26112 tacp->ac_remote.ss_family = AF_INET; 26113 TCP_AC_V4LOCAL(tacp) = tp->tcp_ip_src; 26114 TCP_AC_V4REMOTE(tacp) = tp->tcp_remote; 26115 TCP_AC_V4LPORT(tacp) = tp->tcp_lport; 26116 TCP_AC_V4RPORT(tacp) = tp->tcp_fport; 26117 } else { 26118 tacp->ac_local.ss_family = AF_INET6; 26119 tacp->ac_remote.ss_family = AF_INET6; 26120 TCP_AC_V6LOCAL(tacp) = tp->tcp_ip_src_v6; 26121 TCP_AC_V6REMOTE(tacp) = tp->tcp_remote_v6; 26122 TCP_AC_V6LPORT(tacp) = tp->tcp_lport; 26123 TCP_AC_V6RPORT(tacp) = tp->tcp_fport; 26124 } 26125 mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp); 26126 return (mp); 26127 } 26128 26129 /* 26130 * Print a tcp_ioc_abort_conn_t structure. 26131 */ 26132 static void 26133 tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) 26134 { 26135 char lbuf[128]; 26136 char rbuf[128]; 26137 sa_family_t af; 26138 in_port_t lport, rport; 26139 ushort_t logflags; 26140 26141 af = acp->ac_local.ss_family; 26142 26143 if (af == AF_INET) { 26144 (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp), 26145 lbuf, 128); 26146 (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp), 26147 rbuf, 128); 26148 lport = ntohs(TCP_AC_V4LPORT(acp)); 26149 rport = ntohs(TCP_AC_V4RPORT(acp)); 26150 } else { 26151 (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp), 26152 lbuf, 128); 26153 (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp), 26154 rbuf, 128); 26155 lport = ntohs(TCP_AC_V6LPORT(acp)); 26156 rport = ntohs(TCP_AC_V6RPORT(acp)); 26157 } 26158 26159 logflags = SL_TRACE | SL_NOTE; 26160 /* 26161 * Don't print this message to the console if the operation was done 26162 * to a non-global zone. 26163 */ 26164 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 26165 logflags |= SL_CONSOLE; 26166 (void) strlog(TCP_MOD_ID, 0, 1, logflags, 26167 "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, " 26168 "start = %d, end = %d\n", lbuf, lport, rbuf, rport, 26169 acp->ac_start, acp->ac_end); 26170 } 26171 26172 /* 26173 * Called inside tcp_rput when a message built using 26174 * tcp_ioctl_abort_build_msg is put into a queue. 26175 * Note that when we get here there is no wildcard in acp any more. 26176 */ 26177 static void 26178 tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp) 26179 { 26180 tcp_ioc_abort_conn_t *acp; 26181 26182 acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t)); 26183 if (tcp->tcp_state <= acp->ac_end) { 26184 /* 26185 * If we get here, we are already on the correct 26186 * squeue. This ioctl follows the following path 26187 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn 26188 * ->tcp_ioctl_abort->squeue_fill (if on a 26189 * different squeue) 26190 */ 26191 int errcode; 26192 26193 TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode); 26194 (void) tcp_clean_death(tcp, errcode, 26); 26195 } 26196 freemsg(mp); 26197 } 26198 26199 /* 26200 * Abort all matching connections on a hash chain. 26201 */ 26202 static int 26203 tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count, 26204 boolean_t exact, tcp_stack_t *tcps) 26205 { 26206 int nmatch, err = 0; 26207 tcp_t *tcp; 26208 MBLKP mp, last, listhead = NULL; 26209 conn_t *tconnp; 26210 connf_t *connfp; 26211 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 26212 26213 connfp = &ipst->ips_ipcl_conn_fanout[index]; 26214 26215 startover: 26216 nmatch = 0; 26217 26218 mutex_enter(&connfp->connf_lock); 26219 for (tconnp = connfp->connf_head; tconnp != NULL; 26220 tconnp = tconnp->conn_next) { 26221 tcp = tconnp->conn_tcp; 26222 if (TCP_AC_MATCH(acp, tcp)) { 26223 CONN_INC_REF(tcp->tcp_connp); 26224 mp = tcp_ioctl_abort_build_msg(acp, tcp); 26225 if (mp == NULL) { 26226 err = ENOMEM; 26227 CONN_DEC_REF(tcp->tcp_connp); 26228 break; 26229 } 26230 mp->b_prev = (mblk_t *)tcp; 26231 26232 if (listhead == NULL) { 26233 listhead = mp; 26234 last = mp; 26235 } else { 26236 last->b_next = mp; 26237 last = mp; 26238 } 26239 nmatch++; 26240 if (exact) 26241 break; 26242 } 26243 26244 /* Avoid holding lock for too long. */ 26245 if (nmatch >= 500) 26246 break; 26247 } 26248 mutex_exit(&connfp->connf_lock); 26249 26250 /* Pass mp into the correct tcp */ 26251 while ((mp = listhead) != NULL) { 26252 listhead = listhead->b_next; 26253 tcp = (tcp_t *)mp->b_prev; 26254 mp->b_next = mp->b_prev = NULL; 26255 squeue_fill(tcp->tcp_connp->conn_sqp, mp, 26256 tcp_input, tcp->tcp_connp, SQTAG_TCP_ABORT_BUCKET); 26257 } 26258 26259 *count += nmatch; 26260 if (nmatch >= 500 && err == 0) 26261 goto startover; 26262 return (err); 26263 } 26264 26265 /* 26266 * Abort all connections that matches the attributes specified in acp. 26267 */ 26268 static int 26269 tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps) 26270 { 26271 sa_family_t af; 26272 uint32_t ports; 26273 uint16_t *pports; 26274 int err = 0, count = 0; 26275 boolean_t exact = B_FALSE; /* set when there is no wildcard */ 26276 int index = -1; 26277 ushort_t logflags; 26278 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 26279 26280 af = acp->ac_local.ss_family; 26281 26282 if (af == AF_INET) { 26283 if (TCP_AC_V4REMOTE(acp) != INADDR_ANY && 26284 TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) { 26285 pports = (uint16_t *)&ports; 26286 pports[1] = TCP_AC_V4LPORT(acp); 26287 pports[0] = TCP_AC_V4RPORT(acp); 26288 exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY); 26289 } 26290 } else { 26291 if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) && 26292 TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) { 26293 pports = (uint16_t *)&ports; 26294 pports[1] = TCP_AC_V6LPORT(acp); 26295 pports[0] = TCP_AC_V6RPORT(acp); 26296 exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp)); 26297 } 26298 } 26299 26300 /* 26301 * For cases where remote addr, local port, and remote port are non- 26302 * wildcards, tcp_ioctl_abort_bucket will only be called once. 26303 */ 26304 if (index != -1) { 26305 err = tcp_ioctl_abort_bucket(acp, index, 26306 &count, exact, tcps); 26307 } else { 26308 /* 26309 * loop through all entries for wildcard case 26310 */ 26311 for (index = 0; 26312 index < ipst->ips_ipcl_conn_fanout_size; 26313 index++) { 26314 err = tcp_ioctl_abort_bucket(acp, index, 26315 &count, exact, tcps); 26316 if (err != 0) 26317 break; 26318 } 26319 } 26320 26321 logflags = SL_TRACE | SL_NOTE; 26322 /* 26323 * Don't print this message to the console if the operation was done 26324 * to a non-global zone. 26325 */ 26326 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 26327 logflags |= SL_CONSOLE; 26328 (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: " 26329 "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' ')); 26330 if (err == 0 && count == 0) 26331 err = ENOENT; 26332 return (err); 26333 } 26334 26335 /* 26336 * Process the TCP_IOC_ABORT_CONN ioctl request. 26337 */ 26338 static void 26339 tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) 26340 { 26341 int err; 26342 IOCP iocp; 26343 MBLKP mp1; 26344 sa_family_t laf, raf; 26345 tcp_ioc_abort_conn_t *acp; 26346 zone_t *zptr; 26347 conn_t *connp = Q_TO_CONN(q); 26348 zoneid_t zoneid = connp->conn_zoneid; 26349 tcp_t *tcp = connp->conn_tcp; 26350 tcp_stack_t *tcps = tcp->tcp_tcps; 26351 26352 iocp = (IOCP)mp->b_rptr; 26353 26354 if ((mp1 = mp->b_cont) == NULL || 26355 iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) { 26356 err = EINVAL; 26357 goto out; 26358 } 26359 26360 /* check permissions */ 26361 if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { 26362 err = EPERM; 26363 goto out; 26364 } 26365 26366 if (mp1->b_cont != NULL) { 26367 freemsg(mp1->b_cont); 26368 mp1->b_cont = NULL; 26369 } 26370 26371 acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr; 26372 laf = acp->ac_local.ss_family; 26373 raf = acp->ac_remote.ss_family; 26374 26375 /* check that a zone with the supplied zoneid exists */ 26376 if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) { 26377 zptr = zone_find_by_id(zoneid); 26378 if (zptr != NULL) { 26379 zone_rele(zptr); 26380 } else { 26381 err = EINVAL; 26382 goto out; 26383 } 26384 } 26385 26386 /* 26387 * For exclusive stacks we set the zoneid to zero 26388 * to make TCP operate as if in the global zone. 26389 */ 26390 if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID) 26391 acp->ac_zoneid = GLOBAL_ZONEID; 26392 26393 if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT || 26394 acp->ac_start > acp->ac_end || laf != raf || 26395 (laf != AF_INET && laf != AF_INET6)) { 26396 err = EINVAL; 26397 goto out; 26398 } 26399 26400 tcp_ioctl_abort_dump(acp); 26401 err = tcp_ioctl_abort(acp, tcps); 26402 26403 out: 26404 if (mp1 != NULL) { 26405 freemsg(mp1); 26406 mp->b_cont = NULL; 26407 } 26408 26409 if (err != 0) 26410 miocnak(q, mp, 0, err); 26411 else 26412 miocack(q, mp, 0, 0); 26413 } 26414 26415 /* 26416 * tcp_time_wait_processing() handles processing of incoming packets when 26417 * the tcp is in the TIME_WAIT state. 26418 * A TIME_WAIT tcp that has an associated open TCP stream is never put 26419 * on the time wait list. 26420 */ 26421 void 26422 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, 26423 uint32_t seg_ack, int seg_len, tcph_t *tcph) 26424 { 26425 int32_t bytes_acked; 26426 int32_t gap; 26427 int32_t rgap; 26428 tcp_opt_t tcpopt; 26429 uint_t flags; 26430 uint32_t new_swnd = 0; 26431 conn_t *connp; 26432 tcp_stack_t *tcps = tcp->tcp_tcps; 26433 26434 BUMP_LOCAL(tcp->tcp_ibsegs); 26435 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_RECV_PKT); 26436 26437 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 26438 new_swnd = BE16_TO_U16(tcph->th_win) << 26439 ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 26440 if (tcp->tcp_snd_ts_ok) { 26441 if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 26442 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 26443 tcp->tcp_rnxt, TH_ACK); 26444 goto done; 26445 } 26446 } 26447 gap = seg_seq - tcp->tcp_rnxt; 26448 rgap = tcp->tcp_rwnd - (gap + seg_len); 26449 if (gap < 0) { 26450 BUMP_MIB(&tcps->tcps_mib, tcpInDataDupSegs); 26451 UPDATE_MIB(&tcps->tcps_mib, tcpInDataDupBytes, 26452 (seg_len > -gap ? -gap : seg_len)); 26453 seg_len += gap; 26454 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 26455 if (flags & TH_RST) { 26456 goto done; 26457 } 26458 if ((flags & TH_FIN) && seg_len == -1) { 26459 /* 26460 * When TCP receives a duplicate FIN in 26461 * TIME_WAIT state, restart the 2 MSL timer. 26462 * See page 73 in RFC 793. Make sure this TCP 26463 * is already on the TIME_WAIT list. If not, 26464 * just restart the timer. 26465 */ 26466 if (TCP_IS_DETACHED(tcp)) { 26467 if (tcp_time_wait_remove(tcp, NULL) == 26468 B_TRUE) { 26469 tcp_time_wait_append(tcp); 26470 TCP_DBGSTAT(tcps, 26471 tcp_rput_time_wait); 26472 } 26473 } else { 26474 ASSERT(tcp != NULL); 26475 TCP_TIMER_RESTART(tcp, 26476 tcps->tcps_time_wait_interval); 26477 } 26478 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 26479 tcp->tcp_rnxt, TH_ACK); 26480 goto done; 26481 } 26482 flags |= TH_ACK_NEEDED; 26483 seg_len = 0; 26484 goto process_ack; 26485 } 26486 26487 /* Fix seg_seq, and chew the gap off the front. */ 26488 seg_seq = tcp->tcp_rnxt; 26489 } 26490 26491 if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 26492 /* 26493 * Make sure that when we accept the connection, pick 26494 * an ISS greater than (tcp_snxt + ISS_INCR/2) for the 26495 * old connection. 26496 * 26497 * The next ISS generated is equal to tcp_iss_incr_extra 26498 * + ISS_INCR/2 + other components depending on the 26499 * value of tcp_strong_iss. We pre-calculate the new 26500 * ISS here and compare with tcp_snxt to determine if 26501 * we need to make adjustment to tcp_iss_incr_extra. 26502 * 26503 * The above calculation is ugly and is a 26504 * waste of CPU cycles... 26505 */ 26506 uint32_t new_iss = tcps->tcps_iss_incr_extra; 26507 int32_t adj; 26508 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 26509 26510 switch (tcps->tcps_strong_iss) { 26511 case 2: { 26512 /* Add time and MD5 components. */ 26513 uint32_t answer[4]; 26514 struct { 26515 uint32_t ports; 26516 in6_addr_t src; 26517 in6_addr_t dst; 26518 } arg; 26519 MD5_CTX context; 26520 26521 mutex_enter(&tcps->tcps_iss_key_lock); 26522 context = tcps->tcps_iss_key; 26523 mutex_exit(&tcps->tcps_iss_key_lock); 26524 arg.ports = tcp->tcp_ports; 26525 /* We use MAPPED addresses in tcp_iss_init */ 26526 arg.src = tcp->tcp_ip_src_v6; 26527 if (tcp->tcp_ipversion == IPV4_VERSION) { 26528 IN6_IPADDR_TO_V4MAPPED( 26529 tcp->tcp_ipha->ipha_dst, 26530 &arg.dst); 26531 } else { 26532 arg.dst = 26533 tcp->tcp_ip6h->ip6_dst; 26534 } 26535 MD5Update(&context, (uchar_t *)&arg, 26536 sizeof (arg)); 26537 MD5Final((uchar_t *)answer, &context); 26538 answer[0] ^= answer[1] ^ answer[2] ^ answer[3]; 26539 new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0]; 26540 break; 26541 } 26542 case 1: 26543 /* Add time component and min random (i.e. 1). */ 26544 new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1; 26545 break; 26546 default: 26547 /* Add only time component. */ 26548 new_iss += (uint32_t)gethrestime_sec() * ISS_INCR; 26549 break; 26550 } 26551 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 26552 /* 26553 * New ISS not guaranteed to be ISS_INCR/2 26554 * ahead of the current tcp_snxt, so add the 26555 * difference to tcp_iss_incr_extra. 26556 */ 26557 tcps->tcps_iss_incr_extra += adj; 26558 } 26559 /* 26560 * If tcp_clean_death() can not perform the task now, 26561 * drop the SYN packet and let the other side re-xmit. 26562 * Otherwise pass the SYN packet back in, since the 26563 * old tcp state has been cleaned up or freed. 26564 */ 26565 if (tcp_clean_death(tcp, 0, 27) == -1) 26566 goto done; 26567 /* 26568 * We will come back to tcp_rput_data 26569 * on the global queue. Packets destined 26570 * for the global queue will be checked 26571 * with global policy. But the policy for 26572 * this packet has already been checked as 26573 * this was destined for the detached 26574 * connection. We need to bypass policy 26575 * check this time by attaching a dummy 26576 * ipsec_in with ipsec_in_dont_check set. 26577 */ 26578 connp = ipcl_classify(mp, tcp->tcp_connp->conn_zoneid, ipst); 26579 if (connp != NULL) { 26580 TCP_STAT(tcps, tcp_time_wait_syn_success); 26581 tcp_reinput(connp, mp, tcp->tcp_connp->conn_sqp); 26582 return; 26583 } 26584 goto done; 26585 } 26586 26587 /* 26588 * rgap is the amount of stuff received out of window. A negative 26589 * value is the amount out of window. 26590 */ 26591 if (rgap < 0) { 26592 BUMP_MIB(&tcps->tcps_mib, tcpInDataPastWinSegs); 26593 UPDATE_MIB(&tcps->tcps_mib, tcpInDataPastWinBytes, -rgap); 26594 /* Fix seg_len and make sure there is something left. */ 26595 seg_len += rgap; 26596 if (seg_len <= 0) { 26597 if (flags & TH_RST) { 26598 goto done; 26599 } 26600 flags |= TH_ACK_NEEDED; 26601 seg_len = 0; 26602 goto process_ack; 26603 } 26604 } 26605 /* 26606 * Check whether we can update tcp_ts_recent. This test is 26607 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 26608 * Extensions for High Performance: An Update", Internet Draft. 26609 */ 26610 if (tcp->tcp_snd_ts_ok && 26611 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 26612 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 26613 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 26614 tcp->tcp_last_rcv_lbolt = lbolt64; 26615 } 26616 26617 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 26618 /* Always ack out of order packets */ 26619 flags |= TH_ACK_NEEDED; 26620 seg_len = 0; 26621 } else if (seg_len > 0) { 26622 BUMP_MIB(&tcps->tcps_mib, tcpInClosed); 26623 BUMP_MIB(&tcps->tcps_mib, tcpInDataInorderSegs); 26624 UPDATE_MIB(&tcps->tcps_mib, tcpInDataInorderBytes, seg_len); 26625 } 26626 if (flags & TH_RST) { 26627 (void) tcp_clean_death(tcp, 0, 28); 26628 goto done; 26629 } 26630 if (flags & TH_SYN) { 26631 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 26632 TH_RST|TH_ACK); 26633 /* 26634 * Do not delete the TCP structure if it is in 26635 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 26636 */ 26637 goto done; 26638 } 26639 process_ack: 26640 if (flags & TH_ACK) { 26641 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 26642 if (bytes_acked <= 0) { 26643 if (bytes_acked == 0 && seg_len == 0 && 26644 new_swnd == tcp->tcp_swnd) 26645 BUMP_MIB(&tcps->tcps_mib, tcpInDupAck); 26646 } else { 26647 /* Acks something not sent */ 26648 flags |= TH_ACK_NEEDED; 26649 } 26650 } 26651 if (flags & TH_ACK_NEEDED) { 26652 /* 26653 * Time to send an ack for some reason. 26654 */ 26655 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 26656 tcp->tcp_rnxt, TH_ACK); 26657 } 26658 done: 26659 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 26660 DB_CKSUMSTART(mp) = 0; 26661 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 26662 TCP_STAT(tcps, tcp_time_wait_syn_fail); 26663 } 26664 freemsg(mp); 26665 } 26666 26667 /* 26668 * Allocate a T_SVR4_OPTMGMT_REQ. 26669 * The caller needs to increment tcp_drop_opt_ack_cnt when sending these so 26670 * that tcp_rput_other can drop the acks. 26671 */ 26672 static mblk_t * 26673 tcp_setsockopt_mp(int level, int cmd, char *opt, int optlen) 26674 { 26675 mblk_t *mp; 26676 struct T_optmgmt_req *tor; 26677 struct opthdr *oh; 26678 uint_t size; 26679 char *optptr; 26680 26681 size = sizeof (*tor) + sizeof (*oh) + optlen; 26682 mp = allocb(size, BPRI_MED); 26683 if (mp == NULL) 26684 return (NULL); 26685 26686 mp->b_wptr += size; 26687 mp->b_datap->db_type = M_PROTO; 26688 tor = (struct T_optmgmt_req *)mp->b_rptr; 26689 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 26690 tor->MGMT_flags = T_NEGOTIATE; 26691 tor->OPT_length = sizeof (*oh) + optlen; 26692 tor->OPT_offset = (t_scalar_t)sizeof (*tor); 26693 26694 oh = (struct opthdr *)&tor[1]; 26695 oh->level = level; 26696 oh->name = cmd; 26697 oh->len = optlen; 26698 if (optlen != 0) { 26699 optptr = (char *)&oh[1]; 26700 bcopy(opt, optptr, optlen); 26701 } 26702 return (mp); 26703 } 26704 26705 /* 26706 * TCP Timers Implementation. 26707 */ 26708 timeout_id_t 26709 tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim) 26710 { 26711 mblk_t *mp; 26712 tcp_timer_t *tcpt; 26713 tcp_t *tcp = connp->conn_tcp; 26714 tcp_stack_t *tcps = tcp->tcp_tcps; 26715 26716 ASSERT(connp->conn_sqp != NULL); 26717 26718 TCP_DBGSTAT(tcps, tcp_timeout_calls); 26719 26720 if (tcp->tcp_timercache == NULL) { 26721 mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC); 26722 } else { 26723 TCP_DBGSTAT(tcps, tcp_timeout_cached_alloc); 26724 mp = tcp->tcp_timercache; 26725 tcp->tcp_timercache = mp->b_next; 26726 mp->b_next = NULL; 26727 ASSERT(mp->b_wptr == NULL); 26728 } 26729 26730 CONN_INC_REF(connp); 26731 tcpt = (tcp_timer_t *)mp->b_rptr; 26732 tcpt->connp = connp; 26733 tcpt->tcpt_proc = f; 26734 tcpt->tcpt_tid = timeout(tcp_timer_callback, mp, tim); 26735 return ((timeout_id_t)mp); 26736 } 26737 26738 static void 26739 tcp_timer_callback(void *arg) 26740 { 26741 mblk_t *mp = (mblk_t *)arg; 26742 tcp_timer_t *tcpt; 26743 conn_t *connp; 26744 26745 tcpt = (tcp_timer_t *)mp->b_rptr; 26746 connp = tcpt->connp; 26747 squeue_fill(connp->conn_sqp, mp, 26748 tcp_timer_handler, connp, SQTAG_TCP_TIMER); 26749 } 26750 26751 static void 26752 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2) 26753 { 26754 tcp_timer_t *tcpt; 26755 conn_t *connp = (conn_t *)arg; 26756 tcp_t *tcp = connp->conn_tcp; 26757 26758 tcpt = (tcp_timer_t *)mp->b_rptr; 26759 ASSERT(connp == tcpt->connp); 26760 ASSERT((squeue_t *)arg2 == connp->conn_sqp); 26761 26762 /* 26763 * If the TCP has reached the closed state, don't proceed any 26764 * further. This TCP logically does not exist on the system. 26765 * tcpt_proc could for example access queues, that have already 26766 * been qprocoff'ed off. Also see comments at the start of tcp_input 26767 */ 26768 if (tcp->tcp_state != TCPS_CLOSED) { 26769 (*tcpt->tcpt_proc)(connp); 26770 } else { 26771 tcp->tcp_timer_tid = 0; 26772 } 26773 tcp_timer_free(connp->conn_tcp, mp); 26774 } 26775 26776 /* 26777 * There is potential race with untimeout and the handler firing at the same 26778 * time. The mblock may be freed by the handler while we are trying to use 26779 * it. But since both should execute on the same squeue, this race should not 26780 * occur. 26781 */ 26782 clock_t 26783 tcp_timeout_cancel(conn_t *connp, timeout_id_t id) 26784 { 26785 mblk_t *mp = (mblk_t *)id; 26786 tcp_timer_t *tcpt; 26787 clock_t delta; 26788 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; 26789 26790 TCP_DBGSTAT(tcps, tcp_timeout_cancel_reqs); 26791 26792 if (mp == NULL) 26793 return (-1); 26794 26795 tcpt = (tcp_timer_t *)mp->b_rptr; 26796 ASSERT(tcpt->connp == connp); 26797 26798 delta = untimeout(tcpt->tcpt_tid); 26799 26800 if (delta >= 0) { 26801 TCP_DBGSTAT(tcps, tcp_timeout_canceled); 26802 tcp_timer_free(connp->conn_tcp, mp); 26803 CONN_DEC_REF(connp); 26804 } 26805 26806 return (delta); 26807 } 26808 26809 /* 26810 * Allocate space for the timer event. The allocation looks like mblk, but it is 26811 * not a proper mblk. To avoid confusion we set b_wptr to NULL. 26812 * 26813 * Dealing with failures: If we can't allocate from the timer cache we try 26814 * allocating from dblock caches using allocb_tryhard(). In this case b_wptr 26815 * points to b_rptr. 26816 * If we can't allocate anything using allocb_tryhard(), we perform a last 26817 * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and 26818 * save the actual allocation size in b_datap. 26819 */ 26820 mblk_t * 26821 tcp_timermp_alloc(int kmflags) 26822 { 26823 mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache, 26824 kmflags & ~KM_PANIC); 26825 26826 if (mp != NULL) { 26827 mp->b_next = mp->b_prev = NULL; 26828 mp->b_rptr = (uchar_t *)(&mp[1]); 26829 mp->b_wptr = NULL; 26830 mp->b_datap = NULL; 26831 mp->b_queue = NULL; 26832 mp->b_cont = NULL; 26833 } else if (kmflags & KM_PANIC) { 26834 /* 26835 * Failed to allocate memory for the timer. Try allocating from 26836 * dblock caches. 26837 */ 26838 /* ipclassifier calls this from a constructor - hence no tcps */ 26839 TCP_G_STAT(tcp_timermp_allocfail); 26840 mp = allocb_tryhard(sizeof (tcp_timer_t)); 26841 if (mp == NULL) { 26842 size_t size = 0; 26843 /* 26844 * Memory is really low. Try tryhard allocation. 26845 * 26846 * ipclassifier calls this from a constructor - 26847 * hence no tcps 26848 */ 26849 TCP_G_STAT(tcp_timermp_allocdblfail); 26850 mp = kmem_alloc_tryhard(sizeof (mblk_t) + 26851 sizeof (tcp_timer_t), &size, kmflags); 26852 mp->b_rptr = (uchar_t *)(&mp[1]); 26853 mp->b_next = mp->b_prev = NULL; 26854 mp->b_wptr = (uchar_t *)-1; 26855 mp->b_datap = (dblk_t *)size; 26856 mp->b_queue = NULL; 26857 mp->b_cont = NULL; 26858 } 26859 ASSERT(mp->b_wptr != NULL); 26860 } 26861 /* ipclassifier calls this from a constructor - hence no tcps */ 26862 TCP_G_DBGSTAT(tcp_timermp_alloced); 26863 26864 return (mp); 26865 } 26866 26867 /* 26868 * Free per-tcp timer cache. 26869 * It can only contain entries from tcp_timercache. 26870 */ 26871 void 26872 tcp_timermp_free(tcp_t *tcp) 26873 { 26874 mblk_t *mp; 26875 26876 while ((mp = tcp->tcp_timercache) != NULL) { 26877 ASSERT(mp->b_wptr == NULL); 26878 tcp->tcp_timercache = tcp->tcp_timercache->b_next; 26879 kmem_cache_free(tcp_timercache, mp); 26880 } 26881 } 26882 26883 /* 26884 * Free timer event. Put it on the per-tcp timer cache if there is not too many 26885 * events there already (currently at most two events are cached). 26886 * If the event is not allocated from the timer cache, free it right away. 26887 */ 26888 static void 26889 tcp_timer_free(tcp_t *tcp, mblk_t *mp) 26890 { 26891 mblk_t *mp1 = tcp->tcp_timercache; 26892 tcp_stack_t *tcps = tcp->tcp_tcps; 26893 26894 if (mp->b_wptr != NULL) { 26895 /* 26896 * This allocation is not from a timer cache, free it right 26897 * away. 26898 */ 26899 if (mp->b_wptr != (uchar_t *)-1) 26900 freeb(mp); 26901 else 26902 kmem_free(mp, (size_t)mp->b_datap); 26903 } else if (mp1 == NULL || mp1->b_next == NULL) { 26904 /* Cache this timer block for future allocations */ 26905 mp->b_rptr = (uchar_t *)(&mp[1]); 26906 mp->b_next = mp1; 26907 tcp->tcp_timercache = mp; 26908 } else { 26909 kmem_cache_free(tcp_timercache, mp); 26910 TCP_DBGSTAT(tcps, tcp_timermp_freed); 26911 } 26912 } 26913 26914 /* 26915 * End of TCP Timers implementation. 26916 */ 26917 26918 /* 26919 * tcp_{set,clr}qfull() functions are used to either set or clear QFULL 26920 * on the specified backing STREAMS q. Note, the caller may make the 26921 * decision to call based on the tcp_t.tcp_flow_stopped value which 26922 * when check outside the q's lock is only an advisory check ... 26923 */ 26924 26925 void 26926 tcp_setqfull(tcp_t *tcp) 26927 { 26928 queue_t *q = tcp->tcp_wq; 26929 tcp_stack_t *tcps = tcp->tcp_tcps; 26930 26931 if (!(q->q_flag & QFULL)) { 26932 mutex_enter(QLOCK(q)); 26933 if (!(q->q_flag & QFULL)) { 26934 /* still need to set QFULL */ 26935 q->q_flag |= QFULL; 26936 tcp->tcp_flow_stopped = B_TRUE; 26937 mutex_exit(QLOCK(q)); 26938 TCP_STAT(tcps, tcp_flwctl_on); 26939 } else { 26940 mutex_exit(QLOCK(q)); 26941 } 26942 } 26943 } 26944 26945 void 26946 tcp_clrqfull(tcp_t *tcp) 26947 { 26948 queue_t *q = tcp->tcp_wq; 26949 26950 if (q->q_flag & QFULL) { 26951 mutex_enter(QLOCK(q)); 26952 if (q->q_flag & QFULL) { 26953 q->q_flag &= ~QFULL; 26954 tcp->tcp_flow_stopped = B_FALSE; 26955 mutex_exit(QLOCK(q)); 26956 if (q->q_flag & QWANTW) 26957 qbackenable(q, 0); 26958 } else { 26959 mutex_exit(QLOCK(q)); 26960 } 26961 } 26962 } 26963 26964 26965 /* 26966 * kstats related to squeues i.e. not per IP instance 26967 */ 26968 static void * 26969 tcp_g_kstat_init(tcp_g_stat_t *tcp_g_statp) 26970 { 26971 kstat_t *ksp; 26972 26973 tcp_g_stat_t template = { 26974 { "tcp_timermp_alloced", KSTAT_DATA_UINT64 }, 26975 { "tcp_timermp_allocfail", KSTAT_DATA_UINT64 }, 26976 { "tcp_timermp_allocdblfail", KSTAT_DATA_UINT64 }, 26977 { "tcp_freelist_cleanup", KSTAT_DATA_UINT64 }, 26978 }; 26979 26980 ksp = kstat_create(TCP_MOD_NAME, 0, "tcpstat_g", "net", 26981 KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 26982 KSTAT_FLAG_VIRTUAL); 26983 26984 if (ksp == NULL) 26985 return (NULL); 26986 26987 bcopy(&template, tcp_g_statp, sizeof (template)); 26988 ksp->ks_data = (void *)tcp_g_statp; 26989 26990 kstat_install(ksp); 26991 return (ksp); 26992 } 26993 26994 static void 26995 tcp_g_kstat_fini(kstat_t *ksp) 26996 { 26997 if (ksp != NULL) { 26998 kstat_delete(ksp); 26999 } 27000 } 27001 27002 27003 static void * 27004 tcp_kstat2_init(netstackid_t stackid, tcp_stat_t *tcps_statisticsp) 27005 { 27006 kstat_t *ksp; 27007 27008 tcp_stat_t template = { 27009 { "tcp_time_wait", KSTAT_DATA_UINT64 }, 27010 { "tcp_time_wait_syn", KSTAT_DATA_UINT64 }, 27011 { "tcp_time_wait_success", KSTAT_DATA_UINT64 }, 27012 { "tcp_time_wait_fail", KSTAT_DATA_UINT64 }, 27013 { "tcp_reinput_syn", KSTAT_DATA_UINT64 }, 27014 { "tcp_ip_output", KSTAT_DATA_UINT64 }, 27015 { "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 }, 27016 { "tcp_detach_time_wait", KSTAT_DATA_UINT64 }, 27017 { "tcp_time_wait_reap", KSTAT_DATA_UINT64 }, 27018 { "tcp_clean_death_nondetached", KSTAT_DATA_UINT64 }, 27019 { "tcp_reinit_calls", KSTAT_DATA_UINT64 }, 27020 { "tcp_eager_err1", KSTAT_DATA_UINT64 }, 27021 { "tcp_eager_err2", KSTAT_DATA_UINT64 }, 27022 { "tcp_eager_blowoff_calls", KSTAT_DATA_UINT64 }, 27023 { "tcp_eager_blowoff_q", KSTAT_DATA_UINT64 }, 27024 { "tcp_eager_blowoff_q0", KSTAT_DATA_UINT64 }, 27025 { "tcp_not_hard_bound", KSTAT_DATA_UINT64 }, 27026 { "tcp_no_listener", KSTAT_DATA_UINT64 }, 27027 { "tcp_found_eager", KSTAT_DATA_UINT64 }, 27028 { "tcp_wrong_queue", KSTAT_DATA_UINT64 }, 27029 { "tcp_found_eager_binding1", KSTAT_DATA_UINT64 }, 27030 { "tcp_found_eager_bound1", KSTAT_DATA_UINT64 }, 27031 { "tcp_eager_has_listener1", KSTAT_DATA_UINT64 }, 27032 { "tcp_open_alloc", KSTAT_DATA_UINT64 }, 27033 { "tcp_open_detached_alloc", KSTAT_DATA_UINT64 }, 27034 { "tcp_rput_time_wait", KSTAT_DATA_UINT64 }, 27035 { "tcp_listendrop", KSTAT_DATA_UINT64 }, 27036 { "tcp_listendropq0", KSTAT_DATA_UINT64 }, 27037 { "tcp_wrong_rq", KSTAT_DATA_UINT64 }, 27038 { "tcp_rsrv_calls", KSTAT_DATA_UINT64 }, 27039 { "tcp_eagerfree2", KSTAT_DATA_UINT64 }, 27040 { "tcp_eagerfree3", KSTAT_DATA_UINT64 }, 27041 { "tcp_eagerfree4", KSTAT_DATA_UINT64 }, 27042 { "tcp_eagerfree5", KSTAT_DATA_UINT64 }, 27043 { "tcp_timewait_syn_fail", KSTAT_DATA_UINT64 }, 27044 { "tcp_listen_badflags", KSTAT_DATA_UINT64 }, 27045 { "tcp_timeout_calls", KSTAT_DATA_UINT64 }, 27046 { "tcp_timeout_cached_alloc", KSTAT_DATA_UINT64 }, 27047 { "tcp_timeout_cancel_reqs", KSTAT_DATA_UINT64 }, 27048 { "tcp_timeout_canceled", KSTAT_DATA_UINT64 }, 27049 { "tcp_timermp_freed", KSTAT_DATA_UINT64 }, 27050 { "tcp_push_timer_cnt", KSTAT_DATA_UINT64 }, 27051 { "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 }, 27052 { "tcp_ire_null1", KSTAT_DATA_UINT64 }, 27053 { "tcp_ire_null", KSTAT_DATA_UINT64 }, 27054 { "tcp_ip_send", KSTAT_DATA_UINT64 }, 27055 { "tcp_ip_ire_send", KSTAT_DATA_UINT64 }, 27056 { "tcp_wsrv_called", KSTAT_DATA_UINT64 }, 27057 { "tcp_flwctl_on", KSTAT_DATA_UINT64 }, 27058 { "tcp_timer_fire_early", KSTAT_DATA_UINT64 }, 27059 { "tcp_timer_fire_miss", KSTAT_DATA_UINT64 }, 27060 { "tcp_rput_v6_error", KSTAT_DATA_UINT64 }, 27061 { "tcp_out_sw_cksum", KSTAT_DATA_UINT64 }, 27062 { "tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 27063 { "tcp_zcopy_on", KSTAT_DATA_UINT64 }, 27064 { "tcp_zcopy_off", KSTAT_DATA_UINT64 }, 27065 { "tcp_zcopy_backoff", KSTAT_DATA_UINT64 }, 27066 { "tcp_zcopy_disable", KSTAT_DATA_UINT64 }, 27067 { "tcp_mdt_pkt_out", KSTAT_DATA_UINT64 }, 27068 { "tcp_mdt_pkt_out_v4", KSTAT_DATA_UINT64 }, 27069 { "tcp_mdt_pkt_out_v6", KSTAT_DATA_UINT64 }, 27070 { "tcp_mdt_discarded", KSTAT_DATA_UINT64 }, 27071 { "tcp_mdt_conn_halted1", KSTAT_DATA_UINT64 }, 27072 { "tcp_mdt_conn_halted2", KSTAT_DATA_UINT64 }, 27073 { "tcp_mdt_conn_halted3", KSTAT_DATA_UINT64 }, 27074 { "tcp_mdt_conn_resumed1", KSTAT_DATA_UINT64 }, 27075 { "tcp_mdt_conn_resumed2", KSTAT_DATA_UINT64 }, 27076 { "tcp_mdt_legacy_small", KSTAT_DATA_UINT64 }, 27077 { "tcp_mdt_legacy_all", KSTAT_DATA_UINT64 }, 27078 { "tcp_mdt_legacy_ret", KSTAT_DATA_UINT64 }, 27079 { "tcp_mdt_allocfail", KSTAT_DATA_UINT64 }, 27080 { "tcp_mdt_addpdescfail", KSTAT_DATA_UINT64 }, 27081 { "tcp_mdt_allocd", KSTAT_DATA_UINT64 }, 27082 { "tcp_mdt_linked", KSTAT_DATA_UINT64 }, 27083 { "tcp_fusion_flowctl", KSTAT_DATA_UINT64 }, 27084 { "tcp_fusion_backenabled", KSTAT_DATA_UINT64 }, 27085 { "tcp_fusion_urg", KSTAT_DATA_UINT64 }, 27086 { "tcp_fusion_putnext", KSTAT_DATA_UINT64 }, 27087 { "tcp_fusion_unfusable", KSTAT_DATA_UINT64 }, 27088 { "tcp_fusion_aborted", KSTAT_DATA_UINT64 }, 27089 { "tcp_fusion_unqualified", KSTAT_DATA_UINT64 }, 27090 { "tcp_fusion_rrw_busy", KSTAT_DATA_UINT64 }, 27091 { "tcp_fusion_rrw_msgcnt", KSTAT_DATA_UINT64 }, 27092 { "tcp_fusion_rrw_plugged", KSTAT_DATA_UINT64 }, 27093 { "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64 }, 27094 { "tcp_sock_fallback", KSTAT_DATA_UINT64 }, 27095 { "tcp_lso_enabled", KSTAT_DATA_UINT64 }, 27096 { "tcp_lso_disabled", KSTAT_DATA_UINT64 }, 27097 { "tcp_lso_times", KSTAT_DATA_UINT64 }, 27098 { "tcp_lso_pkt_out", KSTAT_DATA_UINT64 }, 27099 }; 27100 27101 ksp = kstat_create_netstack(TCP_MOD_NAME, 0, "tcpstat", "net", 27102 KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 27103 KSTAT_FLAG_VIRTUAL, stackid); 27104 27105 if (ksp == NULL) 27106 return (NULL); 27107 27108 bcopy(&template, tcps_statisticsp, sizeof (template)); 27109 ksp->ks_data = (void *)tcps_statisticsp; 27110 ksp->ks_private = (void *)(uintptr_t)stackid; 27111 27112 kstat_install(ksp); 27113 return (ksp); 27114 } 27115 27116 static void 27117 tcp_kstat2_fini(netstackid_t stackid, kstat_t *ksp) 27118 { 27119 if (ksp != NULL) { 27120 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 27121 kstat_delete_netstack(ksp, stackid); 27122 } 27123 } 27124 27125 /* 27126 * TCP Kstats implementation 27127 */ 27128 static void * 27129 tcp_kstat_init(netstackid_t stackid, tcp_stack_t *tcps) 27130 { 27131 kstat_t *ksp; 27132 27133 tcp_named_kstat_t template = { 27134 { "rtoAlgorithm", KSTAT_DATA_INT32, 0 }, 27135 { "rtoMin", KSTAT_DATA_INT32, 0 }, 27136 { "rtoMax", KSTAT_DATA_INT32, 0 }, 27137 { "maxConn", KSTAT_DATA_INT32, 0 }, 27138 { "activeOpens", KSTAT_DATA_UINT32, 0 }, 27139 { "passiveOpens", KSTAT_DATA_UINT32, 0 }, 27140 { "attemptFails", KSTAT_DATA_UINT32, 0 }, 27141 { "estabResets", KSTAT_DATA_UINT32, 0 }, 27142 { "currEstab", KSTAT_DATA_UINT32, 0 }, 27143 { "inSegs", KSTAT_DATA_UINT64, 0 }, 27144 { "outSegs", KSTAT_DATA_UINT64, 0 }, 27145 { "retransSegs", KSTAT_DATA_UINT32, 0 }, 27146 { "connTableSize", KSTAT_DATA_INT32, 0 }, 27147 { "outRsts", KSTAT_DATA_UINT32, 0 }, 27148 { "outDataSegs", KSTAT_DATA_UINT32, 0 }, 27149 { "outDataBytes", KSTAT_DATA_UINT32, 0 }, 27150 { "retransBytes", KSTAT_DATA_UINT32, 0 }, 27151 { "outAck", KSTAT_DATA_UINT32, 0 }, 27152 { "outAckDelayed", KSTAT_DATA_UINT32, 0 }, 27153 { "outUrg", KSTAT_DATA_UINT32, 0 }, 27154 { "outWinUpdate", KSTAT_DATA_UINT32, 0 }, 27155 { "outWinProbe", KSTAT_DATA_UINT32, 0 }, 27156 { "outControl", KSTAT_DATA_UINT32, 0 }, 27157 { "outFastRetrans", KSTAT_DATA_UINT32, 0 }, 27158 { "inAckSegs", KSTAT_DATA_UINT32, 0 }, 27159 { "inAckBytes", KSTAT_DATA_UINT32, 0 }, 27160 { "inDupAck", KSTAT_DATA_UINT32, 0 }, 27161 { "inAckUnsent", KSTAT_DATA_UINT32, 0 }, 27162 { "inDataInorderSegs", KSTAT_DATA_UINT32, 0 }, 27163 { "inDataInorderBytes", KSTAT_DATA_UINT32, 0 }, 27164 { "inDataUnorderSegs", KSTAT_DATA_UINT32, 0 }, 27165 { "inDataUnorderBytes", KSTAT_DATA_UINT32, 0 }, 27166 { "inDataDupSegs", KSTAT_DATA_UINT32, 0 }, 27167 { "inDataDupBytes", KSTAT_DATA_UINT32, 0 }, 27168 { "inDataPartDupSegs", KSTAT_DATA_UINT32, 0 }, 27169 { "inDataPartDupBytes", KSTAT_DATA_UINT32, 0 }, 27170 { "inDataPastWinSegs", KSTAT_DATA_UINT32, 0 }, 27171 { "inDataPastWinBytes", KSTAT_DATA_UINT32, 0 }, 27172 { "inWinProbe", KSTAT_DATA_UINT32, 0 }, 27173 { "inWinUpdate", KSTAT_DATA_UINT32, 0 }, 27174 { "inClosed", KSTAT_DATA_UINT32, 0 }, 27175 { "rttUpdate", KSTAT_DATA_UINT32, 0 }, 27176 { "rttNoUpdate", KSTAT_DATA_UINT32, 0 }, 27177 { "timRetrans", KSTAT_DATA_UINT32, 0 }, 27178 { "timRetransDrop", KSTAT_DATA_UINT32, 0 }, 27179 { "timKeepalive", KSTAT_DATA_UINT32, 0 }, 27180 { "timKeepaliveProbe", KSTAT_DATA_UINT32, 0 }, 27181 { "timKeepaliveDrop", KSTAT_DATA_UINT32, 0 }, 27182 { "listenDrop", KSTAT_DATA_UINT32, 0 }, 27183 { "listenDropQ0", KSTAT_DATA_UINT32, 0 }, 27184 { "halfOpenDrop", KSTAT_DATA_UINT32, 0 }, 27185 { "outSackRetransSegs", KSTAT_DATA_UINT32, 0 }, 27186 { "connTableSize6", KSTAT_DATA_INT32, 0 } 27187 }; 27188 27189 ksp = kstat_create_netstack(TCP_MOD_NAME, 0, TCP_MOD_NAME, "mib2", 27190 KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0, stackid); 27191 27192 if (ksp == NULL) 27193 return (NULL); 27194 27195 template.rtoAlgorithm.value.ui32 = 4; 27196 template.rtoMin.value.ui32 = tcps->tcps_rexmit_interval_min; 27197 template.rtoMax.value.ui32 = tcps->tcps_rexmit_interval_max; 27198 template.maxConn.value.i32 = -1; 27199 27200 bcopy(&template, ksp->ks_data, sizeof (template)); 27201 ksp->ks_update = tcp_kstat_update; 27202 ksp->ks_private = (void *)(uintptr_t)stackid; 27203 27204 kstat_install(ksp); 27205 return (ksp); 27206 } 27207 27208 static void 27209 tcp_kstat_fini(netstackid_t stackid, kstat_t *ksp) 27210 { 27211 if (ksp != NULL) { 27212 ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); 27213 kstat_delete_netstack(ksp, stackid); 27214 } 27215 } 27216 27217 static int 27218 tcp_kstat_update(kstat_t *kp, int rw) 27219 { 27220 tcp_named_kstat_t *tcpkp; 27221 tcp_t *tcp; 27222 connf_t *connfp; 27223 conn_t *connp; 27224 int i; 27225 netstackid_t stackid = (netstackid_t)(uintptr_t)kp->ks_private; 27226 netstack_t *ns; 27227 tcp_stack_t *tcps; 27228 ip_stack_t *ipst; 27229 27230 if ((kp == NULL) || (kp->ks_data == NULL)) 27231 return (EIO); 27232 27233 if (rw == KSTAT_WRITE) 27234 return (EACCES); 27235 27236 ns = netstack_find_by_stackid(stackid); 27237 if (ns == NULL) 27238 return (-1); 27239 tcps = ns->netstack_tcp; 27240 if (tcps == NULL) { 27241 netstack_rele(ns); 27242 return (-1); 27243 } 27244 tcpkp = (tcp_named_kstat_t *)kp->ks_data; 27245 27246 tcpkp->currEstab.value.ui32 = 0; 27247 27248 ipst = ns->netstack_ip; 27249 27250 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 27251 connfp = &ipst->ips_ipcl_globalhash_fanout[i]; 27252 connp = NULL; 27253 while ((connp = 27254 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 27255 tcp = connp->conn_tcp; 27256 switch (tcp_snmp_state(tcp)) { 27257 case MIB2_TCP_established: 27258 case MIB2_TCP_closeWait: 27259 tcpkp->currEstab.value.ui32++; 27260 break; 27261 } 27262 } 27263 } 27264 27265 tcpkp->activeOpens.value.ui32 = tcps->tcps_mib.tcpActiveOpens; 27266 tcpkp->passiveOpens.value.ui32 = tcps->tcps_mib.tcpPassiveOpens; 27267 tcpkp->attemptFails.value.ui32 = tcps->tcps_mib.tcpAttemptFails; 27268 tcpkp->estabResets.value.ui32 = tcps->tcps_mib.tcpEstabResets; 27269 tcpkp->inSegs.value.ui64 = tcps->tcps_mib.tcpHCInSegs; 27270 tcpkp->outSegs.value.ui64 = tcps->tcps_mib.tcpHCOutSegs; 27271 tcpkp->retransSegs.value.ui32 = tcps->tcps_mib.tcpRetransSegs; 27272 tcpkp->connTableSize.value.i32 = tcps->tcps_mib.tcpConnTableSize; 27273 tcpkp->outRsts.value.ui32 = tcps->tcps_mib.tcpOutRsts; 27274 tcpkp->outDataSegs.value.ui32 = tcps->tcps_mib.tcpOutDataSegs; 27275 tcpkp->outDataBytes.value.ui32 = tcps->tcps_mib.tcpOutDataBytes; 27276 tcpkp->retransBytes.value.ui32 = tcps->tcps_mib.tcpRetransBytes; 27277 tcpkp->outAck.value.ui32 = tcps->tcps_mib.tcpOutAck; 27278 tcpkp->outAckDelayed.value.ui32 = tcps->tcps_mib.tcpOutAckDelayed; 27279 tcpkp->outUrg.value.ui32 = tcps->tcps_mib.tcpOutUrg; 27280 tcpkp->outWinUpdate.value.ui32 = tcps->tcps_mib.tcpOutWinUpdate; 27281 tcpkp->outWinProbe.value.ui32 = tcps->tcps_mib.tcpOutWinProbe; 27282 tcpkp->outControl.value.ui32 = tcps->tcps_mib.tcpOutControl; 27283 tcpkp->outFastRetrans.value.ui32 = tcps->tcps_mib.tcpOutFastRetrans; 27284 tcpkp->inAckSegs.value.ui32 = tcps->tcps_mib.tcpInAckSegs; 27285 tcpkp->inAckBytes.value.ui32 = tcps->tcps_mib.tcpInAckBytes; 27286 tcpkp->inDupAck.value.ui32 = tcps->tcps_mib.tcpInDupAck; 27287 tcpkp->inAckUnsent.value.ui32 = tcps->tcps_mib.tcpInAckUnsent; 27288 tcpkp->inDataInorderSegs.value.ui32 = 27289 tcps->tcps_mib.tcpInDataInorderSegs; 27290 tcpkp->inDataInorderBytes.value.ui32 = 27291 tcps->tcps_mib.tcpInDataInorderBytes; 27292 tcpkp->inDataUnorderSegs.value.ui32 = 27293 tcps->tcps_mib.tcpInDataUnorderSegs; 27294 tcpkp->inDataUnorderBytes.value.ui32 = 27295 tcps->tcps_mib.tcpInDataUnorderBytes; 27296 tcpkp->inDataDupSegs.value.ui32 = tcps->tcps_mib.tcpInDataDupSegs; 27297 tcpkp->inDataDupBytes.value.ui32 = tcps->tcps_mib.tcpInDataDupBytes; 27298 tcpkp->inDataPartDupSegs.value.ui32 = 27299 tcps->tcps_mib.tcpInDataPartDupSegs; 27300 tcpkp->inDataPartDupBytes.value.ui32 = 27301 tcps->tcps_mib.tcpInDataPartDupBytes; 27302 tcpkp->inDataPastWinSegs.value.ui32 = 27303 tcps->tcps_mib.tcpInDataPastWinSegs; 27304 tcpkp->inDataPastWinBytes.value.ui32 = 27305 tcps->tcps_mib.tcpInDataPastWinBytes; 27306 tcpkp->inWinProbe.value.ui32 = tcps->tcps_mib.tcpInWinProbe; 27307 tcpkp->inWinUpdate.value.ui32 = tcps->tcps_mib.tcpInWinUpdate; 27308 tcpkp->inClosed.value.ui32 = tcps->tcps_mib.tcpInClosed; 27309 tcpkp->rttNoUpdate.value.ui32 = tcps->tcps_mib.tcpRttNoUpdate; 27310 tcpkp->rttUpdate.value.ui32 = tcps->tcps_mib.tcpRttUpdate; 27311 tcpkp->timRetrans.value.ui32 = tcps->tcps_mib.tcpTimRetrans; 27312 tcpkp->timRetransDrop.value.ui32 = tcps->tcps_mib.tcpTimRetransDrop; 27313 tcpkp->timKeepalive.value.ui32 = tcps->tcps_mib.tcpTimKeepalive; 27314 tcpkp->timKeepaliveProbe.value.ui32 = 27315 tcps->tcps_mib.tcpTimKeepaliveProbe; 27316 tcpkp->timKeepaliveDrop.value.ui32 = 27317 tcps->tcps_mib.tcpTimKeepaliveDrop; 27318 tcpkp->listenDrop.value.ui32 = tcps->tcps_mib.tcpListenDrop; 27319 tcpkp->listenDropQ0.value.ui32 = tcps->tcps_mib.tcpListenDropQ0; 27320 tcpkp->halfOpenDrop.value.ui32 = tcps->tcps_mib.tcpHalfOpenDrop; 27321 tcpkp->outSackRetransSegs.value.ui32 = 27322 tcps->tcps_mib.tcpOutSackRetransSegs; 27323 tcpkp->connTableSize6.value.i32 = tcps->tcps_mib.tcp6ConnTableSize; 27324 27325 netstack_rele(ns); 27326 return (0); 27327 } 27328 27329 void 27330 tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp) 27331 { 27332 uint16_t hdr_len; 27333 ipha_t *ipha; 27334 uint8_t *nexthdrp; 27335 tcph_t *tcph; 27336 tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; 27337 27338 /* Already has an eager */ 27339 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 27340 TCP_STAT(tcps, tcp_reinput_syn); 27341 squeue_enter(connp->conn_sqp, mp, connp->conn_recv, 27342 connp, SQTAG_TCP_REINPUT_EAGER); 27343 return; 27344 } 27345 27346 switch (IPH_HDR_VERSION(mp->b_rptr)) { 27347 case IPV4_VERSION: 27348 ipha = (ipha_t *)mp->b_rptr; 27349 hdr_len = IPH_HDR_LENGTH(ipha); 27350 break; 27351 case IPV6_VERSION: 27352 if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, 27353 &hdr_len, &nexthdrp)) { 27354 CONN_DEC_REF(connp); 27355 freemsg(mp); 27356 return; 27357 } 27358 break; 27359 } 27360 27361 tcph = (tcph_t *)&mp->b_rptr[hdr_len]; 27362 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 27363 mp->b_datap->db_struioflag |= STRUIO_EAGER; 27364 DB_CKSUMSTART(mp) = (intptr_t)sqp; 27365 } 27366 27367 squeue_fill(connp->conn_sqp, mp, connp->conn_recv, connp, 27368 SQTAG_TCP_REINPUT); 27369 } 27370 27371 static squeue_func_t 27372 tcp_squeue_switch(int val) 27373 { 27374 squeue_func_t rval = squeue_fill; 27375 27376 switch (val) { 27377 case 1: 27378 rval = squeue_enter_nodrain; 27379 break; 27380 case 2: 27381 rval = squeue_enter; 27382 break; 27383 default: 27384 break; 27385 } 27386 return (rval); 27387 } 27388 27389 /* 27390 * This is called once for each squeue - globally for all stack 27391 * instances. 27392 */ 27393 static void 27394 tcp_squeue_add(squeue_t *sqp) 27395 { 27396 tcp_squeue_priv_t *tcp_time_wait = kmem_zalloc( 27397 sizeof (tcp_squeue_priv_t), KM_SLEEP); 27398 27399 *squeue_getprivate(sqp, SQPRIVATE_TCP) = (intptr_t)tcp_time_wait; 27400 tcp_time_wait->tcp_time_wait_tid = timeout(tcp_time_wait_collector, 27401 sqp, TCP_TIME_WAIT_DELAY); 27402 if (tcp_free_list_max_cnt == 0) { 27403 int tcp_ncpus = ((boot_max_ncpus == -1) ? 27404 max_ncpus : boot_max_ncpus); 27405 27406 /* 27407 * Limit number of entries to 1% of availble memory / tcp_ncpus 27408 */ 27409 tcp_free_list_max_cnt = (freemem * PAGESIZE) / 27410 (tcp_ncpus * sizeof (tcp_t) * 100); 27411 } 27412 tcp_time_wait->tcp_free_list_cnt = 0; 27413 } 27414