1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 const char tcp_version[] = "%Z%%M% %I% %E% SMI"; 30 31 32 #include <sys/types.h> 33 #include <sys/stream.h> 34 #include <sys/strsun.h> 35 #include <sys/strsubr.h> 36 #include <sys/stropts.h> 37 #include <sys/strlog.h> 38 #include <sys/strsun.h> 39 #define _SUN_TPI_VERSION 2 40 #include <sys/tihdr.h> 41 #include <sys/timod.h> 42 #include <sys/ddi.h> 43 #include <sys/sunddi.h> 44 #include <sys/suntpi.h> 45 #include <sys/xti_inet.h> 46 #include <sys/cmn_err.h> 47 #include <sys/debug.h> 48 #include <sys/vtrace.h> 49 #include <sys/kmem.h> 50 #include <sys/ethernet.h> 51 #include <sys/cpuvar.h> 52 #include <sys/dlpi.h> 53 #include <sys/multidata.h> 54 #include <sys/multidata_impl.h> 55 #include <sys/pattr.h> 56 #include <sys/policy.h> 57 #include <sys/priv.h> 58 #include <sys/zone.h> 59 60 #include <sys/errno.h> 61 #include <sys/signal.h> 62 #include <sys/socket.h> 63 #include <sys/sockio.h> 64 #include <sys/isa_defs.h> 65 #include <sys/md5.h> 66 #include <sys/random.h> 67 #include <netinet/in.h> 68 #include <netinet/tcp.h> 69 #include <netinet/ip6.h> 70 #include <netinet/icmp6.h> 71 #include <net/if.h> 72 #include <net/route.h> 73 #include <inet/ipsec_impl.h> 74 75 #include <inet/common.h> 76 #include <inet/ip.h> 77 #include <inet/ip_impl.h> 78 #include <inet/ip6.h> 79 #include <inet/ip_ndp.h> 80 #include <inet/mi.h> 81 #include <inet/mib2.h> 82 #include <inet/nd.h> 83 #include <inet/optcom.h> 84 #include <inet/snmpcom.h> 85 #include <inet/kstatcom.h> 86 #include <inet/tcp.h> 87 #include <inet/tcp_impl.h> 88 #include <net/pfkeyv2.h> 89 #include <inet/ipsec_info.h> 90 #include <inet/ipdrop.h> 91 #include <inet/tcp_trace.h> 92 93 #include <inet/ipclassifier.h> 94 #include <inet/ip_ire.h> 95 #include <inet/ip_ftable.h> 96 #include <inet/ip_if.h> 97 #include <inet/ipp_common.h> 98 #include <sys/squeue.h> 99 #include <inet/kssl/ksslapi.h> 100 #include <sys/tsol/label.h> 101 #include <sys/tsol/tnet.h> 102 #include <sys/sdt.h> 103 #include <rpc/pmap_prot.h> 104 105 /* 106 * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) 107 * 108 * (Read the detailed design doc in PSARC case directory) 109 * 110 * The entire tcp state is contained in tcp_t and conn_t structure 111 * which are allocated in tandem using ipcl_conn_create() and passing 112 * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect 113 * the references on the tcp_t. The tcp_t structure is never compressed 114 * and packets always land on the correct TCP perimeter from the time 115 * eager is created till the time tcp_t dies (as such the old mentat 116 * TCP global queue is not used for detached state and no IPSEC checking 117 * is required). The global queue is still allocated to send out resets 118 * for connection which have no listeners and IP directly calls 119 * tcp_xmit_listeners_reset() which does any policy check. 120 * 121 * Protection and Synchronisation mechanism: 122 * 123 * The tcp data structure does not use any kind of lock for protecting 124 * its state but instead uses 'squeues' for mutual exclusion from various 125 * read and write side threads. To access a tcp member, the thread should 126 * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or 127 * squeue_fill). Since the squeues allow a direct function call, caller 128 * can pass any tcp function having prototype of edesc_t as argument 129 * (different from traditional STREAMs model where packets come in only 130 * designated entry points). The list of functions that can be directly 131 * called via squeue are listed before the usual function prototype. 132 * 133 * Referencing: 134 * 135 * TCP is MT-Hot and we use a reference based scheme to make sure that the 136 * tcp structure doesn't disappear when its needed. When the application 137 * creates an outgoing connection or accepts an incoming connection, we 138 * start out with 2 references on 'conn_ref'. One for TCP and one for IP. 139 * The IP reference is just a symbolic reference since ip_tcpclose() 140 * looks at tcp structure after tcp_close_output() returns which could 141 * have dropped the last TCP reference. So as long as the connection is 142 * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the 143 * conn_t. The classifier puts its own reference when the connection is 144 * inserted in listen or connected hash. Anytime a thread needs to enter 145 * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr 146 * on write side or by doing a classify on read side and then puts a 147 * reference on the conn before doing squeue_enter/tryenter/fill. For 148 * read side, the classifier itself puts the reference under fanout lock 149 * to make sure that tcp can't disappear before it gets processed. The 150 * squeue will drop this reference automatically so the called function 151 * doesn't have to do a DEC_REF. 152 * 153 * Opening a new connection: 154 * 155 * The outgoing connection open is pretty simple. ip_tcpopen() does the 156 * work in creating the conn/tcp structure and initializing it. The 157 * squeue assignment is done based on the CPU the application 158 * is running on. So for outbound connections, processing is always done 159 * on application CPU which might be different from the incoming CPU 160 * being interrupted by the NIC. An optimal way would be to figure out 161 * the NIC <-> CPU binding at listen time, and assign the outgoing 162 * connection to the squeue attached to the CPU that will be interrupted 163 * for incoming packets (we know the NIC based on the bind IP address). 164 * This might seem like a problem if more data is going out but the 165 * fact is that in most cases the transmit is ACK driven transmit where 166 * the outgoing data normally sits on TCP's xmit queue waiting to be 167 * transmitted. 168 * 169 * Accepting a connection: 170 * 171 * This is a more interesting case because of various races involved in 172 * establishing a eager in its own perimeter. Read the meta comment on 173 * top of tcp_conn_request(). But briefly, the squeue is picked by 174 * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU. 175 * 176 * Closing a connection: 177 * 178 * The close is fairly straight forward. tcp_close() calls tcp_close_output() 179 * via squeue to do the close and mark the tcp as detached if the connection 180 * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its 181 * reference but tcp_close() drop IP's reference always. So if tcp was 182 * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP 183 * and 1 because it is in classifier's connected hash. This is the condition 184 * we use to determine that its OK to clean up the tcp outside of squeue 185 * when time wait expires (check the ref under fanout and conn_lock and 186 * if it is 2, remove it from fanout hash and kill it). 187 * 188 * Although close just drops the necessary references and marks the 189 * tcp_detached state, tcp_close needs to know the tcp_detached has been 190 * set (under squeue) before letting the STREAM go away (because a 191 * inbound packet might attempt to go up the STREAM while the close 192 * has happened and tcp_detached is not set). So a special lock and 193 * flag is used along with a condition variable (tcp_closelock, tcp_closed, 194 * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked 195 * tcp_detached. 196 * 197 * Special provisions and fast paths: 198 * 199 * We make special provision for (AF_INET, SOCK_STREAM) sockets which 200 * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP 201 * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles 202 * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY 203 * check to send packets directly to tcp_rput_data via squeue. Everyone 204 * else comes through tcp_input() on the read side. 205 * 206 * We also make special provisions for sockfs by marking tcp_issocket 207 * whenever we have only sockfs on top of TCP. This allows us to skip 208 * putting the tcp in acceptor hash since a sockfs listener can never 209 * become acceptor and also avoid allocating a tcp_t for acceptor STREAM 210 * since eager has already been allocated and the accept now happens 211 * on acceptor STREAM. There is a big blob of comment on top of 212 * tcp_conn_request explaining the new accept. When socket is POP'd, 213 * sockfs sends us an ioctl to mark the fact and we go back to old 214 * behaviour. Once tcp_issocket is unset, its never set for the 215 * life of that connection. 216 * 217 * IPsec notes : 218 * 219 * Since a packet is always executed on the correct TCP perimeter 220 * all IPsec processing is defered to IP including checking new 221 * connections and setting IPSEC policies for new connection. The 222 * only exception is tcp_xmit_listeners_reset() which is called 223 * directly from IP and needs to policy check to see if TH_RST 224 * can be sent out. 225 */ 226 227 extern major_t TCP6_MAJ; 228 229 /* 230 * Values for squeue switch: 231 * 1: squeue_enter_nodrain 232 * 2: squeue_enter 233 * 3: squeue_fill 234 */ 235 int tcp_squeue_close = 2; 236 int tcp_squeue_wput = 2; 237 238 squeue_func_t tcp_squeue_close_proc; 239 squeue_func_t tcp_squeue_wput_proc; 240 241 /* 242 * This controls how tiny a write must be before we try to copy it 243 * into the the mblk on the tail of the transmit queue. Not much 244 * speedup is observed for values larger than sixteen. Zero will 245 * disable the optimisation. 246 */ 247 int tcp_tx_pull_len = 16; 248 249 /* 250 * TCP Statistics. 251 * 252 * How TCP statistics work. 253 * 254 * There are two types of statistics invoked by two macros. 255 * 256 * TCP_STAT(name) does non-atomic increment of a named stat counter. It is 257 * supposed to be used in non MT-hot paths of the code. 258 * 259 * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is 260 * supposed to be used for DEBUG purposes and may be used on a hot path. 261 * 262 * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat 263 * (use "kstat tcp" to get them). 264 * 265 * There is also additional debugging facility that marks tcp_clean_death() 266 * instances and saves them in tcp_t structure. It is triggered by 267 * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for 268 * tcp_clean_death() calls that counts the number of times each tag was hit. It 269 * is triggered by TCP_CLD_COUNTERS define. 270 * 271 * How to add new counters. 272 * 273 * 1) Add a field in the tcp_stat structure describing your counter. 274 * 2) Add a line in tcp_statistics with the name of the counter. 275 * 276 * IMPORTANT!! - make sure that both are in sync !! 277 * 3) Use either TCP_STAT or TCP_DBGSTAT with the name. 278 * 279 * Please avoid using private counters which are not kstat-exported. 280 * 281 * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances 282 * in tcp_t structure. 283 * 284 * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags. 285 */ 286 287 #ifndef TCP_DEBUG_COUNTER 288 #ifdef DEBUG 289 #define TCP_DEBUG_COUNTER 1 290 #else 291 #define TCP_DEBUG_COUNTER 0 292 #endif 293 #endif 294 295 #define TCP_CLD_COUNTERS 0 296 297 #define TCP_TAG_CLEAN_DEATH 1 298 #define TCP_MAX_CLEAN_DEATH_TAG 32 299 300 #ifdef lint 301 static int _lint_dummy_; 302 #endif 303 304 #if TCP_CLD_COUNTERS 305 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; 306 #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++ 307 #elif defined(lint) 308 #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0); 309 #else 310 #define TCP_CLD_STAT(x) 311 #endif 312 313 #if TCP_DEBUG_COUNTER 314 #define TCP_DBGSTAT(x) atomic_add_64(&(tcp_statistics.x.value.ui64), 1) 315 #elif defined(lint) 316 #define TCP_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); 317 #else 318 #define TCP_DBGSTAT(x) 319 #endif 320 321 tcp_stat_t tcp_statistics = { 322 { "tcp_time_wait", KSTAT_DATA_UINT64 }, 323 { "tcp_time_wait_syn", KSTAT_DATA_UINT64 }, 324 { "tcp_time_wait_success", KSTAT_DATA_UINT64 }, 325 { "tcp_time_wait_fail", KSTAT_DATA_UINT64 }, 326 { "tcp_reinput_syn", KSTAT_DATA_UINT64 }, 327 { "tcp_ip_output", KSTAT_DATA_UINT64 }, 328 { "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 }, 329 { "tcp_detach_time_wait", KSTAT_DATA_UINT64 }, 330 { "tcp_time_wait_reap", KSTAT_DATA_UINT64 }, 331 { "tcp_clean_death_nondetached", KSTAT_DATA_UINT64 }, 332 { "tcp_reinit_calls", KSTAT_DATA_UINT64 }, 333 { "tcp_eager_err1", KSTAT_DATA_UINT64 }, 334 { "tcp_eager_err2", KSTAT_DATA_UINT64 }, 335 { "tcp_eager_blowoff_calls", KSTAT_DATA_UINT64 }, 336 { "tcp_eager_blowoff_q", KSTAT_DATA_UINT64 }, 337 { "tcp_eager_blowoff_q0", KSTAT_DATA_UINT64 }, 338 { "tcp_not_hard_bound", KSTAT_DATA_UINT64 }, 339 { "tcp_no_listener", KSTAT_DATA_UINT64 }, 340 { "tcp_found_eager", KSTAT_DATA_UINT64 }, 341 { "tcp_wrong_queue", KSTAT_DATA_UINT64 }, 342 { "tcp_found_eager_binding1", KSTAT_DATA_UINT64 }, 343 { "tcp_found_eager_bound1", KSTAT_DATA_UINT64 }, 344 { "tcp_eager_has_listener1", KSTAT_DATA_UINT64 }, 345 { "tcp_open_alloc", KSTAT_DATA_UINT64 }, 346 { "tcp_open_detached_alloc", KSTAT_DATA_UINT64 }, 347 { "tcp_rput_time_wait", KSTAT_DATA_UINT64 }, 348 { "tcp_listendrop", KSTAT_DATA_UINT64 }, 349 { "tcp_listendropq0", KSTAT_DATA_UINT64 }, 350 { "tcp_wrong_rq", KSTAT_DATA_UINT64 }, 351 { "tcp_rsrv_calls", KSTAT_DATA_UINT64 }, 352 { "tcp_eagerfree2", KSTAT_DATA_UINT64 }, 353 { "tcp_eagerfree3", KSTAT_DATA_UINT64 }, 354 { "tcp_eagerfree4", KSTAT_DATA_UINT64 }, 355 { "tcp_eagerfree5", KSTAT_DATA_UINT64 }, 356 { "tcp_timewait_syn_fail", KSTAT_DATA_UINT64 }, 357 { "tcp_listen_badflags", KSTAT_DATA_UINT64 }, 358 { "tcp_timeout_calls", KSTAT_DATA_UINT64 }, 359 { "tcp_timeout_cached_alloc", KSTAT_DATA_UINT64 }, 360 { "tcp_timeout_cancel_reqs", KSTAT_DATA_UINT64 }, 361 { "tcp_timeout_canceled", KSTAT_DATA_UINT64 }, 362 { "tcp_timermp_alloced", KSTAT_DATA_UINT64 }, 363 { "tcp_timermp_freed", KSTAT_DATA_UINT64 }, 364 { "tcp_timermp_allocfail", KSTAT_DATA_UINT64 }, 365 { "tcp_timermp_allocdblfail", KSTAT_DATA_UINT64 }, 366 { "tcp_push_timer_cnt", KSTAT_DATA_UINT64 }, 367 { "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 }, 368 { "tcp_ire_null1", KSTAT_DATA_UINT64 }, 369 { "tcp_ire_null", KSTAT_DATA_UINT64 }, 370 { "tcp_ip_send", KSTAT_DATA_UINT64 }, 371 { "tcp_ip_ire_send", KSTAT_DATA_UINT64 }, 372 { "tcp_wsrv_called", KSTAT_DATA_UINT64 }, 373 { "tcp_flwctl_on", KSTAT_DATA_UINT64 }, 374 { "tcp_timer_fire_early", KSTAT_DATA_UINT64 }, 375 { "tcp_timer_fire_miss", KSTAT_DATA_UINT64 }, 376 { "tcp_freelist_cleanup", KSTAT_DATA_UINT64 }, 377 { "tcp_rput_v6_error", KSTAT_DATA_UINT64 }, 378 { "tcp_out_sw_cksum", KSTAT_DATA_UINT64 }, 379 { "tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 380 { "tcp_zcopy_on", KSTAT_DATA_UINT64 }, 381 { "tcp_zcopy_off", KSTAT_DATA_UINT64 }, 382 { "tcp_zcopy_backoff", KSTAT_DATA_UINT64 }, 383 { "tcp_zcopy_disable", KSTAT_DATA_UINT64 }, 384 { "tcp_mdt_pkt_out", KSTAT_DATA_UINT64 }, 385 { "tcp_mdt_pkt_out_v4", KSTAT_DATA_UINT64 }, 386 { "tcp_mdt_pkt_out_v6", KSTAT_DATA_UINT64 }, 387 { "tcp_mdt_discarded", KSTAT_DATA_UINT64 }, 388 { "tcp_mdt_conn_halted1", KSTAT_DATA_UINT64 }, 389 { "tcp_mdt_conn_halted2", KSTAT_DATA_UINT64 }, 390 { "tcp_mdt_conn_halted3", KSTAT_DATA_UINT64 }, 391 { "tcp_mdt_conn_resumed1", KSTAT_DATA_UINT64 }, 392 { "tcp_mdt_conn_resumed2", KSTAT_DATA_UINT64 }, 393 { "tcp_mdt_legacy_small", KSTAT_DATA_UINT64 }, 394 { "tcp_mdt_legacy_all", KSTAT_DATA_UINT64 }, 395 { "tcp_mdt_legacy_ret", KSTAT_DATA_UINT64 }, 396 { "tcp_mdt_allocfail", KSTAT_DATA_UINT64 }, 397 { "tcp_mdt_addpdescfail", KSTAT_DATA_UINT64 }, 398 { "tcp_mdt_allocd", KSTAT_DATA_UINT64 }, 399 { "tcp_mdt_linked", KSTAT_DATA_UINT64 }, 400 { "tcp_fusion_flowctl", KSTAT_DATA_UINT64 }, 401 { "tcp_fusion_backenabled", KSTAT_DATA_UINT64 }, 402 { "tcp_fusion_urg", KSTAT_DATA_UINT64 }, 403 { "tcp_fusion_putnext", KSTAT_DATA_UINT64 }, 404 { "tcp_fusion_unfusable", KSTAT_DATA_UINT64 }, 405 { "tcp_fusion_aborted", KSTAT_DATA_UINT64 }, 406 { "tcp_fusion_unqualified", KSTAT_DATA_UINT64 }, 407 { "tcp_fusion_rrw_busy", KSTAT_DATA_UINT64 }, 408 { "tcp_fusion_rrw_msgcnt", KSTAT_DATA_UINT64 }, 409 { "tcp_fusion_rrw_plugged", KSTAT_DATA_UINT64 }, 410 { "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64 }, 411 { "tcp_sock_fallback", KSTAT_DATA_UINT64 }, 412 }; 413 414 static kstat_t *tcp_kstat; 415 416 /* 417 * Call either ip_output or ip_output_v6. This replaces putnext() calls on the 418 * tcp write side. 419 */ 420 #define CALL_IP_WPUT(connp, q, mp) { \ 421 ASSERT(((q)->q_flag & QREADR) == 0); \ 422 TCP_DBGSTAT(tcp_ip_output); \ 423 connp->conn_send(connp, (mp), (q), IP_WPUT); \ 424 } 425 426 /* Macros for timestamp comparisons */ 427 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 428 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 429 430 /* 431 * Parameters for TCP Initial Send Sequence number (ISS) generation. When 432 * tcp_strong_iss is set to 1, which is the default, the ISS is calculated 433 * by adding three components: a time component which grows by 1 every 4096 434 * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); 435 * a per-connection component which grows by 125000 for every new connection; 436 * and an "extra" component that grows by a random amount centered 437 * approximately on 64000. This causes the the ISS generator to cycle every 438 * 4.89 hours if no TCP connections are made, and faster if connections are 439 * made. 440 * 441 * When tcp_strong_iss is set to 0, ISS is calculated by adding two 442 * components: a time component which grows by 250000 every second; and 443 * a per-connection component which grows by 125000 for every new connections. 444 * 445 * A third method, when tcp_strong_iss is set to 2, for generating ISS is 446 * prescribed by Steve Bellovin. This involves adding time, the 125000 per 447 * connection, and a one-way hash (MD5) of the connection ID <sport, dport, 448 * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered 449 * password. 450 */ 451 #define ISS_INCR 250000 452 #define ISS_NSEC_SHT 12 453 454 static uint32_t tcp_iss_incr_extra; /* Incremented for each connection */ 455 static kmutex_t tcp_iss_key_lock; 456 static MD5_CTX tcp_iss_key; 457 static sin_t sin_null; /* Zero address for quick clears */ 458 static sin6_t sin6_null; /* Zero address for quick clears */ 459 460 /* Packet dropper for TCP IPsec policy drops. */ 461 static ipdropper_t tcp_dropper; 462 463 /* 464 * This implementation follows the 4.3BSD interpretation of the urgent 465 * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause 466 * incompatible changes in protocols like telnet and rlogin. 467 */ 468 #define TCP_OLD_URP_INTERPRETATION 1 469 470 #define TCP_IS_DETACHED_NONEAGER(tcp) \ 471 (TCP_IS_DETACHED(tcp) && \ 472 (!(tcp)->tcp_hard_binding)) 473 474 /* 475 * TCP reassembly macros. We hide starting and ending sequence numbers in 476 * b_next and b_prev of messages on the reassembly queue. The messages are 477 * chained using b_cont. These macros are used in tcp_reass() so we don't 478 * have to see the ugly casts and assignments. 479 */ 480 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) 481 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ 482 (mblk_t *)(uintptr_t)(u)) 483 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) 484 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ 485 (mblk_t *)(uintptr_t)(u)) 486 487 /* 488 * Implementation of TCP Timers. 489 * ============================= 490 * 491 * INTERFACE: 492 * 493 * There are two basic functions dealing with tcp timers: 494 * 495 * timeout_id_t tcp_timeout(connp, func, time) 496 * clock_t tcp_timeout_cancel(connp, timeout_id) 497 * TCP_TIMER_RESTART(tcp, intvl) 498 * 499 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' 500 * after 'time' ticks passed. The function called by timeout() must adhere to 501 * the same restrictions as a driver soft interrupt handler - it must not sleep 502 * or call other functions that might sleep. The value returned is the opaque 503 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to 504 * cancel the request. The call to tcp_timeout() may fail in which case it 505 * returns zero. This is different from the timeout(9F) function which never 506 * fails. 507 * 508 * The call-back function 'func' always receives 'connp' as its single 509 * argument. It is always executed in the squeue corresponding to the tcp 510 * structure. The tcp structure is guaranteed to be present at the time the 511 * call-back is called. 512 * 513 * NOTE: The call-back function 'func' is never called if tcp is in 514 * the TCPS_CLOSED state. 515 * 516 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() 517 * request. locks acquired by the call-back routine should not be held across 518 * the call to tcp_timeout_cancel() or a deadlock may result. 519 * 520 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. 521 * Otherwise, it returns an integer value greater than or equal to 0. In 522 * particular, if the call-back function is already placed on the squeue, it can 523 * not be canceled. 524 * 525 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called 526 * within squeue context corresponding to the tcp instance. Since the 527 * call-back is also called via the same squeue, there are no race 528 * conditions described in untimeout(9F) manual page since all calls are 529 * strictly serialized. 530 * 531 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout 532 * stored in tcp_timer_tid and starts a new one using 533 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back 534 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid 535 * field. 536 * 537 * NOTE: since the timeout cancellation is not guaranteed, the cancelled 538 * call-back may still be called, so it is possible tcp_timer() will be 539 * called several times. This should not be a problem since tcp_timer() 540 * should always check the tcp instance state. 541 * 542 * 543 * IMPLEMENTATION: 544 * 545 * TCP timers are implemented using three-stage process. The call to 546 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function 547 * when the timer expires. The tcp_timer_callback() arranges the call of the 548 * tcp_timer_handler() function via squeue corresponding to the tcp 549 * instance. The tcp_timer_handler() calls actual requested timeout call-back 550 * and passes tcp instance as an argument to it. Information is passed between 551 * stages using the tcp_timer_t structure which contains the connp pointer, the 552 * tcp call-back to call and the timeout id returned by the timeout(9F). 553 * 554 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - 555 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo 556 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() 557 * returns the pointer to this mblk. 558 * 559 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It 560 * looks like a normal mblk without actual dblk attached to it. 561 * 562 * To optimize performance each tcp instance holds a small cache of timer 563 * mblocks. In the current implementation it caches up to two timer mblocks per 564 * tcp instance. The cache is preserved over tcp frees and is only freed when 565 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp 566 * timer processing happens on a corresponding squeue, the cache manipulation 567 * does not require any locks. Experiments show that majority of timer mblocks 568 * allocations are satisfied from the tcp cache and do not involve kmem calls. 569 * 570 * The tcp_timeout() places a refhold on the connp instance which guarantees 571 * that it will be present at the time the call-back function fires. The 572 * tcp_timer_handler() drops the reference after calling the call-back, so the 573 * call-back function does not need to manipulate the references explicitly. 574 */ 575 576 typedef struct tcp_timer_s { 577 conn_t *connp; 578 void (*tcpt_proc)(void *); 579 timeout_id_t tcpt_tid; 580 } tcp_timer_t; 581 582 static kmem_cache_t *tcp_timercache; 583 kmem_cache_t *tcp_sack_info_cache; 584 kmem_cache_t *tcp_iphc_cache; 585 586 /* 587 * For scalability, we must not run a timer for every TCP connection 588 * in TIME_WAIT state. To see why, consider (for time wait interval of 589 * 4 minutes): 590 * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's 591 * 592 * This list is ordered by time, so you need only delete from the head 593 * until you get to entries which aren't old enough to delete yet. 594 * The list consists of only the detached TIME_WAIT connections. 595 * 596 * Note that the timer (tcp_time_wait_expire) is started when the tcp_t 597 * becomes detached TIME_WAIT (either by changing the state and already 598 * being detached or the other way around). This means that the TIME_WAIT 599 * state can be extended (up to doubled) if the connection doesn't become 600 * detached for a long time. 601 * 602 * The list manipulations (including tcp_time_wait_next/prev) 603 * are protected by the tcp_time_wait_lock. The content of the 604 * detached TIME_WAIT connections is protected by the normal perimeters. 605 */ 606 607 typedef struct tcp_squeue_priv_s { 608 kmutex_t tcp_time_wait_lock; 609 /* Protects the next 3 globals */ 610 timeout_id_t tcp_time_wait_tid; 611 tcp_t *tcp_time_wait_head; 612 tcp_t *tcp_time_wait_tail; 613 tcp_t *tcp_free_list; 614 uint_t tcp_free_list_cnt; 615 } tcp_squeue_priv_t; 616 617 /* 618 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. 619 * Running it every 5 seconds seems to give the best results. 620 */ 621 #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000) 622 623 /* 624 * To prevent memory hog, limit the number of entries in tcp_free_list 625 * to 1% of available memory / number of cpus 626 */ 627 uint_t tcp_free_list_max_cnt = 0; 628 629 #define TCP_XMIT_LOWATER 4096 630 #define TCP_XMIT_HIWATER 49152 631 #define TCP_RECV_LOWATER 2048 632 #define TCP_RECV_HIWATER 49152 633 634 /* 635 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days 636 */ 637 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) 638 639 #define TIDUSZ 4096 /* transport interface data unit size */ 640 641 /* 642 * Bind hash list size and has function. It has to be a power of 2 for 643 * hashing. 644 */ 645 #define TCP_BIND_FANOUT_SIZE 512 646 #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1)) 647 /* 648 * Size of listen and acceptor hash list. It has to be a power of 2 for 649 * hashing. 650 */ 651 #define TCP_FANOUT_SIZE 256 652 653 #ifdef _ILP32 654 #define TCP_ACCEPTOR_HASH(accid) \ 655 (((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1)) 656 #else 657 #define TCP_ACCEPTOR_HASH(accid) \ 658 ((uint_t)(accid) & (TCP_FANOUT_SIZE - 1)) 659 #endif /* _ILP32 */ 660 661 #define IP_ADDR_CACHE_SIZE 2048 662 #define IP_ADDR_CACHE_HASH(faddr) \ 663 (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1)) 664 665 /* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */ 666 #define TCP_HSP_HASH_SIZE 256 667 668 #define TCP_HSP_HASH(addr) \ 669 (((addr>>24) ^ (addr >>16) ^ \ 670 (addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE) 671 672 /* 673 * TCP options struct returned from tcp_parse_options. 674 */ 675 typedef struct tcp_opt_s { 676 uint32_t tcp_opt_mss; 677 uint32_t tcp_opt_wscale; 678 uint32_t tcp_opt_ts_val; 679 uint32_t tcp_opt_ts_ecr; 680 tcp_t *tcp; 681 } tcp_opt_t; 682 683 /* 684 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 685 */ 686 687 #ifdef _BIG_ENDIAN 688 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 689 (TCPOPT_TSTAMP << 8) | 10) 690 #else 691 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 692 (TCPOPT_NOP << 8) | TCPOPT_NOP) 693 #endif 694 695 /* 696 * Flags returned from tcp_parse_options. 697 */ 698 #define TCP_OPT_MSS_PRESENT 1 699 #define TCP_OPT_WSCALE_PRESENT 2 700 #define TCP_OPT_TSTAMP_PRESENT 4 701 #define TCP_OPT_SACK_OK_PRESENT 8 702 #define TCP_OPT_SACK_PRESENT 16 703 704 /* TCP option length */ 705 #define TCPOPT_NOP_LEN 1 706 #define TCPOPT_MAXSEG_LEN 4 707 #define TCPOPT_WS_LEN 3 708 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 709 #define TCPOPT_TSTAMP_LEN 10 710 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 711 #define TCPOPT_SACK_OK_LEN 2 712 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 713 #define TCPOPT_REAL_SACK_LEN 4 714 #define TCPOPT_MAX_SACK_LEN 36 715 #define TCPOPT_HEADER_LEN 2 716 717 /* TCP cwnd burst factor. */ 718 #define TCP_CWND_INFINITE 65535 719 #define TCP_CWND_SS 3 720 #define TCP_CWND_NORMAL 5 721 722 /* Maximum TCP initial cwin (start/restart). */ 723 #define TCP_MAX_INIT_CWND 8 724 725 /* 726 * Initialize cwnd according to RFC 3390. def_max_init_cwnd is 727 * either tcp_slow_start_initial or tcp_slow_start_after idle 728 * depending on the caller. If the upper layer has not used the 729 * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd 730 * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd. 731 * If the upper layer has changed set the tcp_init_cwnd, just use 732 * it to calculate the tcp_cwnd. 733 */ 734 #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \ 735 { \ 736 if ((tcp)->tcp_init_cwnd == 0) { \ 737 (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \ 738 MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \ 739 } else { \ 740 (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \ 741 } \ 742 tcp->tcp_cwnd_cnt = 0; \ 743 } 744 745 /* TCP Timer control structure */ 746 typedef struct tcpt_s { 747 pfv_t tcpt_pfv; /* The routine we are to call */ 748 tcp_t *tcpt_tcp; /* The parameter we are to pass in */ 749 } tcpt_t; 750 751 /* Host Specific Parameter structure */ 752 typedef struct tcp_hsp { 753 struct tcp_hsp *tcp_hsp_next; 754 in6_addr_t tcp_hsp_addr_v6; 755 in6_addr_t tcp_hsp_subnet_v6; 756 uint_t tcp_hsp_vers; /* IPV4_VERSION | IPV6_VERSION */ 757 int32_t tcp_hsp_sendspace; 758 int32_t tcp_hsp_recvspace; 759 int32_t tcp_hsp_tstamp; 760 } tcp_hsp_t; 761 #define tcp_hsp_addr V4_PART_OF_V6(tcp_hsp_addr_v6) 762 #define tcp_hsp_subnet V4_PART_OF_V6(tcp_hsp_subnet_v6) 763 764 /* 765 * Functions called directly via squeue having a prototype of edesc_t. 766 */ 767 void tcp_conn_request(void *arg, mblk_t *mp, void *arg2); 768 static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2); 769 void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2); 770 static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2); 771 static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2); 772 void tcp_input(void *arg, mblk_t *mp, void *arg2); 773 void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); 774 static void tcp_close_output(void *arg, mblk_t *mp, void *arg2); 775 void tcp_output(void *arg, mblk_t *mp, void *arg2); 776 static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2); 777 static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2); 778 779 780 /* Prototype for TCP functions */ 781 static void tcp_random_init(void); 782 int tcp_random(void); 783 static void tcp_accept(tcp_t *tcp, mblk_t *mp); 784 static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, 785 tcp_t *eager); 786 static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp); 787 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 788 int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, 789 boolean_t user_specified); 790 static void tcp_closei_local(tcp_t *tcp); 791 static void tcp_close_detached(tcp_t *tcp); 792 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, 793 mblk_t *idmp, mblk_t **defermp); 794 static void tcp_connect(tcp_t *tcp, mblk_t *mp); 795 static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, 796 in_port_t dstport, uint_t srcid); 797 static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, 798 in_port_t dstport, uint32_t flowinfo, uint_t srcid, 799 uint32_t scope_id); 800 static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); 801 static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp); 802 static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); 803 static char *tcp_display(tcp_t *tcp, char *, char); 804 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); 805 static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only); 806 static void tcp_eager_unlink(tcp_t *tcp); 807 static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr, 808 int unixerr); 809 static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 810 int tlierr, int unixerr); 811 static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, 812 cred_t *cr); 813 static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, 814 char *value, caddr_t cp, cred_t *cr); 815 static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, 816 char *value, caddr_t cp, cred_t *cr); 817 static int tcp_tpistate(tcp_t *tcp); 818 static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp, 819 int caller_holds_lock); 820 static void tcp_bind_hash_remove(tcp_t *tcp); 821 static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id); 822 void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp); 823 static void tcp_acceptor_hash_remove(tcp_t *tcp); 824 static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); 825 static void tcp_info_req(tcp_t *tcp, mblk_t *mp); 826 static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); 827 static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp); 828 static int tcp_header_init_ipv4(tcp_t *tcp); 829 static int tcp_header_init_ipv6(tcp_t *tcp); 830 int tcp_init(tcp_t *tcp, queue_t *q); 831 static int tcp_init_values(tcp_t *tcp); 832 static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic); 833 static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, 834 t_scalar_t addr_length); 835 static void tcp_ip_ire_mark_advice(tcp_t *tcp); 836 static void tcp_ip_notify(tcp_t *tcp); 837 static mblk_t *tcp_ire_mp(mblk_t *mp); 838 static void tcp_iss_init(tcp_t *tcp); 839 static void tcp_keepalive_killer(void *arg); 840 static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt); 841 static void tcp_mss_set(tcp_t *tcp, uint32_t size); 842 static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, 843 int *do_disconnectp, int *t_errorp, int *sys_errorp); 844 static boolean_t tcp_allow_connopt_set(int level, int name); 845 int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); 846 int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr); 847 int tcp_opt_set(queue_t *q, uint_t optset_context, int level, 848 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, 849 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, 850 mblk_t *mblk); 851 static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha); 852 static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, 853 uchar_t *ptr, uint_t len); 854 static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); 855 static boolean_t tcp_param_register(tcpparam_t *tcppa, int cnt); 856 static int tcp_param_set(queue_t *q, mblk_t *mp, char *value, 857 caddr_t cp, cred_t *cr); 858 static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, 859 caddr_t cp, cred_t *cr); 860 static void tcp_iss_key_init(uint8_t *phrase, int len); 861 static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, 862 caddr_t cp, cred_t *cr); 863 static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt); 864 static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start); 865 static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp); 866 static void tcp_reinit(tcp_t *tcp); 867 static void tcp_reinit_values(tcp_t *tcp); 868 static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, 869 tcp_t *thisstream, cred_t *cr); 870 871 static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp); 872 static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); 873 static boolean_t tcp_send_rst_chk(void); 874 static void tcp_ss_rexmit(tcp_t *tcp); 875 static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp); 876 static void tcp_process_options(tcp_t *, tcph_t *); 877 static void tcp_rput_common(tcp_t *tcp, mblk_t *mp); 878 static void tcp_rsrv(queue_t *q); 879 static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd); 880 static int tcp_snmp_state(tcp_t *tcp); 881 static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, 882 cred_t *cr); 883 static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 884 cred_t *cr); 885 static int tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 886 cred_t *cr); 887 static int tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 888 cred_t *cr); 889 static int tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 890 cred_t *cr); 891 static int tcp_host_param_set(queue_t *q, mblk_t *mp, char *value, 892 caddr_t cp, cred_t *cr); 893 static int tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value, 894 caddr_t cp, cred_t *cr); 895 static int tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, 896 cred_t *cr); 897 static void tcp_timer(void *arg); 898 static void tcp_timer_callback(void *); 899 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp, 900 boolean_t random); 901 static in_port_t tcp_get_next_priv_port(const tcp_t *); 902 static void tcp_wput_sock(queue_t *q, mblk_t *mp); 903 void tcp_wput_accept(queue_t *q, mblk_t *mp); 904 static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); 905 static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); 906 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); 907 static int tcp_send(queue_t *q, tcp_t *tcp, const int mss, 908 const int tcp_hdr_len, const int tcp_tcp_hdr_len, 909 const int num_sack_blk, int *usable, uint_t *snxt, 910 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 911 const int mdt_thres); 912 static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, 913 const int tcp_hdr_len, const int tcp_tcp_hdr_len, 914 const int num_sack_blk, int *usable, uint_t *snxt, 915 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 916 const int mdt_thres); 917 static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, 918 int num_sack_blk); 919 static void tcp_wsrv(queue_t *q); 920 static int tcp_xmit_end(tcp_t *tcp); 921 static mblk_t *tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, 922 int32_t *offset, mblk_t **end_mp, uint32_t seq, 923 boolean_t sendall, uint32_t *seg_len, boolean_t rexmit); 924 static void tcp_ack_timer(void *arg); 925 static mblk_t *tcp_ack_mp(tcp_t *tcp); 926 static void tcp_xmit_early_reset(char *str, mblk_t *mp, 927 uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len, 928 zoneid_t zoneid); 929 static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, 930 uint32_t ack, int ctl); 931 static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr); 932 static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr); 933 static int setmaxps(queue_t *q, int maxpsz); 934 static void tcp_set_rto(tcp_t *, time_t); 935 static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *, 936 boolean_t, boolean_t); 937 static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, 938 boolean_t ipsec_mctl); 939 static mblk_t *tcp_setsockopt_mp(int level, int cmd, 940 char *opt, int optlen); 941 static int tcp_build_hdrs(queue_t *, tcp_t *); 942 static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 943 uint32_t seg_seq, uint32_t seg_ack, int seg_len, 944 tcph_t *tcph); 945 boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp); 946 boolean_t tcp_reserved_port_add(int, in_port_t *, in_port_t *); 947 boolean_t tcp_reserved_port_del(in_port_t, in_port_t); 948 boolean_t tcp_reserved_port_check(in_port_t); 949 static tcp_t *tcp_alloc_temp_tcp(in_port_t); 950 static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *); 951 static mblk_t *tcp_mdt_info_mp(mblk_t *); 952 static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t); 953 static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *, 954 const boolean_t, const uint32_t, const uint32_t, 955 const uint32_t, const uint32_t); 956 static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *, 957 const uint_t, const uint_t, boolean_t *); 958 static void tcp_send_data(tcp_t *, queue_t *, mblk_t *); 959 extern mblk_t *tcp_timermp_alloc(int); 960 extern void tcp_timermp_free(tcp_t *); 961 static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); 962 static void tcp_stop_lingering(tcp_t *tcp); 963 static void tcp_close_linger_timeout(void *arg); 964 void tcp_ddi_init(void); 965 void tcp_ddi_destroy(void); 966 static void tcp_kstat_init(void); 967 static void tcp_kstat_fini(void); 968 static int tcp_kstat_update(kstat_t *kp, int rw); 969 void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp); 970 static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 971 tcph_t *tcph, uint_t ipvers, mblk_t *idmp); 972 static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, 973 tcph_t *tcph, mblk_t *idmp); 974 static squeue_func_t tcp_squeue_switch(int); 975 976 static int tcp_open(queue_t *, dev_t *, int, int, cred_t *); 977 static int tcp_close(queue_t *, int); 978 static int tcpclose_accept(queue_t *); 979 static int tcp_modclose(queue_t *); 980 static void tcp_wput_mod(queue_t *, mblk_t *); 981 982 static void tcp_squeue_add(squeue_t *); 983 static boolean_t tcp_zcopy_check(tcp_t *); 984 static void tcp_zcopy_notify(tcp_t *); 985 static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *); 986 static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int); 987 static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t); 988 989 extern void tcp_kssl_input(tcp_t *, mblk_t *); 990 991 /* 992 * Routines related to the TCP_IOC_ABORT_CONN ioctl command. 993 * 994 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting 995 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure 996 * (defined in tcp.h) needs to be filled in and passed into the kernel 997 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t 998 * structure contains the four-tuple of a TCP connection and a range of TCP 999 * states (specified by ac_start and ac_end). The use of wildcard addresses 1000 * and ports is allowed. Connections with a matching four tuple and a state 1001 * within the specified range will be aborted. The valid states for the 1002 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, 1003 * inclusive. 1004 * 1005 * An application which has its connection aborted by this ioctl will receive 1006 * an error that is dependent on the connection state at the time of the abort. 1007 * If the connection state is < TCPS_TIME_WAIT, an application should behave as 1008 * though a RST packet has been received. If the connection state is equal to 1009 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel 1010 * and all resources associated with the connection will be freed. 1011 */ 1012 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); 1013 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); 1014 static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *); 1015 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *); 1016 static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); 1017 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, 1018 boolean_t); 1019 1020 static struct module_info tcp_rinfo = { 1021 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER 1022 }; 1023 1024 static struct module_info tcp_winfo = { 1025 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16 1026 }; 1027 1028 /* 1029 * Entry points for TCP as a module. It only allows SNMP requests 1030 * to pass through. 1031 */ 1032 struct qinit tcp_mod_rinit = { 1033 (pfi_t)putnext, NULL, tcp_open, ip_snmpmod_close, NULL, &tcp_rinfo, 1034 }; 1035 1036 struct qinit tcp_mod_winit = { 1037 (pfi_t)ip_snmpmod_wput, NULL, tcp_open, ip_snmpmod_close, NULL, 1038 &tcp_rinfo 1039 }; 1040 1041 /* 1042 * Entry points for TCP as a device. The normal case which supports 1043 * the TCP functionality. 1044 */ 1045 struct qinit tcp_rinit = { 1046 NULL, (pfi_t)tcp_rsrv, tcp_open, tcp_close, NULL, &tcp_rinfo 1047 }; 1048 1049 struct qinit tcp_winit = { 1050 (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 1051 }; 1052 1053 /* Initial entry point for TCP in socket mode. */ 1054 struct qinit tcp_sock_winit = { 1055 (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 1056 }; 1057 1058 /* 1059 * Entry points for TCP as a acceptor STREAM opened by sockfs when doing 1060 * an accept. Avoid allocating data structures since eager has already 1061 * been created. 1062 */ 1063 struct qinit tcp_acceptor_rinit = { 1064 NULL, (pfi_t)tcp_rsrv, NULL, tcpclose_accept, NULL, &tcp_winfo 1065 }; 1066 1067 struct qinit tcp_acceptor_winit = { 1068 (pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo 1069 }; 1070 1071 /* 1072 * Entry points for TCP loopback (read side only) 1073 */ 1074 struct qinit tcp_loopback_rinit = { 1075 (pfi_t)0, (pfi_t)tcp_rsrv, tcp_open, tcp_close, (pfi_t)0, 1076 &tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD 1077 }; 1078 1079 struct streamtab tcpinfo = { 1080 &tcp_rinit, &tcp_winit 1081 }; 1082 1083 extern squeue_func_t tcp_squeue_wput_proc; 1084 extern squeue_func_t tcp_squeue_timer_proc; 1085 1086 /* Protected by tcp_g_q_lock */ 1087 static queue_t *tcp_g_q; /* Default queue used during detached closes */ 1088 kmutex_t tcp_g_q_lock; 1089 1090 /* Protected by tcp_hsp_lock */ 1091 /* 1092 * XXX The host param mechanism should go away and instead we should use 1093 * the metrics associated with the routes to determine the default sndspace 1094 * and rcvspace. 1095 */ 1096 static tcp_hsp_t **tcp_hsp_hash; /* Hash table for HSPs */ 1097 krwlock_t tcp_hsp_lock; 1098 1099 /* 1100 * Extra privileged ports. In host byte order. 1101 * Protected by tcp_epriv_port_lock. 1102 */ 1103 #define TCP_NUM_EPRIV_PORTS 64 1104 static int tcp_g_num_epriv_ports = TCP_NUM_EPRIV_PORTS; 1105 static uint16_t tcp_g_epriv_ports[TCP_NUM_EPRIV_PORTS] = { 2049, 4045 }; 1106 kmutex_t tcp_epriv_port_lock; 1107 1108 /* 1109 * The smallest anonymous port in the privileged port range which TCP 1110 * looks for free port. Use in the option TCP_ANONPRIVBIND. 1111 */ 1112 static in_port_t tcp_min_anonpriv_port = 512; 1113 1114 /* Only modified during _init and _fini thus no locking is needed. */ 1115 static caddr_t tcp_g_nd; /* Head of 'named dispatch' variable list */ 1116 1117 /* Hint not protected by any lock */ 1118 static uint_t tcp_next_port_to_try; 1119 1120 1121 /* TCP bind hash list - all tcp_t with state >= BOUND. */ 1122 tf_t tcp_bind_fanout[TCP_BIND_FANOUT_SIZE]; 1123 1124 /* TCP queue hash list - all tcp_t in case they will be an acceptor. */ 1125 static tf_t tcp_acceptor_fanout[TCP_FANOUT_SIZE]; 1126 1127 /* 1128 * TCP has a private interface for other kernel modules to reserve a 1129 * port range for them to use. Once reserved, TCP will not use any ports 1130 * in the range. This interface relies on the TCP_EXCLBIND feature. If 1131 * the semantics of TCP_EXCLBIND is changed, implementation of this interface 1132 * has to be verified. 1133 * 1134 * There can be TCP_RESERVED_PORTS_ARRAY_MAX_SIZE port ranges. Each port 1135 * range can cover at most TCP_RESERVED_PORTS_RANGE_MAX ports. A port 1136 * range is [port a, port b] inclusive. And each port range is between 1137 * TCP_LOWESET_RESERVED_PORT and TCP_LARGEST_RESERVED_PORT inclusive. 1138 * 1139 * Note that the default anonymous port range starts from 32768. There is 1140 * no port "collision" between that and the reserved port range. If there 1141 * is port collision (because the default smallest anonymous port is lowered 1142 * or some apps specifically bind to ports in the reserved port range), the 1143 * system may not be able to reserve a port range even there are enough 1144 * unbound ports as a reserved port range contains consecutive ports . 1145 */ 1146 #define TCP_RESERVED_PORTS_ARRAY_MAX_SIZE 5 1147 #define TCP_RESERVED_PORTS_RANGE_MAX 1000 1148 #define TCP_SMALLEST_RESERVED_PORT 10240 1149 #define TCP_LARGEST_RESERVED_PORT 20480 1150 1151 /* Structure to represent those reserved port ranges. */ 1152 typedef struct tcp_rport_s { 1153 in_port_t lo_port; 1154 in_port_t hi_port; 1155 tcp_t **temp_tcp_array; 1156 } tcp_rport_t; 1157 1158 /* The reserved port array. */ 1159 static tcp_rport_t tcp_reserved_port[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE]; 1160 1161 /* Locks to protect the tcp_reserved_ports array. */ 1162 static krwlock_t tcp_reserved_port_lock; 1163 1164 /* The number of ranges in the array. */ 1165 uint32_t tcp_reserved_port_array_size = 0; 1166 1167 /* 1168 * MIB-2 stuff for SNMP 1169 * Note: tcpInErrs {tcp 15} is accumulated in ip.c 1170 */ 1171 mib2_tcp_t tcp_mib; /* SNMP fixed size info */ 1172 kstat_t *tcp_mibkp; /* kstat exporting tcp_mib data */ 1173 1174 boolean_t tcp_icmp_source_quench = B_FALSE; 1175 /* 1176 * Following assumes TPI alignment requirements stay along 32 bit 1177 * boundaries 1178 */ 1179 #define ROUNDUP32(x) \ 1180 (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1)) 1181 1182 /* Template for response to info request. */ 1183 static struct T_info_ack tcp_g_t_info_ack = { 1184 T_INFO_ACK, /* PRIM_type */ 1185 0, /* TSDU_size */ 1186 T_INFINITE, /* ETSDU_size */ 1187 T_INVALID, /* CDATA_size */ 1188 T_INVALID, /* DDATA_size */ 1189 sizeof (sin_t), /* ADDR_size */ 1190 0, /* OPT_size - not initialized here */ 1191 TIDUSZ, /* TIDU_size */ 1192 T_COTS_ORD, /* SERV_type */ 1193 TCPS_IDLE, /* CURRENT_state */ 1194 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 1195 }; 1196 1197 static struct T_info_ack tcp_g_t_info_ack_v6 = { 1198 T_INFO_ACK, /* PRIM_type */ 1199 0, /* TSDU_size */ 1200 T_INFINITE, /* ETSDU_size */ 1201 T_INVALID, /* CDATA_size */ 1202 T_INVALID, /* DDATA_size */ 1203 sizeof (sin6_t), /* ADDR_size */ 1204 0, /* OPT_size - not initialized here */ 1205 TIDUSZ, /* TIDU_size */ 1206 T_COTS_ORD, /* SERV_type */ 1207 TCPS_IDLE, /* CURRENT_state */ 1208 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 1209 }; 1210 1211 #define MS 1L 1212 #define SECONDS (1000 * MS) 1213 #define MINUTES (60 * SECONDS) 1214 #define HOURS (60 * MINUTES) 1215 #define DAYS (24 * HOURS) 1216 1217 #define PARAM_MAX (~(uint32_t)0) 1218 1219 /* Max size IP datagram is 64k - 1 */ 1220 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t))) 1221 #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t))) 1222 /* Max of the above */ 1223 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 1224 1225 /* Largest TCP port number */ 1226 #define TCP_MAX_PORT (64 * 1024 - 1) 1227 1228 /* 1229 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link 1230 * layer header. It has to be a multiple of 4. 1231 */ 1232 static tcpparam_t tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; 1233 #define tcp_wroff_xtra tcp_wroff_xtra_param.tcp_param_val 1234 1235 /* 1236 * All of these are alterable, within the min/max values given, at run time. 1237 * Note that the default value of "tcp_time_wait_interval" is four minutes, 1238 * per the TCP spec. 1239 */ 1240 /* BEGIN CSTYLED */ 1241 tcpparam_t tcp_param_arr[] = { 1242 /*min max value name */ 1243 { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"}, 1244 { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" }, 1245 { 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" }, 1246 { 1, 1024, 1, "tcp_conn_req_min" }, 1247 { 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" }, 1248 { 128, (1<<30), 1024*1024, "tcp_cwnd_max" }, 1249 { 0, 10, 0, "tcp_debug" }, 1250 { 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"}, 1251 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"}, 1252 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"}, 1253 { 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"}, 1254 { 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"}, 1255 { 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"}, 1256 { 1, 255, 64, "tcp_ipv4_ttl"}, 1257 { 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"}, 1258 { 0, 100, 10, "tcp_maxpsz_multiplier" }, 1259 { 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"}, 1260 { 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"}, 1261 { 1, TCP_MSS_MAX, 108, "tcp_mss_min"}, 1262 { 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"}, 1263 { 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"}, 1264 { 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"}, 1265 { 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"}, 1266 { 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" }, 1267 { 0, 16, 0, "tcp_snd_lowat_fraction" }, 1268 { 0, 128000, 0, "tcp_sth_rcv_hiwat" }, 1269 { 0, 128000, 0, "tcp_sth_rcv_lowat" }, 1270 { 1, 10000, 3, "tcp_dupack_fast_retransmit" }, 1271 { 0, 1, 0, "tcp_ignore_path_mtu" }, 1272 { 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"}, 1273 { 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"}, 1274 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"}, 1275 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"}, 1276 { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"}, 1277 { 1, 65536, 4, "tcp_recv_hiwat_minmss"}, 1278 { 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"}, 1279 { 0, TCP_MSS_MAX, 64, "tcp_co_min"}, 1280 { 8192, (1<<30), 1024*1024, "tcp_max_buf"}, 1281 /* 1282 * Question: What default value should I set for tcp_strong_iss? 1283 */ 1284 { 0, 2, 1, "tcp_strong_iss"}, 1285 { 0, 65536, 20, "tcp_rtt_updates"}, 1286 { 0, 1, 1, "tcp_wscale_always"}, 1287 { 0, 1, 0, "tcp_tstamp_always"}, 1288 { 0, 1, 1, "tcp_tstamp_if_wscale"}, 1289 { 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"}, 1290 { 0, 16, 2, "tcp_deferred_acks_max"}, 1291 { 1, 16384, 4, "tcp_slow_start_after_idle"}, 1292 { 1, 4, 4, "tcp_slow_start_initial"}, 1293 { 10*MS, 50*MS, 20*MS, "tcp_co_timer_interval"}, 1294 { 0, 2, 2, "tcp_sack_permitted"}, 1295 { 0, 1, 0, "tcp_trace"}, 1296 { 0, 1, 1, "tcp_compression_enabled"}, 1297 { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"}, 1298 { 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"}, 1299 { 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"}, 1300 { 0, 1, 0, "tcp_rev_src_routes"}, 1301 { 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"}, 1302 { 100*MS, 60*SECONDS, 1*SECONDS, "tcp_ndd_get_info_interval"}, 1303 { 0, 16, 8, "tcp_local_dacks_max"}, 1304 { 0, 2, 1, "tcp_ecn_permitted"}, 1305 { 0, 1, 1, "tcp_rst_sent_rate_enabled"}, 1306 { 0, PARAM_MAX, 40, "tcp_rst_sent_rate"}, 1307 { 0, 100*MS, 50*MS, "tcp_push_timer_interval"}, 1308 { 0, 1, 0, "tcp_use_smss_as_mss_opt"}, 1309 { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"}, 1310 }; 1311 /* END CSTYLED */ 1312 1313 /* 1314 * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of 1315 * each header fragment in the header buffer. Each parameter value has 1316 * to be a multiple of 4 (32-bit aligned). 1317 */ 1318 static tcpparam_t tcp_mdt_head_param = { 32, 256, 32, "tcp_mdt_hdr_head_min" }; 1319 static tcpparam_t tcp_mdt_tail_param = { 0, 256, 32, "tcp_mdt_hdr_tail_min" }; 1320 #define tcp_mdt_hdr_head_min tcp_mdt_head_param.tcp_param_val 1321 #define tcp_mdt_hdr_tail_min tcp_mdt_tail_param.tcp_param_val 1322 1323 /* 1324 * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out 1325 * the maximum number of payload buffers associated per Multidata. 1326 */ 1327 static tcpparam_t tcp_mdt_max_pbufs_param = 1328 { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" }; 1329 #define tcp_mdt_max_pbufs tcp_mdt_max_pbufs_param.tcp_param_val 1330 1331 /* Round up the value to the nearest mss. */ 1332 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 1333 1334 /* 1335 * Set ECN capable transport (ECT) code point in IP header. 1336 * 1337 * Note that there are 2 ECT code points '01' and '10', which are called 1338 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 1339 * point ECT(0) for TCP as described in RFC 2481. 1340 */ 1341 #define SET_ECT(tcp, iph) \ 1342 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1343 /* We need to clear the code point first. */ \ 1344 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ 1345 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ 1346 } else { \ 1347 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \ 1348 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ 1349 } 1350 1351 /* 1352 * The format argument to pass to tcp_display(). 1353 * DISP_PORT_ONLY means that the returned string has only port info. 1354 * DISP_ADDR_AND_PORT means that the returned string also contains the 1355 * remote and local IP address. 1356 */ 1357 #define DISP_PORT_ONLY 1 1358 #define DISP_ADDR_AND_PORT 2 1359 1360 /* 1361 * This controls the rate some ndd info report functions can be used 1362 * by non-privileged users. It stores the last time such info is 1363 * requested. When those report functions are called again, this 1364 * is checked with the current time and compare with the ndd param 1365 * tcp_ndd_get_info_interval. 1366 */ 1367 static clock_t tcp_last_ndd_get_info_time = 0; 1368 #define NDD_TOO_QUICK_MSG \ 1369 "ndd get info rate too high for non-privileged users, try again " \ 1370 "later.\n" 1371 #define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n" 1372 1373 #define IS_VMLOANED_MBLK(mp) \ 1374 (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) 1375 1376 /* 1377 * These two variables control the rate for TCP to generate RSTs in 1378 * response to segments not belonging to any connections. We limit 1379 * TCP to sent out tcp_rst_sent_rate (ndd param) number of RSTs in 1380 * each 1 second interval. This is to protect TCP against DoS attack. 1381 */ 1382 static clock_t tcp_last_rst_intrvl; 1383 static uint32_t tcp_rst_cnt; 1384 1385 /* The number of RST not sent because of the rate limit. */ 1386 static uint32_t tcp_rst_unsent; 1387 1388 /* Enable or disable b_cont M_MULTIDATA chaining for MDT. */ 1389 boolean_t tcp_mdt_chain = B_TRUE; 1390 1391 /* 1392 * MDT threshold in the form of effective send MSS multiplier; we take 1393 * the MDT path if the amount of unsent data exceeds the threshold value 1394 * (default threshold is 1*SMSS). 1395 */ 1396 uint_t tcp_mdt_smss_threshold = 1; 1397 1398 uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ 1399 1400 /* 1401 * Forces all connections to obey the value of the tcp_maxpsz_multiplier 1402 * tunable settable via NDD. Otherwise, the per-connection behavior is 1403 * determined dynamically during tcp_adapt_ire(), which is the default. 1404 */ 1405 boolean_t tcp_static_maxpsz = B_FALSE; 1406 1407 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ 1408 uint32_t tcp_random_anon_port = 1; 1409 1410 /* 1411 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more 1412 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent 1413 * data, TCP will not respond with an ACK. RFC 793 requires that 1414 * TCP responds with an ACK for such a bogus ACK. By not following 1415 * the RFC, we prevent TCP from getting into an ACK storm if somehow 1416 * an attacker successfully spoofs an acceptable segment to our 1417 * peer; or when our peer is "confused." 1418 */ 1419 uint32_t tcp_drop_ack_unsent_cnt = 10; 1420 1421 /* 1422 * Hook functions to enable cluster networking 1423 * On non-clustered systems these vectors must always be NULL. 1424 */ 1425 1426 void (*cl_inet_listen)(uint8_t protocol, sa_family_t addr_family, 1427 uint8_t *laddrp, in_port_t lport) = NULL; 1428 void (*cl_inet_unlisten)(uint8_t protocol, sa_family_t addr_family, 1429 uint8_t *laddrp, in_port_t lport) = NULL; 1430 void (*cl_inet_connect)(uint8_t protocol, sa_family_t addr_family, 1431 uint8_t *laddrp, in_port_t lport, 1432 uint8_t *faddrp, in_port_t fport) = NULL; 1433 void (*cl_inet_disconnect)(uint8_t protocol, sa_family_t addr_family, 1434 uint8_t *laddrp, in_port_t lport, 1435 uint8_t *faddrp, in_port_t fport) = NULL; 1436 1437 /* 1438 * The following are defined in ip.c 1439 */ 1440 extern int (*cl_inet_isclusterwide)(uint8_t protocol, sa_family_t addr_family, 1441 uint8_t *laddrp); 1442 extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, 1443 uint8_t *laddrp, uint8_t *faddrp); 1444 1445 #define CL_INET_CONNECT(tcp) { \ 1446 if (cl_inet_connect != NULL) { \ 1447 /* \ 1448 * Running in cluster mode - register active connection \ 1449 * information \ 1450 */ \ 1451 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1452 if ((tcp)->tcp_ipha->ipha_src != 0) { \ 1453 (*cl_inet_connect)(IPPROTO_TCP, AF_INET,\ 1454 (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\ 1455 (in_port_t)(tcp)->tcp_lport, \ 1456 (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ 1457 (in_port_t)(tcp)->tcp_fport); \ 1458 } \ 1459 } else { \ 1460 if (!IN6_IS_ADDR_UNSPECIFIED( \ 1461 &(tcp)->tcp_ip6h->ip6_src)) {\ 1462 (*cl_inet_connect)(IPPROTO_TCP, AF_INET6,\ 1463 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\ 1464 (in_port_t)(tcp)->tcp_lport, \ 1465 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ 1466 (in_port_t)(tcp)->tcp_fport); \ 1467 } \ 1468 } \ 1469 } \ 1470 } 1471 1472 #define CL_INET_DISCONNECT(tcp) { \ 1473 if (cl_inet_disconnect != NULL) { \ 1474 /* \ 1475 * Running in cluster mode - deregister active \ 1476 * connection information \ 1477 */ \ 1478 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1479 if ((tcp)->tcp_ip_src != 0) { \ 1480 (*cl_inet_disconnect)(IPPROTO_TCP, \ 1481 AF_INET, \ 1482 (uint8_t *)(&((tcp)->tcp_ip_src)),\ 1483 (in_port_t)(tcp)->tcp_lport, \ 1484 (uint8_t *) \ 1485 (&((tcp)->tcp_ipha->ipha_dst)),\ 1486 (in_port_t)(tcp)->tcp_fport); \ 1487 } \ 1488 } else { \ 1489 if (!IN6_IS_ADDR_UNSPECIFIED( \ 1490 &(tcp)->tcp_ip_src_v6)) { \ 1491 (*cl_inet_disconnect)(IPPROTO_TCP, AF_INET6,\ 1492 (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\ 1493 (in_port_t)(tcp)->tcp_lport, \ 1494 (uint8_t *) \ 1495 (&((tcp)->tcp_ip6h->ip6_dst)),\ 1496 (in_port_t)(tcp)->tcp_fport); \ 1497 } \ 1498 } \ 1499 } \ 1500 } 1501 1502 /* 1503 * Cluster networking hook for traversing current connection list. 1504 * This routine is used to extract the current list of live connections 1505 * which must continue to to be dispatched to this node. 1506 */ 1507 int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg); 1508 1509 /* 1510 * Figure out the value of window scale opton. Note that the rwnd is 1511 * ASSUMED to be rounded up to the nearest MSS before the calculation. 1512 * We cannot find the scale value and then do a round up of tcp_rwnd 1513 * because the scale value may not be correct after that. 1514 * 1515 * Set the compiler flag to make this function inline. 1516 */ 1517 static void 1518 tcp_set_ws_value(tcp_t *tcp) 1519 { 1520 int i; 1521 uint32_t rwnd = tcp->tcp_rwnd; 1522 1523 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; 1524 i++, rwnd >>= 1) 1525 ; 1526 tcp->tcp_rcv_ws = i; 1527 } 1528 1529 /* 1530 * Remove a connection from the list of detached TIME_WAIT connections. 1531 */ 1532 static void 1533 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) 1534 { 1535 boolean_t locked = B_FALSE; 1536 1537 if (tcp_time_wait == NULL) { 1538 tcp_time_wait = *((tcp_squeue_priv_t **) 1539 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); 1540 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1541 locked = B_TRUE; 1542 } 1543 1544 if (tcp->tcp_time_wait_expire == 0) { 1545 ASSERT(tcp->tcp_time_wait_next == NULL); 1546 ASSERT(tcp->tcp_time_wait_prev == NULL); 1547 if (locked) 1548 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1549 return; 1550 } 1551 ASSERT(TCP_IS_DETACHED(tcp)); 1552 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1553 1554 if (tcp == tcp_time_wait->tcp_time_wait_head) { 1555 ASSERT(tcp->tcp_time_wait_prev == NULL); 1556 tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; 1557 if (tcp_time_wait->tcp_time_wait_head != NULL) { 1558 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = 1559 NULL; 1560 } else { 1561 tcp_time_wait->tcp_time_wait_tail = NULL; 1562 } 1563 } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { 1564 ASSERT(tcp != tcp_time_wait->tcp_time_wait_head); 1565 ASSERT(tcp->tcp_time_wait_next == NULL); 1566 tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; 1567 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1568 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; 1569 } else { 1570 ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 1571 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 1572 tcp->tcp_time_wait_prev->tcp_time_wait_next = 1573 tcp->tcp_time_wait_next; 1574 tcp->tcp_time_wait_next->tcp_time_wait_prev = 1575 tcp->tcp_time_wait_prev; 1576 } 1577 tcp->tcp_time_wait_next = NULL; 1578 tcp->tcp_time_wait_prev = NULL; 1579 tcp->tcp_time_wait_expire = 0; 1580 1581 if (locked) 1582 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1583 } 1584 1585 /* 1586 * Add a connection to the list of detached TIME_WAIT connections 1587 * and set its time to expire. 1588 */ 1589 static void 1590 tcp_time_wait_append(tcp_t *tcp) 1591 { 1592 tcp_squeue_priv_t *tcp_time_wait = 1593 *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp, 1594 SQPRIVATE_TCP)); 1595 1596 tcp_timers_stop(tcp); 1597 1598 /* Freed above */ 1599 ASSERT(tcp->tcp_timer_tid == 0); 1600 ASSERT(tcp->tcp_ack_tid == 0); 1601 1602 /* must have happened at the time of detaching the tcp */ 1603 ASSERT(tcp->tcp_ptpahn == NULL); 1604 ASSERT(tcp->tcp_flow_stopped == 0); 1605 ASSERT(tcp->tcp_time_wait_next == NULL); 1606 ASSERT(tcp->tcp_time_wait_prev == NULL); 1607 ASSERT(tcp->tcp_time_wait_expire == NULL); 1608 ASSERT(tcp->tcp_listener == NULL); 1609 1610 tcp->tcp_time_wait_expire = ddi_get_lbolt(); 1611 /* 1612 * The value computed below in tcp->tcp_time_wait_expire may 1613 * appear negative or wrap around. That is ok since our 1614 * interest is only in the difference between the current lbolt 1615 * value and tcp->tcp_time_wait_expire. But the value should not 1616 * be zero, since it means the tcp is not in the TIME_WAIT list. 1617 * The corresponding comparison in tcp_time_wait_collector() uses 1618 * modular arithmetic. 1619 */ 1620 tcp->tcp_time_wait_expire += 1621 drv_usectohz(tcp_time_wait_interval * 1000); 1622 if (tcp->tcp_time_wait_expire == 0) 1623 tcp->tcp_time_wait_expire = 1; 1624 1625 ASSERT(TCP_IS_DETACHED(tcp)); 1626 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1627 ASSERT(tcp->tcp_time_wait_next == NULL); 1628 ASSERT(tcp->tcp_time_wait_prev == NULL); 1629 TCP_DBGSTAT(tcp_time_wait); 1630 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1631 if (tcp_time_wait->tcp_time_wait_head == NULL) { 1632 ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); 1633 tcp_time_wait->tcp_time_wait_head = tcp; 1634 } else { 1635 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1636 ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == 1637 TCPS_TIME_WAIT); 1638 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; 1639 tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; 1640 } 1641 tcp_time_wait->tcp_time_wait_tail = tcp; 1642 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1643 } 1644 1645 /* ARGSUSED */ 1646 void 1647 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) 1648 { 1649 conn_t *connp = (conn_t *)arg; 1650 tcp_t *tcp = connp->conn_tcp; 1651 1652 ASSERT(tcp != NULL); 1653 if (tcp->tcp_state == TCPS_CLOSED) { 1654 return; 1655 } 1656 1657 ASSERT((tcp->tcp_family == AF_INET && 1658 tcp->tcp_ipversion == IPV4_VERSION) || 1659 (tcp->tcp_family == AF_INET6 && 1660 (tcp->tcp_ipversion == IPV4_VERSION || 1661 tcp->tcp_ipversion == IPV6_VERSION))); 1662 ASSERT(!tcp->tcp_listener); 1663 1664 TCP_STAT(tcp_time_wait_reap); 1665 ASSERT(TCP_IS_DETACHED(tcp)); 1666 1667 /* 1668 * Because they have no upstream client to rebind or tcp_close() 1669 * them later, we axe the connection here and now. 1670 */ 1671 tcp_close_detached(tcp); 1672 } 1673 1674 void 1675 tcp_cleanup(tcp_t *tcp) 1676 { 1677 mblk_t *mp; 1678 char *tcp_iphc; 1679 int tcp_iphc_len; 1680 int tcp_hdr_grown; 1681 tcp_sack_info_t *tcp_sack_info; 1682 conn_t *connp = tcp->tcp_connp; 1683 1684 tcp_bind_hash_remove(tcp); 1685 tcp_free(tcp); 1686 1687 /* Release any SSL context */ 1688 if (tcp->tcp_kssl_ent != NULL) { 1689 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); 1690 tcp->tcp_kssl_ent = NULL; 1691 } 1692 1693 if (tcp->tcp_kssl_ctx != NULL) { 1694 kssl_release_ctx(tcp->tcp_kssl_ctx); 1695 tcp->tcp_kssl_ctx = NULL; 1696 } 1697 tcp->tcp_kssl_pending = B_FALSE; 1698 1699 conn_delete_ire(connp, NULL); 1700 if (connp->conn_flags & IPCL_TCPCONN) { 1701 if (connp->conn_latch != NULL) 1702 IPLATCH_REFRELE(connp->conn_latch); 1703 if (connp->conn_policy != NULL) 1704 IPPH_REFRELE(connp->conn_policy); 1705 } 1706 1707 /* 1708 * Since we will bzero the entire structure, we need to 1709 * remove it and reinsert it in global hash list. We 1710 * know the walkers can't get to this conn because we 1711 * had set CONDEMNED flag earlier and checked reference 1712 * under conn_lock so walker won't pick it and when we 1713 * go the ipcl_globalhash_remove() below, no walker 1714 * can get to it. 1715 */ 1716 ipcl_globalhash_remove(connp); 1717 1718 /* Save some state */ 1719 mp = tcp->tcp_timercache; 1720 1721 tcp_sack_info = tcp->tcp_sack_info; 1722 tcp_iphc = tcp->tcp_iphc; 1723 tcp_iphc_len = tcp->tcp_iphc_len; 1724 tcp_hdr_grown = tcp->tcp_hdr_grown; 1725 1726 if (connp->conn_cred != NULL) 1727 crfree(connp->conn_cred); 1728 if (connp->conn_peercred != NULL) 1729 crfree(connp->conn_peercred); 1730 bzero(connp, sizeof (conn_t)); 1731 bzero(tcp, sizeof (tcp_t)); 1732 1733 /* restore the state */ 1734 tcp->tcp_timercache = mp; 1735 1736 tcp->tcp_sack_info = tcp_sack_info; 1737 tcp->tcp_iphc = tcp_iphc; 1738 tcp->tcp_iphc_len = tcp_iphc_len; 1739 tcp->tcp_hdr_grown = tcp_hdr_grown; 1740 1741 1742 tcp->tcp_connp = connp; 1743 1744 connp->conn_tcp = tcp; 1745 connp->conn_flags = IPCL_TCPCONN; 1746 connp->conn_state_flags = CONN_INCIPIENT; 1747 connp->conn_ulp = IPPROTO_TCP; 1748 connp->conn_ref = 1; 1749 1750 ipcl_globalhash_insert(connp); 1751 } 1752 1753 /* 1754 * Blows away all tcps whose TIME_WAIT has expired. List traversal 1755 * is done forwards from the head. 1756 */ 1757 /* ARGSUSED */ 1758 void 1759 tcp_time_wait_collector(void *arg) 1760 { 1761 tcp_t *tcp; 1762 clock_t now; 1763 mblk_t *mp; 1764 conn_t *connp; 1765 kmutex_t *lock; 1766 1767 squeue_t *sqp = (squeue_t *)arg; 1768 tcp_squeue_priv_t *tcp_time_wait = 1769 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 1770 1771 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1772 tcp_time_wait->tcp_time_wait_tid = 0; 1773 1774 if (tcp_time_wait->tcp_free_list != NULL && 1775 tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { 1776 TCP_STAT(tcp_freelist_cleanup); 1777 while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { 1778 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 1779 CONN_DEC_REF(tcp->tcp_connp); 1780 } 1781 tcp_time_wait->tcp_free_list_cnt = 0; 1782 } 1783 1784 /* 1785 * In order to reap time waits reliably, we should use a 1786 * source of time that is not adjustable by the user -- hence 1787 * the call to ddi_get_lbolt(). 1788 */ 1789 now = ddi_get_lbolt(); 1790 while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { 1791 /* 1792 * Compare times using modular arithmetic, since 1793 * lbolt can wrapover. 1794 */ 1795 if ((now - tcp->tcp_time_wait_expire) < 0) { 1796 break; 1797 } 1798 1799 tcp_time_wait_remove(tcp, tcp_time_wait); 1800 1801 connp = tcp->tcp_connp; 1802 ASSERT(connp->conn_fanout != NULL); 1803 lock = &connp->conn_fanout->connf_lock; 1804 /* 1805 * This is essentially a TW reclaim fast path optimization for 1806 * performance where the timewait collector checks under the 1807 * fanout lock (so that no one else can get access to the 1808 * conn_t) that the refcnt is 2 i.e. one for TCP and one for 1809 * the classifier hash list. If ref count is indeed 2, we can 1810 * just remove the conn under the fanout lock and avoid 1811 * cleaning up the conn under the squeue, provided that 1812 * clustering callbacks are not enabled. If clustering is 1813 * enabled, we need to make the clustering callback before 1814 * setting the CONDEMNED flag and after dropping all locks and 1815 * so we forego this optimization and fall back to the slow 1816 * path. Also please see the comments in tcp_closei_local 1817 * regarding the refcnt logic. 1818 * 1819 * Since we are holding the tcp_time_wait_lock, its better 1820 * not to block on the fanout_lock because other connections 1821 * can't add themselves to time_wait list. So we do a 1822 * tryenter instead of mutex_enter. 1823 */ 1824 if (mutex_tryenter(lock)) { 1825 mutex_enter(&connp->conn_lock); 1826 if ((connp->conn_ref == 2) && 1827 (cl_inet_disconnect == NULL)) { 1828 ipcl_hash_remove_locked(connp, 1829 connp->conn_fanout); 1830 /* 1831 * Set the CONDEMNED flag now itself so that 1832 * the refcnt cannot increase due to any 1833 * walker. But we have still not cleaned up 1834 * conn_ire_cache. This is still ok since 1835 * we are going to clean it up in tcp_cleanup 1836 * immediately and any interface unplumb 1837 * thread will wait till the ire is blown away 1838 */ 1839 connp->conn_state_flags |= CONN_CONDEMNED; 1840 mutex_exit(lock); 1841 mutex_exit(&connp->conn_lock); 1842 if (tcp_time_wait->tcp_free_list_cnt < 1843 tcp_free_list_max_cnt) { 1844 /* Add to head of tcp_free_list */ 1845 mutex_exit( 1846 &tcp_time_wait->tcp_time_wait_lock); 1847 tcp_cleanup(tcp); 1848 mutex_enter( 1849 &tcp_time_wait->tcp_time_wait_lock); 1850 tcp->tcp_time_wait_next = 1851 tcp_time_wait->tcp_free_list; 1852 tcp_time_wait->tcp_free_list = tcp; 1853 tcp_time_wait->tcp_free_list_cnt++; 1854 continue; 1855 } else { 1856 /* Do not add to tcp_free_list */ 1857 mutex_exit( 1858 &tcp_time_wait->tcp_time_wait_lock); 1859 tcp_bind_hash_remove(tcp); 1860 conn_delete_ire(tcp->tcp_connp, NULL); 1861 CONN_DEC_REF(tcp->tcp_connp); 1862 } 1863 } else { 1864 CONN_INC_REF_LOCKED(connp); 1865 mutex_exit(lock); 1866 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1867 mutex_exit(&connp->conn_lock); 1868 /* 1869 * We can reuse the closemp here since conn has 1870 * detached (otherwise we wouldn't even be in 1871 * time_wait list). 1872 */ 1873 mp = &tcp->tcp_closemp; 1874 squeue_fill(connp->conn_sqp, mp, 1875 tcp_timewait_output, connp, 1876 SQTAG_TCP_TIMEWAIT); 1877 } 1878 } else { 1879 mutex_enter(&connp->conn_lock); 1880 CONN_INC_REF_LOCKED(connp); 1881 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1882 mutex_exit(&connp->conn_lock); 1883 /* 1884 * We can reuse the closemp here since conn has 1885 * detached (otherwise we wouldn't even be in 1886 * time_wait list). 1887 */ 1888 mp = &tcp->tcp_closemp; 1889 squeue_fill(connp->conn_sqp, mp, 1890 tcp_timewait_output, connp, 0); 1891 } 1892 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1893 } 1894 1895 if (tcp_time_wait->tcp_free_list != NULL) 1896 tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; 1897 1898 tcp_time_wait->tcp_time_wait_tid = 1899 timeout(tcp_time_wait_collector, sqp, TCP_TIME_WAIT_DELAY); 1900 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1901 } 1902 1903 /* 1904 * Reply to a clients T_CONN_RES TPI message. This function 1905 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES 1906 * on the acceptor STREAM and processed in tcp_wput_accept(). 1907 * Read the block comment on top of tcp_conn_request(). 1908 */ 1909 static void 1910 tcp_accept(tcp_t *listener, mblk_t *mp) 1911 { 1912 tcp_t *acceptor; 1913 tcp_t *eager; 1914 tcp_t *tcp; 1915 struct T_conn_res *tcr; 1916 t_uscalar_t acceptor_id; 1917 t_scalar_t seqnum; 1918 mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */ 1919 mblk_t *ok_mp; 1920 mblk_t *mp1; 1921 1922 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 1923 tcp_err_ack(listener, mp, TPROTO, 0); 1924 return; 1925 } 1926 tcr = (struct T_conn_res *)mp->b_rptr; 1927 1928 /* 1929 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the 1930 * read side queue of the streams device underneath us i.e. the 1931 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we 1932 * look it up in the queue_hash. Under LP64 it sends down the 1933 * minor_t of the accepting endpoint. 1934 * 1935 * Once the acceptor/eager are modified (in tcp_accept_swap) the 1936 * fanout hash lock is held. 1937 * This prevents any thread from entering the acceptor queue from 1938 * below (since it has not been hard bound yet i.e. any inbound 1939 * packets will arrive on the listener or default tcp queue and 1940 * go through tcp_lookup). 1941 * The CONN_INC_REF will prevent the acceptor from closing. 1942 * 1943 * XXX It is still possible for a tli application to send down data 1944 * on the accepting stream while another thread calls t_accept. 1945 * This should not be a problem for well-behaved applications since 1946 * the T_OK_ACK is sent after the queue swapping is completed. 1947 * 1948 * If the accepting fd is the same as the listening fd, avoid 1949 * queue hash lookup since that will return an eager listener in a 1950 * already established state. 1951 */ 1952 acceptor_id = tcr->ACCEPTOR_id; 1953 mutex_enter(&listener->tcp_eager_lock); 1954 if (listener->tcp_acceptor_id == acceptor_id) { 1955 eager = listener->tcp_eager_next_q; 1956 /* only count how many T_CONN_INDs so don't count q0 */ 1957 if ((listener->tcp_conn_req_cnt_q != 1) || 1958 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { 1959 mutex_exit(&listener->tcp_eager_lock); 1960 tcp_err_ack(listener, mp, TBADF, 0); 1961 return; 1962 } 1963 if (listener->tcp_conn_req_cnt_q0 != 0) { 1964 /* Throw away all the eagers on q0. */ 1965 tcp_eager_cleanup(listener, 1); 1966 } 1967 if (listener->tcp_syn_defense) { 1968 listener->tcp_syn_defense = B_FALSE; 1969 if (listener->tcp_ip_addr_cache != NULL) { 1970 kmem_free(listener->tcp_ip_addr_cache, 1971 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1972 listener->tcp_ip_addr_cache = NULL; 1973 } 1974 } 1975 /* 1976 * Transfer tcp_conn_req_max to the eager so that when 1977 * a disconnect occurs we can revert the endpoint to the 1978 * listen state. 1979 */ 1980 eager->tcp_conn_req_max = listener->tcp_conn_req_max; 1981 ASSERT(listener->tcp_conn_req_cnt_q0 == 0); 1982 /* 1983 * Get a reference on the acceptor just like the 1984 * tcp_acceptor_hash_lookup below. 1985 */ 1986 acceptor = listener; 1987 CONN_INC_REF(acceptor->tcp_connp); 1988 } else { 1989 acceptor = tcp_acceptor_hash_lookup(acceptor_id); 1990 if (acceptor == NULL) { 1991 if (listener->tcp_debug) { 1992 (void) strlog(TCP_MOD_ID, 0, 1, 1993 SL_ERROR|SL_TRACE, 1994 "tcp_accept: did not find acceptor 0x%x\n", 1995 acceptor_id); 1996 } 1997 mutex_exit(&listener->tcp_eager_lock); 1998 tcp_err_ack(listener, mp, TPROVMISMATCH, 0); 1999 return; 2000 } 2001 /* 2002 * Verify acceptor state. The acceptable states for an acceptor 2003 * include TCPS_IDLE and TCPS_BOUND. 2004 */ 2005 switch (acceptor->tcp_state) { 2006 case TCPS_IDLE: 2007 /* FALLTHRU */ 2008 case TCPS_BOUND: 2009 break; 2010 default: 2011 CONN_DEC_REF(acceptor->tcp_connp); 2012 mutex_exit(&listener->tcp_eager_lock); 2013 tcp_err_ack(listener, mp, TOUTSTATE, 0); 2014 return; 2015 } 2016 } 2017 2018 /* The listener must be in TCPS_LISTEN */ 2019 if (listener->tcp_state != TCPS_LISTEN) { 2020 CONN_DEC_REF(acceptor->tcp_connp); 2021 mutex_exit(&listener->tcp_eager_lock); 2022 tcp_err_ack(listener, mp, TOUTSTATE, 0); 2023 return; 2024 } 2025 2026 /* 2027 * Rendezvous with an eager connection request packet hanging off 2028 * 'tcp' that has the 'seqnum' tag. We tagged the detached open 2029 * tcp structure when the connection packet arrived in 2030 * tcp_conn_request(). 2031 */ 2032 seqnum = tcr->SEQ_number; 2033 eager = listener; 2034 do { 2035 eager = eager->tcp_eager_next_q; 2036 if (eager == NULL) { 2037 CONN_DEC_REF(acceptor->tcp_connp); 2038 mutex_exit(&listener->tcp_eager_lock); 2039 tcp_err_ack(listener, mp, TBADSEQ, 0); 2040 return; 2041 } 2042 } while (eager->tcp_conn_req_seqnum != seqnum); 2043 mutex_exit(&listener->tcp_eager_lock); 2044 2045 /* 2046 * At this point, both acceptor and listener have 2 ref 2047 * that they begin with. Acceptor has one additional ref 2048 * we placed in lookup while listener has 3 additional 2049 * ref for being behind the squeue (tcp_accept() is 2050 * done on listener's squeue); being in classifier hash; 2051 * and eager's ref on listener. 2052 */ 2053 ASSERT(listener->tcp_connp->conn_ref >= 5); 2054 ASSERT(acceptor->tcp_connp->conn_ref >= 3); 2055 2056 /* 2057 * The eager at this point is set in its own squeue and 2058 * could easily have been killed (tcp_accept_finish will 2059 * deal with that) because of a TH_RST so we can only 2060 * ASSERT for a single ref. 2061 */ 2062 ASSERT(eager->tcp_connp->conn_ref >= 1); 2063 2064 /* Pre allocate the stroptions mblk also */ 2065 opt_mp = allocb(sizeof (struct stroptions), BPRI_HI); 2066 if (opt_mp == NULL) { 2067 CONN_DEC_REF(acceptor->tcp_connp); 2068 CONN_DEC_REF(eager->tcp_connp); 2069 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 2070 return; 2071 } 2072 DB_TYPE(opt_mp) = M_SETOPTS; 2073 opt_mp->b_wptr += sizeof (struct stroptions); 2074 2075 /* 2076 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO 2077 * from listener to acceptor. The message is chained on opt_mp 2078 * which will be sent onto eager's squeue. 2079 */ 2080 if (listener->tcp_bound_if != 0) { 2081 /* allocate optmgmt req */ 2082 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6, 2083 IPV6_BOUND_IF, (char *)&listener->tcp_bound_if, 2084 sizeof (int)); 2085 if (mp1 != NULL) 2086 linkb(opt_mp, mp1); 2087 } 2088 if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { 2089 uint_t on = 1; 2090 2091 /* allocate optmgmt req */ 2092 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6, 2093 IPV6_RECVPKTINFO, (char *)&on, sizeof (on)); 2094 if (mp1 != NULL) 2095 linkb(opt_mp, mp1); 2096 } 2097 2098 /* Re-use mp1 to hold a copy of mp, in case reallocb fails */ 2099 if ((mp1 = copymsg(mp)) == NULL) { 2100 CONN_DEC_REF(acceptor->tcp_connp); 2101 CONN_DEC_REF(eager->tcp_connp); 2102 freemsg(opt_mp); 2103 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 2104 return; 2105 } 2106 2107 tcr = (struct T_conn_res *)mp1->b_rptr; 2108 2109 /* 2110 * This is an expanded version of mi_tpi_ok_ack_alloc() 2111 * which allocates a larger mblk and appends the new 2112 * local address to the ok_ack. The address is copied by 2113 * soaccept() for getsockname(). 2114 */ 2115 { 2116 int extra; 2117 2118 extra = (eager->tcp_family == AF_INET) ? 2119 sizeof (sin_t) : sizeof (sin6_t); 2120 2121 /* 2122 * Try to re-use mp, if possible. Otherwise, allocate 2123 * an mblk and return it as ok_mp. In any case, mp 2124 * is no longer usable upon return. 2125 */ 2126 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { 2127 CONN_DEC_REF(acceptor->tcp_connp); 2128 CONN_DEC_REF(eager->tcp_connp); 2129 freemsg(opt_mp); 2130 /* Original mp has been freed by now, so use mp1 */ 2131 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); 2132 return; 2133 } 2134 2135 mp = NULL; /* We should never use mp after this point */ 2136 2137 switch (extra) { 2138 case sizeof (sin_t): { 2139 sin_t *sin = (sin_t *)ok_mp->b_wptr; 2140 2141 ok_mp->b_wptr += extra; 2142 sin->sin_family = AF_INET; 2143 sin->sin_port = eager->tcp_lport; 2144 sin->sin_addr.s_addr = 2145 eager->tcp_ipha->ipha_src; 2146 break; 2147 } 2148 case sizeof (sin6_t): { 2149 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; 2150 2151 ok_mp->b_wptr += extra; 2152 sin6->sin6_family = AF_INET6; 2153 sin6->sin6_port = eager->tcp_lport; 2154 if (eager->tcp_ipversion == IPV4_VERSION) { 2155 sin6->sin6_flowinfo = 0; 2156 IN6_IPADDR_TO_V4MAPPED( 2157 eager->tcp_ipha->ipha_src, 2158 &sin6->sin6_addr); 2159 } else { 2160 ASSERT(eager->tcp_ip6h != NULL); 2161 sin6->sin6_flowinfo = 2162 eager->tcp_ip6h->ip6_vcf & 2163 ~IPV6_VERS_AND_FLOW_MASK; 2164 sin6->sin6_addr = 2165 eager->tcp_ip6h->ip6_src; 2166 } 2167 break; 2168 } 2169 default: 2170 break; 2171 } 2172 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); 2173 } 2174 2175 /* 2176 * If there are no options we know that the T_CONN_RES will 2177 * succeed. However, we can't send the T_OK_ACK upstream until 2178 * the tcp_accept_swap is done since it would be dangerous to 2179 * let the application start using the new fd prior to the swap. 2180 */ 2181 tcp_accept_swap(listener, acceptor, eager); 2182 2183 /* 2184 * tcp_accept_swap unlinks eager from listener but does not drop 2185 * the eager's reference on the listener. 2186 */ 2187 ASSERT(eager->tcp_listener == NULL); 2188 ASSERT(listener->tcp_connp->conn_ref >= 5); 2189 2190 /* 2191 * The eager is now associated with its own queue. Insert in 2192 * the hash so that the connection can be reused for a future 2193 * T_CONN_RES. 2194 */ 2195 tcp_acceptor_hash_insert(acceptor_id, eager); 2196 2197 /* 2198 * We now do the processing of options with T_CONN_RES. 2199 * We delay till now since we wanted to have queue to pass to 2200 * option processing routines that points back to the right 2201 * instance structure which does not happen until after 2202 * tcp_accept_swap(). 2203 * 2204 * Note: 2205 * The sanity of the logic here assumes that whatever options 2206 * are appropriate to inherit from listner=>eager are done 2207 * before this point, and whatever were to be overridden (or not) 2208 * in transfer logic from eager=>acceptor in tcp_accept_swap(). 2209 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it 2210 * before its ACCEPTOR_id comes down in T_CONN_RES ] 2211 * This may not be true at this point in time but can be fixed 2212 * independently. This option processing code starts with 2213 * the instantiated acceptor instance and the final queue at 2214 * this point. 2215 */ 2216 2217 if (tcr->OPT_length != 0) { 2218 /* Options to process */ 2219 int t_error = 0; 2220 int sys_error = 0; 2221 int do_disconnect = 0; 2222 2223 if (tcp_conprim_opt_process(eager, mp1, 2224 &do_disconnect, &t_error, &sys_error) < 0) { 2225 eager->tcp_accept_error = 1; 2226 if (do_disconnect) { 2227 /* 2228 * An option failed which does not allow 2229 * connection to be accepted. 2230 * 2231 * We allow T_CONN_RES to succeed and 2232 * put a T_DISCON_IND on the eager queue. 2233 */ 2234 ASSERT(t_error == 0 && sys_error == 0); 2235 eager->tcp_send_discon_ind = 1; 2236 } else { 2237 ASSERT(t_error != 0); 2238 freemsg(ok_mp); 2239 /* 2240 * Original mp was either freed or set 2241 * to ok_mp above, so use mp1 instead. 2242 */ 2243 tcp_err_ack(listener, mp1, t_error, sys_error); 2244 goto finish; 2245 } 2246 } 2247 /* 2248 * Most likely success in setting options (except if 2249 * eager->tcp_send_discon_ind set). 2250 * mp1 option buffer represented by OPT_length/offset 2251 * potentially modified and contains results of setting 2252 * options at this point 2253 */ 2254 } 2255 2256 /* We no longer need mp1, since all options processing has passed */ 2257 freemsg(mp1); 2258 2259 putnext(listener->tcp_rq, ok_mp); 2260 2261 mutex_enter(&listener->tcp_eager_lock); 2262 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 2263 tcp_t *tail; 2264 mblk_t *conn_ind; 2265 2266 /* 2267 * This path should not be executed if listener and 2268 * acceptor streams are the same. 2269 */ 2270 ASSERT(listener != acceptor); 2271 2272 tcp = listener->tcp_eager_prev_q0; 2273 /* 2274 * listener->tcp_eager_prev_q0 points to the TAIL of the 2275 * deferred T_conn_ind queue. We need to get to the head of 2276 * the queue in order to send up T_conn_ind the same order as 2277 * how the 3WHS is completed. 2278 */ 2279 while (tcp != listener) { 2280 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) 2281 break; 2282 else 2283 tcp = tcp->tcp_eager_prev_q0; 2284 } 2285 ASSERT(tcp != listener); 2286 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; 2287 ASSERT(conn_ind != NULL); 2288 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 2289 2290 /* Move from q0 to q */ 2291 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 2292 listener->tcp_conn_req_cnt_q0--; 2293 listener->tcp_conn_req_cnt_q++; 2294 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 2295 tcp->tcp_eager_prev_q0; 2296 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 2297 tcp->tcp_eager_next_q0; 2298 tcp->tcp_eager_prev_q0 = NULL; 2299 tcp->tcp_eager_next_q0 = NULL; 2300 tcp->tcp_conn_def_q0 = B_FALSE; 2301 2302 /* 2303 * Insert at end of the queue because sockfs sends 2304 * down T_CONN_RES in chronological order. Leaving 2305 * the older conn indications at front of the queue 2306 * helps reducing search time. 2307 */ 2308 tail = listener->tcp_eager_last_q; 2309 if (tail != NULL) 2310 tail->tcp_eager_next_q = tcp; 2311 else 2312 listener->tcp_eager_next_q = tcp; 2313 listener->tcp_eager_last_q = tcp; 2314 tcp->tcp_eager_next_q = NULL; 2315 mutex_exit(&listener->tcp_eager_lock); 2316 putnext(tcp->tcp_rq, conn_ind); 2317 } else { 2318 mutex_exit(&listener->tcp_eager_lock); 2319 } 2320 2321 /* 2322 * Done with the acceptor - free it 2323 * 2324 * Note: from this point on, no access to listener should be made 2325 * as listener can be equal to acceptor. 2326 */ 2327 finish: 2328 ASSERT(acceptor->tcp_detached); 2329 acceptor->tcp_rq = tcp_g_q; 2330 acceptor->tcp_wq = WR(tcp_g_q); 2331 (void) tcp_clean_death(acceptor, 0, 2); 2332 CONN_DEC_REF(acceptor->tcp_connp); 2333 2334 /* 2335 * In case we already received a FIN we have to make tcp_rput send 2336 * the ordrel_ind. This will also send up a window update if the window 2337 * has opened up. 2338 * 2339 * In the normal case of a successful connection acceptance 2340 * we give the O_T_BIND_REQ to the read side put procedure as an 2341 * indication that this was just accepted. This tells tcp_rput to 2342 * pass up any data queued in tcp_rcv_list. 2343 * 2344 * In the fringe case where options sent with T_CONN_RES failed and 2345 * we required, we would be indicating a T_DISCON_IND to blow 2346 * away this connection. 2347 */ 2348 2349 /* 2350 * XXX: we currently have a problem if XTI application closes the 2351 * acceptor stream in between. This problem exists in on10-gate also 2352 * and is well know but nothing can be done short of major rewrite 2353 * to fix it. Now it is possible to take care of it by assigning TLI/XTI 2354 * eager same squeue as listener (we can distinguish non socket 2355 * listeners at the time of handling a SYN in tcp_conn_request) 2356 * and do most of the work that tcp_accept_finish does here itself 2357 * and then get behind the acceptor squeue to access the acceptor 2358 * queue. 2359 */ 2360 /* 2361 * We already have a ref on tcp so no need to do one before squeue_fill 2362 */ 2363 squeue_fill(eager->tcp_connp->conn_sqp, opt_mp, 2364 tcp_accept_finish, eager->tcp_connp, SQTAG_TCP_ACCEPT_FINISH); 2365 } 2366 2367 /* 2368 * Swap information between the eager and acceptor for a TLI/XTI client. 2369 * The sockfs accept is done on the acceptor stream and control goes 2370 * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not 2371 * called. In either case, both the eager and listener are in their own 2372 * perimeter (squeue) and the code has to deal with potential race. 2373 * 2374 * See the block comment on top of tcp_accept() and tcp_wput_accept(). 2375 */ 2376 static void 2377 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) 2378 { 2379 conn_t *econnp, *aconnp; 2380 2381 ASSERT(eager->tcp_rq == listener->tcp_rq); 2382 ASSERT(eager->tcp_detached && !acceptor->tcp_detached); 2383 ASSERT(!eager->tcp_hard_bound); 2384 ASSERT(!TCP_IS_SOCKET(acceptor)); 2385 ASSERT(!TCP_IS_SOCKET(eager)); 2386 ASSERT(!TCP_IS_SOCKET(listener)); 2387 2388 acceptor->tcp_detached = B_TRUE; 2389 /* 2390 * To permit stream re-use by TLI/XTI, the eager needs a copy of 2391 * the acceptor id. 2392 */ 2393 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; 2394 2395 /* remove eager from listen list... */ 2396 mutex_enter(&listener->tcp_eager_lock); 2397 tcp_eager_unlink(eager); 2398 ASSERT(eager->tcp_eager_next_q == NULL && 2399 eager->tcp_eager_last_q == NULL); 2400 ASSERT(eager->tcp_eager_next_q0 == NULL && 2401 eager->tcp_eager_prev_q0 == NULL); 2402 mutex_exit(&listener->tcp_eager_lock); 2403 eager->tcp_rq = acceptor->tcp_rq; 2404 eager->tcp_wq = acceptor->tcp_wq; 2405 2406 econnp = eager->tcp_connp; 2407 aconnp = acceptor->tcp_connp; 2408 2409 eager->tcp_rq->q_ptr = econnp; 2410 eager->tcp_wq->q_ptr = econnp; 2411 2412 /* 2413 * In the TLI/XTI loopback case, we are inside the listener's squeue, 2414 * which might be a different squeue from our peer TCP instance. 2415 * For TCP Fusion, the peer expects that whenever tcp_detached is 2416 * clear, our TCP queues point to the acceptor's queues. Thus, use 2417 * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq 2418 * above reach global visibility prior to the clearing of tcp_detached. 2419 */ 2420 membar_producer(); 2421 eager->tcp_detached = B_FALSE; 2422 2423 ASSERT(eager->tcp_ack_tid == 0); 2424 2425 econnp->conn_dev = aconnp->conn_dev; 2426 if (eager->tcp_cred != NULL) 2427 crfree(eager->tcp_cred); 2428 eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred; 2429 aconnp->conn_cred = NULL; 2430 2431 econnp->conn_zoneid = aconnp->conn_zoneid; 2432 econnp->conn_allzones = aconnp->conn_allzones; 2433 2434 econnp->conn_mac_exempt = aconnp->conn_mac_exempt; 2435 aconnp->conn_mac_exempt = B_FALSE; 2436 2437 ASSERT(aconnp->conn_peercred == NULL); 2438 2439 /* Do the IPC initialization */ 2440 CONN_INC_REF(econnp); 2441 2442 econnp->conn_multicast_loop = aconnp->conn_multicast_loop; 2443 econnp->conn_af_isv6 = aconnp->conn_af_isv6; 2444 econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6; 2445 econnp->conn_ulp = aconnp->conn_ulp; 2446 2447 /* Done with old IPC. Drop its ref on its connp */ 2448 CONN_DEC_REF(aconnp); 2449 } 2450 2451 2452 /* 2453 * Adapt to the information, such as rtt and rtt_sd, provided from the 2454 * ire cached in conn_cache_ire. If no ire cached, do a ire lookup. 2455 * 2456 * Checks for multicast and broadcast destination address. 2457 * Returns zero on failure; non-zero if ok. 2458 * 2459 * Note that the MSS calculation here is based on the info given in 2460 * the IRE. We do not do any calculation based on TCP options. They 2461 * will be handled in tcp_rput_other() and tcp_rput_data() when TCP 2462 * knows which options to use. 2463 * 2464 * Note on how TCP gets its parameters for a connection. 2465 * 2466 * When a tcp_t structure is allocated, it gets all the default parameters. 2467 * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd, 2468 * spipe, rpipe, ... from the route metrics. Route metric overrides the 2469 * default. But if there is an associated tcp_host_param, it will override 2470 * the metrics. 2471 * 2472 * An incoming SYN with a multicast or broadcast destination address, is dropped 2473 * in 1 of 2 places. 2474 * 2475 * 1. If the packet was received over the wire it is dropped in 2476 * ip_rput_process_broadcast() 2477 * 2478 * 2. If the packet was received through internal IP loopback, i.e. the packet 2479 * was generated and received on the same machine, it is dropped in 2480 * ip_wput_local() 2481 * 2482 * An incoming SYN with a multicast or broadcast source address is always 2483 * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to 2484 * reject an attempt to connect to a broadcast or multicast (destination) 2485 * address. 2486 */ 2487 static int 2488 tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) 2489 { 2490 tcp_hsp_t *hsp; 2491 ire_t *ire; 2492 ire_t *sire = NULL; 2493 iulp_t *ire_uinfo = NULL; 2494 uint32_t mss_max; 2495 uint32_t mss; 2496 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 2497 conn_t *connp = tcp->tcp_connp; 2498 boolean_t ire_cacheable = B_FALSE; 2499 zoneid_t zoneid = connp->conn_zoneid; 2500 int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 2501 MATCH_IRE_SECATTR; 2502 ts_label_t *tsl = crgetlabel(CONN_CRED(connp)); 2503 ill_t *ill = NULL; 2504 boolean_t incoming = (ire_mp == NULL); 2505 2506 ASSERT(connp->conn_ire_cache == NULL); 2507 2508 if (tcp->tcp_ipversion == IPV4_VERSION) { 2509 2510 if (CLASSD(tcp->tcp_connp->conn_rem)) { 2511 BUMP_MIB(&ip_mib, ipInDiscards); 2512 return (0); 2513 } 2514 /* 2515 * If IP_NEXTHOP is set, then look for an IRE_CACHE 2516 * for the destination with the nexthop as gateway. 2517 * ire_ctable_lookup() is used because this particular 2518 * ire, if it exists, will be marked private. 2519 * If that is not available, use the interface ire 2520 * for the nexthop. 2521 * 2522 * TSol: tcp_update_label will detect label mismatches based 2523 * only on the destination's label, but that would not 2524 * detect label mismatches based on the security attributes 2525 * of routes or next hop gateway. Hence we need to pass the 2526 * label to ire_ftable_lookup below in order to locate the 2527 * right prefix (and/or) ire cache. Similarly we also need 2528 * pass the label to the ire_cache_lookup below to locate 2529 * the right ire that also matches on the label. 2530 */ 2531 if (tcp->tcp_connp->conn_nexthop_set) { 2532 ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem, 2533 tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid, 2534 tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW); 2535 if (ire == NULL) { 2536 ire = ire_ftable_lookup( 2537 tcp->tcp_connp->conn_nexthop_v4, 2538 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0, 2539 tsl, match_flags); 2540 if (ire == NULL) 2541 return (0); 2542 } else { 2543 ire_uinfo = &ire->ire_uinfo; 2544 } 2545 } else { 2546 ire = ire_cache_lookup(tcp->tcp_connp->conn_rem, 2547 zoneid, tsl); 2548 if (ire != NULL) { 2549 ire_cacheable = B_TRUE; 2550 ire_uinfo = (ire_mp != NULL) ? 2551 &((ire_t *)ire_mp->b_rptr)->ire_uinfo: 2552 &ire->ire_uinfo; 2553 2554 } else { 2555 if (ire_mp == NULL) { 2556 ire = ire_ftable_lookup( 2557 tcp->tcp_connp->conn_rem, 2558 0, 0, 0, NULL, &sire, zoneid, 0, 2559 tsl, (MATCH_IRE_RECURSIVE | 2560 MATCH_IRE_DEFAULT)); 2561 if (ire == NULL) 2562 return (0); 2563 ire_uinfo = (sire != NULL) ? 2564 &sire->ire_uinfo : 2565 &ire->ire_uinfo; 2566 } else { 2567 ire = (ire_t *)ire_mp->b_rptr; 2568 ire_uinfo = 2569 &((ire_t *) 2570 ire_mp->b_rptr)->ire_uinfo; 2571 } 2572 } 2573 } 2574 ASSERT(ire != NULL); 2575 2576 if ((ire->ire_src_addr == INADDR_ANY) || 2577 (ire->ire_type & IRE_BROADCAST)) { 2578 /* 2579 * ire->ire_mp is non null when ire_mp passed in is used 2580 * ire->ire_mp is set in ip_bind_insert_ire[_v6](). 2581 */ 2582 if (ire->ire_mp == NULL) 2583 ire_refrele(ire); 2584 if (sire != NULL) 2585 ire_refrele(sire); 2586 return (0); 2587 } 2588 2589 if (tcp->tcp_ipha->ipha_src == INADDR_ANY) { 2590 ipaddr_t src_addr; 2591 2592 /* 2593 * ip_bind_connected() has stored the correct source 2594 * address in conn_src. 2595 */ 2596 src_addr = tcp->tcp_connp->conn_src; 2597 tcp->tcp_ipha->ipha_src = src_addr; 2598 /* 2599 * Copy of the src addr. in tcp_t is needed 2600 * for the lookup funcs. 2601 */ 2602 IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6); 2603 } 2604 /* 2605 * Set the fragment bit so that IP will tell us if the MTU 2606 * should change. IP tells us the latest setting of 2607 * ip_path_mtu_discovery through ire_frag_flag. 2608 */ 2609 if (ip_path_mtu_discovery) { 2610 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 2611 htons(IPH_DF); 2612 } 2613 /* 2614 * If ire_uinfo is NULL, this is the IRE_INTERFACE case 2615 * for IP_NEXTHOP. No cache ire has been found for the 2616 * destination and we are working with the nexthop's 2617 * interface ire. Since we need to forward all packets 2618 * to the nexthop first, we "blindly" set tcp_localnet 2619 * to false, eventhough the destination may also be 2620 * onlink. 2621 */ 2622 if (ire_uinfo == NULL) 2623 tcp->tcp_localnet = 0; 2624 else 2625 tcp->tcp_localnet = (ire->ire_gateway_addr == 0); 2626 } else { 2627 /* 2628 * For incoming connection ire_mp = NULL 2629 * For outgoing connection ire_mp != NULL 2630 * Technically we should check conn_incoming_ill 2631 * when ire_mp is NULL and conn_outgoing_ill when 2632 * ire_mp is non-NULL. But this is performance 2633 * critical path and for IPV*_BOUND_IF, outgoing 2634 * and incoming ill are always set to the same value. 2635 */ 2636 ill_t *dst_ill = NULL; 2637 ipif_t *dst_ipif = NULL; 2638 2639 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 2640 2641 if (connp->conn_outgoing_ill != NULL) { 2642 /* Outgoing or incoming path */ 2643 int err; 2644 2645 dst_ill = conn_get_held_ill(connp, 2646 &connp->conn_outgoing_ill, &err); 2647 if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) { 2648 ip1dbg(("tcp_adapt_ire: ill_lookup failed\n")); 2649 return (0); 2650 } 2651 match_flags |= MATCH_IRE_ILL; 2652 dst_ipif = dst_ill->ill_ipif; 2653 } 2654 ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6, 2655 0, 0, dst_ipif, zoneid, tsl, match_flags); 2656 2657 if (ire != NULL) { 2658 ire_cacheable = B_TRUE; 2659 ire_uinfo = (ire_mp != NULL) ? 2660 &((ire_t *)ire_mp->b_rptr)->ire_uinfo: 2661 &ire->ire_uinfo; 2662 } else { 2663 if (ire_mp == NULL) { 2664 ire = ire_ftable_lookup_v6( 2665 &tcp->tcp_connp->conn_remv6, 2666 0, 0, 0, dst_ipif, &sire, zoneid, 2667 0, tsl, match_flags); 2668 if (ire == NULL) { 2669 if (dst_ill != NULL) 2670 ill_refrele(dst_ill); 2671 return (0); 2672 } 2673 ire_uinfo = (sire != NULL) ? &sire->ire_uinfo : 2674 &ire->ire_uinfo; 2675 } else { 2676 ire = (ire_t *)ire_mp->b_rptr; 2677 ire_uinfo = 2678 &((ire_t *)ire_mp->b_rptr)->ire_uinfo; 2679 } 2680 } 2681 if (dst_ill != NULL) 2682 ill_refrele(dst_ill); 2683 2684 ASSERT(ire != NULL); 2685 ASSERT(ire_uinfo != NULL); 2686 2687 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) || 2688 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 2689 /* 2690 * ire->ire_mp is non null when ire_mp passed in is used 2691 * ire->ire_mp is set in ip_bind_insert_ire[_v6](). 2692 */ 2693 if (ire->ire_mp == NULL) 2694 ire_refrele(ire); 2695 if (sire != NULL) 2696 ire_refrele(sire); 2697 return (0); 2698 } 2699 2700 if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { 2701 in6_addr_t src_addr; 2702 2703 /* 2704 * ip_bind_connected_v6() has stored the correct source 2705 * address per IPv6 addr. selection policy in 2706 * conn_src_v6. 2707 */ 2708 src_addr = tcp->tcp_connp->conn_srcv6; 2709 2710 tcp->tcp_ip6h->ip6_src = src_addr; 2711 /* 2712 * Copy of the src addr. in tcp_t is needed 2713 * for the lookup funcs. 2714 */ 2715 tcp->tcp_ip_src_v6 = src_addr; 2716 ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src, 2717 &connp->conn_srcv6)); 2718 } 2719 tcp->tcp_localnet = 2720 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6); 2721 } 2722 2723 /* 2724 * This allows applications to fail quickly when connections are made 2725 * to dead hosts. Hosts can be labeled dead by adding a reject route 2726 * with both the RTF_REJECT and RTF_PRIVATE flags set. 2727 */ 2728 if ((ire->ire_flags & RTF_REJECT) && 2729 (ire->ire_flags & RTF_PRIVATE)) 2730 goto error; 2731 2732 /* 2733 * Make use of the cached rtt and rtt_sd values to calculate the 2734 * initial RTO. Note that they are already initialized in 2735 * tcp_init_values(). 2736 * If ire_uinfo is NULL, i.e., we do not have a cache ire for 2737 * IP_NEXTHOP, but instead are using the interface ire for the 2738 * nexthop, then we do not use the ire_uinfo from that ire to 2739 * do any initializations. 2740 */ 2741 if (ire_uinfo != NULL) { 2742 if (ire_uinfo->iulp_rtt != 0) { 2743 clock_t rto; 2744 2745 tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt; 2746 tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd; 2747 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 2748 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5); 2749 2750 if (rto > tcp_rexmit_interval_max) { 2751 tcp->tcp_rto = tcp_rexmit_interval_max; 2752 } else if (rto < tcp_rexmit_interval_min) { 2753 tcp->tcp_rto = tcp_rexmit_interval_min; 2754 } else { 2755 tcp->tcp_rto = rto; 2756 } 2757 } 2758 if (ire_uinfo->iulp_ssthresh != 0) 2759 tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh; 2760 else 2761 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 2762 if (ire_uinfo->iulp_spipe > 0) { 2763 tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe, 2764 tcp_max_buf); 2765 if (tcp_snd_lowat_fraction != 0) 2766 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / 2767 tcp_snd_lowat_fraction; 2768 (void) tcp_maxpsz_set(tcp, B_TRUE); 2769 } 2770 /* 2771 * Note that up till now, acceptor always inherits receive 2772 * window from the listener. But if there is a metrics 2773 * associated with a host, we should use that instead of 2774 * inheriting it from listener. Thus we need to pass this 2775 * info back to the caller. 2776 */ 2777 if (ire_uinfo->iulp_rpipe > 0) { 2778 tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe, tcp_max_buf); 2779 } 2780 2781 if (ire_uinfo->iulp_rtomax > 0) { 2782 tcp->tcp_second_timer_threshold = 2783 ire_uinfo->iulp_rtomax; 2784 } 2785 2786 /* 2787 * Use the metric option settings, iulp_tstamp_ok and 2788 * iulp_wscale_ok, only for active open. What this means 2789 * is that if the other side uses timestamp or window 2790 * scale option, TCP will also use those options. That 2791 * is for passive open. If the application sets a 2792 * large window, window scale is enabled regardless of 2793 * the value in iulp_wscale_ok. This is the behavior 2794 * since 2.6. So we keep it. 2795 * The only case left in passive open processing is the 2796 * check for SACK. 2797 * For ECN, it should probably be like SACK. But the 2798 * current value is binary, so we treat it like the other 2799 * cases. The metric only controls active open.For passive 2800 * open, the ndd param, tcp_ecn_permitted, controls the 2801 * behavior. 2802 */ 2803 if (!tcp_detached) { 2804 /* 2805 * The if check means that the following can only 2806 * be turned on by the metrics only IRE, but not off. 2807 */ 2808 if (ire_uinfo->iulp_tstamp_ok) 2809 tcp->tcp_snd_ts_ok = B_TRUE; 2810 if (ire_uinfo->iulp_wscale_ok) 2811 tcp->tcp_snd_ws_ok = B_TRUE; 2812 if (ire_uinfo->iulp_sack == 2) 2813 tcp->tcp_snd_sack_ok = B_TRUE; 2814 if (ire_uinfo->iulp_ecn_ok) 2815 tcp->tcp_ecn_ok = B_TRUE; 2816 } else { 2817 /* 2818 * Passive open. 2819 * 2820 * As above, the if check means that SACK can only be 2821 * turned on by the metric only IRE. 2822 */ 2823 if (ire_uinfo->iulp_sack > 0) { 2824 tcp->tcp_snd_sack_ok = B_TRUE; 2825 } 2826 } 2827 } 2828 2829 2830 /* 2831 * XXX: Note that currently, ire_max_frag can be as small as 68 2832 * because of PMTUd. So tcp_mss may go to negative if combined 2833 * length of all those options exceeds 28 bytes. But because 2834 * of the tcp_mss_min check below, we may not have a problem if 2835 * tcp_mss_min is of a reasonable value. The default is 1 so 2836 * the negative problem still exists. And the check defeats PMTUd. 2837 * In fact, if PMTUd finds that the MSS should be smaller than 2838 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min 2839 * value. 2840 * 2841 * We do not deal with that now. All those problems related to 2842 * PMTUd will be fixed later. 2843 */ 2844 ASSERT(ire->ire_max_frag != 0); 2845 mss = tcp->tcp_if_mtu = ire->ire_max_frag; 2846 if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) { 2847 if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) { 2848 mss = MIN(mss, IPV6_MIN_MTU); 2849 } 2850 } 2851 2852 /* Sanity check for MSS value. */ 2853 if (tcp->tcp_ipversion == IPV4_VERSION) 2854 mss_max = tcp_mss_max_ipv4; 2855 else 2856 mss_max = tcp_mss_max_ipv6; 2857 2858 if (tcp->tcp_ipversion == IPV6_VERSION && 2859 (ire->ire_frag_flag & IPH_FRAG_HDR)) { 2860 /* 2861 * After receiving an ICMPv6 "packet too big" message with a 2862 * MTU < 1280, and for multirouted IPv6 packets, the IP layer 2863 * will insert a 8-byte fragment header in every packet; we 2864 * reduce the MSS by that amount here. 2865 */ 2866 mss -= sizeof (ip6_frag_t); 2867 } 2868 2869 if (tcp->tcp_ipsec_overhead == 0) 2870 tcp->tcp_ipsec_overhead = conn_ipsec_length(connp); 2871 2872 mss -= tcp->tcp_ipsec_overhead; 2873 2874 if (mss < tcp_mss_min) 2875 mss = tcp_mss_min; 2876 if (mss > mss_max) 2877 mss = mss_max; 2878 2879 /* Note that this is the maximum MSS, excluding all options. */ 2880 tcp->tcp_mss = mss; 2881 2882 /* 2883 * Initialize the ISS here now that we have the full connection ID. 2884 * The RFC 1948 method of initial sequence number generation requires 2885 * knowledge of the full connection ID before setting the ISS. 2886 */ 2887 2888 tcp_iss_init(tcp); 2889 2890 if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL)) 2891 tcp->tcp_loopback = B_TRUE; 2892 2893 if (tcp->tcp_ipversion == IPV4_VERSION) { 2894 hsp = tcp_hsp_lookup(tcp->tcp_remote); 2895 } else { 2896 hsp = tcp_hsp_lookup_ipv6(&tcp->tcp_remote_v6); 2897 } 2898 2899 if (hsp != NULL) { 2900 /* Only modify if we're going to make them bigger */ 2901 if (hsp->tcp_hsp_sendspace > tcp->tcp_xmit_hiwater) { 2902 tcp->tcp_xmit_hiwater = hsp->tcp_hsp_sendspace; 2903 if (tcp_snd_lowat_fraction != 0) 2904 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / 2905 tcp_snd_lowat_fraction; 2906 } 2907 2908 if (hsp->tcp_hsp_recvspace > tcp->tcp_rwnd) { 2909 tcp->tcp_rwnd = hsp->tcp_hsp_recvspace; 2910 } 2911 2912 /* Copy timestamp flag only for active open */ 2913 if (!tcp_detached) 2914 tcp->tcp_snd_ts_ok = hsp->tcp_hsp_tstamp; 2915 } 2916 2917 if (sire != NULL) 2918 IRE_REFRELE(sire); 2919 2920 /* 2921 * If we got an IRE_CACHE and an ILL, go through their properties; 2922 * otherwise, this is deferred until later when we have an IRE_CACHE. 2923 */ 2924 if (tcp->tcp_loopback || 2925 (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) { 2926 /* 2927 * For incoming, see if this tcp may be MDT-capable. For 2928 * outgoing, this process has been taken care of through 2929 * tcp_rput_other. 2930 */ 2931 tcp_ire_ill_check(tcp, ire, ill, incoming); 2932 tcp->tcp_ire_ill_check_done = B_TRUE; 2933 } 2934 2935 mutex_enter(&connp->conn_lock); 2936 /* 2937 * Make sure that conn is not marked incipient 2938 * for incoming connections. A blind 2939 * removal of incipient flag is cheaper than 2940 * check and removal. 2941 */ 2942 connp->conn_state_flags &= ~CONN_INCIPIENT; 2943 2944 /* Must not cache forwarding table routes. */ 2945 if (ire_cacheable) { 2946 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 2947 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 2948 connp->conn_ire_cache = ire; 2949 IRE_UNTRACE_REF(ire); 2950 rw_exit(&ire->ire_bucket->irb_lock); 2951 mutex_exit(&connp->conn_lock); 2952 return (1); 2953 } 2954 rw_exit(&ire->ire_bucket->irb_lock); 2955 } 2956 mutex_exit(&connp->conn_lock); 2957 2958 if (ire->ire_mp == NULL) 2959 ire_refrele(ire); 2960 return (1); 2961 2962 error: 2963 if (ire->ire_mp == NULL) 2964 ire_refrele(ire); 2965 if (sire != NULL) 2966 ire_refrele(sire); 2967 return (0); 2968 } 2969 2970 /* 2971 * tcp_bind is called (holding the writer lock) by tcp_wput_proto to process a 2972 * O_T_BIND_REQ/T_BIND_REQ message. 2973 */ 2974 static void 2975 tcp_bind(tcp_t *tcp, mblk_t *mp) 2976 { 2977 sin_t *sin; 2978 sin6_t *sin6; 2979 mblk_t *mp1; 2980 in_port_t requested_port; 2981 in_port_t allocated_port; 2982 struct T_bind_req *tbr; 2983 boolean_t bind_to_req_port_only; 2984 boolean_t backlog_update = B_FALSE; 2985 boolean_t user_specified; 2986 in6_addr_t v6addr; 2987 ipaddr_t v4addr; 2988 uint_t origipversion; 2989 int err; 2990 queue_t *q = tcp->tcp_wq; 2991 conn_t *connp; 2992 mlp_type_t addrtype, mlptype; 2993 zone_t *zone; 2994 cred_t *cr; 2995 in_port_t mlp_port; 2996 2997 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 2998 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 2999 if (tcp->tcp_debug) { 3000 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 3001 "tcp_bind: bad req, len %u", 3002 (uint_t)(mp->b_wptr - mp->b_rptr)); 3003 } 3004 tcp_err_ack(tcp, mp, TPROTO, 0); 3005 return; 3006 } 3007 /* Make sure the largest address fits */ 3008 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1); 3009 if (mp1 == NULL) { 3010 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 3011 return; 3012 } 3013 mp = mp1; 3014 tbr = (struct T_bind_req *)mp->b_rptr; 3015 if (tcp->tcp_state >= TCPS_BOUND) { 3016 if ((tcp->tcp_state == TCPS_BOUND || 3017 tcp->tcp_state == TCPS_LISTEN) && 3018 tcp->tcp_conn_req_max != tbr->CONIND_number && 3019 tbr->CONIND_number > 0) { 3020 /* 3021 * Handle listen() increasing CONIND_number. 3022 * This is more "liberal" then what the TPI spec 3023 * requires but is needed to avoid a t_unbind 3024 * when handling listen() since the port number 3025 * might be "stolen" between the unbind and bind. 3026 */ 3027 backlog_update = B_TRUE; 3028 goto do_bind; 3029 } 3030 if (tcp->tcp_debug) { 3031 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 3032 "tcp_bind: bad state, %d", tcp->tcp_state); 3033 } 3034 tcp_err_ack(tcp, mp, TOUTSTATE, 0); 3035 return; 3036 } 3037 origipversion = tcp->tcp_ipversion; 3038 3039 switch (tbr->ADDR_length) { 3040 case 0: /* request for a generic port */ 3041 tbr->ADDR_offset = sizeof (struct T_bind_req); 3042 if (tcp->tcp_family == AF_INET) { 3043 tbr->ADDR_length = sizeof (sin_t); 3044 sin = (sin_t *)&tbr[1]; 3045 *sin = sin_null; 3046 sin->sin_family = AF_INET; 3047 mp->b_wptr = (uchar_t *)&sin[1]; 3048 tcp->tcp_ipversion = IPV4_VERSION; 3049 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &v6addr); 3050 } else { 3051 ASSERT(tcp->tcp_family == AF_INET6); 3052 tbr->ADDR_length = sizeof (sin6_t); 3053 sin6 = (sin6_t *)&tbr[1]; 3054 *sin6 = sin6_null; 3055 sin6->sin6_family = AF_INET6; 3056 mp->b_wptr = (uchar_t *)&sin6[1]; 3057 tcp->tcp_ipversion = IPV6_VERSION; 3058 V6_SET_ZERO(v6addr); 3059 } 3060 requested_port = 0; 3061 break; 3062 3063 case sizeof (sin_t): /* Complete IPv4 address */ 3064 sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset, 3065 sizeof (sin_t)); 3066 if (sin == NULL || !OK_32PTR((char *)sin)) { 3067 if (tcp->tcp_debug) { 3068 (void) strlog(TCP_MOD_ID, 0, 1, 3069 SL_ERROR|SL_TRACE, 3070 "tcp_bind: bad address parameter, " 3071 "offset %d, len %d", 3072 tbr->ADDR_offset, tbr->ADDR_length); 3073 } 3074 tcp_err_ack(tcp, mp, TPROTO, 0); 3075 return; 3076 } 3077 /* 3078 * With sockets sockfs will accept bogus sin_family in 3079 * bind() and replace it with the family used in the socket 3080 * call. 3081 */ 3082 if (sin->sin_family != AF_INET || 3083 tcp->tcp_family != AF_INET) { 3084 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 3085 return; 3086 } 3087 requested_port = ntohs(sin->sin_port); 3088 tcp->tcp_ipversion = IPV4_VERSION; 3089 v4addr = sin->sin_addr.s_addr; 3090 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); 3091 break; 3092 3093 case sizeof (sin6_t): /* Complete IPv6 address */ 3094 sin6 = (sin6_t *)mi_offset_param(mp, 3095 tbr->ADDR_offset, sizeof (sin6_t)); 3096 if (sin6 == NULL || !OK_32PTR((char *)sin6)) { 3097 if (tcp->tcp_debug) { 3098 (void) strlog(TCP_MOD_ID, 0, 1, 3099 SL_ERROR|SL_TRACE, 3100 "tcp_bind: bad IPv6 address parameter, " 3101 "offset %d, len %d", tbr->ADDR_offset, 3102 tbr->ADDR_length); 3103 } 3104 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 3105 return; 3106 } 3107 if (sin6->sin6_family != AF_INET6 || 3108 tcp->tcp_family != AF_INET6) { 3109 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 3110 return; 3111 } 3112 requested_port = ntohs(sin6->sin6_port); 3113 tcp->tcp_ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 3114 IPV4_VERSION : IPV6_VERSION; 3115 v6addr = sin6->sin6_addr; 3116 break; 3117 3118 default: 3119 if (tcp->tcp_debug) { 3120 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 3121 "tcp_bind: bad address length, %d", 3122 tbr->ADDR_length); 3123 } 3124 tcp_err_ack(tcp, mp, TBADADDR, 0); 3125 return; 3126 } 3127 tcp->tcp_bound_source_v6 = v6addr; 3128 3129 /* Check for change in ipversion */ 3130 if (origipversion != tcp->tcp_ipversion) { 3131 ASSERT(tcp->tcp_family == AF_INET6); 3132 err = tcp->tcp_ipversion == IPV6_VERSION ? 3133 tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp); 3134 if (err) { 3135 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 3136 return; 3137 } 3138 } 3139 3140 /* 3141 * Initialize family specific fields. Copy of the src addr. 3142 * in tcp_t is needed for the lookup funcs. 3143 */ 3144 if (tcp->tcp_ipversion == IPV6_VERSION) { 3145 tcp->tcp_ip6h->ip6_src = v6addr; 3146 } else { 3147 IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src); 3148 } 3149 tcp->tcp_ip_src_v6 = v6addr; 3150 3151 /* 3152 * For O_T_BIND_REQ: 3153 * Verify that the target port/addr is available, or choose 3154 * another. 3155 * For T_BIND_REQ: 3156 * Verify that the target port/addr is available or fail. 3157 * In both cases when it succeeds the tcp is inserted in the 3158 * bind hash table. This ensures that the operation is atomic 3159 * under the lock on the hash bucket. 3160 */ 3161 bind_to_req_port_only = requested_port != 0 && 3162 tbr->PRIM_type != O_T_BIND_REQ; 3163 /* 3164 * Get a valid port (within the anonymous range and should not 3165 * be a privileged one) to use if the user has not given a port. 3166 * If multiple threads are here, they may all start with 3167 * with the same initial port. But, it should be fine as long as 3168 * tcp_bindi will ensure that no two threads will be assigned 3169 * the same port. 3170 * 3171 * NOTE: XXX If a privileged process asks for an anonymous port, we 3172 * still check for ports only in the range > tcp_smallest_non_priv_port, 3173 * unless TCP_ANONPRIVBIND option is set. 3174 */ 3175 mlptype = mlptSingle; 3176 mlp_port = requested_port; 3177 if (requested_port == 0) { 3178 requested_port = tcp->tcp_anon_priv_bind ? 3179 tcp_get_next_priv_port(tcp) : 3180 tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE); 3181 if (requested_port == 0) { 3182 tcp_err_ack(tcp, mp, TNOADDR, 0); 3183 return; 3184 } 3185 user_specified = B_FALSE; 3186 3187 /* 3188 * If the user went through one of the RPC interfaces to create 3189 * this socket and RPC is MLP in this zone, then give him an 3190 * anonymous MLP. 3191 */ 3192 cr = DB_CREDDEF(mp, tcp->tcp_cred); 3193 connp = tcp->tcp_connp; 3194 if (connp->conn_anon_mlp && is_system_labeled()) { 3195 zone = crgetzone(cr); 3196 addrtype = tsol_mlp_addr_type(zone->zone_id, 3197 IPV6_VERSION, &v6addr); 3198 if (addrtype == mlptSingle) { 3199 tcp_err_ack(tcp, mp, TNOADDR, 0); 3200 return; 3201 } 3202 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 3203 PMAPPORT, addrtype); 3204 mlp_port = PMAPPORT; 3205 } 3206 } else { 3207 int i; 3208 boolean_t priv = B_FALSE; 3209 3210 /* 3211 * If the requested_port is in the well-known privileged range, 3212 * verify that the stream was opened by a privileged user. 3213 * Note: No locks are held when inspecting tcp_g_*epriv_ports 3214 * but instead the code relies on: 3215 * - the fact that the address of the array and its size never 3216 * changes 3217 * - the atomic assignment of the elements of the array 3218 */ 3219 cr = DB_CREDDEF(mp, tcp->tcp_cred); 3220 if (requested_port < tcp_smallest_nonpriv_port) { 3221 priv = B_TRUE; 3222 } else { 3223 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 3224 if (requested_port == 3225 tcp_g_epriv_ports[i]) { 3226 priv = B_TRUE; 3227 break; 3228 } 3229 } 3230 } 3231 if (priv) { 3232 if (secpolicy_net_privaddr(cr, requested_port) != 0) { 3233 if (tcp->tcp_debug) { 3234 (void) strlog(TCP_MOD_ID, 0, 1, 3235 SL_ERROR|SL_TRACE, 3236 "tcp_bind: no priv for port %d", 3237 requested_port); 3238 } 3239 tcp_err_ack(tcp, mp, TACCES, 0); 3240 return; 3241 } 3242 } 3243 user_specified = B_TRUE; 3244 3245 connp = tcp->tcp_connp; 3246 if (is_system_labeled()) { 3247 zone = crgetzone(cr); 3248 addrtype = tsol_mlp_addr_type(zone->zone_id, 3249 IPV6_VERSION, &v6addr); 3250 if (addrtype == mlptSingle) { 3251 tcp_err_ack(tcp, mp, TNOADDR, 0); 3252 return; 3253 } 3254 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 3255 requested_port, addrtype); 3256 } 3257 } 3258 3259 if (mlptype != mlptSingle) { 3260 if (secpolicy_net_bindmlp(cr) != 0) { 3261 if (tcp->tcp_debug) { 3262 (void) strlog(TCP_MOD_ID, 0, 1, 3263 SL_ERROR|SL_TRACE, 3264 "tcp_bind: no priv for multilevel port %d", 3265 requested_port); 3266 } 3267 tcp_err_ack(tcp, mp, TACCES, 0); 3268 return; 3269 } 3270 3271 /* 3272 * If we're specifically binding a shared IP address and the 3273 * port is MLP on shared addresses, then check to see if this 3274 * zone actually owns the MLP. Reject if not. 3275 */ 3276 if (mlptype == mlptShared && addrtype == mlptShared) { 3277 zoneid_t mlpzone; 3278 3279 mlpzone = tsol_mlp_findzone(IPPROTO_TCP, 3280 htons(mlp_port)); 3281 if (connp->conn_zoneid != mlpzone) { 3282 if (tcp->tcp_debug) { 3283 (void) strlog(TCP_MOD_ID, 0, 1, 3284 SL_ERROR|SL_TRACE, 3285 "tcp_bind: attempt to bind port " 3286 "%d on shared addr in zone %d " 3287 "(should be %d)", 3288 mlp_port, connp->conn_zoneid, 3289 mlpzone); 3290 } 3291 tcp_err_ack(tcp, mp, TACCES, 0); 3292 return; 3293 } 3294 } 3295 3296 if (!user_specified) { 3297 err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp, 3298 requested_port, B_TRUE); 3299 if (err != 0) { 3300 if (tcp->tcp_debug) { 3301 (void) strlog(TCP_MOD_ID, 0, 1, 3302 SL_ERROR|SL_TRACE, 3303 "tcp_bind: cannot establish anon " 3304 "MLP for port %d", 3305 requested_port); 3306 } 3307 tcp_err_ack(tcp, mp, TSYSERR, err); 3308 return; 3309 } 3310 connp->conn_anon_port = B_TRUE; 3311 } 3312 connp->conn_mlp_type = mlptype; 3313 } 3314 3315 allocated_port = tcp_bindi(tcp, requested_port, &v6addr, 3316 tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified); 3317 3318 if (allocated_port == 0) { 3319 connp->conn_mlp_type = mlptSingle; 3320 if (connp->conn_anon_port) { 3321 connp->conn_anon_port = B_FALSE; 3322 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, 3323 requested_port, B_FALSE); 3324 } 3325 if (bind_to_req_port_only) { 3326 if (tcp->tcp_debug) { 3327 (void) strlog(TCP_MOD_ID, 0, 1, 3328 SL_ERROR|SL_TRACE, 3329 "tcp_bind: requested addr busy"); 3330 } 3331 tcp_err_ack(tcp, mp, TADDRBUSY, 0); 3332 } else { 3333 /* If we are out of ports, fail the bind. */ 3334 if (tcp->tcp_debug) { 3335 (void) strlog(TCP_MOD_ID, 0, 1, 3336 SL_ERROR|SL_TRACE, 3337 "tcp_bind: out of ports?"); 3338 } 3339 tcp_err_ack(tcp, mp, TNOADDR, 0); 3340 } 3341 return; 3342 } 3343 ASSERT(tcp->tcp_state == TCPS_BOUND); 3344 do_bind: 3345 if (!backlog_update) { 3346 if (tcp->tcp_family == AF_INET) 3347 sin->sin_port = htons(allocated_port); 3348 else 3349 sin6->sin6_port = htons(allocated_port); 3350 } 3351 if (tcp->tcp_family == AF_INET) { 3352 if (tbr->CONIND_number != 0) { 3353 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, 3354 sizeof (sin_t)); 3355 } else { 3356 /* Just verify the local IP address */ 3357 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, IP_ADDR_LEN); 3358 } 3359 } else { 3360 if (tbr->CONIND_number != 0) { 3361 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, 3362 sizeof (sin6_t)); 3363 } else { 3364 /* Just verify the local IP address */ 3365 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, 3366 IPV6_ADDR_LEN); 3367 } 3368 } 3369 if (mp1 == NULL) { 3370 if (connp->conn_anon_port) { 3371 connp->conn_anon_port = B_FALSE; 3372 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, 3373 requested_port, B_FALSE); 3374 } 3375 connp->conn_mlp_type = mlptSingle; 3376 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 3377 return; 3378 } 3379 3380 tbr->PRIM_type = T_BIND_ACK; 3381 mp->b_datap->db_type = M_PCPROTO; 3382 3383 /* Chain in the reply mp for tcp_rput() */ 3384 mp1->b_cont = mp; 3385 mp = mp1; 3386 3387 tcp->tcp_conn_req_max = tbr->CONIND_number; 3388 if (tcp->tcp_conn_req_max) { 3389 if (tcp->tcp_conn_req_max < tcp_conn_req_min) 3390 tcp->tcp_conn_req_max = tcp_conn_req_min; 3391 if (tcp->tcp_conn_req_max > tcp_conn_req_max_q) 3392 tcp->tcp_conn_req_max = tcp_conn_req_max_q; 3393 /* 3394 * If this is a listener, do not reset the eager list 3395 * and other stuffs. Note that we don't check if the 3396 * existing eager list meets the new tcp_conn_req_max 3397 * requirement. 3398 */ 3399 if (tcp->tcp_state != TCPS_LISTEN) { 3400 tcp->tcp_state = TCPS_LISTEN; 3401 /* Initialize the chain. Don't need the eager_lock */ 3402 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 3403 tcp->tcp_second_ctimer_threshold = 3404 tcp_ip_abort_linterval; 3405 } 3406 } 3407 3408 /* 3409 * We can call ip_bind directly which returns a T_BIND_ACK mp. The 3410 * processing continues in tcp_rput_other(). 3411 */ 3412 if (tcp->tcp_family == AF_INET6) { 3413 ASSERT(tcp->tcp_connp->conn_af_isv6); 3414 mp = ip_bind_v6(q, mp, tcp->tcp_connp, &tcp->tcp_sticky_ipp); 3415 } else { 3416 ASSERT(!tcp->tcp_connp->conn_af_isv6); 3417 mp = ip_bind_v4(q, mp, tcp->tcp_connp); 3418 } 3419 /* 3420 * If the bind cannot complete immediately 3421 * IP will arrange to call tcp_rput_other 3422 * when the bind completes. 3423 */ 3424 if (mp != NULL) { 3425 tcp_rput_other(tcp, mp); 3426 } else { 3427 /* 3428 * Bind will be resumed later. Need to ensure 3429 * that conn doesn't disappear when that happens. 3430 * This will be decremented in ip_resume_tcp_bind(). 3431 */ 3432 CONN_INC_REF(tcp->tcp_connp); 3433 } 3434 } 3435 3436 3437 /* 3438 * If the "bind_to_req_port_only" parameter is set, if the requested port 3439 * number is available, return it, If not return 0 3440 * 3441 * If "bind_to_req_port_only" parameter is not set and 3442 * If the requested port number is available, return it. If not, return 3443 * the first anonymous port we happen across. If no anonymous ports are 3444 * available, return 0. addr is the requested local address, if any. 3445 * 3446 * In either case, when succeeding update the tcp_t to record the port number 3447 * and insert it in the bind hash table. 3448 * 3449 * Note that TCP over IPv4 and IPv6 sockets can use the same port number 3450 * without setting SO_REUSEADDR. This is needed so that they 3451 * can be viewed as two independent transport protocols. 3452 */ 3453 static in_port_t 3454 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 3455 int reuseaddr, boolean_t quick_connect, 3456 boolean_t bind_to_req_port_only, boolean_t user_specified) 3457 { 3458 /* number of times we have run around the loop */ 3459 int count = 0; 3460 /* maximum number of times to run around the loop */ 3461 int loopmax; 3462 conn_t *connp = tcp->tcp_connp; 3463 zoneid_t zoneid = connp->conn_zoneid; 3464 3465 /* 3466 * Lookup for free addresses is done in a loop and "loopmax" 3467 * influences how long we spin in the loop 3468 */ 3469 if (bind_to_req_port_only) { 3470 /* 3471 * If the requested port is busy, don't bother to look 3472 * for a new one. Setting loop maximum count to 1 has 3473 * that effect. 3474 */ 3475 loopmax = 1; 3476 } else { 3477 /* 3478 * If the requested port is busy, look for a free one 3479 * in the anonymous port range. 3480 * Set loopmax appropriately so that one does not look 3481 * forever in the case all of the anonymous ports are in use. 3482 */ 3483 if (tcp->tcp_anon_priv_bind) { 3484 /* 3485 * loopmax = 3486 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 3487 */ 3488 loopmax = IPPORT_RESERVED - tcp_min_anonpriv_port; 3489 } else { 3490 loopmax = (tcp_largest_anon_port - 3491 tcp_smallest_anon_port + 1); 3492 } 3493 } 3494 do { 3495 uint16_t lport; 3496 tf_t *tbf; 3497 tcp_t *ltcp; 3498 conn_t *lconnp; 3499 3500 lport = htons(port); 3501 3502 /* 3503 * Ensure that the tcp_t is not currently in the bind hash. 3504 * Hold the lock on the hash bucket to ensure that 3505 * the duplicate check plus the insertion is an atomic 3506 * operation. 3507 * 3508 * This function does an inline lookup on the bind hash list 3509 * Make sure that we access only members of tcp_t 3510 * and that we don't look at tcp_tcp, since we are not 3511 * doing a CONN_INC_REF. 3512 */ 3513 tcp_bind_hash_remove(tcp); 3514 tbf = &tcp_bind_fanout[TCP_BIND_HASH(lport)]; 3515 mutex_enter(&tbf->tf_lock); 3516 for (ltcp = tbf->tf_tcp; ltcp != NULL; 3517 ltcp = ltcp->tcp_bind_hash) { 3518 boolean_t not_socket; 3519 boolean_t exclbind; 3520 3521 if (lport != ltcp->tcp_lport) 3522 continue; 3523 3524 lconnp = ltcp->tcp_connp; 3525 3526 /* 3527 * On a labeled system, we must treat bindings to ports 3528 * on shared IP addresses by sockets with MAC exemption 3529 * privilege as being in all zones, as there's 3530 * otherwise no way to identify the right receiver. 3531 */ 3532 if (!IPCL_ZONE_MATCH(ltcp->tcp_connp, zoneid) && 3533 !lconnp->conn_mac_exempt && 3534 !connp->conn_mac_exempt) 3535 continue; 3536 3537 /* 3538 * If TCP_EXCLBIND is set for either the bound or 3539 * binding endpoint, the semantics of bind 3540 * is changed according to the following. 3541 * 3542 * spec = specified address (v4 or v6) 3543 * unspec = unspecified address (v4 or v6) 3544 * A = specified addresses are different for endpoints 3545 * 3546 * bound bind to allowed 3547 * ------------------------------------- 3548 * unspec unspec no 3549 * unspec spec no 3550 * spec unspec no 3551 * spec spec yes if A 3552 * 3553 * For labeled systems, SO_MAC_EXEMPT behaves the same 3554 * as TCP_EXCLBIND, except that zoneid is ignored. 3555 * 3556 * Note: 3557 * 3558 * 1. Because of TLI semantics, an endpoint can go 3559 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or 3560 * TCPS_BOUND, depending on whether it is originally 3561 * a listener or not. That is why we need to check 3562 * for states greater than or equal to TCPS_BOUND 3563 * here. 3564 * 3565 * 2. Ideally, we should only check for state equals 3566 * to TCPS_LISTEN. And the following check should be 3567 * added. 3568 * 3569 * if (ltcp->tcp_state == TCPS_LISTEN || 3570 * !reuseaddr || !ltcp->tcp_reuseaddr) { 3571 * ... 3572 * } 3573 * 3574 * The semantics will be changed to this. If the 3575 * endpoint on the list is in state not equal to 3576 * TCPS_LISTEN and both endpoints have SO_REUSEADDR 3577 * set, let the bind succeed. 3578 * 3579 * Because of (1), we cannot do that for TLI 3580 * endpoints. But we can do that for socket endpoints. 3581 * If in future, we can change this going back 3582 * semantics, we can use the above check for TLI also. 3583 */ 3584 not_socket = !(TCP_IS_SOCKET(ltcp) && 3585 TCP_IS_SOCKET(tcp)); 3586 exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind; 3587 3588 if (lconnp->conn_mac_exempt || connp->conn_mac_exempt || 3589 (exclbind && (not_socket || 3590 ltcp->tcp_state <= TCPS_ESTABLISHED))) { 3591 if (V6_OR_V4_INADDR_ANY( 3592 ltcp->tcp_bound_source_v6) || 3593 V6_OR_V4_INADDR_ANY(*laddr) || 3594 IN6_ARE_ADDR_EQUAL(laddr, 3595 <cp->tcp_bound_source_v6)) { 3596 break; 3597 } 3598 continue; 3599 } 3600 3601 /* 3602 * Check ipversion to allow IPv4 and IPv6 sockets to 3603 * have disjoint port number spaces, if *_EXCLBIND 3604 * is not set and only if the application binds to a 3605 * specific port. We use the same autoassigned port 3606 * number space for IPv4 and IPv6 sockets. 3607 */ 3608 if (tcp->tcp_ipversion != ltcp->tcp_ipversion && 3609 bind_to_req_port_only) 3610 continue; 3611 3612 /* 3613 * Ideally, we should make sure that the source 3614 * address, remote address, and remote port in the 3615 * four tuple for this tcp-connection is unique. 3616 * However, trying to find out the local source 3617 * address would require too much code duplication 3618 * with IP, since IP needs needs to have that code 3619 * to support userland TCP implementations. 3620 */ 3621 if (quick_connect && 3622 (ltcp->tcp_state > TCPS_LISTEN) && 3623 ((tcp->tcp_fport != ltcp->tcp_fport) || 3624 !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, 3625 <cp->tcp_remote_v6))) 3626 continue; 3627 3628 if (!reuseaddr) { 3629 /* 3630 * No socket option SO_REUSEADDR. 3631 * If existing port is bound to 3632 * a non-wildcard IP address 3633 * and the requesting stream is 3634 * bound to a distinct 3635 * different IP addresses 3636 * (non-wildcard, also), keep 3637 * going. 3638 */ 3639 if (!V6_OR_V4_INADDR_ANY(*laddr) && 3640 !V6_OR_V4_INADDR_ANY( 3641 ltcp->tcp_bound_source_v6) && 3642 !IN6_ARE_ADDR_EQUAL(laddr, 3643 <cp->tcp_bound_source_v6)) 3644 continue; 3645 if (ltcp->tcp_state >= TCPS_BOUND) { 3646 /* 3647 * This port is being used and 3648 * its state is >= TCPS_BOUND, 3649 * so we can't bind to it. 3650 */ 3651 break; 3652 } 3653 } else { 3654 /* 3655 * socket option SO_REUSEADDR is set on the 3656 * binding tcp_t. 3657 * 3658 * If two streams are bound to 3659 * same IP address or both addr 3660 * and bound source are wildcards 3661 * (INADDR_ANY), we want to stop 3662 * searching. 3663 * We have found a match of IP source 3664 * address and source port, which is 3665 * refused regardless of the 3666 * SO_REUSEADDR setting, so we break. 3667 */ 3668 if (IN6_ARE_ADDR_EQUAL(laddr, 3669 <cp->tcp_bound_source_v6) && 3670 (ltcp->tcp_state == TCPS_LISTEN || 3671 ltcp->tcp_state == TCPS_BOUND)) 3672 break; 3673 } 3674 } 3675 if (ltcp != NULL) { 3676 /* The port number is busy */ 3677 mutex_exit(&tbf->tf_lock); 3678 } else { 3679 /* 3680 * This port is ours. Insert in fanout and mark as 3681 * bound to prevent others from getting the port 3682 * number. 3683 */ 3684 tcp->tcp_state = TCPS_BOUND; 3685 tcp->tcp_lport = htons(port); 3686 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 3687 3688 ASSERT(&tcp_bind_fanout[TCP_BIND_HASH( 3689 tcp->tcp_lport)] == tbf); 3690 tcp_bind_hash_insert(tbf, tcp, 1); 3691 3692 mutex_exit(&tbf->tf_lock); 3693 3694 /* 3695 * We don't want tcp_next_port_to_try to "inherit" 3696 * a port number supplied by the user in a bind. 3697 */ 3698 if (user_specified) 3699 return (port); 3700 3701 /* 3702 * This is the only place where tcp_next_port_to_try 3703 * is updated. After the update, it may or may not 3704 * be in the valid range. 3705 */ 3706 if (!tcp->tcp_anon_priv_bind) 3707 tcp_next_port_to_try = port + 1; 3708 return (port); 3709 } 3710 3711 if (tcp->tcp_anon_priv_bind) { 3712 port = tcp_get_next_priv_port(tcp); 3713 } else { 3714 if (count == 0 && user_specified) { 3715 /* 3716 * We may have to return an anonymous port. So 3717 * get one to start with. 3718 */ 3719 port = 3720 tcp_update_next_port(tcp_next_port_to_try, 3721 tcp, B_TRUE); 3722 user_specified = B_FALSE; 3723 } else { 3724 port = tcp_update_next_port(port + 1, tcp, 3725 B_FALSE); 3726 } 3727 } 3728 if (port == 0) 3729 break; 3730 3731 /* 3732 * Don't let this loop run forever in the case where 3733 * all of the anonymous ports are in use. 3734 */ 3735 } while (++count < loopmax); 3736 return (0); 3737 } 3738 3739 /* 3740 * We are dying for some reason. Try to do it gracefully. (May be called 3741 * as writer.) 3742 * 3743 * Return -1 if the structure was not cleaned up (if the cleanup had to be 3744 * done by a service procedure). 3745 * TBD - Should the return value distinguish between the tcp_t being 3746 * freed and it being reinitialized? 3747 */ 3748 static int 3749 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) 3750 { 3751 mblk_t *mp; 3752 queue_t *q; 3753 3754 TCP_CLD_STAT(tag); 3755 3756 #if TCP_TAG_CLEAN_DEATH 3757 tcp->tcp_cleandeathtag = tag; 3758 #endif 3759 3760 if (tcp->tcp_fused) 3761 tcp_unfuse(tcp); 3762 3763 if (tcp->tcp_linger_tid != 0 && 3764 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 3765 tcp_stop_lingering(tcp); 3766 } 3767 3768 ASSERT(tcp != NULL); 3769 ASSERT((tcp->tcp_family == AF_INET && 3770 tcp->tcp_ipversion == IPV4_VERSION) || 3771 (tcp->tcp_family == AF_INET6 && 3772 (tcp->tcp_ipversion == IPV4_VERSION || 3773 tcp->tcp_ipversion == IPV6_VERSION))); 3774 3775 if (TCP_IS_DETACHED(tcp)) { 3776 if (tcp->tcp_hard_binding) { 3777 /* 3778 * Its an eager that we are dealing with. We close the 3779 * eager but in case a conn_ind has already gone to the 3780 * listener, let tcp_accept_finish() send a discon_ind 3781 * to the listener and drop the last reference. If the 3782 * listener doesn't even know about the eager i.e. the 3783 * conn_ind hasn't gone up, blow away the eager and drop 3784 * the last reference as well. If the conn_ind has gone 3785 * up, state should be BOUND. tcp_accept_finish 3786 * will figure out that the connection has received a 3787 * RST and will send a DISCON_IND to the application. 3788 */ 3789 tcp_closei_local(tcp); 3790 if (tcp->tcp_conn.tcp_eager_conn_ind != NULL) { 3791 CONN_DEC_REF(tcp->tcp_connp); 3792 } else { 3793 tcp->tcp_state = TCPS_BOUND; 3794 } 3795 } else { 3796 tcp_close_detached(tcp); 3797 } 3798 return (0); 3799 } 3800 3801 TCP_STAT(tcp_clean_death_nondetached); 3802 3803 /* 3804 * If T_ORDREL_IND has not been sent yet (done when service routine 3805 * is run) postpone cleaning up the endpoint until service routine 3806 * has sent up the T_ORDREL_IND. Avoid clearing out an existing 3807 * client_errno since tcp_close uses the client_errno field. 3808 */ 3809 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 3810 if (err != 0) 3811 tcp->tcp_client_errno = err; 3812 3813 tcp->tcp_deferred_clean_death = B_TRUE; 3814 return (-1); 3815 } 3816 3817 q = tcp->tcp_rq; 3818 3819 /* Trash all inbound data */ 3820 flushq(q, FLUSHALL); 3821 3822 /* 3823 * If we are at least part way open and there is error 3824 * (err==0 implies no error) 3825 * notify our client by a T_DISCON_IND. 3826 */ 3827 if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) { 3828 if (tcp->tcp_state >= TCPS_ESTABLISHED && 3829 !TCP_IS_SOCKET(tcp)) { 3830 /* 3831 * Send M_FLUSH according to TPI. Because sockets will 3832 * (and must) ignore FLUSHR we do that only for TPI 3833 * endpoints and sockets in STREAMS mode. 3834 */ 3835 (void) putnextctl1(q, M_FLUSH, FLUSHR); 3836 } 3837 if (tcp->tcp_debug) { 3838 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 3839 "tcp_clean_death: discon err %d", err); 3840 } 3841 mp = mi_tpi_discon_ind(NULL, err, 0); 3842 if (mp != NULL) { 3843 putnext(q, mp); 3844 } else { 3845 if (tcp->tcp_debug) { 3846 (void) strlog(TCP_MOD_ID, 0, 1, 3847 SL_ERROR|SL_TRACE, 3848 "tcp_clean_death, sending M_ERROR"); 3849 } 3850 (void) putnextctl1(q, M_ERROR, EPROTO); 3851 } 3852 if (tcp->tcp_state <= TCPS_SYN_RCVD) { 3853 /* SYN_SENT or SYN_RCVD */ 3854 BUMP_MIB(&tcp_mib, tcpAttemptFails); 3855 } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) { 3856 /* ESTABLISHED or CLOSE_WAIT */ 3857 BUMP_MIB(&tcp_mib, tcpEstabResets); 3858 } 3859 } 3860 3861 tcp_reinit(tcp); 3862 return (-1); 3863 } 3864 3865 /* 3866 * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout 3867 * to expire, stop the wait and finish the close. 3868 */ 3869 static void 3870 tcp_stop_lingering(tcp_t *tcp) 3871 { 3872 clock_t delta = 0; 3873 3874 tcp->tcp_linger_tid = 0; 3875 if (tcp->tcp_state > TCPS_LISTEN) { 3876 tcp_acceptor_hash_remove(tcp); 3877 if (tcp->tcp_flow_stopped) { 3878 tcp_clrqfull(tcp); 3879 } 3880 3881 if (tcp->tcp_timer_tid != 0) { 3882 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 3883 tcp->tcp_timer_tid = 0; 3884 } 3885 /* 3886 * Need to cancel those timers which will not be used when 3887 * TCP is detached. This has to be done before the tcp_wq 3888 * is set to the global queue. 3889 */ 3890 tcp_timers_stop(tcp); 3891 3892 3893 tcp->tcp_detached = B_TRUE; 3894 tcp->tcp_rq = tcp_g_q; 3895 tcp->tcp_wq = WR(tcp_g_q); 3896 3897 if (tcp->tcp_state == TCPS_TIME_WAIT) { 3898 tcp_time_wait_append(tcp); 3899 TCP_DBGSTAT(tcp_detach_time_wait); 3900 goto finish; 3901 } 3902 3903 /* 3904 * If delta is zero the timer event wasn't executed and was 3905 * successfully canceled. In this case we need to restart it 3906 * with the minimal delta possible. 3907 */ 3908 if (delta >= 0) { 3909 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 3910 delta ? delta : 1); 3911 } 3912 } else { 3913 tcp_closei_local(tcp); 3914 CONN_DEC_REF(tcp->tcp_connp); 3915 } 3916 finish: 3917 /* Signal closing thread that it can complete close */ 3918 mutex_enter(&tcp->tcp_closelock); 3919 tcp->tcp_detached = B_TRUE; 3920 tcp->tcp_rq = tcp_g_q; 3921 tcp->tcp_wq = WR(tcp_g_q); 3922 tcp->tcp_closed = 1; 3923 cv_signal(&tcp->tcp_closecv); 3924 mutex_exit(&tcp->tcp_closelock); 3925 } 3926 3927 /* 3928 * Handle lingering timeouts. This function is called when the SO_LINGER timeout 3929 * expires. 3930 */ 3931 static void 3932 tcp_close_linger_timeout(void *arg) 3933 { 3934 conn_t *connp = (conn_t *)arg; 3935 tcp_t *tcp = connp->conn_tcp; 3936 3937 tcp->tcp_client_errno = ETIMEDOUT; 3938 tcp_stop_lingering(tcp); 3939 } 3940 3941 static int 3942 tcp_close(queue_t *q, int flags) 3943 { 3944 conn_t *connp = Q_TO_CONN(q); 3945 tcp_t *tcp = connp->conn_tcp; 3946 mblk_t *mp = &tcp->tcp_closemp; 3947 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 3948 3949 ASSERT(WR(q)->q_next == NULL); 3950 ASSERT(connp->conn_ref >= 2); 3951 ASSERT((connp->conn_flags & IPCL_TCPMOD) == 0); 3952 3953 /* 3954 * We are being closed as /dev/tcp or /dev/tcp6. 3955 * 3956 * Mark the conn as closing. ill_pending_mp_add will not 3957 * add any mp to the pending mp list, after this conn has 3958 * started closing. Same for sq_pending_mp_add 3959 */ 3960 mutex_enter(&connp->conn_lock); 3961 connp->conn_state_flags |= CONN_CLOSING; 3962 if (connp->conn_oper_pending_ill != NULL) 3963 conn_ioctl_cleanup_reqd = B_TRUE; 3964 CONN_INC_REF_LOCKED(connp); 3965 mutex_exit(&connp->conn_lock); 3966 tcp->tcp_closeflags = (uint8_t)flags; 3967 ASSERT(connp->conn_ref >= 3); 3968 3969 (*tcp_squeue_close_proc)(connp->conn_sqp, mp, 3970 tcp_close_output, connp, SQTAG_IP_TCP_CLOSE); 3971 3972 mutex_enter(&tcp->tcp_closelock); 3973 3974 while (!tcp->tcp_closed) 3975 cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock); 3976 mutex_exit(&tcp->tcp_closelock); 3977 /* 3978 * In the case of listener streams that have eagers in the q or q0 3979 * we wait for the eagers to drop their reference to us. tcp_rq and 3980 * tcp_wq of the eagers point to our queues. By waiting for the 3981 * refcnt to drop to 1, we are sure that the eagers have cleaned 3982 * up their queue pointers and also dropped their references to us. 3983 */ 3984 if (tcp->tcp_wait_for_eagers) { 3985 mutex_enter(&connp->conn_lock); 3986 while (connp->conn_ref != 1) { 3987 cv_wait(&connp->conn_cv, &connp->conn_lock); 3988 } 3989 mutex_exit(&connp->conn_lock); 3990 } 3991 /* 3992 * ioctl cleanup. The mp is queued in the 3993 * ill_pending_mp or in the sq_pending_mp. 3994 */ 3995 if (conn_ioctl_cleanup_reqd) 3996 conn_ioctl_cleanup(connp); 3997 3998 qprocsoff(q); 3999 inet_minor_free(ip_minor_arena, connp->conn_dev); 4000 4001 tcp->tcp_cpid = -1; 4002 4003 /* 4004 * Drop IP's reference on the conn. This is the last reference 4005 * on the connp if the state was less than established. If the 4006 * connection has gone into timewait state, then we will have 4007 * one ref for the TCP and one more ref (total of two) for the 4008 * classifier connected hash list (a timewait connections stays 4009 * in connected hash till closed). 4010 * 4011 * We can't assert the references because there might be other 4012 * transient reference places because of some walkers or queued 4013 * packets in squeue for the timewait state. 4014 */ 4015 CONN_DEC_REF(connp); 4016 q->q_ptr = WR(q)->q_ptr = NULL; 4017 return (0); 4018 } 4019 4020 static int 4021 tcpclose_accept(queue_t *q) 4022 { 4023 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); 4024 4025 /* 4026 * We had opened an acceptor STREAM for sockfs which is 4027 * now being closed due to some error. 4028 */ 4029 qprocsoff(q); 4030 inet_minor_free(ip_minor_arena, (dev_t)q->q_ptr); 4031 q->q_ptr = WR(q)->q_ptr = NULL; 4032 return (0); 4033 } 4034 4035 4036 /* 4037 * Called by streams close routine via squeues when our client blows off her 4038 * descriptor, we take this to mean: "close the stream state NOW, close the tcp 4039 * connection politely" When SO_LINGER is set (with a non-zero linger time and 4040 * it is not a nonblocking socket) then this routine sleeps until the FIN is 4041 * acked. 4042 * 4043 * NOTE: tcp_close potentially returns error when lingering. 4044 * However, the stream head currently does not pass these errors 4045 * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK 4046 * errors to the application (from tsleep()) and not errors 4047 * like ECONNRESET caused by receiving a reset packet. 4048 */ 4049 4050 /* ARGSUSED */ 4051 static void 4052 tcp_close_output(void *arg, mblk_t *mp, void *arg2) 4053 { 4054 char *msg; 4055 conn_t *connp = (conn_t *)arg; 4056 tcp_t *tcp = connp->conn_tcp; 4057 clock_t delta = 0; 4058 4059 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 4060 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 4061 4062 /* Cancel any pending timeout */ 4063 if (tcp->tcp_ordrelid != 0) { 4064 if (tcp->tcp_timeout) { 4065 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ordrelid); 4066 } 4067 tcp->tcp_ordrelid = 0; 4068 tcp->tcp_timeout = B_FALSE; 4069 } 4070 4071 mutex_enter(&tcp->tcp_eager_lock); 4072 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 4073 /* Cleanup for listener */ 4074 tcp_eager_cleanup(tcp, 0); 4075 tcp->tcp_wait_for_eagers = 1; 4076 } 4077 mutex_exit(&tcp->tcp_eager_lock); 4078 4079 connp->conn_mdt_ok = B_FALSE; 4080 tcp->tcp_mdt = B_FALSE; 4081 4082 msg = NULL; 4083 switch (tcp->tcp_state) { 4084 case TCPS_CLOSED: 4085 case TCPS_IDLE: 4086 case TCPS_BOUND: 4087 case TCPS_LISTEN: 4088 break; 4089 case TCPS_SYN_SENT: 4090 msg = "tcp_close, during connect"; 4091 break; 4092 case TCPS_SYN_RCVD: 4093 /* 4094 * Close during the connect 3-way handshake 4095 * but here there may or may not be pending data 4096 * already on queue. Process almost same as in 4097 * the ESTABLISHED state. 4098 */ 4099 /* FALLTHRU */ 4100 default: 4101 if (tcp->tcp_fused) 4102 tcp_unfuse(tcp); 4103 4104 /* 4105 * If SO_LINGER has set a zero linger time, abort the 4106 * connection with a reset. 4107 */ 4108 if (tcp->tcp_linger && tcp->tcp_lingertime == 0) { 4109 msg = "tcp_close, zero lingertime"; 4110 break; 4111 } 4112 4113 ASSERT(tcp->tcp_hard_bound || tcp->tcp_hard_binding); 4114 /* 4115 * Abort connection if there is unread data queued. 4116 */ 4117 if (tcp->tcp_rcv_list || tcp->tcp_reass_head) { 4118 msg = "tcp_close, unread data"; 4119 break; 4120 } 4121 /* 4122 * tcp_hard_bound is now cleared thus all packets go through 4123 * tcp_lookup. This fact is used by tcp_detach below. 4124 * 4125 * We have done a qwait() above which could have possibly 4126 * drained more messages in turn causing transition to a 4127 * different state. Check whether we have to do the rest 4128 * of the processing or not. 4129 */ 4130 if (tcp->tcp_state <= TCPS_LISTEN) 4131 break; 4132 4133 /* 4134 * Transmit the FIN before detaching the tcp_t. 4135 * After tcp_detach returns this queue/perimeter 4136 * no longer owns the tcp_t thus others can modify it. 4137 */ 4138 (void) tcp_xmit_end(tcp); 4139 4140 /* 4141 * If lingering on close then wait until the fin is acked, 4142 * the SO_LINGER time passes, or a reset is sent/received. 4143 */ 4144 if (tcp->tcp_linger && tcp->tcp_lingertime > 0 && 4145 !(tcp->tcp_fin_acked) && 4146 tcp->tcp_state >= TCPS_ESTABLISHED) { 4147 if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) { 4148 tcp->tcp_client_errno = EWOULDBLOCK; 4149 } else if (tcp->tcp_client_errno == 0) { 4150 4151 ASSERT(tcp->tcp_linger_tid == 0); 4152 4153 tcp->tcp_linger_tid = TCP_TIMER(tcp, 4154 tcp_close_linger_timeout, 4155 tcp->tcp_lingertime * hz); 4156 4157 /* tcp_close_linger_timeout will finish close */ 4158 if (tcp->tcp_linger_tid == 0) 4159 tcp->tcp_client_errno = ENOSR; 4160 else 4161 return; 4162 } 4163 4164 /* 4165 * Check if we need to detach or just close 4166 * the instance. 4167 */ 4168 if (tcp->tcp_state <= TCPS_LISTEN) 4169 break; 4170 } 4171 4172 /* 4173 * Make sure that no other thread will access the tcp_rq of 4174 * this instance (through lookups etc.) as tcp_rq will go 4175 * away shortly. 4176 */ 4177 tcp_acceptor_hash_remove(tcp); 4178 4179 if (tcp->tcp_flow_stopped) { 4180 tcp_clrqfull(tcp); 4181 } 4182 4183 if (tcp->tcp_timer_tid != 0) { 4184 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 4185 tcp->tcp_timer_tid = 0; 4186 } 4187 /* 4188 * Need to cancel those timers which will not be used when 4189 * TCP is detached. This has to be done before the tcp_wq 4190 * is set to the global queue. 4191 */ 4192 tcp_timers_stop(tcp); 4193 4194 tcp->tcp_detached = B_TRUE; 4195 if (tcp->tcp_state == TCPS_TIME_WAIT) { 4196 tcp_time_wait_append(tcp); 4197 TCP_DBGSTAT(tcp_detach_time_wait); 4198 ASSERT(connp->conn_ref >= 3); 4199 goto finish; 4200 } 4201 4202 /* 4203 * If delta is zero the timer event wasn't executed and was 4204 * successfully canceled. In this case we need to restart it 4205 * with the minimal delta possible. 4206 */ 4207 if (delta >= 0) 4208 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 4209 delta ? delta : 1); 4210 4211 ASSERT(connp->conn_ref >= 3); 4212 goto finish; 4213 } 4214 4215 /* Detach did not complete. Still need to remove q from stream. */ 4216 if (msg) { 4217 if (tcp->tcp_state == TCPS_ESTABLISHED || 4218 tcp->tcp_state == TCPS_CLOSE_WAIT) 4219 BUMP_MIB(&tcp_mib, tcpEstabResets); 4220 if (tcp->tcp_state == TCPS_SYN_SENT || 4221 tcp->tcp_state == TCPS_SYN_RCVD) 4222 BUMP_MIB(&tcp_mib, tcpAttemptFails); 4223 tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST); 4224 } 4225 4226 tcp_closei_local(tcp); 4227 CONN_DEC_REF(connp); 4228 ASSERT(connp->conn_ref >= 2); 4229 4230 finish: 4231 /* 4232 * Although packets are always processed on the correct 4233 * tcp's perimeter and access is serialized via squeue's, 4234 * IP still needs a queue when sending packets in time_wait 4235 * state so use WR(tcp_g_q) till ip_output() can be 4236 * changed to deal with just connp. For read side, we 4237 * could have set tcp_rq to NULL but there are some cases 4238 * in tcp_rput_data() from early days of this code which 4239 * do a putnext without checking if tcp is closed. Those 4240 * need to be identified before both tcp_rq and tcp_wq 4241 * can be set to NULL and tcp_q_q can disappear forever. 4242 */ 4243 mutex_enter(&tcp->tcp_closelock); 4244 /* 4245 * Don't change the queues in the case of a listener that has 4246 * eagers in its q or q0. It could surprise the eagers. 4247 * Instead wait for the eagers outside the squeue. 4248 */ 4249 if (!tcp->tcp_wait_for_eagers) { 4250 tcp->tcp_detached = B_TRUE; 4251 tcp->tcp_rq = tcp_g_q; 4252 tcp->tcp_wq = WR(tcp_g_q); 4253 } 4254 4255 /* Signal tcp_close() to finish closing. */ 4256 tcp->tcp_closed = 1; 4257 cv_signal(&tcp->tcp_closecv); 4258 mutex_exit(&tcp->tcp_closelock); 4259 } 4260 4261 4262 /* 4263 * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp. 4264 * Some stream heads get upset if they see these later on as anything but NULL. 4265 */ 4266 static void 4267 tcp_close_mpp(mblk_t **mpp) 4268 { 4269 mblk_t *mp; 4270 4271 if ((mp = *mpp) != NULL) { 4272 do { 4273 mp->b_next = NULL; 4274 mp->b_prev = NULL; 4275 } while ((mp = mp->b_cont) != NULL); 4276 4277 mp = *mpp; 4278 *mpp = NULL; 4279 freemsg(mp); 4280 } 4281 } 4282 4283 /* Do detached close. */ 4284 static void 4285 tcp_close_detached(tcp_t *tcp) 4286 { 4287 if (tcp->tcp_fused) 4288 tcp_unfuse(tcp); 4289 4290 /* 4291 * Clustering code serializes TCP disconnect callbacks and 4292 * cluster tcp list walks by blocking a TCP disconnect callback 4293 * if a cluster tcp list walk is in progress. This ensures 4294 * accurate accounting of TCPs in the cluster code even though 4295 * the TCP list walk itself is not atomic. 4296 */ 4297 tcp_closei_local(tcp); 4298 CONN_DEC_REF(tcp->tcp_connp); 4299 } 4300 4301 /* 4302 * Stop all TCP timers, and free the timer mblks if requested. 4303 */ 4304 void 4305 tcp_timers_stop(tcp_t *tcp) 4306 { 4307 if (tcp->tcp_timer_tid != 0) { 4308 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 4309 tcp->tcp_timer_tid = 0; 4310 } 4311 if (tcp->tcp_ka_tid != 0) { 4312 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); 4313 tcp->tcp_ka_tid = 0; 4314 } 4315 if (tcp->tcp_ack_tid != 0) { 4316 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 4317 tcp->tcp_ack_tid = 0; 4318 } 4319 if (tcp->tcp_push_tid != 0) { 4320 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 4321 tcp->tcp_push_tid = 0; 4322 } 4323 } 4324 4325 /* 4326 * The tcp_t is going away. Remove it from all lists and set it 4327 * to TCPS_CLOSED. The freeing up of memory is deferred until 4328 * tcp_inactive. This is needed since a thread in tcp_rput might have 4329 * done a CONN_INC_REF on this structure before it was removed from the 4330 * hashes. 4331 */ 4332 static void 4333 tcp_closei_local(tcp_t *tcp) 4334 { 4335 ire_t *ire; 4336 conn_t *connp = tcp->tcp_connp; 4337 4338 if (!TCP_IS_SOCKET(tcp)) 4339 tcp_acceptor_hash_remove(tcp); 4340 4341 UPDATE_MIB(&tcp_mib, tcpInSegs, tcp->tcp_ibsegs); 4342 tcp->tcp_ibsegs = 0; 4343 UPDATE_MIB(&tcp_mib, tcpOutSegs, tcp->tcp_obsegs); 4344 tcp->tcp_obsegs = 0; 4345 4346 /* 4347 * If we are an eager connection hanging off a listener that 4348 * hasn't formally accepted the connection yet, get off his 4349 * list and blow off any data that we have accumulated. 4350 */ 4351 if (tcp->tcp_listener != NULL) { 4352 tcp_t *listener = tcp->tcp_listener; 4353 mutex_enter(&listener->tcp_eager_lock); 4354 /* 4355 * tcp_eager_conn_ind == NULL means that the 4356 * conn_ind has already gone to listener. At 4357 * this point, eager will be closed but we 4358 * leave it in listeners eager list so that 4359 * if listener decides to close without doing 4360 * accept, we can clean this up. In tcp_wput_accept 4361 * we take case of the case of accept on closed 4362 * eager. 4363 */ 4364 if (tcp->tcp_conn.tcp_eager_conn_ind != NULL) { 4365 tcp_eager_unlink(tcp); 4366 mutex_exit(&listener->tcp_eager_lock); 4367 /* 4368 * We don't want to have any pointers to the 4369 * listener queue, after we have released our 4370 * reference on the listener 4371 */ 4372 tcp->tcp_rq = tcp_g_q; 4373 tcp->tcp_wq = WR(tcp_g_q); 4374 CONN_DEC_REF(listener->tcp_connp); 4375 } else { 4376 mutex_exit(&listener->tcp_eager_lock); 4377 } 4378 } 4379 4380 /* Stop all the timers */ 4381 tcp_timers_stop(tcp); 4382 4383 if (tcp->tcp_state == TCPS_LISTEN) { 4384 if (tcp->tcp_ip_addr_cache) { 4385 kmem_free((void *)tcp->tcp_ip_addr_cache, 4386 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 4387 tcp->tcp_ip_addr_cache = NULL; 4388 } 4389 } 4390 if (tcp->tcp_flow_stopped) 4391 tcp_clrqfull(tcp); 4392 4393 tcp_bind_hash_remove(tcp); 4394 /* 4395 * If the tcp_time_wait_collector (which runs outside the squeue) 4396 * is trying to remove this tcp from the time wait list, we will 4397 * block in tcp_time_wait_remove while trying to acquire the 4398 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also 4399 * requires the ipcl_hash_remove to be ordered after the 4400 * tcp_time_wait_remove for the refcnt checks to work correctly. 4401 */ 4402 if (tcp->tcp_state == TCPS_TIME_WAIT) 4403 tcp_time_wait_remove(tcp, NULL); 4404 CL_INET_DISCONNECT(tcp); 4405 ipcl_hash_remove(connp); 4406 4407 /* 4408 * Delete the cached ire in conn_ire_cache and also mark 4409 * the conn as CONDEMNED 4410 */ 4411 mutex_enter(&connp->conn_lock); 4412 connp->conn_state_flags |= CONN_CONDEMNED; 4413 ire = connp->conn_ire_cache; 4414 connp->conn_ire_cache = NULL; 4415 mutex_exit(&connp->conn_lock); 4416 if (ire != NULL) 4417 IRE_REFRELE_NOTR(ire); 4418 4419 /* Need to cleanup any pending ioctls */ 4420 ASSERT(tcp->tcp_time_wait_next == NULL); 4421 ASSERT(tcp->tcp_time_wait_prev == NULL); 4422 ASSERT(tcp->tcp_time_wait_expire == 0); 4423 tcp->tcp_state = TCPS_CLOSED; 4424 4425 /* Release any SSL context */ 4426 if (tcp->tcp_kssl_ent != NULL) { 4427 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); 4428 tcp->tcp_kssl_ent = NULL; 4429 } 4430 if (tcp->tcp_kssl_ctx != NULL) { 4431 kssl_release_ctx(tcp->tcp_kssl_ctx); 4432 tcp->tcp_kssl_ctx = NULL; 4433 } 4434 tcp->tcp_kssl_pending = B_FALSE; 4435 } 4436 4437 /* 4438 * tcp is dying (called from ipcl_conn_destroy and error cases). 4439 * Free the tcp_t in either case. 4440 */ 4441 void 4442 tcp_free(tcp_t *tcp) 4443 { 4444 mblk_t *mp; 4445 ip6_pkt_t *ipp; 4446 4447 ASSERT(tcp != NULL); 4448 ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL); 4449 4450 tcp->tcp_rq = NULL; 4451 tcp->tcp_wq = NULL; 4452 4453 tcp_close_mpp(&tcp->tcp_xmit_head); 4454 tcp_close_mpp(&tcp->tcp_reass_head); 4455 if (tcp->tcp_rcv_list != NULL) { 4456 /* Free b_next chain */ 4457 tcp_close_mpp(&tcp->tcp_rcv_list); 4458 } 4459 if ((mp = tcp->tcp_urp_mp) != NULL) { 4460 freemsg(mp); 4461 } 4462 if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 4463 freemsg(mp); 4464 } 4465 4466 if (tcp->tcp_fused_sigurg_mp != NULL) { 4467 freeb(tcp->tcp_fused_sigurg_mp); 4468 tcp->tcp_fused_sigurg_mp = NULL; 4469 } 4470 4471 if (tcp->tcp_sack_info != NULL) { 4472 if (tcp->tcp_notsack_list != NULL) { 4473 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 4474 } 4475 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 4476 } 4477 4478 if (tcp->tcp_hopopts != NULL) { 4479 mi_free(tcp->tcp_hopopts); 4480 tcp->tcp_hopopts = NULL; 4481 tcp->tcp_hopoptslen = 0; 4482 } 4483 ASSERT(tcp->tcp_hopoptslen == 0); 4484 if (tcp->tcp_dstopts != NULL) { 4485 mi_free(tcp->tcp_dstopts); 4486 tcp->tcp_dstopts = NULL; 4487 tcp->tcp_dstoptslen = 0; 4488 } 4489 ASSERT(tcp->tcp_dstoptslen == 0); 4490 if (tcp->tcp_rtdstopts != NULL) { 4491 mi_free(tcp->tcp_rtdstopts); 4492 tcp->tcp_rtdstopts = NULL; 4493 tcp->tcp_rtdstoptslen = 0; 4494 } 4495 ASSERT(tcp->tcp_rtdstoptslen == 0); 4496 if (tcp->tcp_rthdr != NULL) { 4497 mi_free(tcp->tcp_rthdr); 4498 tcp->tcp_rthdr = NULL; 4499 tcp->tcp_rthdrlen = 0; 4500 } 4501 ASSERT(tcp->tcp_rthdrlen == 0); 4502 4503 ipp = &tcp->tcp_sticky_ipp; 4504 if (ipp->ipp_fields & (IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | 4505 IPPF_RTHDR)) 4506 ip6_pkt_free(ipp); 4507 4508 /* 4509 * Free memory associated with the tcp/ip header template. 4510 */ 4511 4512 if (tcp->tcp_iphc != NULL) 4513 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 4514 4515 /* 4516 * Following is really a blowing away a union. 4517 * It happens to have exactly two members of identical size 4518 * the following code is enough. 4519 */ 4520 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 4521 4522 if (tcp->tcp_tracebuf != NULL) { 4523 kmem_free(tcp->tcp_tracebuf, sizeof (tcptrch_t)); 4524 tcp->tcp_tracebuf = NULL; 4525 } 4526 } 4527 4528 4529 /* 4530 * Put a connection confirmation message upstream built from the 4531 * address information within 'iph' and 'tcph'. Report our success or failure. 4532 */ 4533 static boolean_t 4534 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, 4535 mblk_t **defermp) 4536 { 4537 sin_t sin; 4538 sin6_t sin6; 4539 mblk_t *mp; 4540 char *optp = NULL; 4541 int optlen = 0; 4542 cred_t *cr; 4543 4544 if (defermp != NULL) 4545 *defermp = NULL; 4546 4547 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { 4548 /* 4549 * Return in T_CONN_CON results of option negotiation through 4550 * the T_CONN_REQ. Note: If there is an real end-to-end option 4551 * negotiation, then what is received from remote end needs 4552 * to be taken into account but there is no such thing (yet?) 4553 * in our TCP/IP. 4554 * Note: We do not use mi_offset_param() here as 4555 * tcp_opts_conn_req contents do not directly come from 4556 * an application and are either generated in kernel or 4557 * from user input that was already verified. 4558 */ 4559 mp = tcp->tcp_conn.tcp_opts_conn_req; 4560 optp = (char *)(mp->b_rptr + 4561 ((struct T_conn_req *)mp->b_rptr)->OPT_offset); 4562 optlen = (int) 4563 ((struct T_conn_req *)mp->b_rptr)->OPT_length; 4564 } 4565 4566 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { 4567 ipha_t *ipha = (ipha_t *)iphdr; 4568 4569 /* packet is IPv4 */ 4570 if (tcp->tcp_family == AF_INET) { 4571 sin = sin_null; 4572 sin.sin_addr.s_addr = ipha->ipha_src; 4573 sin.sin_port = *(uint16_t *)tcph->th_lport; 4574 sin.sin_family = AF_INET; 4575 mp = mi_tpi_conn_con(NULL, (char *)&sin, 4576 (int)sizeof (sin_t), optp, optlen); 4577 } else { 4578 sin6 = sin6_null; 4579 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr); 4580 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4581 sin6.sin6_family = AF_INET6; 4582 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 4583 (int)sizeof (sin6_t), optp, optlen); 4584 4585 } 4586 } else { 4587 ip6_t *ip6h = (ip6_t *)iphdr; 4588 4589 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); 4590 ASSERT(tcp->tcp_family == AF_INET6); 4591 sin6 = sin6_null; 4592 sin6.sin6_addr = ip6h->ip6_src; 4593 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4594 sin6.sin6_family = AF_INET6; 4595 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 4596 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 4597 (int)sizeof (sin6_t), optp, optlen); 4598 } 4599 4600 if (!mp) 4601 return (B_FALSE); 4602 4603 if ((cr = DB_CRED(idmp)) != NULL) { 4604 mblk_setcred(mp, cr); 4605 DB_CPID(mp) = DB_CPID(idmp); 4606 } 4607 4608 if (defermp == NULL) 4609 putnext(tcp->tcp_rq, mp); 4610 else 4611 *defermp = mp; 4612 4613 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 4614 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 4615 return (B_TRUE); 4616 } 4617 4618 /* 4619 * Defense for the SYN attack - 4620 * 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest 4621 * one that doesn't have the dontdrop bit set. 4622 * 2. Don't drop a SYN request before its first timeout. This gives every 4623 * request at least til the first timeout to complete its 3-way handshake. 4624 * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many 4625 * requests currently on the queue that has timed out. This will be used 4626 * as an indicator of whether an attack is under way, so that appropriate 4627 * actions can be taken. (It's incremented in tcp_timer() and decremented 4628 * either when eager goes into ESTABLISHED, or gets freed up.) 4629 * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on 4630 * # of timeout drops back to <= q0len/32 => SYN alert off 4631 */ 4632 static boolean_t 4633 tcp_drop_q0(tcp_t *tcp) 4634 { 4635 tcp_t *eager; 4636 4637 ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock)); 4638 ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 4639 /* 4640 * New one is added after next_q0 so prev_q0 points to the oldest 4641 * Also do not drop any established connections that are deferred on 4642 * q0 due to q being full 4643 */ 4644 4645 eager = tcp->tcp_eager_prev_q0; 4646 while (eager->tcp_dontdrop || eager->tcp_conn_def_q0) { 4647 eager = eager->tcp_eager_prev_q0; 4648 if (eager == tcp) { 4649 eager = tcp->tcp_eager_prev_q0; 4650 break; 4651 } 4652 } 4653 if (eager->tcp_syn_rcvd_timeout == 0) 4654 return (B_FALSE); 4655 4656 if (tcp->tcp_debug) { 4657 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 4658 "tcp_drop_q0: listen half-open queue (max=%d) overflow" 4659 " (%d pending) on %s, drop one", tcp_conn_req_max_q0, 4660 tcp->tcp_conn_req_cnt_q0, 4661 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 4662 } 4663 4664 BUMP_MIB(&tcp_mib, tcpHalfOpenDrop); 4665 4666 /* 4667 * need to do refhold here because the selected eager could 4668 * be removed by someone else if we release the eager lock. 4669 */ 4670 CONN_INC_REF(eager->tcp_connp); 4671 mutex_exit(&tcp->tcp_eager_lock); 4672 4673 /* Mark the IRE created for this SYN request temporary */ 4674 tcp_ip_ire_mark_advice(eager); 4675 (void) tcp_clean_death(eager, ETIMEDOUT, 5); 4676 CONN_DEC_REF(eager->tcp_connp); 4677 4678 mutex_enter(&tcp->tcp_eager_lock); 4679 return (B_TRUE); 4680 } 4681 4682 int 4683 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 4684 tcph_t *tcph, uint_t ipvers, mblk_t *idmp) 4685 { 4686 tcp_t *ltcp = lconnp->conn_tcp; 4687 tcp_t *tcp = connp->conn_tcp; 4688 mblk_t *tpi_mp; 4689 ipha_t *ipha; 4690 ip6_t *ip6h; 4691 sin6_t sin6; 4692 in6_addr_t v6dst; 4693 int err; 4694 int ifindex = 0; 4695 cred_t *cr; 4696 4697 if (ipvers == IPV4_VERSION) { 4698 ipha = (ipha_t *)mp->b_rptr; 4699 4700 connp->conn_send = ip_output; 4701 connp->conn_recv = tcp_input; 4702 4703 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6); 4704 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6); 4705 4706 sin6 = sin6_null; 4707 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr); 4708 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 4709 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4710 sin6.sin6_family = AF_INET6; 4711 sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst, 4712 lconnp->conn_zoneid); 4713 if (tcp->tcp_recvdstaddr) { 4714 sin6_t sin6d; 4715 4716 sin6d = sin6_null; 4717 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, 4718 &sin6d.sin6_addr); 4719 sin6d.sin6_port = *(uint16_t *)tcph->th_fport; 4720 sin6d.sin6_family = AF_INET; 4721 tpi_mp = mi_tpi_extconn_ind(NULL, 4722 (char *)&sin6d, sizeof (sin6_t), 4723 (char *)&tcp, 4724 (t_scalar_t)sizeof (intptr_t), 4725 (char *)&sin6d, sizeof (sin6_t), 4726 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4727 } else { 4728 tpi_mp = mi_tpi_conn_ind(NULL, 4729 (char *)&sin6, sizeof (sin6_t), 4730 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4731 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4732 } 4733 } else { 4734 ip6h = (ip6_t *)mp->b_rptr; 4735 4736 connp->conn_send = ip_output_v6; 4737 connp->conn_recv = tcp_input; 4738 4739 connp->conn_srcv6 = ip6h->ip6_dst; 4740 connp->conn_remv6 = ip6h->ip6_src; 4741 4742 /* db_cksumstuff is set at ip_fanout_tcp_v6 */ 4743 ifindex = (int)DB_CKSUMSTUFF(mp); 4744 DB_CKSUMSTUFF(mp) = 0; 4745 4746 sin6 = sin6_null; 4747 sin6.sin6_addr = ip6h->ip6_src; 4748 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4749 sin6.sin6_family = AF_INET6; 4750 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 4751 sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 4752 lconnp->conn_zoneid); 4753 4754 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { 4755 /* Pass up the scope_id of remote addr */ 4756 sin6.sin6_scope_id = ifindex; 4757 } else { 4758 sin6.sin6_scope_id = 0; 4759 } 4760 if (tcp->tcp_recvdstaddr) { 4761 sin6_t sin6d; 4762 4763 sin6d = sin6_null; 4764 sin6.sin6_addr = ip6h->ip6_dst; 4765 sin6d.sin6_port = *(uint16_t *)tcph->th_fport; 4766 sin6d.sin6_family = AF_INET; 4767 tpi_mp = mi_tpi_extconn_ind(NULL, 4768 (char *)&sin6d, sizeof (sin6_t), 4769 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4770 (char *)&sin6d, sizeof (sin6_t), 4771 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4772 } else { 4773 tpi_mp = mi_tpi_conn_ind(NULL, 4774 (char *)&sin6, sizeof (sin6_t), 4775 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4776 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4777 } 4778 } 4779 4780 if (tpi_mp == NULL) 4781 return (ENOMEM); 4782 4783 connp->conn_fport = *(uint16_t *)tcph->th_lport; 4784 connp->conn_lport = *(uint16_t *)tcph->th_fport; 4785 connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER); 4786 connp->conn_fully_bound = B_FALSE; 4787 4788 if (tcp_trace) 4789 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP); 4790 4791 /* Inherit information from the "parent" */ 4792 tcp->tcp_ipversion = ltcp->tcp_ipversion; 4793 tcp->tcp_family = ltcp->tcp_family; 4794 tcp->tcp_wq = ltcp->tcp_wq; 4795 tcp->tcp_rq = ltcp->tcp_rq; 4796 tcp->tcp_mss = tcp_mss_def_ipv6; 4797 tcp->tcp_detached = B_TRUE; 4798 if ((err = tcp_init_values(tcp)) != 0) { 4799 freemsg(tpi_mp); 4800 return (err); 4801 } 4802 4803 if (ipvers == IPV4_VERSION) { 4804 if ((err = tcp_header_init_ipv4(tcp)) != 0) { 4805 freemsg(tpi_mp); 4806 return (err); 4807 } 4808 ASSERT(tcp->tcp_ipha != NULL); 4809 } else { 4810 /* ifindex must be already set */ 4811 ASSERT(ifindex != 0); 4812 4813 if (ltcp->tcp_bound_if != 0) { 4814 /* 4815 * Set newtcp's bound_if equal to 4816 * listener's value. If ifindex is 4817 * not the same as ltcp->tcp_bound_if, 4818 * it must be a packet for the ipmp group 4819 * of interfaces 4820 */ 4821 tcp->tcp_bound_if = ltcp->tcp_bound_if; 4822 } else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { 4823 tcp->tcp_bound_if = ifindex; 4824 } 4825 4826 tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary; 4827 tcp->tcp_recvifindex = 0; 4828 tcp->tcp_recvhops = 0xffffffffU; 4829 ASSERT(tcp->tcp_ip6h != NULL); 4830 } 4831 4832 tcp->tcp_lport = ltcp->tcp_lport; 4833 4834 if (ltcp->tcp_ipversion == tcp->tcp_ipversion) { 4835 if (tcp->tcp_iphc_len != ltcp->tcp_iphc_len) { 4836 /* 4837 * Listener had options of some sort; eager inherits. 4838 * Free up the eager template and allocate one 4839 * of the right size. 4840 */ 4841 if (tcp->tcp_hdr_grown) { 4842 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 4843 } else { 4844 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 4845 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 4846 } 4847 tcp->tcp_iphc = kmem_zalloc(ltcp->tcp_iphc_len, 4848 KM_NOSLEEP); 4849 if (tcp->tcp_iphc == NULL) { 4850 tcp->tcp_iphc_len = 0; 4851 freemsg(tpi_mp); 4852 return (ENOMEM); 4853 } 4854 tcp->tcp_iphc_len = ltcp->tcp_iphc_len; 4855 tcp->tcp_hdr_grown = B_TRUE; 4856 } 4857 tcp->tcp_hdr_len = ltcp->tcp_hdr_len; 4858 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len; 4859 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; 4860 tcp->tcp_ip6_hops = ltcp->tcp_ip6_hops; 4861 tcp->tcp_ip6_vcf = ltcp->tcp_ip6_vcf; 4862 4863 /* 4864 * Copy the IP+TCP header template from listener to eager 4865 */ 4866 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len); 4867 if (tcp->tcp_ipversion == IPV6_VERSION) { 4868 if (((ip6i_t *)(tcp->tcp_iphc))->ip6i_nxt == 4869 IPPROTO_RAW) { 4870 tcp->tcp_ip6h = 4871 (ip6_t *)(tcp->tcp_iphc + 4872 sizeof (ip6i_t)); 4873 } else { 4874 tcp->tcp_ip6h = 4875 (ip6_t *)(tcp->tcp_iphc); 4876 } 4877 tcp->tcp_ipha = NULL; 4878 } else { 4879 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; 4880 tcp->tcp_ip6h = NULL; 4881 } 4882 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + 4883 tcp->tcp_ip_hdr_len); 4884 } else { 4885 /* 4886 * only valid case when ipversion of listener and 4887 * eager differ is when listener is IPv6 and 4888 * eager is IPv4. 4889 * Eager header template has been initialized to the 4890 * maximum v4 header sizes, which includes space for 4891 * TCP and IP options. 4892 */ 4893 ASSERT((ltcp->tcp_ipversion == IPV6_VERSION) && 4894 (tcp->tcp_ipversion == IPV4_VERSION)); 4895 ASSERT(tcp->tcp_iphc_len >= 4896 TCP_MAX_COMBINED_HEADER_LENGTH); 4897 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; 4898 /* copy IP header fields individually */ 4899 tcp->tcp_ipha->ipha_ttl = 4900 ltcp->tcp_ip6h->ip6_hops; 4901 bcopy(ltcp->tcp_tcph->th_lport, 4902 tcp->tcp_tcph->th_lport, sizeof (ushort_t)); 4903 } 4904 4905 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t)); 4906 bcopy(tcp->tcp_tcph->th_fport, &tcp->tcp_fport, 4907 sizeof (in_port_t)); 4908 4909 if (ltcp->tcp_lport == 0) { 4910 tcp->tcp_lport = *(in_port_t *)tcph->th_fport; 4911 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, 4912 sizeof (in_port_t)); 4913 } 4914 4915 if (tcp->tcp_ipversion == IPV4_VERSION) { 4916 ASSERT(ipha != NULL); 4917 tcp->tcp_ipha->ipha_dst = ipha->ipha_src; 4918 tcp->tcp_ipha->ipha_src = ipha->ipha_dst; 4919 4920 /* Source routing option copyover (reverse it) */ 4921 if (tcp_rev_src_routes) 4922 tcp_opt_reverse(tcp, ipha); 4923 } else { 4924 ASSERT(ip6h != NULL); 4925 tcp->tcp_ip6h->ip6_dst = ip6h->ip6_src; 4926 tcp->tcp_ip6h->ip6_src = ip6h->ip6_dst; 4927 } 4928 4929 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 4930 /* 4931 * If the SYN contains a credential, it's a loopback packet; attach 4932 * the credential to the TPI message. 4933 */ 4934 if ((cr = DB_CRED(idmp)) != NULL) { 4935 mblk_setcred(tpi_mp, cr); 4936 DB_CPID(tpi_mp) = DB_CPID(idmp); 4937 } 4938 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp; 4939 4940 /* Inherit the listener's SSL protection state */ 4941 4942 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) { 4943 kssl_hold_ent(tcp->tcp_kssl_ent); 4944 tcp->tcp_kssl_pending = B_TRUE; 4945 } 4946 4947 return (0); 4948 } 4949 4950 4951 int 4952 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, 4953 tcph_t *tcph, mblk_t *idmp) 4954 { 4955 tcp_t *ltcp = lconnp->conn_tcp; 4956 tcp_t *tcp = connp->conn_tcp; 4957 sin_t sin; 4958 mblk_t *tpi_mp = NULL; 4959 int err; 4960 cred_t *cr; 4961 4962 sin = sin_null; 4963 sin.sin_addr.s_addr = ipha->ipha_src; 4964 sin.sin_port = *(uint16_t *)tcph->th_lport; 4965 sin.sin_family = AF_INET; 4966 if (ltcp->tcp_recvdstaddr) { 4967 sin_t sind; 4968 4969 sind = sin_null; 4970 sind.sin_addr.s_addr = ipha->ipha_dst; 4971 sind.sin_port = *(uint16_t *)tcph->th_fport; 4972 sind.sin_family = AF_INET; 4973 tpi_mp = mi_tpi_extconn_ind(NULL, 4974 (char *)&sind, sizeof (sin_t), (char *)&tcp, 4975 (t_scalar_t)sizeof (intptr_t), (char *)&sind, 4976 sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4977 } else { 4978 tpi_mp = mi_tpi_conn_ind(NULL, 4979 (char *)&sin, sizeof (sin_t), 4980 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4981 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4982 } 4983 4984 if (tpi_mp == NULL) { 4985 return (ENOMEM); 4986 } 4987 4988 connp->conn_flags |= (IPCL_TCP4|IPCL_EAGER); 4989 connp->conn_send = ip_output; 4990 connp->conn_recv = tcp_input; 4991 connp->conn_fully_bound = B_FALSE; 4992 4993 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6); 4994 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6); 4995 connp->conn_fport = *(uint16_t *)tcph->th_lport; 4996 connp->conn_lport = *(uint16_t *)tcph->th_fport; 4997 4998 if (tcp_trace) { 4999 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP); 5000 } 5001 5002 /* Inherit information from the "parent" */ 5003 tcp->tcp_ipversion = ltcp->tcp_ipversion; 5004 tcp->tcp_family = ltcp->tcp_family; 5005 tcp->tcp_wq = ltcp->tcp_wq; 5006 tcp->tcp_rq = ltcp->tcp_rq; 5007 tcp->tcp_mss = tcp_mss_def_ipv4; 5008 tcp->tcp_detached = B_TRUE; 5009 if ((err = tcp_init_values(tcp)) != 0) { 5010 freemsg(tpi_mp); 5011 return (err); 5012 } 5013 5014 /* 5015 * Let's make sure that eager tcp template has enough space to 5016 * copy IPv4 listener's tcp template. Since the conn_t structure is 5017 * preserved and tcp_iphc_len is also preserved, an eager conn_t may 5018 * have a tcp_template of total len TCP_MAX_COMBINED_HEADER_LENGTH or 5019 * more (in case of re-allocation of conn_t with tcp-IPv6 template with 5020 * extension headers or with ip6i_t struct). Note that bcopy() below 5021 * copies listener tcp's hdr_len which cannot be greater than TCP_MAX_ 5022 * COMBINED_HEADER_LENGTH as this listener must be a IPv4 listener. 5023 */ 5024 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 5025 ASSERT(ltcp->tcp_hdr_len <= TCP_MAX_COMBINED_HEADER_LENGTH); 5026 5027 tcp->tcp_hdr_len = ltcp->tcp_hdr_len; 5028 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len; 5029 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; 5030 tcp->tcp_ttl = ltcp->tcp_ttl; 5031 tcp->tcp_tos = ltcp->tcp_tos; 5032 5033 /* Copy the IP+TCP header template from listener to eager */ 5034 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len); 5035 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; 5036 tcp->tcp_ip6h = NULL; 5037 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + 5038 tcp->tcp_ip_hdr_len); 5039 5040 /* Initialize the IP addresses and Ports */ 5041 tcp->tcp_ipha->ipha_dst = ipha->ipha_src; 5042 tcp->tcp_ipha->ipha_src = ipha->ipha_dst; 5043 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t)); 5044 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t)); 5045 5046 /* Source routing option copyover (reverse it) */ 5047 if (tcp_rev_src_routes) 5048 tcp_opt_reverse(tcp, ipha); 5049 5050 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 5051 5052 /* 5053 * If the SYN contains a credential, it's a loopback packet; attach 5054 * the credential to the TPI message. 5055 */ 5056 if ((cr = DB_CRED(idmp)) != NULL) { 5057 mblk_setcred(tpi_mp, cr); 5058 DB_CPID(tpi_mp) = DB_CPID(idmp); 5059 } 5060 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp; 5061 5062 /* Inherit the listener's SSL protection state */ 5063 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) { 5064 kssl_hold_ent(tcp->tcp_kssl_ent); 5065 tcp->tcp_kssl_pending = B_TRUE; 5066 } 5067 5068 return (0); 5069 } 5070 5071 /* 5072 * sets up conn for ipsec. 5073 * if the first mblk is M_CTL it is consumed and mpp is updated. 5074 * in case of error mpp is freed. 5075 */ 5076 conn_t * 5077 tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp) 5078 { 5079 conn_t *connp = tcp->tcp_connp; 5080 conn_t *econnp; 5081 squeue_t *new_sqp; 5082 mblk_t *first_mp = *mpp; 5083 mblk_t *mp = *mpp; 5084 boolean_t mctl_present = B_FALSE; 5085 uint_t ipvers; 5086 5087 econnp = tcp_get_conn(sqp); 5088 if (econnp == NULL) { 5089 freemsg(first_mp); 5090 return (NULL); 5091 } 5092 if (DB_TYPE(mp) == M_CTL) { 5093 if (mp->b_cont == NULL || 5094 mp->b_cont->b_datap->db_type != M_DATA) { 5095 freemsg(first_mp); 5096 return (NULL); 5097 } 5098 mp = mp->b_cont; 5099 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) == 0) { 5100 freemsg(first_mp); 5101 return (NULL); 5102 } 5103 5104 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 5105 first_mp->b_datap->db_struioflag &= ~STRUIO_POLICY; 5106 mctl_present = B_TRUE; 5107 } else { 5108 ASSERT(mp->b_datap->db_struioflag & STRUIO_POLICY); 5109 mp->b_datap->db_struioflag &= ~STRUIO_POLICY; 5110 } 5111 5112 new_sqp = (squeue_t *)DB_CKSUMSTART(mp); 5113 DB_CKSUMSTART(mp) = 0; 5114 5115 ASSERT(OK_32PTR(mp->b_rptr)); 5116 ipvers = IPH_HDR_VERSION(mp->b_rptr); 5117 if (ipvers == IPV4_VERSION) { 5118 uint16_t *up; 5119 uint32_t ports; 5120 ipha_t *ipha; 5121 5122 ipha = (ipha_t *)mp->b_rptr; 5123 up = (uint16_t *)((uchar_t *)ipha + 5124 IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET); 5125 ports = *(uint32_t *)up; 5126 IPCL_TCP_EAGER_INIT(econnp, IPPROTO_TCP, 5127 ipha->ipha_dst, ipha->ipha_src, ports); 5128 } else { 5129 uint16_t *up; 5130 uint32_t ports; 5131 uint16_t ip_hdr_len; 5132 uint8_t *nexthdrp; 5133 ip6_t *ip6h; 5134 tcph_t *tcph; 5135 5136 ip6h = (ip6_t *)mp->b_rptr; 5137 if (ip6h->ip6_nxt == IPPROTO_TCP) { 5138 ip_hdr_len = IPV6_HDR_LEN; 5139 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_len, 5140 &nexthdrp) || *nexthdrp != IPPROTO_TCP) { 5141 CONN_DEC_REF(econnp); 5142 freemsg(first_mp); 5143 return (NULL); 5144 } 5145 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5146 up = (uint16_t *)tcph->th_lport; 5147 ports = *(uint32_t *)up; 5148 IPCL_TCP_EAGER_INIT_V6(econnp, IPPROTO_TCP, 5149 ip6h->ip6_dst, ip6h->ip6_src, ports); 5150 } 5151 5152 /* 5153 * The caller already ensured that there is a sqp present. 5154 */ 5155 econnp->conn_sqp = new_sqp; 5156 5157 if (connp->conn_policy != NULL) { 5158 ipsec_in_t *ii; 5159 ii = (ipsec_in_t *)(first_mp->b_rptr); 5160 ASSERT(ii->ipsec_in_policy == NULL); 5161 IPPH_REFHOLD(connp->conn_policy); 5162 ii->ipsec_in_policy = connp->conn_policy; 5163 5164 first_mp->b_datap->db_type = IPSEC_POLICY_SET; 5165 if (!ip_bind_ipsec_policy_set(econnp, first_mp)) { 5166 CONN_DEC_REF(econnp); 5167 freemsg(first_mp); 5168 return (NULL); 5169 } 5170 } 5171 5172 if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) { 5173 CONN_DEC_REF(econnp); 5174 freemsg(first_mp); 5175 return (NULL); 5176 } 5177 5178 /* 5179 * If we know we have some policy, pass the "IPSEC" 5180 * options size TCP uses this adjust the MSS. 5181 */ 5182 econnp->conn_tcp->tcp_ipsec_overhead = conn_ipsec_length(econnp); 5183 if (mctl_present) { 5184 freeb(first_mp); 5185 *mpp = mp; 5186 } 5187 5188 return (econnp); 5189 } 5190 5191 /* 5192 * tcp_get_conn/tcp_free_conn 5193 * 5194 * tcp_get_conn is used to get a clean tcp connection structure. 5195 * It tries to reuse the connections put on the freelist by the 5196 * time_wait_collector failing which it goes to kmem_cache. This 5197 * way has two benefits compared to just allocating from and 5198 * freeing to kmem_cache. 5199 * 1) The time_wait_collector can free (which includes the cleanup) 5200 * outside the squeue. So when the interrupt comes, we have a clean 5201 * connection sitting in the freelist. Obviously, this buys us 5202 * performance. 5203 * 5204 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_conn_request 5205 * has multiple disadvantages - tying up the squeue during alloc, and the 5206 * fact that IPSec policy initialization has to happen here which 5207 * requires us sending a M_CTL and checking for it i.e. real ugliness. 5208 * But allocating the conn/tcp in IP land is also not the best since 5209 * we can't check the 'q' and 'q0' which are protected by squeue and 5210 * blindly allocate memory which might have to be freed here if we are 5211 * not allowed to accept the connection. By using the freelist and 5212 * putting the conn/tcp back in freelist, we don't pay a penalty for 5213 * allocating memory without checking 'q/q0' and freeing it if we can't 5214 * accept the connection. 5215 * 5216 * Care should be taken to put the conn back in the same squeue's freelist 5217 * from which it was allocated. Best results are obtained if conn is 5218 * allocated from listener's squeue and freed to the same. Time wait 5219 * collector will free up the freelist is the connection ends up sitting 5220 * there for too long. 5221 */ 5222 void * 5223 tcp_get_conn(void *arg) 5224 { 5225 tcp_t *tcp = NULL; 5226 conn_t *connp = NULL; 5227 squeue_t *sqp = (squeue_t *)arg; 5228 tcp_squeue_priv_t *tcp_time_wait; 5229 5230 tcp_time_wait = 5231 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 5232 5233 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 5234 tcp = tcp_time_wait->tcp_free_list; 5235 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0)); 5236 if (tcp != NULL) { 5237 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 5238 tcp_time_wait->tcp_free_list_cnt--; 5239 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 5240 tcp->tcp_time_wait_next = NULL; 5241 connp = tcp->tcp_connp; 5242 connp->conn_flags |= IPCL_REUSED; 5243 return ((void *)connp); 5244 } 5245 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 5246 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL) 5247 return (NULL); 5248 return ((void *)connp); 5249 } 5250 5251 /* 5252 * Update the cached label for the given tcp_t. This should be called once per 5253 * connection, and before any packets are sent or tcp_process_options is 5254 * invoked. Returns B_FALSE if the correct label could not be constructed. 5255 */ 5256 static boolean_t 5257 tcp_update_label(tcp_t *tcp, const cred_t *cr) 5258 { 5259 conn_t *connp = tcp->tcp_connp; 5260 5261 if (tcp->tcp_ipversion == IPV4_VERSION) { 5262 uchar_t optbuf[IP_MAX_OPT_LENGTH]; 5263 int added; 5264 5265 if (tsol_compute_label(cr, tcp->tcp_remote, optbuf, 5266 connp->conn_mac_exempt) != 0) 5267 return (B_FALSE); 5268 5269 added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len); 5270 if (added == -1) 5271 return (B_FALSE); 5272 tcp->tcp_hdr_len += added; 5273 tcp->tcp_tcph = (tcph_t *)((uchar_t *)tcp->tcp_tcph + added); 5274 tcp->tcp_ip_hdr_len += added; 5275 if ((tcp->tcp_label_len = optbuf[IPOPT_OLEN]) != 0) { 5276 tcp->tcp_label_len = (tcp->tcp_label_len + 3) & ~3; 5277 added = tsol_prepend_option(optbuf, tcp->tcp_ipha, 5278 tcp->tcp_hdr_len); 5279 if (added == -1) 5280 return (B_FALSE); 5281 tcp->tcp_hdr_len += added; 5282 tcp->tcp_tcph = (tcph_t *) 5283 ((uchar_t *)tcp->tcp_tcph + added); 5284 tcp->tcp_ip_hdr_len += added; 5285 } 5286 } else { 5287 uchar_t optbuf[TSOL_MAX_IPV6_OPTION]; 5288 5289 if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf, 5290 connp->conn_mac_exempt) != 0) 5291 return (B_FALSE); 5292 if (tsol_update_sticky(&tcp->tcp_sticky_ipp, 5293 &tcp->tcp_label_len, optbuf) != 0) 5294 return (B_FALSE); 5295 if (tcp_build_hdrs(tcp->tcp_rq, tcp) != 0) 5296 return (B_FALSE); 5297 } 5298 5299 connp->conn_ulp_labeled = 1; 5300 5301 return (B_TRUE); 5302 } 5303 5304 /* BEGIN CSTYLED */ 5305 /* 5306 * 5307 * The sockfs ACCEPT path: 5308 * ======================= 5309 * 5310 * The eager is now established in its own perimeter as soon as SYN is 5311 * received in tcp_conn_request(). When sockfs receives conn_ind, it 5312 * completes the accept processing on the acceptor STREAM. The sending 5313 * of conn_ind part is common for both sockfs listener and a TLI/XTI 5314 * listener but a TLI/XTI listener completes the accept processing 5315 * on the listener perimeter. 5316 * 5317 * Common control flow for 3 way handshake: 5318 * ---------------------------------------- 5319 * 5320 * incoming SYN (listener perimeter) -> tcp_rput_data() 5321 * -> tcp_conn_request() 5322 * 5323 * incoming SYN-ACK-ACK (eager perim) -> tcp_rput_data() 5324 * send T_CONN_IND (listener perim) -> tcp_send_conn_ind() 5325 * 5326 * Sockfs ACCEPT Path: 5327 * ------------------- 5328 * 5329 * open acceptor stream (ip_tcpopen allocates tcp_wput_accept() 5330 * as STREAM entry point) 5331 * 5332 * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept() 5333 * 5334 * tcp_wput_accept() extracts the eager and makes the q->q_ptr <-> eager 5335 * association (we are not behind eager's squeue but sockfs is protecting us 5336 * and no one knows about this stream yet. The STREAMS entry point q->q_info 5337 * is changed to point at tcp_wput(). 5338 * 5339 * tcp_wput_accept() sends any deferred eagers via tcp_send_pending() to 5340 * listener (done on listener's perimeter). 5341 * 5342 * tcp_wput_accept() calls tcp_accept_finish() on eagers perimeter to finish 5343 * accept. 5344 * 5345 * TLI/XTI client ACCEPT path: 5346 * --------------------------- 5347 * 5348 * soaccept() sends T_CONN_RES on the listener STREAM. 5349 * 5350 * tcp_accept() -> tcp_accept_swap() complete the processing and send 5351 * the bind_mp to eager perimeter to finish accept (tcp_rput_other()). 5352 * 5353 * Locks: 5354 * ====== 5355 * 5356 * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and 5357 * and listeners->tcp_eager_next_q. 5358 * 5359 * Referencing: 5360 * ============ 5361 * 5362 * 1) We start out in tcp_conn_request by eager placing a ref on 5363 * listener and listener adding eager to listeners->tcp_eager_next_q0. 5364 * 5365 * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before 5366 * doing so we place a ref on the eager. This ref is finally dropped at the 5367 * end of tcp_accept_finish() while unwinding from the squeue, i.e. the 5368 * reference is dropped by the squeue framework. 5369 * 5370 * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish 5371 * 5372 * The reference must be released by the same entity that added the reference 5373 * In the above scheme, the eager is the entity that adds and releases the 5374 * references. Note that tcp_accept_finish executes in the squeue of the eager 5375 * (albeit after it is attached to the acceptor stream). Though 1. executes 5376 * in the listener's squeue, the eager is nascent at this point and the 5377 * reference can be considered to have been added on behalf of the eager. 5378 * 5379 * Eager getting a Reset or listener closing: 5380 * ========================================== 5381 * 5382 * Once the listener and eager are linked, the listener never does the unlink. 5383 * If the listener needs to close, tcp_eager_cleanup() is called which queues 5384 * a message on all eager perimeter. The eager then does the unlink, clears 5385 * any pointers to the listener's queue and drops the reference to the 5386 * listener. The listener waits in tcp_close outside the squeue until its 5387 * refcount has dropped to 1. This ensures that the listener has waited for 5388 * all eagers to clear their association with the listener. 5389 * 5390 * Similarly, if eager decides to go away, it can unlink itself and close. 5391 * When the T_CONN_RES comes down, we check if eager has closed. Note that 5392 * the reference to eager is still valid because of the extra ref we put 5393 * in tcp_send_conn_ind. 5394 * 5395 * Listener can always locate the eager under the protection 5396 * of the listener->tcp_eager_lock, and then do a refhold 5397 * on the eager during the accept processing. 5398 * 5399 * The acceptor stream accesses the eager in the accept processing 5400 * based on the ref placed on eager before sending T_conn_ind. 5401 * The only entity that can negate this refhold is a listener close 5402 * which is mutually exclusive with an active acceptor stream. 5403 * 5404 * Eager's reference on the listener 5405 * =================================== 5406 * 5407 * If the accept happens (even on a closed eager) the eager drops its 5408 * reference on the listener at the start of tcp_accept_finish. If the 5409 * eager is killed due to an incoming RST before the T_conn_ind is sent up, 5410 * the reference is dropped in tcp_closei_local. If the listener closes, 5411 * the reference is dropped in tcp_eager_kill. In all cases the reference 5412 * is dropped while executing in the eager's context (squeue). 5413 */ 5414 /* END CSTYLED */ 5415 5416 /* Process the SYN packet, mp, directed at the listener 'tcp' */ 5417 5418 /* 5419 * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN. 5420 * tcp_rput_data will not see any SYN packets. 5421 */ 5422 /* ARGSUSED */ 5423 void 5424 tcp_conn_request(void *arg, mblk_t *mp, void *arg2) 5425 { 5426 tcph_t *tcph; 5427 uint32_t seg_seq; 5428 tcp_t *eager; 5429 uint_t ipvers; 5430 ipha_t *ipha; 5431 ip6_t *ip6h; 5432 int err; 5433 conn_t *econnp = NULL; 5434 squeue_t *new_sqp; 5435 mblk_t *mp1; 5436 uint_t ip_hdr_len; 5437 conn_t *connp = (conn_t *)arg; 5438 tcp_t *tcp = connp->conn_tcp; 5439 ire_t *ire; 5440 cred_t *credp; 5441 5442 if (tcp->tcp_state != TCPS_LISTEN) 5443 goto error2; 5444 5445 ASSERT((tcp->tcp_connp->conn_flags & IPCL_BOUND) != 0); 5446 5447 mutex_enter(&tcp->tcp_eager_lock); 5448 if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { 5449 mutex_exit(&tcp->tcp_eager_lock); 5450 TCP_STAT(tcp_listendrop); 5451 BUMP_MIB(&tcp_mib, tcpListenDrop); 5452 if (tcp->tcp_debug) { 5453 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 5454 "tcp_conn_request: listen backlog (max=%d) " 5455 "overflow (%d pending) on %s", 5456 tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, 5457 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 5458 } 5459 goto error2; 5460 } 5461 5462 if (tcp->tcp_conn_req_cnt_q0 >= 5463 tcp->tcp_conn_req_max + tcp_conn_req_max_q0) { 5464 /* 5465 * Q0 is full. Drop a pending half-open req from the queue 5466 * to make room for the new SYN req. Also mark the time we 5467 * drop a SYN. 5468 * 5469 * A more aggressive defense against SYN attack will 5470 * be to set the "tcp_syn_defense" flag now. 5471 */ 5472 TCP_STAT(tcp_listendropq0); 5473 tcp->tcp_last_rcv_lbolt = lbolt64; 5474 if (!tcp_drop_q0(tcp)) { 5475 mutex_exit(&tcp->tcp_eager_lock); 5476 BUMP_MIB(&tcp_mib, tcpListenDropQ0); 5477 if (tcp->tcp_debug) { 5478 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 5479 "tcp_conn_request: listen half-open queue " 5480 "(max=%d) full (%d pending) on %s", 5481 tcp_conn_req_max_q0, 5482 tcp->tcp_conn_req_cnt_q0, 5483 tcp_display(tcp, NULL, 5484 DISP_PORT_ONLY)); 5485 } 5486 goto error2; 5487 } 5488 } 5489 mutex_exit(&tcp->tcp_eager_lock); 5490 5491 /* 5492 * IP adds STRUIO_EAGER and ensures that the received packet is 5493 * M_DATA even if conn_ipv6_recvpktinfo is enabled or for ip6 5494 * link local address. If IPSec is enabled, db_struioflag has 5495 * STRUIO_POLICY set (mutually exclusive from STRUIO_EAGER); 5496 * otherwise an error case if neither of them is set. 5497 */ 5498 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 5499 new_sqp = (squeue_t *)DB_CKSUMSTART(mp); 5500 DB_CKSUMSTART(mp) = 0; 5501 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 5502 econnp = (conn_t *)tcp_get_conn(arg2); 5503 if (econnp == NULL) 5504 goto error2; 5505 econnp->conn_sqp = new_sqp; 5506 } else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) { 5507 /* 5508 * mp is updated in tcp_get_ipsec_conn(). 5509 */ 5510 econnp = tcp_get_ipsec_conn(tcp, arg2, &mp); 5511 if (econnp == NULL) { 5512 /* 5513 * mp freed by tcp_get_ipsec_conn. 5514 */ 5515 return; 5516 } 5517 } else { 5518 goto error2; 5519 } 5520 5521 ASSERT(DB_TYPE(mp) == M_DATA); 5522 5523 ipvers = IPH_HDR_VERSION(mp->b_rptr); 5524 ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION); 5525 ASSERT(OK_32PTR(mp->b_rptr)); 5526 if (ipvers == IPV4_VERSION) { 5527 ipha = (ipha_t *)mp->b_rptr; 5528 ip_hdr_len = IPH_HDR_LENGTH(ipha); 5529 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5530 } else { 5531 ip6h = (ip6_t *)mp->b_rptr; 5532 ip_hdr_len = ip_hdr_length_v6(mp, ip6h); 5533 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5534 } 5535 5536 if (tcp->tcp_family == AF_INET) { 5537 ASSERT(ipvers == IPV4_VERSION); 5538 err = tcp_conn_create_v4(connp, econnp, ipha, tcph, mp); 5539 } else { 5540 err = tcp_conn_create_v6(connp, econnp, mp, tcph, ipvers, mp); 5541 } 5542 5543 if (err) 5544 goto error3; 5545 5546 eager = econnp->conn_tcp; 5547 5548 /* Inherit various TCP parameters from the listener */ 5549 eager->tcp_naglim = tcp->tcp_naglim; 5550 eager->tcp_first_timer_threshold = 5551 tcp->tcp_first_timer_threshold; 5552 eager->tcp_second_timer_threshold = 5553 tcp->tcp_second_timer_threshold; 5554 5555 eager->tcp_first_ctimer_threshold = 5556 tcp->tcp_first_ctimer_threshold; 5557 eager->tcp_second_ctimer_threshold = 5558 tcp->tcp_second_ctimer_threshold; 5559 5560 /* 5561 * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics. 5562 * If it does not, the eager's receive window will be set to the 5563 * listener's receive window later in this function. 5564 */ 5565 eager->tcp_rwnd = 0; 5566 5567 /* 5568 * Inherit listener's tcp_init_cwnd. Need to do this before 5569 * calling tcp_process_options() where tcp_mss_set() is called 5570 * to set the initial cwnd. 5571 */ 5572 eager->tcp_init_cwnd = tcp->tcp_init_cwnd; 5573 5574 /* 5575 * Zones: tcp_adapt_ire() and tcp_send_data() both need the 5576 * zone id before the accept is completed in tcp_wput_accept(). 5577 */ 5578 econnp->conn_zoneid = connp->conn_zoneid; 5579 econnp->conn_allzones = connp->conn_allzones; 5580 5581 /* Copy nexthop information from listener to eager */ 5582 if (connp->conn_nexthop_set) { 5583 econnp->conn_nexthop_set = connp->conn_nexthop_set; 5584 econnp->conn_nexthop_v4 = connp->conn_nexthop_v4; 5585 } 5586 5587 /* 5588 * TSOL: tsol_input_proc() needs the eager's cred before the 5589 * eager is accepted 5590 */ 5591 econnp->conn_cred = eager->tcp_cred = credp = connp->conn_cred; 5592 crhold(credp); 5593 5594 /* 5595 * If the caller has the process-wide flag set, then default to MAC 5596 * exempt mode. This allows read-down to unlabeled hosts. 5597 */ 5598 if (getpflags(NET_MAC_AWARE, credp) != 0) 5599 econnp->conn_mac_exempt = B_TRUE; 5600 5601 if (is_system_labeled()) { 5602 cred_t *cr; 5603 5604 if (connp->conn_mlp_type != mlptSingle) { 5605 cr = econnp->conn_peercred = DB_CRED(mp); 5606 if (cr != NULL) 5607 crhold(cr); 5608 else 5609 cr = econnp->conn_cred; 5610 DTRACE_PROBE2(mlp_syn_accept, conn_t *, 5611 econnp, cred_t *, cr) 5612 } else { 5613 cr = econnp->conn_cred; 5614 DTRACE_PROBE2(syn_accept, conn_t *, 5615 econnp, cred_t *, cr) 5616 } 5617 5618 if (!tcp_update_label(eager, cr)) { 5619 DTRACE_PROBE3( 5620 tx__ip__log__error__connrequest__tcp, 5621 char *, "eager connp(1) label on SYN mp(2) failed", 5622 conn_t *, econnp, mblk_t *, mp); 5623 goto error3; 5624 } 5625 } 5626 5627 eager->tcp_hard_binding = B_TRUE; 5628 5629 tcp_bind_hash_insert(&tcp_bind_fanout[ 5630 TCP_BIND_HASH(eager->tcp_lport)], eager, 0); 5631 5632 CL_INET_CONNECT(eager); 5633 5634 /* 5635 * No need to check for multicast destination since ip will only pass 5636 * up multicasts to those that have expressed interest 5637 * TODO: what about rejecting broadcasts? 5638 * Also check that source is not a multicast or broadcast address. 5639 */ 5640 eager->tcp_state = TCPS_SYN_RCVD; 5641 5642 5643 /* 5644 * There should be no ire in the mp as we are being called after 5645 * receiving the SYN. 5646 */ 5647 ASSERT(tcp_ire_mp(mp) == NULL); 5648 5649 /* 5650 * Adapt our mss, ttl, ... according to information provided in IRE. 5651 */ 5652 5653 if (tcp_adapt_ire(eager, NULL) == 0) { 5654 /* Undo the bind_hash_insert */ 5655 tcp_bind_hash_remove(eager); 5656 goto error3; 5657 } 5658 5659 /* Process all TCP options. */ 5660 tcp_process_options(eager, tcph); 5661 5662 /* Is the other end ECN capable? */ 5663 if (tcp_ecn_permitted >= 1 && 5664 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 5665 eager->tcp_ecn_ok = B_TRUE; 5666 } 5667 5668 /* 5669 * listener->tcp_rq->q_hiwat should be the default window size or a 5670 * window size changed via SO_RCVBUF option. First round up the 5671 * eager's tcp_rwnd to the nearest MSS. Then find out the window 5672 * scale option value if needed. Call tcp_rwnd_set() to finish the 5673 * setting. 5674 * 5675 * Note if there is a rpipe metric associated with the remote host, 5676 * we should not inherit receive window size from listener. 5677 */ 5678 eager->tcp_rwnd = MSS_ROUNDUP( 5679 (eager->tcp_rwnd == 0 ? tcp->tcp_rq->q_hiwat : 5680 eager->tcp_rwnd), eager->tcp_mss); 5681 if (eager->tcp_snd_ws_ok) 5682 tcp_set_ws_value(eager); 5683 /* 5684 * Note that this is the only place tcp_rwnd_set() is called for 5685 * accepting a connection. We need to call it here instead of 5686 * after the 3-way handshake because we need to tell the other 5687 * side our rwnd in the SYN-ACK segment. 5688 */ 5689 (void) tcp_rwnd_set(eager, eager->tcp_rwnd); 5690 5691 /* 5692 * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ 5693 * via soaccept()->soinheritoptions() which essentially applies 5694 * all the listener options to the new STREAM. The options that we 5695 * need to take care of are: 5696 * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST, 5697 * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER, 5698 * SO_SNDBUF, SO_RCVBUF. 5699 * 5700 * SO_RCVBUF: tcp_rwnd_set() above takes care of it. 5701 * SO_SNDBUF: Set the tcp_xmit_hiwater for the eager. When 5702 * tcp_maxpsz_set() gets called later from 5703 * tcp_accept_finish(), the option takes effect. 5704 * 5705 */ 5706 /* Set the TCP options */ 5707 eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater; 5708 eager->tcp_dgram_errind = tcp->tcp_dgram_errind; 5709 eager->tcp_oobinline = tcp->tcp_oobinline; 5710 eager->tcp_reuseaddr = tcp->tcp_reuseaddr; 5711 eager->tcp_broadcast = tcp->tcp_broadcast; 5712 eager->tcp_useloopback = tcp->tcp_useloopback; 5713 eager->tcp_dontroute = tcp->tcp_dontroute; 5714 eager->tcp_linger = tcp->tcp_linger; 5715 eager->tcp_lingertime = tcp->tcp_lingertime; 5716 if (tcp->tcp_ka_enabled) 5717 eager->tcp_ka_enabled = 1; 5718 5719 /* Set the IP options */ 5720 econnp->conn_broadcast = connp->conn_broadcast; 5721 econnp->conn_loopback = connp->conn_loopback; 5722 econnp->conn_dontroute = connp->conn_dontroute; 5723 econnp->conn_reuseaddr = connp->conn_reuseaddr; 5724 5725 /* Put a ref on the listener for the eager. */ 5726 CONN_INC_REF(connp); 5727 mutex_enter(&tcp->tcp_eager_lock); 5728 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 5729 eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 5730 tcp->tcp_eager_next_q0 = eager; 5731 eager->tcp_eager_prev_q0 = tcp; 5732 5733 /* Set tcp_listener before adding it to tcp_conn_fanout */ 5734 eager->tcp_listener = tcp; 5735 eager->tcp_saved_listener = tcp; 5736 5737 /* 5738 * Tag this detached tcp vector for later retrieval 5739 * by our listener client in tcp_accept(). 5740 */ 5741 eager->tcp_conn_req_seqnum = tcp->tcp_conn_req_seqnum; 5742 tcp->tcp_conn_req_cnt_q0++; 5743 if (++tcp->tcp_conn_req_seqnum == -1) { 5744 /* 5745 * -1 is "special" and defined in TPI as something 5746 * that should never be used in T_CONN_IND 5747 */ 5748 ++tcp->tcp_conn_req_seqnum; 5749 } 5750 mutex_exit(&tcp->tcp_eager_lock); 5751 5752 if (tcp->tcp_syn_defense) { 5753 /* Don't drop the SYN that comes from a good IP source */ 5754 ipaddr_t *addr_cache = (ipaddr_t *)(tcp->tcp_ip_addr_cache); 5755 if (addr_cache != NULL && eager->tcp_remote == 5756 addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) { 5757 eager->tcp_dontdrop = B_TRUE; 5758 } 5759 } 5760 5761 /* 5762 * We need to insert the eager in its own perimeter but as soon 5763 * as we do that, we expose the eager to the classifier and 5764 * should not touch any field outside the eager's perimeter. 5765 * So do all the work necessary before inserting the eager 5766 * in its own perimeter. Be optimistic that ipcl_conn_insert() 5767 * will succeed but undo everything if it fails. 5768 */ 5769 seg_seq = ABE32_TO_U32(tcph->th_seq); 5770 eager->tcp_irs = seg_seq; 5771 eager->tcp_rack = seg_seq; 5772 eager->tcp_rnxt = seg_seq + 1; 5773 U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack); 5774 BUMP_MIB(&tcp_mib, tcpPassiveOpens); 5775 eager->tcp_state = TCPS_SYN_RCVD; 5776 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, 5777 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE); 5778 if (mp1 == NULL) 5779 goto error1; 5780 DB_CPID(mp1) = tcp->tcp_cpid; 5781 5782 /* 5783 * We need to start the rto timer. In normal case, we start 5784 * the timer after sending the packet on the wire (or at 5785 * least believing that packet was sent by waiting for 5786 * CALL_IP_WPUT() to return). Since this is the first packet 5787 * being sent on the wire for the eager, our initial tcp_rto 5788 * is at least tcp_rexmit_interval_min which is a fairly 5789 * large value to allow the algorithm to adjust slowly to large 5790 * fluctuations of RTT during first few transmissions. 5791 * 5792 * Starting the timer first and then sending the packet in this 5793 * case shouldn't make much difference since tcp_rexmit_interval_min 5794 * is of the order of several 100ms and starting the timer 5795 * first and then sending the packet will result in difference 5796 * of few micro seconds. 5797 * 5798 * Without this optimization, we are forced to hold the fanout 5799 * lock across the ipcl_bind_insert() and sending the packet 5800 * so that we don't race against an incoming packet (maybe RST) 5801 * for this eager. 5802 */ 5803 5804 TCP_RECORD_TRACE(eager, mp1, TCP_TRACE_SEND_PKT); 5805 TCP_TIMER_RESTART(eager, eager->tcp_rto); 5806 5807 5808 /* 5809 * Insert the eager in its own perimeter now. We are ready to deal 5810 * with any packets on eager. 5811 */ 5812 if (eager->tcp_ipversion == IPV4_VERSION) { 5813 if (ipcl_conn_insert(econnp, IPPROTO_TCP, 0, 0, 0) != 0) { 5814 goto error; 5815 } 5816 } else { 5817 if (ipcl_conn_insert_v6(econnp, IPPROTO_TCP, 0, 0, 0, 0) != 0) { 5818 goto error; 5819 } 5820 } 5821 5822 /* mark conn as fully-bound */ 5823 econnp->conn_fully_bound = B_TRUE; 5824 5825 /* Send the SYN-ACK */ 5826 tcp_send_data(eager, eager->tcp_wq, mp1); 5827 freemsg(mp); 5828 5829 return; 5830 error: 5831 (void) TCP_TIMER_CANCEL(eager, eager->tcp_timer_tid); 5832 freemsg(mp1); 5833 error1: 5834 /* Undo what we did above */ 5835 mutex_enter(&tcp->tcp_eager_lock); 5836 tcp_eager_unlink(eager); 5837 mutex_exit(&tcp->tcp_eager_lock); 5838 /* Drop eager's reference on the listener */ 5839 CONN_DEC_REF(connp); 5840 5841 /* 5842 * Delete the cached ire in conn_ire_cache and also mark 5843 * the conn as CONDEMNED 5844 */ 5845 mutex_enter(&econnp->conn_lock); 5846 econnp->conn_state_flags |= CONN_CONDEMNED; 5847 ire = econnp->conn_ire_cache; 5848 econnp->conn_ire_cache = NULL; 5849 mutex_exit(&econnp->conn_lock); 5850 if (ire != NULL) 5851 IRE_REFRELE_NOTR(ire); 5852 5853 /* 5854 * tcp_accept_comm inserts the eager to the bind_hash 5855 * we need to remove it from the hash if ipcl_conn_insert 5856 * fails. 5857 */ 5858 tcp_bind_hash_remove(eager); 5859 /* Drop the eager ref placed in tcp_open_detached */ 5860 CONN_DEC_REF(econnp); 5861 5862 /* 5863 * If a connection already exists, send the mp to that connections so 5864 * that it can be appropriately dealt with. 5865 */ 5866 if ((econnp = ipcl_classify(mp, connp->conn_zoneid)) != NULL) { 5867 if (!IPCL_IS_CONNECTED(econnp)) { 5868 /* 5869 * Something bad happened. ipcl_conn_insert() 5870 * failed because a connection already existed 5871 * in connected hash but we can't find it 5872 * anymore (someone blew it away). Just 5873 * free this message and hopefully remote 5874 * will retransmit at which time the SYN can be 5875 * treated as a new connection or dealth with 5876 * a TH_RST if a connection already exists. 5877 */ 5878 freemsg(mp); 5879 } else { 5880 squeue_fill(econnp->conn_sqp, mp, tcp_input, 5881 econnp, SQTAG_TCP_CONN_REQ); 5882 } 5883 } else { 5884 /* Nobody wants this packet */ 5885 freemsg(mp); 5886 } 5887 return; 5888 error2: 5889 freemsg(mp); 5890 return; 5891 error3: 5892 CONN_DEC_REF(econnp); 5893 freemsg(mp); 5894 } 5895 5896 /* 5897 * In an ideal case of vertical partition in NUMA architecture, its 5898 * beneficial to have the listener and all the incoming connections 5899 * tied to the same squeue. The other constraint is that incoming 5900 * connections should be tied to the squeue attached to interrupted 5901 * CPU for obvious locality reason so this leaves the listener to 5902 * be tied to the same squeue. Our only problem is that when listener 5903 * is binding, the CPU that will get interrupted by the NIC whose 5904 * IP address the listener is binding to is not even known. So 5905 * the code below allows us to change that binding at the time the 5906 * CPU is interrupted by virtue of incoming connection's squeue. 5907 * 5908 * This is usefull only in case of a listener bound to a specific IP 5909 * address. For other kind of listeners, they get bound the 5910 * very first time and there is no attempt to rebind them. 5911 */ 5912 void 5913 tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) 5914 { 5915 conn_t *connp = (conn_t *)arg; 5916 squeue_t *sqp = (squeue_t *)arg2; 5917 squeue_t *new_sqp; 5918 uint32_t conn_flags; 5919 5920 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 5921 new_sqp = (squeue_t *)DB_CKSUMSTART(mp); 5922 } else { 5923 goto done; 5924 } 5925 5926 if (connp->conn_fanout == NULL) 5927 goto done; 5928 5929 if (!(connp->conn_flags & IPCL_FULLY_BOUND)) { 5930 mutex_enter(&connp->conn_fanout->connf_lock); 5931 mutex_enter(&connp->conn_lock); 5932 /* 5933 * No one from read or write side can access us now 5934 * except for already queued packets on this squeue. 5935 * But since we haven't changed the squeue yet, they 5936 * can't execute. If they are processed after we have 5937 * changed the squeue, they are sent back to the 5938 * correct squeue down below. 5939 */ 5940 if (connp->conn_sqp != new_sqp) { 5941 while (connp->conn_sqp != new_sqp) 5942 (void) casptr(&connp->conn_sqp, sqp, new_sqp); 5943 } 5944 5945 do { 5946 conn_flags = connp->conn_flags; 5947 conn_flags |= IPCL_FULLY_BOUND; 5948 (void) cas32(&connp->conn_flags, connp->conn_flags, 5949 conn_flags); 5950 } while (!(connp->conn_flags & IPCL_FULLY_BOUND)); 5951 5952 mutex_exit(&connp->conn_fanout->connf_lock); 5953 mutex_exit(&connp->conn_lock); 5954 } 5955 5956 done: 5957 if (connp->conn_sqp != sqp) { 5958 CONN_INC_REF(connp); 5959 squeue_fill(connp->conn_sqp, mp, 5960 connp->conn_recv, connp, SQTAG_TCP_CONN_REQ_UNBOUND); 5961 } else { 5962 tcp_conn_request(connp, mp, sqp); 5963 } 5964 } 5965 5966 /* 5967 * Successful connect request processing begins when our client passes 5968 * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes 5969 * our T_OK_ACK reply message upstream. The control flow looks like this: 5970 * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_connect() -> IP 5971 * upstream <- tcp_rput() <- IP 5972 * After various error checks are completed, tcp_connect() lays 5973 * the target address and port into the composite header template, 5974 * preallocates the T_OK_ACK reply message, construct a full 12 byte bind 5975 * request followed by an IRE request, and passes the three mblk message 5976 * down to IP looking like this: 5977 * O_T_BIND_REQ for IP --> IRE req --> T_OK_ACK for our client 5978 * Processing continues in tcp_rput() when we receive the following message: 5979 * T_BIND_ACK from IP --> IRE ack --> T_OK_ACK for our client 5980 * After consuming the first two mblks, tcp_rput() calls tcp_timer(), 5981 * to fire off the connection request, and then passes the T_OK_ACK mblk 5982 * upstream that we filled in below. There are, of course, numerous 5983 * error conditions along the way which truncate the processing described 5984 * above. 5985 */ 5986 static void 5987 tcp_connect(tcp_t *tcp, mblk_t *mp) 5988 { 5989 sin_t *sin; 5990 sin6_t *sin6; 5991 queue_t *q = tcp->tcp_wq; 5992 struct T_conn_req *tcr; 5993 ipaddr_t *dstaddrp; 5994 in_port_t dstport; 5995 uint_t srcid; 5996 5997 tcr = (struct T_conn_req *)mp->b_rptr; 5998 5999 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 6000 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 6001 tcp_err_ack(tcp, mp, TPROTO, 0); 6002 return; 6003 } 6004 6005 /* 6006 * Determine packet type based on type of address passed in 6007 * the request should contain an IPv4 or IPv6 address. 6008 * Make sure that address family matches the type of 6009 * family of the the address passed down 6010 */ 6011 switch (tcr->DEST_length) { 6012 default: 6013 tcp_err_ack(tcp, mp, TBADADDR, 0); 6014 return; 6015 6016 case (sizeof (sin_t) - sizeof (sin->sin_zero)): { 6017 /* 6018 * XXX: The check for valid DEST_length was not there 6019 * in earlier releases and some buggy 6020 * TLI apps (e.g Sybase) got away with not feeding 6021 * in sin_zero part of address. 6022 * We allow that bug to keep those buggy apps humming. 6023 * Test suites require the check on DEST_length. 6024 * We construct a new mblk with valid DEST_length 6025 * free the original so the rest of the code does 6026 * not have to keep track of this special shorter 6027 * length address case. 6028 */ 6029 mblk_t *nmp; 6030 struct T_conn_req *ntcr; 6031 sin_t *nsin; 6032 6033 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + 6034 tcr->OPT_length, BPRI_HI); 6035 if (nmp == NULL) { 6036 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 6037 return; 6038 } 6039 ntcr = (struct T_conn_req *)nmp->b_rptr; 6040 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ 6041 ntcr->PRIM_type = T_CONN_REQ; 6042 ntcr->DEST_length = sizeof (sin_t); 6043 ntcr->DEST_offset = sizeof (struct T_conn_req); 6044 6045 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); 6046 *nsin = sin_null; 6047 /* Get pointer to shorter address to copy from original mp */ 6048 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 6049 tcr->DEST_length); /* extract DEST_length worth of sin_t */ 6050 if (sin == NULL || !OK_32PTR((char *)sin)) { 6051 freemsg(nmp); 6052 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 6053 return; 6054 } 6055 nsin->sin_family = sin->sin_family; 6056 nsin->sin_port = sin->sin_port; 6057 nsin->sin_addr = sin->sin_addr; 6058 /* Note:nsin->sin_zero zero-fill with sin_null assign above */ 6059 nmp->b_wptr = (uchar_t *)&nsin[1]; 6060 if (tcr->OPT_length != 0) { 6061 ntcr->OPT_length = tcr->OPT_length; 6062 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; 6063 bcopy((uchar_t *)tcr + tcr->OPT_offset, 6064 (uchar_t *)ntcr + ntcr->OPT_offset, 6065 tcr->OPT_length); 6066 nmp->b_wptr += tcr->OPT_length; 6067 } 6068 freemsg(mp); /* original mp freed */ 6069 mp = nmp; /* re-initialize original variables */ 6070 tcr = ntcr; 6071 } 6072 /* FALLTHRU */ 6073 6074 case sizeof (sin_t): 6075 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 6076 sizeof (sin_t)); 6077 if (sin == NULL || !OK_32PTR((char *)sin)) { 6078 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 6079 return; 6080 } 6081 if (tcp->tcp_family != AF_INET || 6082 sin->sin_family != AF_INET) { 6083 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 6084 return; 6085 } 6086 if (sin->sin_port == 0) { 6087 tcp_err_ack(tcp, mp, TBADADDR, 0); 6088 return; 6089 } 6090 if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) { 6091 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 6092 return; 6093 } 6094 6095 break; 6096 6097 case sizeof (sin6_t): 6098 sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset, 6099 sizeof (sin6_t)); 6100 if (sin6 == NULL || !OK_32PTR((char *)sin6)) { 6101 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 6102 return; 6103 } 6104 if (tcp->tcp_family != AF_INET6 || 6105 sin6->sin6_family != AF_INET6) { 6106 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 6107 return; 6108 } 6109 if (sin6->sin6_port == 0) { 6110 tcp_err_ack(tcp, mp, TBADADDR, 0); 6111 return; 6112 } 6113 break; 6114 } 6115 /* 6116 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we 6117 * should key on their sequence number and cut them loose. 6118 */ 6119 6120 /* 6121 * If options passed in, feed it for verification and handling 6122 */ 6123 if (tcr->OPT_length != 0) { 6124 mblk_t *ok_mp; 6125 mblk_t *discon_mp; 6126 mblk_t *conn_opts_mp; 6127 int t_error, sys_error, do_disconnect; 6128 6129 conn_opts_mp = NULL; 6130 6131 if (tcp_conprim_opt_process(tcp, mp, 6132 &do_disconnect, &t_error, &sys_error) < 0) { 6133 if (do_disconnect) { 6134 ASSERT(t_error == 0 && sys_error == 0); 6135 discon_mp = mi_tpi_discon_ind(NULL, 6136 ECONNREFUSED, 0); 6137 if (!discon_mp) { 6138 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 6139 TSYSERR, ENOMEM); 6140 return; 6141 } 6142 ok_mp = mi_tpi_ok_ack_alloc(mp); 6143 if (!ok_mp) { 6144 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6145 TSYSERR, ENOMEM); 6146 return; 6147 } 6148 qreply(q, ok_mp); 6149 qreply(q, discon_mp); /* no flush! */ 6150 } else { 6151 ASSERT(t_error != 0); 6152 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, 6153 sys_error); 6154 } 6155 return; 6156 } 6157 /* 6158 * Success in setting options, the mp option buffer represented 6159 * by OPT_length/offset has been potentially modified and 6160 * contains results of option processing. We copy it in 6161 * another mp to save it for potentially influencing returning 6162 * it in T_CONN_CONN. 6163 */ 6164 if (tcr->OPT_length != 0) { /* there are resulting options */ 6165 conn_opts_mp = copyb(mp); 6166 if (!conn_opts_mp) { 6167 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 6168 TSYSERR, ENOMEM); 6169 return; 6170 } 6171 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); 6172 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; 6173 /* 6174 * Note: 6175 * These resulting option negotiation can include any 6176 * end-to-end negotiation options but there no such 6177 * thing (yet?) in our TCP/IP. 6178 */ 6179 } 6180 } 6181 6182 /* 6183 * If we're connecting to an IPv4-mapped IPv6 address, we need to 6184 * make sure that the template IP header in the tcp structure is an 6185 * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We 6186 * need to this before we call tcp_bindi() so that the port lookup 6187 * code will look for ports in the correct port space (IPv4 and 6188 * IPv6 have separate port spaces). 6189 */ 6190 if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION && 6191 IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 6192 int err = 0; 6193 6194 err = tcp_header_init_ipv4(tcp); 6195 if (err != 0) { 6196 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); 6197 goto connect_failed; 6198 } 6199 if (tcp->tcp_lport != 0) 6200 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 6201 } 6202 6203 switch (tcp->tcp_state) { 6204 case TCPS_IDLE: 6205 /* 6206 * We support quick connect, refer to comments in 6207 * tcp_connect_*() 6208 */ 6209 /* FALLTHRU */ 6210 case TCPS_BOUND: 6211 case TCPS_LISTEN: 6212 if (tcp->tcp_family == AF_INET6) { 6213 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 6214 tcp_connect_ipv6(tcp, mp, 6215 &sin6->sin6_addr, 6216 sin6->sin6_port, sin6->sin6_flowinfo, 6217 sin6->__sin6_src_id, sin6->sin6_scope_id); 6218 return; 6219 } 6220 /* 6221 * Destination adress is mapped IPv6 address. 6222 * Source bound address should be unspecified or 6223 * IPv6 mapped address as well. 6224 */ 6225 if (!IN6_IS_ADDR_UNSPECIFIED( 6226 &tcp->tcp_bound_source_v6) && 6227 !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) { 6228 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, 6229 EADDRNOTAVAIL); 6230 break; 6231 } 6232 dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr)); 6233 dstport = sin6->sin6_port; 6234 srcid = sin6->__sin6_src_id; 6235 } else { 6236 dstaddrp = &sin->sin_addr.s_addr; 6237 dstport = sin->sin_port; 6238 srcid = 0; 6239 } 6240 6241 tcp_connect_ipv4(tcp, mp, dstaddrp, dstport, srcid); 6242 return; 6243 default: 6244 mp = mi_tpi_err_ack_alloc(mp, TOUTSTATE, 0); 6245 break; 6246 } 6247 /* 6248 * Note: Code below is the "failure" case 6249 */ 6250 /* return error ack and blow away saved option results if any */ 6251 connect_failed: 6252 if (mp != NULL) 6253 putnext(tcp->tcp_rq, mp); 6254 else { 6255 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6256 TSYSERR, ENOMEM); 6257 } 6258 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 6259 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 6260 } 6261 6262 /* 6263 * Handle connect to IPv4 destinations, including connections for AF_INET6 6264 * sockets connecting to IPv4 mapped IPv6 destinations. 6265 */ 6266 static void 6267 tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport, 6268 uint_t srcid) 6269 { 6270 tcph_t *tcph; 6271 mblk_t *mp1; 6272 ipaddr_t dstaddr = *dstaddrp; 6273 int32_t oldstate; 6274 uint16_t lport; 6275 6276 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 6277 6278 /* Check for attempt to connect to INADDR_ANY */ 6279 if (dstaddr == INADDR_ANY) { 6280 /* 6281 * SunOS 4.x and 4.3 BSD allow an application 6282 * to connect a TCP socket to INADDR_ANY. 6283 * When they do this, the kernel picks the 6284 * address of one interface and uses it 6285 * instead. The kernel usually ends up 6286 * picking the address of the loopback 6287 * interface. This is an undocumented feature. 6288 * However, we provide the same thing here 6289 * in order to have source and binary 6290 * compatibility with SunOS 4.x. 6291 * Update the T_CONN_REQ (sin/sin6) since it is used to 6292 * generate the T_CONN_CON. 6293 */ 6294 dstaddr = htonl(INADDR_LOOPBACK); 6295 *dstaddrp = dstaddr; 6296 } 6297 6298 /* Handle __sin6_src_id if socket not bound to an IP address */ 6299 if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) { 6300 ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6, 6301 tcp->tcp_connp->conn_zoneid); 6302 IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6, 6303 tcp->tcp_ipha->ipha_src); 6304 } 6305 6306 /* 6307 * Don't let an endpoint connect to itself. Note that 6308 * the test here does not catch the case where the 6309 * source IP addr was left unspecified by the user. In 6310 * this case, the source addr is set in tcp_adapt_ire() 6311 * using the reply to the T_BIND message that we send 6312 * down to IP here and the check is repeated in tcp_rput_other. 6313 */ 6314 if (dstaddr == tcp->tcp_ipha->ipha_src && 6315 dstport == tcp->tcp_lport) { 6316 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 6317 goto failed; 6318 } 6319 6320 tcp->tcp_ipha->ipha_dst = dstaddr; 6321 IN6_IPADDR_TO_V4MAPPED(dstaddr, &tcp->tcp_remote_v6); 6322 6323 /* 6324 * Massage a source route if any putting the first hop 6325 * in iph_dst. Compute a starting value for the checksum which 6326 * takes into account that the original iph_dst should be 6327 * included in the checksum but that ip will include the 6328 * first hop in the source route in the tcp checksum. 6329 */ 6330 tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha); 6331 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); 6332 tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + 6333 (tcp->tcp_ipha->ipha_dst & 0xffff)); 6334 if ((int)tcp->tcp_sum < 0) 6335 tcp->tcp_sum--; 6336 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); 6337 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + 6338 (tcp->tcp_sum >> 16)); 6339 tcph = tcp->tcp_tcph; 6340 *(uint16_t *)tcph->th_fport = dstport; 6341 tcp->tcp_fport = dstport; 6342 6343 oldstate = tcp->tcp_state; 6344 /* 6345 * At this point the remote destination address and remote port fields 6346 * in the tcp-four-tuple have been filled in the tcp structure. Now we 6347 * have to see which state tcp was in so we can take apropriate action. 6348 */ 6349 if (oldstate == TCPS_IDLE) { 6350 /* 6351 * We support a quick connect capability here, allowing 6352 * clients to transition directly from IDLE to SYN_SENT 6353 * tcp_bindi will pick an unused port, insert the connection 6354 * in the bind hash and transition to BOUND state. 6355 */ 6356 lport = tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE); 6357 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, 6358 B_FALSE, B_FALSE); 6359 if (lport == 0) { 6360 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0); 6361 goto failed; 6362 } 6363 } 6364 tcp->tcp_state = TCPS_SYN_SENT; 6365 6366 /* 6367 * TODO: allow data with connect requests 6368 * by unlinking M_DATA trailers here and 6369 * linking them in behind the T_OK_ACK mblk. 6370 * The tcp_rput() bind ack handler would then 6371 * feed them to tcp_wput_data() rather than call 6372 * tcp_timer(). 6373 */ 6374 mp = mi_tpi_ok_ack_alloc(mp); 6375 if (!mp) { 6376 tcp->tcp_state = oldstate; 6377 goto failed; 6378 } 6379 if (tcp->tcp_family == AF_INET) { 6380 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 6381 sizeof (ipa_conn_t)); 6382 } else { 6383 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 6384 sizeof (ipa6_conn_t)); 6385 } 6386 if (mp1) { 6387 /* Hang onto the T_OK_ACK for later. */ 6388 linkb(mp1, mp); 6389 mblk_setcred(mp1, tcp->tcp_cred); 6390 if (tcp->tcp_family == AF_INET) 6391 mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp); 6392 else { 6393 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp, 6394 &tcp->tcp_sticky_ipp); 6395 } 6396 BUMP_MIB(&tcp_mib, tcpActiveOpens); 6397 tcp->tcp_active_open = 1; 6398 /* 6399 * If the bind cannot complete immediately 6400 * IP will arrange to call tcp_rput_other 6401 * when the bind completes. 6402 */ 6403 if (mp1 != NULL) 6404 tcp_rput_other(tcp, mp1); 6405 return; 6406 } 6407 /* Error case */ 6408 tcp->tcp_state = oldstate; 6409 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); 6410 6411 failed: 6412 /* return error ack and blow away saved option results if any */ 6413 if (mp != NULL) 6414 putnext(tcp->tcp_rq, mp); 6415 else { 6416 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6417 TSYSERR, ENOMEM); 6418 } 6419 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 6420 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 6421 6422 } 6423 6424 /* 6425 * Handle connect to IPv6 destinations. 6426 */ 6427 static void 6428 tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, 6429 in_port_t dstport, uint32_t flowinfo, uint_t srcid, uint32_t scope_id) 6430 { 6431 tcph_t *tcph; 6432 mblk_t *mp1; 6433 ip6_rthdr_t *rth; 6434 int32_t oldstate; 6435 uint16_t lport; 6436 6437 ASSERT(tcp->tcp_family == AF_INET6); 6438 6439 /* 6440 * If we're here, it means that the destination address is a native 6441 * IPv6 address. Return an error if tcp_ipversion is not IPv6. A 6442 * reason why it might not be IPv6 is if the socket was bound to an 6443 * IPv4-mapped IPv6 address. 6444 */ 6445 if (tcp->tcp_ipversion != IPV6_VERSION) { 6446 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 6447 goto failed; 6448 } 6449 6450 /* 6451 * Interpret a zero destination to mean loopback. 6452 * Update the T_CONN_REQ (sin/sin6) since it is used to 6453 * generate the T_CONN_CON. 6454 */ 6455 if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) { 6456 *dstaddrp = ipv6_loopback; 6457 } 6458 6459 /* Handle __sin6_src_id if socket not bound to an IP address */ 6460 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { 6461 ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src, 6462 tcp->tcp_connp->conn_zoneid); 6463 tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src; 6464 } 6465 6466 /* 6467 * Take care of the scope_id now and add ip6i_t 6468 * if ip6i_t is not already allocated through TCP 6469 * sticky options. At this point tcp_ip6h does not 6470 * have dst info, thus use dstaddrp. 6471 */ 6472 if (scope_id != 0 && 6473 IN6_IS_ADDR_LINKSCOPE(dstaddrp)) { 6474 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; 6475 ip6i_t *ip6i; 6476 6477 ipp->ipp_ifindex = scope_id; 6478 ip6i = (ip6i_t *)tcp->tcp_iphc; 6479 6480 if ((ipp->ipp_fields & IPPF_HAS_IP6I) && 6481 ip6i != NULL && (ip6i->ip6i_nxt == IPPROTO_RAW)) { 6482 /* Already allocated */ 6483 ip6i->ip6i_flags |= IP6I_IFINDEX; 6484 ip6i->ip6i_ifindex = ipp->ipp_ifindex; 6485 ipp->ipp_fields |= IPPF_SCOPE_ID; 6486 } else { 6487 int reterr; 6488 6489 ipp->ipp_fields |= IPPF_SCOPE_ID; 6490 if (ipp->ipp_fields & IPPF_HAS_IP6I) 6491 ip2dbg(("tcp_connect_v6: SCOPE_ID set\n")); 6492 reterr = tcp_build_hdrs(tcp->tcp_rq, tcp); 6493 if (reterr != 0) 6494 goto failed; 6495 ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n")); 6496 } 6497 } 6498 6499 /* 6500 * Don't let an endpoint connect to itself. Note that 6501 * the test here does not catch the case where the 6502 * source IP addr was left unspecified by the user. In 6503 * this case, the source addr is set in tcp_adapt_ire() 6504 * using the reply to the T_BIND message that we send 6505 * down to IP here and the check is repeated in tcp_rput_other. 6506 */ 6507 if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) && 6508 (dstport == tcp->tcp_lport)) { 6509 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 6510 goto failed; 6511 } 6512 6513 tcp->tcp_ip6h->ip6_dst = *dstaddrp; 6514 tcp->tcp_remote_v6 = *dstaddrp; 6515 tcp->tcp_ip6h->ip6_vcf = 6516 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 6517 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 6518 6519 6520 /* 6521 * Massage a routing header (if present) putting the first hop 6522 * in ip6_dst. Compute a starting value for the checksum which 6523 * takes into account that the original ip6_dst should be 6524 * included in the checksum but that ip will include the 6525 * first hop in the source route in the tcp checksum. 6526 */ 6527 rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph); 6528 if (rth != NULL) { 6529 6530 tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth); 6531 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + 6532 (tcp->tcp_sum >> 16)); 6533 } else { 6534 tcp->tcp_sum = 0; 6535 } 6536 6537 tcph = tcp->tcp_tcph; 6538 *(uint16_t *)tcph->th_fport = dstport; 6539 tcp->tcp_fport = dstport; 6540 6541 oldstate = tcp->tcp_state; 6542 /* 6543 * At this point the remote destination address and remote port fields 6544 * in the tcp-four-tuple have been filled in the tcp structure. Now we 6545 * have to see which state tcp was in so we can take apropriate action. 6546 */ 6547 if (oldstate == TCPS_IDLE) { 6548 /* 6549 * We support a quick connect capability here, allowing 6550 * clients to transition directly from IDLE to SYN_SENT 6551 * tcp_bindi will pick an unused port, insert the connection 6552 * in the bind hash and transition to BOUND state. 6553 */ 6554 lport = tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE); 6555 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, 6556 B_FALSE, B_FALSE); 6557 if (lport == 0) { 6558 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0); 6559 goto failed; 6560 } 6561 } 6562 tcp->tcp_state = TCPS_SYN_SENT; 6563 /* 6564 * TODO: allow data with connect requests 6565 * by unlinking M_DATA trailers here and 6566 * linking them in behind the T_OK_ACK mblk. 6567 * The tcp_rput() bind ack handler would then 6568 * feed them to tcp_wput_data() rather than call 6569 * tcp_timer(). 6570 */ 6571 mp = mi_tpi_ok_ack_alloc(mp); 6572 if (!mp) { 6573 tcp->tcp_state = oldstate; 6574 goto failed; 6575 } 6576 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t)); 6577 if (mp1) { 6578 /* Hang onto the T_OK_ACK for later. */ 6579 linkb(mp1, mp); 6580 mblk_setcred(mp1, tcp->tcp_cred); 6581 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp, 6582 &tcp->tcp_sticky_ipp); 6583 BUMP_MIB(&tcp_mib, tcpActiveOpens); 6584 tcp->tcp_active_open = 1; 6585 /* ip_bind_v6() may return ACK or ERROR */ 6586 if (mp1 != NULL) 6587 tcp_rput_other(tcp, mp1); 6588 return; 6589 } 6590 /* Error case */ 6591 tcp->tcp_state = oldstate; 6592 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); 6593 6594 failed: 6595 /* return error ack and blow away saved option results if any */ 6596 if (mp != NULL) 6597 putnext(tcp->tcp_rq, mp); 6598 else { 6599 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6600 TSYSERR, ENOMEM); 6601 } 6602 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 6603 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 6604 } 6605 6606 /* 6607 * We need a stream q for detached closing tcp connections 6608 * to use. Our client hereby indicates that this q is the 6609 * one to use. 6610 */ 6611 static void 6612 tcp_def_q_set(tcp_t *tcp, mblk_t *mp) 6613 { 6614 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 6615 queue_t *q = tcp->tcp_wq; 6616 6617 mp->b_datap->db_type = M_IOCACK; 6618 iocp->ioc_count = 0; 6619 mutex_enter(&tcp_g_q_lock); 6620 if (tcp_g_q != NULL) { 6621 mutex_exit(&tcp_g_q_lock); 6622 iocp->ioc_error = EALREADY; 6623 } else { 6624 mblk_t *mp1; 6625 6626 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 0); 6627 if (mp1 == NULL) { 6628 mutex_exit(&tcp_g_q_lock); 6629 iocp->ioc_error = ENOMEM; 6630 } else { 6631 tcp_g_q = tcp->tcp_rq; 6632 mutex_exit(&tcp_g_q_lock); 6633 iocp->ioc_error = 0; 6634 iocp->ioc_rval = 0; 6635 /* 6636 * We are passing tcp_sticky_ipp as NULL 6637 * as it is not useful for tcp_default queue 6638 */ 6639 mp1 = ip_bind_v6(q, mp1, tcp->tcp_connp, NULL); 6640 if (mp1 != NULL) 6641 tcp_rput_other(tcp, mp1); 6642 } 6643 } 6644 qreply(q, mp); 6645 } 6646 6647 /* 6648 * Our client hereby directs us to reject the connection request 6649 * that tcp_conn_request() marked with 'seqnum'. Rejection consists 6650 * of sending the appropriate RST, not an ICMP error. 6651 */ 6652 static void 6653 tcp_disconnect(tcp_t *tcp, mblk_t *mp) 6654 { 6655 tcp_t *ltcp = NULL; 6656 t_scalar_t seqnum; 6657 conn_t *connp; 6658 6659 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 6660 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) { 6661 tcp_err_ack(tcp, mp, TPROTO, 0); 6662 return; 6663 } 6664 6665 /* 6666 * Right now, upper modules pass down a T_DISCON_REQ to TCP, 6667 * when the stream is in BOUND state. Do not send a reset, 6668 * since the destination IP address is not valid, and it can 6669 * be the initialized value of all zeros (broadcast address). 6670 * 6671 * If TCP has sent down a bind request to IP and has not 6672 * received the reply, reject the request. Otherwise, TCP 6673 * will be confused. 6674 */ 6675 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) { 6676 if (tcp->tcp_debug) { 6677 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 6678 "tcp_disconnect: bad state, %d", tcp->tcp_state); 6679 } 6680 tcp_err_ack(tcp, mp, TOUTSTATE, 0); 6681 return; 6682 } 6683 6684 seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number; 6685 6686 if (seqnum == -1 || tcp->tcp_conn_req_max == 0) { 6687 6688 /* 6689 * According to TPI, for non-listeners, ignore seqnum 6690 * and disconnect. 6691 * Following interpretation of -1 seqnum is historical 6692 * and implied TPI ? (TPI only states that for T_CONN_IND, 6693 * a valid seqnum should not be -1). 6694 * 6695 * -1 means disconnect everything 6696 * regardless even on a listener. 6697 */ 6698 6699 int old_state = tcp->tcp_state; 6700 6701 /* 6702 * The connection can't be on the tcp_time_wait_head list 6703 * since it is not detached. 6704 */ 6705 ASSERT(tcp->tcp_time_wait_next == NULL); 6706 ASSERT(tcp->tcp_time_wait_prev == NULL); 6707 ASSERT(tcp->tcp_time_wait_expire == 0); 6708 ltcp = NULL; 6709 /* 6710 * If it used to be a listener, check to make sure no one else 6711 * has taken the port before switching back to LISTEN state. 6712 */ 6713 if (tcp->tcp_ipversion == IPV4_VERSION) { 6714 connp = ipcl_lookup_listener_v4(tcp->tcp_lport, 6715 tcp->tcp_ipha->ipha_src, 6716 tcp->tcp_connp->conn_zoneid); 6717 if (connp != NULL) 6718 ltcp = connp->conn_tcp; 6719 } else { 6720 /* Allow tcp_bound_if listeners? */ 6721 connp = ipcl_lookup_listener_v6(tcp->tcp_lport, 6722 &tcp->tcp_ip6h->ip6_src, 0, 6723 tcp->tcp_connp->conn_zoneid); 6724 if (connp != NULL) 6725 ltcp = connp->conn_tcp; 6726 } 6727 if (tcp->tcp_conn_req_max && ltcp == NULL) { 6728 tcp->tcp_state = TCPS_LISTEN; 6729 } else if (old_state > TCPS_BOUND) { 6730 tcp->tcp_conn_req_max = 0; 6731 tcp->tcp_state = TCPS_BOUND; 6732 } 6733 if (ltcp != NULL) 6734 CONN_DEC_REF(ltcp->tcp_connp); 6735 if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) { 6736 BUMP_MIB(&tcp_mib, tcpAttemptFails); 6737 } else if (old_state == TCPS_ESTABLISHED || 6738 old_state == TCPS_CLOSE_WAIT) { 6739 BUMP_MIB(&tcp_mib, tcpEstabResets); 6740 } 6741 6742 if (tcp->tcp_fused) 6743 tcp_unfuse(tcp); 6744 6745 mutex_enter(&tcp->tcp_eager_lock); 6746 if ((tcp->tcp_conn_req_cnt_q0 != 0) || 6747 (tcp->tcp_conn_req_cnt_q != 0)) { 6748 tcp_eager_cleanup(tcp, 0); 6749 } 6750 mutex_exit(&tcp->tcp_eager_lock); 6751 6752 tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt, 6753 tcp->tcp_rnxt, TH_RST | TH_ACK); 6754 6755 tcp_reinit(tcp); 6756 6757 if (old_state >= TCPS_ESTABLISHED) { 6758 /* Send M_FLUSH according to TPI */ 6759 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); 6760 } 6761 mp = mi_tpi_ok_ack_alloc(mp); 6762 if (mp) 6763 putnext(tcp->tcp_rq, mp); 6764 return; 6765 } else if (!tcp_eager_blowoff(tcp, seqnum)) { 6766 tcp_err_ack(tcp, mp, TBADSEQ, 0); 6767 return; 6768 } 6769 if (tcp->tcp_state >= TCPS_ESTABLISHED) { 6770 /* Send M_FLUSH according to TPI */ 6771 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); 6772 } 6773 mp = mi_tpi_ok_ack_alloc(mp); 6774 if (mp) 6775 putnext(tcp->tcp_rq, mp); 6776 } 6777 6778 /* 6779 * Diagnostic routine used to return a string associated with the tcp state. 6780 * Note that if the caller does not supply a buffer, it will use an internal 6781 * static string. This means that if multiple threads call this function at 6782 * the same time, output can be corrupted... Note also that this function 6783 * does not check the size of the supplied buffer. The caller has to make 6784 * sure that it is big enough. 6785 */ 6786 static char * 6787 tcp_display(tcp_t *tcp, char *sup_buf, char format) 6788 { 6789 char buf1[30]; 6790 static char priv_buf[INET6_ADDRSTRLEN * 2 + 80]; 6791 char *buf; 6792 char *cp; 6793 in6_addr_t local, remote; 6794 char local_addrbuf[INET6_ADDRSTRLEN]; 6795 char remote_addrbuf[INET6_ADDRSTRLEN]; 6796 6797 if (sup_buf != NULL) 6798 buf = sup_buf; 6799 else 6800 buf = priv_buf; 6801 6802 if (tcp == NULL) 6803 return ("NULL_TCP"); 6804 switch (tcp->tcp_state) { 6805 case TCPS_CLOSED: 6806 cp = "TCP_CLOSED"; 6807 break; 6808 case TCPS_IDLE: 6809 cp = "TCP_IDLE"; 6810 break; 6811 case TCPS_BOUND: 6812 cp = "TCP_BOUND"; 6813 break; 6814 case TCPS_LISTEN: 6815 cp = "TCP_LISTEN"; 6816 break; 6817 case TCPS_SYN_SENT: 6818 cp = "TCP_SYN_SENT"; 6819 break; 6820 case TCPS_SYN_RCVD: 6821 cp = "TCP_SYN_RCVD"; 6822 break; 6823 case TCPS_ESTABLISHED: 6824 cp = "TCP_ESTABLISHED"; 6825 break; 6826 case TCPS_CLOSE_WAIT: 6827 cp = "TCP_CLOSE_WAIT"; 6828 break; 6829 case TCPS_FIN_WAIT_1: 6830 cp = "TCP_FIN_WAIT_1"; 6831 break; 6832 case TCPS_CLOSING: 6833 cp = "TCP_CLOSING"; 6834 break; 6835 case TCPS_LAST_ACK: 6836 cp = "TCP_LAST_ACK"; 6837 break; 6838 case TCPS_FIN_WAIT_2: 6839 cp = "TCP_FIN_WAIT_2"; 6840 break; 6841 case TCPS_TIME_WAIT: 6842 cp = "TCP_TIME_WAIT"; 6843 break; 6844 default: 6845 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 6846 cp = buf1; 6847 break; 6848 } 6849 switch (format) { 6850 case DISP_ADDR_AND_PORT: 6851 if (tcp->tcp_ipversion == IPV4_VERSION) { 6852 /* 6853 * Note that we use the remote address in the tcp_b 6854 * structure. This means that it will print out 6855 * the real destination address, not the next hop's 6856 * address if source routing is used. 6857 */ 6858 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ip_src, &local); 6859 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &remote); 6860 6861 } else { 6862 local = tcp->tcp_ip_src_v6; 6863 remote = tcp->tcp_remote_v6; 6864 } 6865 (void) inet_ntop(AF_INET6, &local, local_addrbuf, 6866 sizeof (local_addrbuf)); 6867 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf, 6868 sizeof (remote_addrbuf)); 6869 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s", 6870 local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf, 6871 ntohs(tcp->tcp_fport), cp); 6872 break; 6873 case DISP_PORT_ONLY: 6874 default: 6875 (void) mi_sprintf(buf, "[%u, %u] %s", 6876 ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp); 6877 break; 6878 } 6879 6880 return (buf); 6881 } 6882 6883 /* 6884 * Called via squeue to get on to eager's perimeter to send a 6885 * TH_RST. The listener wants the eager to disappear either 6886 * by means of tcp_eager_blowoff() or tcp_eager_cleanup() 6887 * being called. 6888 */ 6889 /* ARGSUSED */ 6890 void 6891 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2) 6892 { 6893 conn_t *econnp = (conn_t *)arg; 6894 tcp_t *eager = econnp->conn_tcp; 6895 tcp_t *listener = eager->tcp_listener; 6896 6897 /* 6898 * We could be called because listener is closing. Since 6899 * the eager is using listener's queue's, its not safe. 6900 * Better use the default queue just to send the TH_RST 6901 * out. 6902 */ 6903 eager->tcp_rq = tcp_g_q; 6904 eager->tcp_wq = WR(tcp_g_q); 6905 6906 if (eager->tcp_state > TCPS_LISTEN) { 6907 tcp_xmit_ctl("tcp_eager_kill, can't wait", 6908 eager, eager->tcp_snxt, 0, TH_RST); 6909 } 6910 6911 /* We are here because listener wants this eager gone */ 6912 if (listener != NULL) { 6913 mutex_enter(&listener->tcp_eager_lock); 6914 tcp_eager_unlink(eager); 6915 if (eager->tcp_conn.tcp_eager_conn_ind == NULL) { 6916 /* 6917 * The eager has sent a conn_ind up to the 6918 * listener but listener decides to close 6919 * instead. We need to drop the extra ref 6920 * placed on eager in tcp_rput_data() before 6921 * sending the conn_ind to listener. 6922 */ 6923 CONN_DEC_REF(econnp); 6924 } 6925 mutex_exit(&listener->tcp_eager_lock); 6926 CONN_DEC_REF(listener->tcp_connp); 6927 } 6928 6929 if (eager->tcp_state > TCPS_BOUND) 6930 tcp_close_detached(eager); 6931 } 6932 6933 /* 6934 * Reset any eager connection hanging off this listener marked 6935 * with 'seqnum' and then reclaim it's resources. 6936 */ 6937 static boolean_t 6938 tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) 6939 { 6940 tcp_t *eager; 6941 mblk_t *mp; 6942 6943 TCP_STAT(tcp_eager_blowoff_calls); 6944 eager = listener; 6945 mutex_enter(&listener->tcp_eager_lock); 6946 do { 6947 eager = eager->tcp_eager_next_q; 6948 if (eager == NULL) { 6949 mutex_exit(&listener->tcp_eager_lock); 6950 return (B_FALSE); 6951 } 6952 } while (eager->tcp_conn_req_seqnum != seqnum); 6953 CONN_INC_REF(eager->tcp_connp); 6954 mutex_exit(&listener->tcp_eager_lock); 6955 mp = &eager->tcp_closemp; 6956 squeue_fill(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, 6957 eager->tcp_connp, SQTAG_TCP_EAGER_BLOWOFF); 6958 return (B_TRUE); 6959 } 6960 6961 /* 6962 * Reset any eager connection hanging off this listener 6963 * and then reclaim it's resources. 6964 */ 6965 static void 6966 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) 6967 { 6968 tcp_t *eager; 6969 mblk_t *mp; 6970 6971 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 6972 6973 if (!q0_only) { 6974 /* First cleanup q */ 6975 TCP_STAT(tcp_eager_blowoff_q); 6976 eager = listener->tcp_eager_next_q; 6977 while (eager != NULL) { 6978 CONN_INC_REF(eager->tcp_connp); 6979 mp = &eager->tcp_closemp; 6980 squeue_fill(eager->tcp_connp->conn_sqp, mp, 6981 tcp_eager_kill, eager->tcp_connp, 6982 SQTAG_TCP_EAGER_CLEANUP); 6983 eager = eager->tcp_eager_next_q; 6984 } 6985 } 6986 /* Then cleanup q0 */ 6987 TCP_STAT(tcp_eager_blowoff_q0); 6988 eager = listener->tcp_eager_next_q0; 6989 while (eager != listener) { 6990 CONN_INC_REF(eager->tcp_connp); 6991 mp = &eager->tcp_closemp; 6992 squeue_fill(eager->tcp_connp->conn_sqp, mp, 6993 tcp_eager_kill, eager->tcp_connp, 6994 SQTAG_TCP_EAGER_CLEANUP_Q0); 6995 eager = eager->tcp_eager_next_q0; 6996 } 6997 } 6998 6999 /* 7000 * If we are an eager connection hanging off a listener that hasn't 7001 * formally accepted the connection yet, get off his list and blow off 7002 * any data that we have accumulated. 7003 */ 7004 static void 7005 tcp_eager_unlink(tcp_t *tcp) 7006 { 7007 tcp_t *listener = tcp->tcp_listener; 7008 7009 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 7010 ASSERT(listener != NULL); 7011 if (tcp->tcp_eager_next_q0 != NULL) { 7012 ASSERT(tcp->tcp_eager_prev_q0 != NULL); 7013 7014 /* Remove the eager tcp from q0 */ 7015 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 7016 tcp->tcp_eager_prev_q0; 7017 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 7018 tcp->tcp_eager_next_q0; 7019 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 7020 listener->tcp_conn_req_cnt_q0--; 7021 7022 tcp->tcp_eager_next_q0 = NULL; 7023 tcp->tcp_eager_prev_q0 = NULL; 7024 7025 if (tcp->tcp_syn_rcvd_timeout != 0) { 7026 /* we have timed out before */ 7027 ASSERT(listener->tcp_syn_rcvd_timeout > 0); 7028 listener->tcp_syn_rcvd_timeout--; 7029 } 7030 } else { 7031 tcp_t **tcpp = &listener->tcp_eager_next_q; 7032 tcp_t *prev = NULL; 7033 7034 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 7035 if (tcpp[0] == tcp) { 7036 if (listener->tcp_eager_last_q == tcp) { 7037 /* 7038 * If we are unlinking the last 7039 * element on the list, adjust 7040 * tail pointer. Set tail pointer 7041 * to nil when list is empty. 7042 */ 7043 ASSERT(tcp->tcp_eager_next_q == NULL); 7044 if (listener->tcp_eager_last_q == 7045 listener->tcp_eager_next_q) { 7046 listener->tcp_eager_last_q = 7047 NULL; 7048 } else { 7049 /* 7050 * We won't get here if there 7051 * is only one eager in the 7052 * list. 7053 */ 7054 ASSERT(prev != NULL); 7055 listener->tcp_eager_last_q = 7056 prev; 7057 } 7058 } 7059 tcpp[0] = tcp->tcp_eager_next_q; 7060 tcp->tcp_eager_next_q = NULL; 7061 tcp->tcp_eager_last_q = NULL; 7062 ASSERT(listener->tcp_conn_req_cnt_q > 0); 7063 listener->tcp_conn_req_cnt_q--; 7064 break; 7065 } 7066 prev = tcpp[0]; 7067 } 7068 } 7069 tcp->tcp_listener = NULL; 7070 } 7071 7072 /* Shorthand to generate and send TPI error acks to our client */ 7073 static void 7074 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) 7075 { 7076 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 7077 putnext(tcp->tcp_rq, mp); 7078 } 7079 7080 /* Shorthand to generate and send TPI error acks to our client */ 7081 static void 7082 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 7083 int t_error, int sys_error) 7084 { 7085 struct T_error_ack *teackp; 7086 7087 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 7088 M_PCPROTO, T_ERROR_ACK)) != NULL) { 7089 teackp = (struct T_error_ack *)mp->b_rptr; 7090 teackp->ERROR_prim = primitive; 7091 teackp->TLI_error = t_error; 7092 teackp->UNIX_error = sys_error; 7093 putnext(tcp->tcp_rq, mp); 7094 } 7095 } 7096 7097 /* 7098 * Note: No locks are held when inspecting tcp_g_*epriv_ports 7099 * but instead the code relies on: 7100 * - the fact that the address of the array and its size never changes 7101 * - the atomic assignment of the elements of the array 7102 */ 7103 /* ARGSUSED */ 7104 static int 7105 tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 7106 { 7107 int i; 7108 7109 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 7110 if (tcp_g_epriv_ports[i] != 0) 7111 (void) mi_mpprintf(mp, "%d ", tcp_g_epriv_ports[i]); 7112 } 7113 return (0); 7114 } 7115 7116 /* 7117 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple 7118 * threads from changing it at the same time. 7119 */ 7120 /* ARGSUSED */ 7121 static int 7122 tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 7123 cred_t *cr) 7124 { 7125 long new_value; 7126 int i; 7127 7128 /* 7129 * Fail the request if the new value does not lie within the 7130 * port number limits. 7131 */ 7132 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 7133 new_value <= 0 || new_value >= 65536) { 7134 return (EINVAL); 7135 } 7136 7137 mutex_enter(&tcp_epriv_port_lock); 7138 /* Check if the value is already in the list */ 7139 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 7140 if (new_value == tcp_g_epriv_ports[i]) { 7141 mutex_exit(&tcp_epriv_port_lock); 7142 return (EEXIST); 7143 } 7144 } 7145 /* Find an empty slot */ 7146 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 7147 if (tcp_g_epriv_ports[i] == 0) 7148 break; 7149 } 7150 if (i == tcp_g_num_epriv_ports) { 7151 mutex_exit(&tcp_epriv_port_lock); 7152 return (EOVERFLOW); 7153 } 7154 /* Set the new value */ 7155 tcp_g_epriv_ports[i] = (uint16_t)new_value; 7156 mutex_exit(&tcp_epriv_port_lock); 7157 return (0); 7158 } 7159 7160 /* 7161 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple 7162 * threads from changing it at the same time. 7163 */ 7164 /* ARGSUSED */ 7165 static int 7166 tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 7167 cred_t *cr) 7168 { 7169 long new_value; 7170 int i; 7171 7172 /* 7173 * Fail the request if the new value does not lie within the 7174 * port number limits. 7175 */ 7176 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value <= 0 || 7177 new_value >= 65536) { 7178 return (EINVAL); 7179 } 7180 7181 mutex_enter(&tcp_epriv_port_lock); 7182 /* Check that the value is already in the list */ 7183 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 7184 if (tcp_g_epriv_ports[i] == new_value) 7185 break; 7186 } 7187 if (i == tcp_g_num_epriv_ports) { 7188 mutex_exit(&tcp_epriv_port_lock); 7189 return (ESRCH); 7190 } 7191 /* Clear the value */ 7192 tcp_g_epriv_ports[i] = 0; 7193 mutex_exit(&tcp_epriv_port_lock); 7194 return (0); 7195 } 7196 7197 /* Return the TPI/TLI equivalent of our current tcp_state */ 7198 static int 7199 tcp_tpistate(tcp_t *tcp) 7200 { 7201 switch (tcp->tcp_state) { 7202 case TCPS_IDLE: 7203 return (TS_UNBND); 7204 case TCPS_LISTEN: 7205 /* 7206 * Return whether there are outstanding T_CONN_IND waiting 7207 * for the matching T_CONN_RES. Therefore don't count q0. 7208 */ 7209 if (tcp->tcp_conn_req_cnt_q > 0) 7210 return (TS_WRES_CIND); 7211 else 7212 return (TS_IDLE); 7213 case TCPS_BOUND: 7214 return (TS_IDLE); 7215 case TCPS_SYN_SENT: 7216 return (TS_WCON_CREQ); 7217 case TCPS_SYN_RCVD: 7218 /* 7219 * Note: assumption: this has to the active open SYN_RCVD. 7220 * The passive instance is detached in SYN_RCVD stage of 7221 * incoming connection processing so we cannot get request 7222 * for T_info_ack on it. 7223 */ 7224 return (TS_WACK_CRES); 7225 case TCPS_ESTABLISHED: 7226 return (TS_DATA_XFER); 7227 case TCPS_CLOSE_WAIT: 7228 return (TS_WREQ_ORDREL); 7229 case TCPS_FIN_WAIT_1: 7230 return (TS_WIND_ORDREL); 7231 case TCPS_FIN_WAIT_2: 7232 return (TS_WIND_ORDREL); 7233 7234 case TCPS_CLOSING: 7235 case TCPS_LAST_ACK: 7236 case TCPS_TIME_WAIT: 7237 case TCPS_CLOSED: 7238 /* 7239 * Following TS_WACK_DREQ7 is a rendition of "not 7240 * yet TS_IDLE" TPI state. There is no best match to any 7241 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we 7242 * choose a value chosen that will map to TLI/XTI level 7243 * state of TSTATECHNG (state is process of changing) which 7244 * captures what this dummy state represents. 7245 */ 7246 return (TS_WACK_DREQ7); 7247 default: 7248 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", 7249 tcp->tcp_state, tcp_display(tcp, NULL, 7250 DISP_PORT_ONLY)); 7251 return (TS_UNBND); 7252 } 7253 } 7254 7255 static void 7256 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) 7257 { 7258 if (tcp->tcp_family == AF_INET6) 7259 *tia = tcp_g_t_info_ack_v6; 7260 else 7261 *tia = tcp_g_t_info_ack; 7262 tia->CURRENT_state = tcp_tpistate(tcp); 7263 tia->OPT_size = tcp_max_optsize; 7264 if (tcp->tcp_mss == 0) { 7265 /* Not yet set - tcp_open does not set mss */ 7266 if (tcp->tcp_ipversion == IPV4_VERSION) 7267 tia->TIDU_size = tcp_mss_def_ipv4; 7268 else 7269 tia->TIDU_size = tcp_mss_def_ipv6; 7270 } else { 7271 tia->TIDU_size = tcp->tcp_mss; 7272 } 7273 /* TODO: Default ETSDU is 1. Is that correct for tcp? */ 7274 } 7275 7276 /* 7277 * This routine responds to T_CAPABILITY_REQ messages. It is called by 7278 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from 7279 * tcp_g_t_info_ack. The current state of the stream is copied from 7280 * tcp_state. 7281 */ 7282 static void 7283 tcp_capability_req(tcp_t *tcp, mblk_t *mp) 7284 { 7285 t_uscalar_t cap_bits1; 7286 struct T_capability_ack *tcap; 7287 7288 if (MBLKL(mp) < sizeof (struct T_capability_req)) { 7289 freemsg(mp); 7290 return; 7291 } 7292 7293 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 7294 7295 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 7296 mp->b_datap->db_type, T_CAPABILITY_ACK); 7297 if (mp == NULL) 7298 return; 7299 7300 tcap = (struct T_capability_ack *)mp->b_rptr; 7301 tcap->CAP_bits1 = 0; 7302 7303 if (cap_bits1 & TC1_INFO) { 7304 tcp_copy_info(&tcap->INFO_ack, tcp); 7305 tcap->CAP_bits1 |= TC1_INFO; 7306 } 7307 7308 if (cap_bits1 & TC1_ACCEPTOR_ID) { 7309 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; 7310 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; 7311 } 7312 7313 putnext(tcp->tcp_rq, mp); 7314 } 7315 7316 /* 7317 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. 7318 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. 7319 * The current state of the stream is copied from tcp_state. 7320 */ 7321 static void 7322 tcp_info_req(tcp_t *tcp, mblk_t *mp) 7323 { 7324 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 7325 T_INFO_ACK); 7326 if (!mp) { 7327 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 7328 return; 7329 } 7330 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); 7331 putnext(tcp->tcp_rq, mp); 7332 } 7333 7334 /* Respond to the TPI addr request */ 7335 static void 7336 tcp_addr_req(tcp_t *tcp, mblk_t *mp) 7337 { 7338 sin_t *sin; 7339 mblk_t *ackmp; 7340 struct T_addr_ack *taa; 7341 7342 /* Make it large enough for worst case */ 7343 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 7344 2 * sizeof (sin6_t), 1); 7345 if (ackmp == NULL) { 7346 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 7347 return; 7348 } 7349 7350 if (tcp->tcp_ipversion == IPV6_VERSION) { 7351 tcp_addr_req_ipv6(tcp, ackmp); 7352 return; 7353 } 7354 taa = (struct T_addr_ack *)ackmp->b_rptr; 7355 7356 bzero(taa, sizeof (struct T_addr_ack)); 7357 ackmp->b_wptr = (uchar_t *)&taa[1]; 7358 7359 taa->PRIM_type = T_ADDR_ACK; 7360 ackmp->b_datap->db_type = M_PCPROTO; 7361 7362 /* 7363 * Note: Following code assumes 32 bit alignment of basic 7364 * data structures like sin_t and struct T_addr_ack. 7365 */ 7366 if (tcp->tcp_state >= TCPS_BOUND) { 7367 /* 7368 * Fill in local address 7369 */ 7370 taa->LOCADDR_length = sizeof (sin_t); 7371 taa->LOCADDR_offset = sizeof (*taa); 7372 7373 sin = (sin_t *)&taa[1]; 7374 7375 /* Fill zeroes and then intialize non-zero fields */ 7376 *sin = sin_null; 7377 7378 sin->sin_family = AF_INET; 7379 7380 sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src; 7381 sin->sin_port = *(uint16_t *)tcp->tcp_tcph->th_lport; 7382 7383 ackmp->b_wptr = (uchar_t *)&sin[1]; 7384 7385 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 7386 /* 7387 * Fill in Remote address 7388 */ 7389 taa->REMADDR_length = sizeof (sin_t); 7390 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset + 7391 taa->LOCADDR_length); 7392 7393 sin = (sin_t *)(ackmp->b_rptr + taa->REMADDR_offset); 7394 *sin = sin_null; 7395 sin->sin_family = AF_INET; 7396 sin->sin_addr.s_addr = tcp->tcp_remote; 7397 sin->sin_port = tcp->tcp_fport; 7398 7399 ackmp->b_wptr = (uchar_t *)&sin[1]; 7400 } 7401 } 7402 putnext(tcp->tcp_rq, ackmp); 7403 } 7404 7405 /* Assumes that tcp_addr_req gets enough space and alignment */ 7406 static void 7407 tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp) 7408 { 7409 sin6_t *sin6; 7410 struct T_addr_ack *taa; 7411 7412 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 7413 ASSERT(OK_32PTR(ackmp->b_rptr)); 7414 ASSERT(ackmp->b_wptr - ackmp->b_rptr >= sizeof (struct T_addr_ack) + 7415 2 * sizeof (sin6_t)); 7416 7417 taa = (struct T_addr_ack *)ackmp->b_rptr; 7418 7419 bzero(taa, sizeof (struct T_addr_ack)); 7420 ackmp->b_wptr = (uchar_t *)&taa[1]; 7421 7422 taa->PRIM_type = T_ADDR_ACK; 7423 ackmp->b_datap->db_type = M_PCPROTO; 7424 7425 /* 7426 * Note: Following code assumes 32 bit alignment of basic 7427 * data structures like sin6_t and struct T_addr_ack. 7428 */ 7429 if (tcp->tcp_state >= TCPS_BOUND) { 7430 /* 7431 * Fill in local address 7432 */ 7433 taa->LOCADDR_length = sizeof (sin6_t); 7434 taa->LOCADDR_offset = sizeof (*taa); 7435 7436 sin6 = (sin6_t *)&taa[1]; 7437 *sin6 = sin6_null; 7438 7439 sin6->sin6_family = AF_INET6; 7440 sin6->sin6_addr = tcp->tcp_ip6h->ip6_src; 7441 sin6->sin6_port = tcp->tcp_lport; 7442 7443 ackmp->b_wptr = (uchar_t *)&sin6[1]; 7444 7445 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 7446 /* 7447 * Fill in Remote address 7448 */ 7449 taa->REMADDR_length = sizeof (sin6_t); 7450 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset + 7451 taa->LOCADDR_length); 7452 7453 sin6 = (sin6_t *)(ackmp->b_rptr + taa->REMADDR_offset); 7454 *sin6 = sin6_null; 7455 sin6->sin6_family = AF_INET6; 7456 sin6->sin6_flowinfo = 7457 tcp->tcp_ip6h->ip6_vcf & 7458 ~IPV6_VERS_AND_FLOW_MASK; 7459 sin6->sin6_addr = tcp->tcp_remote_v6; 7460 sin6->sin6_port = tcp->tcp_fport; 7461 7462 ackmp->b_wptr = (uchar_t *)&sin6[1]; 7463 } 7464 } 7465 putnext(tcp->tcp_rq, ackmp); 7466 } 7467 7468 /* 7469 * Handle reinitialization of a tcp structure. 7470 * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE. 7471 */ 7472 static void 7473 tcp_reinit(tcp_t *tcp) 7474 { 7475 mblk_t *mp; 7476 int err; 7477 7478 TCP_STAT(tcp_reinit_calls); 7479 7480 /* tcp_reinit should never be called for detached tcp_t's */ 7481 ASSERT(tcp->tcp_listener == NULL); 7482 ASSERT((tcp->tcp_family == AF_INET && 7483 tcp->tcp_ipversion == IPV4_VERSION) || 7484 (tcp->tcp_family == AF_INET6 && 7485 (tcp->tcp_ipversion == IPV4_VERSION || 7486 tcp->tcp_ipversion == IPV6_VERSION))); 7487 7488 /* Cancel outstanding timers */ 7489 tcp_timers_stop(tcp); 7490 7491 /* 7492 * Reset everything in the state vector, after updating global 7493 * MIB data from instance counters. 7494 */ 7495 UPDATE_MIB(&tcp_mib, tcpInSegs, tcp->tcp_ibsegs); 7496 tcp->tcp_ibsegs = 0; 7497 UPDATE_MIB(&tcp_mib, tcpOutSegs, tcp->tcp_obsegs); 7498 tcp->tcp_obsegs = 0; 7499 7500 tcp_close_mpp(&tcp->tcp_xmit_head); 7501 if (tcp->tcp_snd_zcopy_aware) 7502 tcp_zcopy_notify(tcp); 7503 tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL; 7504 tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0; 7505 if (tcp->tcp_flow_stopped && 7506 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 7507 tcp_clrqfull(tcp); 7508 } 7509 tcp_close_mpp(&tcp->tcp_reass_head); 7510 tcp->tcp_reass_tail = NULL; 7511 if (tcp->tcp_rcv_list != NULL) { 7512 /* Free b_next chain */ 7513 tcp_close_mpp(&tcp->tcp_rcv_list); 7514 tcp->tcp_rcv_last_head = NULL; 7515 tcp->tcp_rcv_last_tail = NULL; 7516 tcp->tcp_rcv_cnt = 0; 7517 } 7518 tcp->tcp_rcv_last_tail = NULL; 7519 7520 if ((mp = tcp->tcp_urp_mp) != NULL) { 7521 freemsg(mp); 7522 tcp->tcp_urp_mp = NULL; 7523 } 7524 if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 7525 freemsg(mp); 7526 tcp->tcp_urp_mark_mp = NULL; 7527 } 7528 if (tcp->tcp_fused_sigurg_mp != NULL) { 7529 freeb(tcp->tcp_fused_sigurg_mp); 7530 tcp->tcp_fused_sigurg_mp = NULL; 7531 } 7532 7533 /* 7534 * Following is a union with two members which are 7535 * identical types and size so the following cleanup 7536 * is enough. 7537 */ 7538 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 7539 7540 CL_INET_DISCONNECT(tcp); 7541 7542 /* 7543 * The connection can't be on the tcp_time_wait_head list 7544 * since it is not detached. 7545 */ 7546 ASSERT(tcp->tcp_time_wait_next == NULL); 7547 ASSERT(tcp->tcp_time_wait_prev == NULL); 7548 ASSERT(tcp->tcp_time_wait_expire == 0); 7549 7550 if (tcp->tcp_kssl_pending) { 7551 tcp->tcp_kssl_pending = B_FALSE; 7552 7553 /* Don't reset if the initialized by bind. */ 7554 if (tcp->tcp_kssl_ent != NULL) { 7555 kssl_release_ent(tcp->tcp_kssl_ent, NULL, 7556 KSSL_NO_PROXY); 7557 } 7558 } 7559 if (tcp->tcp_kssl_ctx != NULL) { 7560 kssl_release_ctx(tcp->tcp_kssl_ctx); 7561 tcp->tcp_kssl_ctx = NULL; 7562 } 7563 7564 /* 7565 * Reset/preserve other values 7566 */ 7567 tcp_reinit_values(tcp); 7568 ipcl_hash_remove(tcp->tcp_connp); 7569 conn_delete_ire(tcp->tcp_connp, NULL); 7570 7571 if (tcp->tcp_conn_req_max != 0) { 7572 /* 7573 * This is the case when a TLI program uses the same 7574 * transport end point to accept a connection. This 7575 * makes the TCP both a listener and acceptor. When 7576 * this connection is closed, we need to set the state 7577 * back to TCPS_LISTEN. Make sure that the eager list 7578 * is reinitialized. 7579 * 7580 * Note that this stream is still bound to the four 7581 * tuples of the previous connection in IP. If a new 7582 * SYN with different foreign address comes in, IP will 7583 * not find it and will send it to the global queue. In 7584 * the global queue, TCP will do a tcp_lookup_listener() 7585 * to find this stream. This works because this stream 7586 * is only removed from connected hash. 7587 * 7588 */ 7589 tcp->tcp_state = TCPS_LISTEN; 7590 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 7591 tcp->tcp_connp->conn_recv = tcp_conn_request; 7592 if (tcp->tcp_family == AF_INET6) { 7593 ASSERT(tcp->tcp_connp->conn_af_isv6); 7594 (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP, 7595 &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport); 7596 } else { 7597 ASSERT(!tcp->tcp_connp->conn_af_isv6); 7598 (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP, 7599 tcp->tcp_ipha->ipha_src, tcp->tcp_lport); 7600 } 7601 } else { 7602 tcp->tcp_state = TCPS_BOUND; 7603 } 7604 7605 /* 7606 * Initialize to default values 7607 * Can't fail since enough header template space already allocated 7608 * at open(). 7609 */ 7610 err = tcp_init_values(tcp); 7611 ASSERT(err == 0); 7612 /* Restore state in tcp_tcph */ 7613 bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN); 7614 if (tcp->tcp_ipversion == IPV4_VERSION) 7615 tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source; 7616 else 7617 tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6; 7618 /* 7619 * Copy of the src addr. in tcp_t is needed in tcp_t 7620 * since the lookup funcs can only lookup on tcp_t 7621 */ 7622 tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6; 7623 7624 ASSERT(tcp->tcp_ptpbhn != NULL); 7625 tcp->tcp_rq->q_hiwat = tcp_recv_hiwat; 7626 tcp->tcp_rwnd = tcp_recv_hiwat; 7627 tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ? 7628 tcp_mss_def_ipv6 : tcp_mss_def_ipv4; 7629 } 7630 7631 /* 7632 * Force values to zero that need be zero. 7633 * Do not touch values asociated with the BOUND or LISTEN state 7634 * since the connection will end up in that state after the reinit. 7635 * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t 7636 * structure! 7637 */ 7638 static void 7639 tcp_reinit_values(tcp) 7640 tcp_t *tcp; 7641 { 7642 #ifndef lint 7643 #define DONTCARE(x) 7644 #define PRESERVE(x) 7645 #else 7646 #define DONTCARE(x) ((x) = (x)) 7647 #define PRESERVE(x) ((x) = (x)) 7648 #endif /* lint */ 7649 7650 PRESERVE(tcp->tcp_bind_hash); 7651 PRESERVE(tcp->tcp_ptpbhn); 7652 PRESERVE(tcp->tcp_acceptor_hash); 7653 PRESERVE(tcp->tcp_ptpahn); 7654 7655 /* Should be ASSERT NULL on these with new code! */ 7656 ASSERT(tcp->tcp_time_wait_next == NULL); 7657 ASSERT(tcp->tcp_time_wait_prev == NULL); 7658 ASSERT(tcp->tcp_time_wait_expire == 0); 7659 PRESERVE(tcp->tcp_state); 7660 PRESERVE(tcp->tcp_rq); 7661 PRESERVE(tcp->tcp_wq); 7662 7663 ASSERT(tcp->tcp_xmit_head == NULL); 7664 ASSERT(tcp->tcp_xmit_last == NULL); 7665 ASSERT(tcp->tcp_unsent == 0); 7666 ASSERT(tcp->tcp_xmit_tail == NULL); 7667 ASSERT(tcp->tcp_xmit_tail_unsent == 0); 7668 7669 tcp->tcp_snxt = 0; /* Displayed in mib */ 7670 tcp->tcp_suna = 0; /* Displayed in mib */ 7671 tcp->tcp_swnd = 0; 7672 DONTCARE(tcp->tcp_cwnd); /* Init in tcp_mss_set */ 7673 7674 ASSERT(tcp->tcp_ibsegs == 0); 7675 ASSERT(tcp->tcp_obsegs == 0); 7676 7677 if (tcp->tcp_iphc != NULL) { 7678 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 7679 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 7680 } 7681 7682 DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */ 7683 DONTCARE(tcp->tcp_hdr_len); /* Init in tcp_init_values */ 7684 DONTCARE(tcp->tcp_ipha); 7685 DONTCARE(tcp->tcp_ip6h); 7686 DONTCARE(tcp->tcp_ip_hdr_len); 7687 DONTCARE(tcp->tcp_tcph); 7688 DONTCARE(tcp->tcp_tcp_hdr_len); /* Init in tcp_init_values */ 7689 tcp->tcp_valid_bits = 0; 7690 7691 DONTCARE(tcp->tcp_xmit_hiwater); /* Init in tcp_init_values */ 7692 DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */ 7693 DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */ 7694 tcp->tcp_last_rcv_lbolt = 0; 7695 7696 tcp->tcp_init_cwnd = 0; 7697 7698 tcp->tcp_urp_last_valid = 0; 7699 tcp->tcp_hard_binding = 0; 7700 tcp->tcp_hard_bound = 0; 7701 PRESERVE(tcp->tcp_cred); 7702 PRESERVE(tcp->tcp_cpid); 7703 PRESERVE(tcp->tcp_exclbind); 7704 7705 tcp->tcp_fin_acked = 0; 7706 tcp->tcp_fin_rcvd = 0; 7707 tcp->tcp_fin_sent = 0; 7708 tcp->tcp_ordrel_done = 0; 7709 7710 tcp->tcp_debug = 0; 7711 tcp->tcp_dontroute = 0; 7712 tcp->tcp_broadcast = 0; 7713 7714 tcp->tcp_useloopback = 0; 7715 tcp->tcp_reuseaddr = 0; 7716 tcp->tcp_oobinline = 0; 7717 tcp->tcp_dgram_errind = 0; 7718 7719 tcp->tcp_detached = 0; 7720 tcp->tcp_bind_pending = 0; 7721 tcp->tcp_unbind_pending = 0; 7722 tcp->tcp_deferred_clean_death = 0; 7723 7724 tcp->tcp_snd_ws_ok = B_FALSE; 7725 tcp->tcp_snd_ts_ok = B_FALSE; 7726 tcp->tcp_linger = 0; 7727 tcp->tcp_ka_enabled = 0; 7728 tcp->tcp_zero_win_probe = 0; 7729 7730 tcp->tcp_loopback = 0; 7731 tcp->tcp_localnet = 0; 7732 tcp->tcp_syn_defense = 0; 7733 tcp->tcp_set_timer = 0; 7734 7735 tcp->tcp_active_open = 0; 7736 ASSERT(tcp->tcp_timeout == B_FALSE); 7737 tcp->tcp_rexmit = B_FALSE; 7738 tcp->tcp_xmit_zc_clean = B_FALSE; 7739 7740 tcp->tcp_snd_sack_ok = B_FALSE; 7741 PRESERVE(tcp->tcp_recvdstaddr); 7742 tcp->tcp_hwcksum = B_FALSE; 7743 7744 tcp->tcp_ire_ill_check_done = B_FALSE; 7745 DONTCARE(tcp->tcp_maxpsz); /* Init in tcp_init_values */ 7746 7747 tcp->tcp_mdt = B_FALSE; 7748 tcp->tcp_mdt_hdr_head = 0; 7749 tcp->tcp_mdt_hdr_tail = 0; 7750 7751 tcp->tcp_conn_def_q0 = 0; 7752 tcp->tcp_ip_forward_progress = B_FALSE; 7753 tcp->tcp_anon_priv_bind = 0; 7754 tcp->tcp_ecn_ok = B_FALSE; 7755 7756 tcp->tcp_cwr = B_FALSE; 7757 tcp->tcp_ecn_echo_on = B_FALSE; 7758 7759 if (tcp->tcp_sack_info != NULL) { 7760 if (tcp->tcp_notsack_list != NULL) { 7761 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 7762 } 7763 kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info); 7764 tcp->tcp_sack_info = NULL; 7765 } 7766 7767 tcp->tcp_rcv_ws = 0; 7768 tcp->tcp_snd_ws = 0; 7769 tcp->tcp_ts_recent = 0; 7770 tcp->tcp_rnxt = 0; /* Displayed in mib */ 7771 DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */ 7772 tcp->tcp_if_mtu = 0; 7773 7774 ASSERT(tcp->tcp_reass_head == NULL); 7775 ASSERT(tcp->tcp_reass_tail == NULL); 7776 7777 tcp->tcp_cwnd_cnt = 0; 7778 7779 ASSERT(tcp->tcp_rcv_list == NULL); 7780 ASSERT(tcp->tcp_rcv_last_head == NULL); 7781 ASSERT(tcp->tcp_rcv_last_tail == NULL); 7782 ASSERT(tcp->tcp_rcv_cnt == 0); 7783 7784 DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_adapt_ire */ 7785 DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */ 7786 tcp->tcp_csuna = 0; 7787 7788 tcp->tcp_rto = 0; /* Displayed in MIB */ 7789 DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */ 7790 DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */ 7791 tcp->tcp_rtt_update = 0; 7792 7793 DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 7794 DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 7795 7796 tcp->tcp_rack = 0; /* Displayed in mib */ 7797 tcp->tcp_rack_cnt = 0; 7798 tcp->tcp_rack_cur_max = 0; 7799 tcp->tcp_rack_abs_max = 0; 7800 7801 tcp->tcp_max_swnd = 0; 7802 7803 ASSERT(tcp->tcp_listener == NULL); 7804 7805 DONTCARE(tcp->tcp_xmit_lowater); /* Init in tcp_init_values */ 7806 7807 DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */ 7808 DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */ 7809 DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */ 7810 DONTCARE(tcp->tcp_urg); /* tcp_valid_bits cleared */ 7811 7812 ASSERT(tcp->tcp_conn_req_cnt_q == 0); 7813 ASSERT(tcp->tcp_conn_req_cnt_q0 == 0); 7814 PRESERVE(tcp->tcp_conn_req_max); 7815 PRESERVE(tcp->tcp_conn_req_seqnum); 7816 7817 DONTCARE(tcp->tcp_ip_hdr_len); /* Init in tcp_init_values */ 7818 DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */ 7819 DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */ 7820 DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */ 7821 DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */ 7822 7823 tcp->tcp_lingertime = 0; 7824 7825 DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */ 7826 ASSERT(tcp->tcp_urp_mp == NULL); 7827 ASSERT(tcp->tcp_urp_mark_mp == NULL); 7828 ASSERT(tcp->tcp_fused_sigurg_mp == NULL); 7829 7830 ASSERT(tcp->tcp_eager_next_q == NULL); 7831 ASSERT(tcp->tcp_eager_last_q == NULL); 7832 ASSERT((tcp->tcp_eager_next_q0 == NULL && 7833 tcp->tcp_eager_prev_q0 == NULL) || 7834 tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0); 7835 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 7836 7837 tcp->tcp_client_errno = 0; 7838 7839 DONTCARE(tcp->tcp_sum); /* Init in tcp_init_values */ 7840 7841 tcp->tcp_remote_v6 = ipv6_all_zeros; /* Displayed in MIB */ 7842 7843 PRESERVE(tcp->tcp_bound_source_v6); 7844 tcp->tcp_last_sent_len = 0; 7845 tcp->tcp_dupack_cnt = 0; 7846 7847 tcp->tcp_fport = 0; /* Displayed in MIB */ 7848 PRESERVE(tcp->tcp_lport); 7849 7850 PRESERVE(tcp->tcp_acceptor_lockp); 7851 7852 ASSERT(tcp->tcp_ordrelid == 0); 7853 PRESERVE(tcp->tcp_acceptor_id); 7854 DONTCARE(tcp->tcp_ipsec_overhead); 7855 7856 /* 7857 * If tcp_tracing flag is ON (i.e. We have a trace buffer 7858 * in tcp structure and now tracing), Re-initialize all 7859 * members of tcp_traceinfo. 7860 */ 7861 if (tcp->tcp_tracebuf != NULL) { 7862 bzero(tcp->tcp_tracebuf, sizeof (tcptrch_t)); 7863 } 7864 7865 PRESERVE(tcp->tcp_family); 7866 if (tcp->tcp_family == AF_INET6) { 7867 tcp->tcp_ipversion = IPV6_VERSION; 7868 tcp->tcp_mss = tcp_mss_def_ipv6; 7869 } else { 7870 tcp->tcp_ipversion = IPV4_VERSION; 7871 tcp->tcp_mss = tcp_mss_def_ipv4; 7872 } 7873 7874 tcp->tcp_bound_if = 0; 7875 tcp->tcp_ipv6_recvancillary = 0; 7876 tcp->tcp_recvifindex = 0; 7877 tcp->tcp_recvhops = 0; 7878 tcp->tcp_closed = 0; 7879 tcp->tcp_cleandeathtag = 0; 7880 if (tcp->tcp_hopopts != NULL) { 7881 mi_free(tcp->tcp_hopopts); 7882 tcp->tcp_hopopts = NULL; 7883 tcp->tcp_hopoptslen = 0; 7884 } 7885 ASSERT(tcp->tcp_hopoptslen == 0); 7886 if (tcp->tcp_dstopts != NULL) { 7887 mi_free(tcp->tcp_dstopts); 7888 tcp->tcp_dstopts = NULL; 7889 tcp->tcp_dstoptslen = 0; 7890 } 7891 ASSERT(tcp->tcp_dstoptslen == 0); 7892 if (tcp->tcp_rtdstopts != NULL) { 7893 mi_free(tcp->tcp_rtdstopts); 7894 tcp->tcp_rtdstopts = NULL; 7895 tcp->tcp_rtdstoptslen = 0; 7896 } 7897 ASSERT(tcp->tcp_rtdstoptslen == 0); 7898 if (tcp->tcp_rthdr != NULL) { 7899 mi_free(tcp->tcp_rthdr); 7900 tcp->tcp_rthdr = NULL; 7901 tcp->tcp_rthdrlen = 0; 7902 } 7903 ASSERT(tcp->tcp_rthdrlen == 0); 7904 PRESERVE(tcp->tcp_drop_opt_ack_cnt); 7905 7906 /* Reset fusion-related fields */ 7907 tcp->tcp_fused = B_FALSE; 7908 tcp->tcp_unfusable = B_FALSE; 7909 tcp->tcp_fused_sigurg = B_FALSE; 7910 tcp->tcp_direct_sockfs = B_FALSE; 7911 tcp->tcp_fuse_syncstr_stopped = B_FALSE; 7912 tcp->tcp_fuse_syncstr_plugged = B_FALSE; 7913 tcp->tcp_loopback_peer = NULL; 7914 tcp->tcp_fuse_rcv_hiwater = 0; 7915 tcp->tcp_fuse_rcv_unread_hiwater = 0; 7916 tcp->tcp_fuse_rcv_unread_cnt = 0; 7917 7918 tcp->tcp_in_ack_unsent = 0; 7919 tcp->tcp_cork = B_FALSE; 7920 7921 PRESERVE(tcp->tcp_squeue_bytes); 7922 7923 ASSERT(tcp->tcp_kssl_ctx == NULL); 7924 ASSERT(!tcp->tcp_kssl_pending); 7925 PRESERVE(tcp->tcp_kssl_ent); 7926 7927 #undef DONTCARE 7928 #undef PRESERVE 7929 } 7930 7931 /* 7932 * Allocate necessary resources and initialize state vector. 7933 * Guaranteed not to fail so that when an error is returned, 7934 * the caller doesn't need to do any additional cleanup. 7935 */ 7936 int 7937 tcp_init(tcp_t *tcp, queue_t *q) 7938 { 7939 int err; 7940 7941 tcp->tcp_rq = q; 7942 tcp->tcp_wq = WR(q); 7943 tcp->tcp_state = TCPS_IDLE; 7944 if ((err = tcp_init_values(tcp)) != 0) 7945 tcp_timers_stop(tcp); 7946 return (err); 7947 } 7948 7949 static int 7950 tcp_init_values(tcp_t *tcp) 7951 { 7952 int err; 7953 7954 ASSERT((tcp->tcp_family == AF_INET && 7955 tcp->tcp_ipversion == IPV4_VERSION) || 7956 (tcp->tcp_family == AF_INET6 && 7957 (tcp->tcp_ipversion == IPV4_VERSION || 7958 tcp->tcp_ipversion == IPV6_VERSION))); 7959 7960 /* 7961 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 7962 * will be close to tcp_rexmit_interval_initial. By doing this, we 7963 * allow the algorithm to adjust slowly to large fluctuations of RTT 7964 * during first few transmissions of a connection as seen in slow 7965 * links. 7966 */ 7967 tcp->tcp_rtt_sa = tcp_rexmit_interval_initial << 2; 7968 tcp->tcp_rtt_sd = tcp_rexmit_interval_initial >> 1; 7969 tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 7970 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + 7971 tcp_conn_grace_period; 7972 if (tcp->tcp_rto < tcp_rexmit_interval_min) 7973 tcp->tcp_rto = tcp_rexmit_interval_min; 7974 tcp->tcp_timer_backoff = 0; 7975 tcp->tcp_ms_we_have_waited = 0; 7976 tcp->tcp_last_recv_time = lbolt; 7977 tcp->tcp_cwnd_max = tcp_cwnd_max_; 7978 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 7979 tcp->tcp_snd_burst = TCP_CWND_INFINITE; 7980 7981 tcp->tcp_maxpsz = tcp_maxpsz_multiplier; 7982 7983 tcp->tcp_first_timer_threshold = tcp_ip_notify_interval; 7984 tcp->tcp_first_ctimer_threshold = tcp_ip_notify_cinterval; 7985 tcp->tcp_second_timer_threshold = tcp_ip_abort_interval; 7986 /* 7987 * Fix it to tcp_ip_abort_linterval later if it turns out to be a 7988 * passive open. 7989 */ 7990 tcp->tcp_second_ctimer_threshold = tcp_ip_abort_cinterval; 7991 7992 tcp->tcp_naglim = tcp_naglim_def; 7993 7994 /* NOTE: ISS is now set in tcp_adapt_ire(). */ 7995 7996 tcp->tcp_mdt_hdr_head = 0; 7997 tcp->tcp_mdt_hdr_tail = 0; 7998 7999 /* Reset fusion-related fields */ 8000 tcp->tcp_fused = B_FALSE; 8001 tcp->tcp_unfusable = B_FALSE; 8002 tcp->tcp_fused_sigurg = B_FALSE; 8003 tcp->tcp_direct_sockfs = B_FALSE; 8004 tcp->tcp_fuse_syncstr_stopped = B_FALSE; 8005 tcp->tcp_fuse_syncstr_plugged = B_FALSE; 8006 tcp->tcp_loopback_peer = NULL; 8007 tcp->tcp_fuse_rcv_hiwater = 0; 8008 tcp->tcp_fuse_rcv_unread_hiwater = 0; 8009 tcp->tcp_fuse_rcv_unread_cnt = 0; 8010 8011 /* Initialize the header template */ 8012 if (tcp->tcp_ipversion == IPV4_VERSION) { 8013 err = tcp_header_init_ipv4(tcp); 8014 } else { 8015 err = tcp_header_init_ipv6(tcp); 8016 } 8017 if (err) 8018 return (err); 8019 8020 /* 8021 * Init the window scale to the max so tcp_rwnd_set() won't pare 8022 * down tcp_rwnd. tcp_adapt_ire() will set the right value later. 8023 */ 8024 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 8025 tcp->tcp_xmit_lowater = tcp_xmit_lowat; 8026 tcp->tcp_xmit_hiwater = tcp_xmit_hiwat; 8027 8028 tcp->tcp_cork = B_FALSE; 8029 /* 8030 * Init the tcp_debug option. This value determines whether TCP 8031 * calls strlog() to print out debug messages. Doing this 8032 * initialization here means that this value is not inherited thru 8033 * tcp_reinit(). 8034 */ 8035 tcp->tcp_debug = tcp_dbg; 8036 8037 tcp->tcp_ka_interval = tcp_keepalive_interval; 8038 tcp->tcp_ka_abort_thres = tcp_keepalive_abort_interval; 8039 8040 return (0); 8041 } 8042 8043 /* 8044 * Initialize the IPv4 header. Loses any record of any IP options. 8045 */ 8046 static int 8047 tcp_header_init_ipv4(tcp_t *tcp) 8048 { 8049 tcph_t *tcph; 8050 uint32_t sum; 8051 conn_t *connp; 8052 8053 /* 8054 * This is a simple initialization. If there's 8055 * already a template, it should never be too small, 8056 * so reuse it. Otherwise, allocate space for the new one. 8057 */ 8058 if (tcp->tcp_iphc == NULL) { 8059 ASSERT(tcp->tcp_iphc_len == 0); 8060 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 8061 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP); 8062 if (tcp->tcp_iphc == NULL) { 8063 tcp->tcp_iphc_len = 0; 8064 return (ENOMEM); 8065 } 8066 } 8067 8068 /* options are gone; may need a new label */ 8069 connp = tcp->tcp_connp; 8070 connp->conn_mlp_type = mlptSingle; 8071 connp->conn_ulp_labeled = !is_system_labeled(); 8072 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 8073 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; 8074 tcp->tcp_ip6h = NULL; 8075 tcp->tcp_ipversion = IPV4_VERSION; 8076 tcp->tcp_hdr_len = sizeof (ipha_t) + sizeof (tcph_t); 8077 tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 8078 tcp->tcp_ip_hdr_len = sizeof (ipha_t); 8079 tcp->tcp_ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (tcph_t)); 8080 tcp->tcp_ipha->ipha_version_and_hdr_length 8081 = (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS; 8082 tcp->tcp_ipha->ipha_ident = 0; 8083 8084 tcp->tcp_ttl = (uchar_t)tcp_ipv4_ttl; 8085 tcp->tcp_tos = 0; 8086 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; 8087 tcp->tcp_ipha->ipha_ttl = (uchar_t)tcp_ipv4_ttl; 8088 tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP; 8089 8090 tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t)); 8091 tcp->tcp_tcph = tcph; 8092 tcph->th_offset_and_rsrvd[0] = (5 << 4); 8093 /* 8094 * IP wants our header length in the checksum field to 8095 * allow it to perform a single pseudo-header+checksum 8096 * calculation on behalf of TCP. 8097 * Include the adjustment for a source route once IP_OPTIONS is set. 8098 */ 8099 sum = sizeof (tcph_t) + tcp->tcp_sum; 8100 sum = (sum >> 16) + (sum & 0xFFFF); 8101 U16_TO_ABE16(sum, tcph->th_sum); 8102 return (0); 8103 } 8104 8105 /* 8106 * Initialize the IPv6 header. Loses any record of any IPv6 extension headers. 8107 */ 8108 static int 8109 tcp_header_init_ipv6(tcp_t *tcp) 8110 { 8111 tcph_t *tcph; 8112 uint32_t sum; 8113 conn_t *connp; 8114 8115 /* 8116 * This is a simple initialization. If there's 8117 * already a template, it should never be too small, 8118 * so reuse it. Otherwise, allocate space for the new one. 8119 * Ensure that there is enough space to "downgrade" the tcp_t 8120 * to an IPv4 tcp_t. This requires having space for a full load 8121 * of IPv4 options, as well as a full load of TCP options 8122 * (TCP_MAX_COMBINED_HEADER_LENGTH, 120 bytes); this is more space 8123 * than a v6 header and a TCP header with a full load of TCP options 8124 * (IPV6_HDR_LEN is 40 bytes; TCP_MAX_HDR_LENGTH is 60 bytes). 8125 * We want to avoid reallocation in the "downgraded" case when 8126 * processing outbound IPv4 options. 8127 */ 8128 if (tcp->tcp_iphc == NULL) { 8129 ASSERT(tcp->tcp_iphc_len == 0); 8130 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 8131 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP); 8132 if (tcp->tcp_iphc == NULL) { 8133 tcp->tcp_iphc_len = 0; 8134 return (ENOMEM); 8135 } 8136 } 8137 8138 /* options are gone; may need a new label */ 8139 connp = tcp->tcp_connp; 8140 connp->conn_mlp_type = mlptSingle; 8141 connp->conn_ulp_labeled = !is_system_labeled(); 8142 8143 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 8144 tcp->tcp_ipversion = IPV6_VERSION; 8145 tcp->tcp_hdr_len = IPV6_HDR_LEN + sizeof (tcph_t); 8146 tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 8147 tcp->tcp_ip_hdr_len = IPV6_HDR_LEN; 8148 tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc; 8149 tcp->tcp_ipha = NULL; 8150 8151 /* Initialize the header template */ 8152 8153 tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 8154 tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t)); 8155 tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP; 8156 tcp->tcp_ip6h->ip6_hops = (uint8_t)tcp_ipv6_hoplimit; 8157 8158 tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN); 8159 tcp->tcp_tcph = tcph; 8160 tcph->th_offset_and_rsrvd[0] = (5 << 4); 8161 /* 8162 * IP wants our header length in the checksum field to 8163 * allow it to perform a single psuedo-header+checksum 8164 * calculation on behalf of TCP. 8165 * Include the adjustment for a source route when IPV6_RTHDR is set. 8166 */ 8167 sum = sizeof (tcph_t) + tcp->tcp_sum; 8168 sum = (sum >> 16) + (sum & 0xFFFF); 8169 U16_TO_ABE16(sum, tcph->th_sum); 8170 return (0); 8171 } 8172 8173 /* At minimum we need 4 bytes in the TCP header for the lookup */ 8174 #define ICMP_MIN_TCP_HDR 12 8175 8176 /* 8177 * tcp_icmp_error is called by tcp_rput_other to process ICMP error messages 8178 * passed up by IP. The message is always received on the correct tcp_t. 8179 * Assumes that IP has pulled up everything up to and including the ICMP header. 8180 */ 8181 void 8182 tcp_icmp_error(tcp_t *tcp, mblk_t *mp) 8183 { 8184 icmph_t *icmph; 8185 ipha_t *ipha; 8186 int iph_hdr_length; 8187 tcph_t *tcph; 8188 boolean_t ipsec_mctl = B_FALSE; 8189 boolean_t secure; 8190 mblk_t *first_mp = mp; 8191 uint32_t new_mss; 8192 uint32_t ratio; 8193 size_t mp_size = MBLKL(mp); 8194 uint32_t seg_ack; 8195 uint32_t seg_seq; 8196 8197 /* Assume IP provides aligned packets - otherwise toss */ 8198 if (!OK_32PTR(mp->b_rptr)) { 8199 freemsg(mp); 8200 return; 8201 } 8202 8203 /* 8204 * Since ICMP errors are normal data marked with M_CTL when sent 8205 * to TCP or UDP, we have to look for a IPSEC_IN value to identify 8206 * packets starting with an ipsec_info_t, see ipsec_info.h. 8207 */ 8208 if ((mp_size == sizeof (ipsec_info_t)) && 8209 (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == IPSEC_IN)) { 8210 ASSERT(mp->b_cont != NULL); 8211 mp = mp->b_cont; 8212 /* IP should have done this */ 8213 ASSERT(OK_32PTR(mp->b_rptr)); 8214 mp_size = MBLKL(mp); 8215 ipsec_mctl = B_TRUE; 8216 } 8217 8218 /* 8219 * Verify that we have a complete outer IP header. If not, drop it. 8220 */ 8221 if (mp_size < sizeof (ipha_t)) { 8222 noticmpv4: 8223 freemsg(first_mp); 8224 return; 8225 } 8226 8227 ipha = (ipha_t *)mp->b_rptr; 8228 /* 8229 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent 8230 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6. 8231 */ 8232 switch (IPH_HDR_VERSION(ipha)) { 8233 case IPV6_VERSION: 8234 tcp_icmp_error_ipv6(tcp, first_mp, ipsec_mctl); 8235 return; 8236 case IPV4_VERSION: 8237 break; 8238 default: 8239 goto noticmpv4; 8240 } 8241 8242 /* Skip past the outer IP and ICMP headers */ 8243 iph_hdr_length = IPH_HDR_LENGTH(ipha); 8244 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 8245 /* 8246 * If we don't have the correct outer IP header length or if the ULP 8247 * is not IPPROTO_ICMP or if we don't have a complete inner IP header 8248 * send it upstream. 8249 */ 8250 if (iph_hdr_length < sizeof (ipha_t) || 8251 ipha->ipha_protocol != IPPROTO_ICMP || 8252 (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) { 8253 goto noticmpv4; 8254 } 8255 ipha = (ipha_t *)&icmph[1]; 8256 8257 /* Skip past the inner IP and find the ULP header */ 8258 iph_hdr_length = IPH_HDR_LENGTH(ipha); 8259 tcph = (tcph_t *)((char *)ipha + iph_hdr_length); 8260 /* 8261 * If we don't have the correct inner IP header length or if the ULP 8262 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR 8263 * bytes of TCP header, drop it. 8264 */ 8265 if (iph_hdr_length < sizeof (ipha_t) || 8266 ipha->ipha_protocol != IPPROTO_TCP || 8267 (uchar_t *)tcph + ICMP_MIN_TCP_HDR > mp->b_wptr) { 8268 goto noticmpv4; 8269 } 8270 8271 if (TCP_IS_DETACHED_NONEAGER(tcp)) { 8272 if (ipsec_mctl) { 8273 secure = ipsec_in_is_secure(first_mp); 8274 } else { 8275 secure = B_FALSE; 8276 } 8277 if (secure) { 8278 /* 8279 * If we are willing to accept this in clear 8280 * we don't have to verify policy. 8281 */ 8282 if (!ipsec_inbound_accept_clear(mp, ipha, NULL)) { 8283 if (!tcp_check_policy(tcp, first_mp, 8284 ipha, NULL, secure, ipsec_mctl)) { 8285 /* 8286 * tcp_check_policy called 8287 * ip_drop_packet() on failure. 8288 */ 8289 return; 8290 } 8291 } 8292 } 8293 } else if (ipsec_mctl) { 8294 /* 8295 * This is a hard_bound connection. IP has already 8296 * verified policy. We don't have to do it again. 8297 */ 8298 freeb(first_mp); 8299 first_mp = mp; 8300 ipsec_mctl = B_FALSE; 8301 } 8302 8303 seg_ack = ABE32_TO_U32(tcph->th_ack); 8304 seg_seq = ABE32_TO_U32(tcph->th_seq); 8305 /* 8306 * TCP SHOULD check that the TCP sequence number contained in 8307 * payload of the ICMP error message is within the range 8308 * SND.UNA <= SEG.SEQ < SND.NXT. and also SEG.ACK <= RECV.NXT 8309 */ 8310 if (SEQ_LT(seg_seq, tcp->tcp_suna) || 8311 SEQ_GEQ(seg_seq, tcp->tcp_snxt) || 8312 SEQ_GT(seg_ack, tcp->tcp_rnxt)) { 8313 /* 8314 * If the ICMP message is bogus, should we kill the 8315 * connection, or should we just drop the bogus ICMP 8316 * message? It would probably make more sense to just 8317 * drop the message so that if this one managed to get 8318 * in, the real connection should not suffer. 8319 */ 8320 goto noticmpv4; 8321 } 8322 8323 switch (icmph->icmph_type) { 8324 case ICMP_DEST_UNREACHABLE: 8325 switch (icmph->icmph_code) { 8326 case ICMP_FRAGMENTATION_NEEDED: 8327 /* 8328 * Reduce the MSS based on the new MTU. This will 8329 * eliminate any fragmentation locally. 8330 * N.B. There may well be some funny side-effects on 8331 * the local send policy and the remote receive policy. 8332 * Pending further research, we provide 8333 * tcp_ignore_path_mtu just in case this proves 8334 * disastrous somewhere. 8335 * 8336 * After updating the MSS, retransmit part of the 8337 * dropped segment using the new mss by calling 8338 * tcp_wput_data(). Need to adjust all those 8339 * params to make sure tcp_wput_data() work properly. 8340 */ 8341 if (tcp_ignore_path_mtu) 8342 break; 8343 8344 /* 8345 * Decrease the MSS by time stamp options 8346 * IP options and IPSEC options. tcp_hdr_len 8347 * includes time stamp option and IP option 8348 * length. 8349 */ 8350 8351 new_mss = ntohs(icmph->icmph_du_mtu) - 8352 tcp->tcp_hdr_len - tcp->tcp_ipsec_overhead; 8353 8354 /* 8355 * Only update the MSS if the new one is 8356 * smaller than the previous one. This is 8357 * to avoid problems when getting multiple 8358 * ICMP errors for the same MTU. 8359 */ 8360 if (new_mss >= tcp->tcp_mss) 8361 break; 8362 8363 /* 8364 * Stop doing PMTU if new_mss is less than 68 8365 * or less than tcp_mss_min. 8366 * The value 68 comes from rfc 1191. 8367 */ 8368 if (new_mss < MAX(68, tcp_mss_min)) 8369 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 8370 0; 8371 8372 ratio = tcp->tcp_cwnd / tcp->tcp_mss; 8373 ASSERT(ratio >= 1); 8374 tcp_mss_set(tcp, new_mss); 8375 8376 /* 8377 * Make sure we have something to 8378 * send. 8379 */ 8380 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) && 8381 (tcp->tcp_xmit_head != NULL)) { 8382 /* 8383 * Shrink tcp_cwnd in 8384 * proportion to the old MSS/new MSS. 8385 */ 8386 tcp->tcp_cwnd = ratio * tcp->tcp_mss; 8387 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 8388 (tcp->tcp_unsent == 0)) { 8389 tcp->tcp_rexmit_max = tcp->tcp_fss; 8390 } else { 8391 tcp->tcp_rexmit_max = tcp->tcp_snxt; 8392 } 8393 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 8394 tcp->tcp_rexmit = B_TRUE; 8395 tcp->tcp_dupack_cnt = 0; 8396 tcp->tcp_snd_burst = TCP_CWND_SS; 8397 tcp_ss_rexmit(tcp); 8398 } 8399 break; 8400 case ICMP_PORT_UNREACHABLE: 8401 case ICMP_PROTOCOL_UNREACHABLE: 8402 switch (tcp->tcp_state) { 8403 case TCPS_SYN_SENT: 8404 case TCPS_SYN_RCVD: 8405 /* 8406 * ICMP can snipe away incipient 8407 * TCP connections as long as 8408 * seq number is same as initial 8409 * send seq number. 8410 */ 8411 if (seg_seq == tcp->tcp_iss) { 8412 (void) tcp_clean_death(tcp, 8413 ECONNREFUSED, 6); 8414 } 8415 break; 8416 } 8417 break; 8418 case ICMP_HOST_UNREACHABLE: 8419 case ICMP_NET_UNREACHABLE: 8420 /* Record the error in case we finally time out. */ 8421 if (icmph->icmph_code == ICMP_HOST_UNREACHABLE) 8422 tcp->tcp_client_errno = EHOSTUNREACH; 8423 else 8424 tcp->tcp_client_errno = ENETUNREACH; 8425 if (tcp->tcp_state == TCPS_SYN_RCVD) { 8426 if (tcp->tcp_listener != NULL && 8427 tcp->tcp_listener->tcp_syn_defense) { 8428 /* 8429 * Ditch the half-open connection if we 8430 * suspect a SYN attack is under way. 8431 */ 8432 tcp_ip_ire_mark_advice(tcp); 8433 (void) tcp_clean_death(tcp, 8434 tcp->tcp_client_errno, 7); 8435 } 8436 } 8437 break; 8438 default: 8439 break; 8440 } 8441 break; 8442 case ICMP_SOURCE_QUENCH: { 8443 /* 8444 * use a global boolean to control 8445 * whether TCP should respond to ICMP_SOURCE_QUENCH. 8446 * The default is false. 8447 */ 8448 if (tcp_icmp_source_quench) { 8449 /* 8450 * Reduce the sending rate as if we got a 8451 * retransmit timeout 8452 */ 8453 uint32_t npkt; 8454 8455 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / 8456 tcp->tcp_mss; 8457 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss; 8458 tcp->tcp_cwnd = tcp->tcp_mss; 8459 tcp->tcp_cwnd_cnt = 0; 8460 } 8461 break; 8462 } 8463 } 8464 freemsg(first_mp); 8465 } 8466 8467 /* 8468 * tcp_icmp_error_ipv6 is called by tcp_rput_other to process ICMPv6 8469 * error messages passed up by IP. 8470 * Assumes that IP has pulled up all the extension headers as well 8471 * as the ICMPv6 header. 8472 */ 8473 static void 8474 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl) 8475 { 8476 icmp6_t *icmp6; 8477 ip6_t *ip6h; 8478 uint16_t iph_hdr_length; 8479 tcpha_t *tcpha; 8480 uint8_t *nexthdrp; 8481 uint32_t new_mss; 8482 uint32_t ratio; 8483 boolean_t secure; 8484 mblk_t *first_mp = mp; 8485 size_t mp_size; 8486 uint32_t seg_ack; 8487 uint32_t seg_seq; 8488 8489 /* 8490 * The caller has determined if this is an IPSEC_IN packet and 8491 * set ipsec_mctl appropriately (see tcp_icmp_error). 8492 */ 8493 if (ipsec_mctl) 8494 mp = mp->b_cont; 8495 8496 mp_size = MBLKL(mp); 8497 8498 /* 8499 * Verify that we have a complete IP header. If not, send it upstream. 8500 */ 8501 if (mp_size < sizeof (ip6_t)) { 8502 noticmpv6: 8503 freemsg(first_mp); 8504 return; 8505 } 8506 8507 /* 8508 * Verify this is an ICMPV6 packet, else send it upstream. 8509 */ 8510 ip6h = (ip6_t *)mp->b_rptr; 8511 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 8512 iph_hdr_length = IPV6_HDR_LEN; 8513 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, 8514 &nexthdrp) || 8515 *nexthdrp != IPPROTO_ICMPV6) { 8516 goto noticmpv6; 8517 } 8518 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 8519 ip6h = (ip6_t *)&icmp6[1]; 8520 /* 8521 * Verify if we have a complete ICMP and inner IP header. 8522 */ 8523 if ((uchar_t *)&ip6h[1] > mp->b_wptr) 8524 goto noticmpv6; 8525 8526 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) 8527 goto noticmpv6; 8528 tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length); 8529 /* 8530 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't 8531 * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the 8532 * packet. 8533 */ 8534 if ((*nexthdrp != IPPROTO_TCP) || 8535 ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) { 8536 goto noticmpv6; 8537 } 8538 8539 /* 8540 * ICMP errors come on the right queue or come on 8541 * listener/global queue for detached connections and 8542 * get switched to the right queue. If it comes on the 8543 * right queue, policy check has already been done by IP 8544 * and thus free the first_mp without verifying the policy. 8545 * If it has come for a non-hard bound connection, we need 8546 * to verify policy as IP may not have done it. 8547 */ 8548 if (!tcp->tcp_hard_bound) { 8549 if (ipsec_mctl) { 8550 secure = ipsec_in_is_secure(first_mp); 8551 } else { 8552 secure = B_FALSE; 8553 } 8554 if (secure) { 8555 /* 8556 * If we are willing to accept this in clear 8557 * we don't have to verify policy. 8558 */ 8559 if (!ipsec_inbound_accept_clear(mp, NULL, ip6h)) { 8560 if (!tcp_check_policy(tcp, first_mp, 8561 NULL, ip6h, secure, ipsec_mctl)) { 8562 /* 8563 * tcp_check_policy called 8564 * ip_drop_packet() on failure. 8565 */ 8566 return; 8567 } 8568 } 8569 } 8570 } else if (ipsec_mctl) { 8571 /* 8572 * This is a hard_bound connection. IP has already 8573 * verified policy. We don't have to do it again. 8574 */ 8575 freeb(first_mp); 8576 first_mp = mp; 8577 ipsec_mctl = B_FALSE; 8578 } 8579 8580 seg_ack = ntohl(tcpha->tha_ack); 8581 seg_seq = ntohl(tcpha->tha_seq); 8582 /* 8583 * TCP SHOULD check that the TCP sequence number contained in 8584 * payload of the ICMP error message is within the range 8585 * SND.UNA <= SEG.SEQ < SND.NXT. and also SEG.ACK <= RECV.NXT 8586 */ 8587 if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt) || 8588 SEQ_GT(seg_ack, tcp->tcp_rnxt)) { 8589 /* 8590 * If the ICMP message is bogus, should we kill the 8591 * connection, or should we just drop the bogus ICMP 8592 * message? It would probably make more sense to just 8593 * drop the message so that if this one managed to get 8594 * in, the real connection should not suffer. 8595 */ 8596 goto noticmpv6; 8597 } 8598 8599 switch (icmp6->icmp6_type) { 8600 case ICMP6_PACKET_TOO_BIG: 8601 /* 8602 * Reduce the MSS based on the new MTU. This will 8603 * eliminate any fragmentation locally. 8604 * N.B. There may well be some funny side-effects on 8605 * the local send policy and the remote receive policy. 8606 * Pending further research, we provide 8607 * tcp_ignore_path_mtu just in case this proves 8608 * disastrous somewhere. 8609 * 8610 * After updating the MSS, retransmit part of the 8611 * dropped segment using the new mss by calling 8612 * tcp_wput_data(). Need to adjust all those 8613 * params to make sure tcp_wput_data() work properly. 8614 */ 8615 if (tcp_ignore_path_mtu) 8616 break; 8617 8618 /* 8619 * Decrease the MSS by time stamp options 8620 * IP options and IPSEC options. tcp_hdr_len 8621 * includes time stamp option and IP option 8622 * length. 8623 */ 8624 new_mss = ntohs(icmp6->icmp6_mtu) - tcp->tcp_hdr_len - 8625 tcp->tcp_ipsec_overhead; 8626 8627 /* 8628 * Only update the MSS if the new one is 8629 * smaller than the previous one. This is 8630 * to avoid problems when getting multiple 8631 * ICMP errors for the same MTU. 8632 */ 8633 if (new_mss >= tcp->tcp_mss) 8634 break; 8635 8636 ratio = tcp->tcp_cwnd / tcp->tcp_mss; 8637 ASSERT(ratio >= 1); 8638 tcp_mss_set(tcp, new_mss); 8639 8640 /* 8641 * Make sure we have something to 8642 * send. 8643 */ 8644 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) && 8645 (tcp->tcp_xmit_head != NULL)) { 8646 /* 8647 * Shrink tcp_cwnd in 8648 * proportion to the old MSS/new MSS. 8649 */ 8650 tcp->tcp_cwnd = ratio * tcp->tcp_mss; 8651 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 8652 (tcp->tcp_unsent == 0)) { 8653 tcp->tcp_rexmit_max = tcp->tcp_fss; 8654 } else { 8655 tcp->tcp_rexmit_max = tcp->tcp_snxt; 8656 } 8657 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 8658 tcp->tcp_rexmit = B_TRUE; 8659 tcp->tcp_dupack_cnt = 0; 8660 tcp->tcp_snd_burst = TCP_CWND_SS; 8661 tcp_ss_rexmit(tcp); 8662 } 8663 break; 8664 8665 case ICMP6_DST_UNREACH: 8666 switch (icmp6->icmp6_code) { 8667 case ICMP6_DST_UNREACH_NOPORT: 8668 if (((tcp->tcp_state == TCPS_SYN_SENT) || 8669 (tcp->tcp_state == TCPS_SYN_RCVD)) && 8670 (seg_seq == tcp->tcp_iss)) { 8671 (void) tcp_clean_death(tcp, 8672 ECONNREFUSED, 8); 8673 } 8674 break; 8675 8676 case ICMP6_DST_UNREACH_ADMIN: 8677 case ICMP6_DST_UNREACH_NOROUTE: 8678 case ICMP6_DST_UNREACH_BEYONDSCOPE: 8679 case ICMP6_DST_UNREACH_ADDR: 8680 /* Record the error in case we finally time out. */ 8681 tcp->tcp_client_errno = EHOSTUNREACH; 8682 if (((tcp->tcp_state == TCPS_SYN_SENT) || 8683 (tcp->tcp_state == TCPS_SYN_RCVD)) && 8684 (seg_seq == tcp->tcp_iss)) { 8685 if (tcp->tcp_listener != NULL && 8686 tcp->tcp_listener->tcp_syn_defense) { 8687 /* 8688 * Ditch the half-open connection if we 8689 * suspect a SYN attack is under way. 8690 */ 8691 tcp_ip_ire_mark_advice(tcp); 8692 (void) tcp_clean_death(tcp, 8693 tcp->tcp_client_errno, 9); 8694 } 8695 } 8696 8697 8698 break; 8699 default: 8700 break; 8701 } 8702 break; 8703 8704 case ICMP6_PARAM_PROB: 8705 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 8706 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 8707 (uchar_t *)ip6h + icmp6->icmp6_pptr == 8708 (uchar_t *)nexthdrp) { 8709 if (tcp->tcp_state == TCPS_SYN_SENT || 8710 tcp->tcp_state == TCPS_SYN_RCVD) { 8711 (void) tcp_clean_death(tcp, 8712 ECONNREFUSED, 10); 8713 } 8714 break; 8715 } 8716 break; 8717 8718 case ICMP6_TIME_EXCEEDED: 8719 default: 8720 break; 8721 } 8722 freemsg(first_mp); 8723 } 8724 8725 /* 8726 * IP recognizes seven kinds of bind requests: 8727 * 8728 * - A zero-length address binds only to the protocol number. 8729 * 8730 * - A 4-byte address is treated as a request to 8731 * validate that the address is a valid local IPv4 8732 * address, appropriate for an application to bind to. 8733 * IP does the verification, but does not make any note 8734 * of the address at this time. 8735 * 8736 * - A 16-byte address contains is treated as a request 8737 * to validate a local IPv6 address, as the 4-byte 8738 * address case above. 8739 * 8740 * - A 16-byte sockaddr_in to validate the local IPv4 address and also 8741 * use it for the inbound fanout of packets. 8742 * 8743 * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also 8744 * use it for the inbound fanout of packets. 8745 * 8746 * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout 8747 * information consisting of local and remote addresses 8748 * and ports. In this case, the addresses are both 8749 * validated as appropriate for this operation, and, if 8750 * so, the information is retained for use in the 8751 * inbound fanout. 8752 * 8753 * - A 36-byte address address (ipa6_conn_t) containing complete IPv6 8754 * fanout information, like the 12-byte case above. 8755 * 8756 * IP will also fill in the IRE request mblk with information 8757 * regarding our peer. In all cases, we notify IP of our protocol 8758 * type by appending a single protocol byte to the bind request. 8759 */ 8760 static mblk_t * 8761 tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, t_scalar_t addr_length) 8762 { 8763 char *cp; 8764 mblk_t *mp; 8765 struct T_bind_req *tbr; 8766 ipa_conn_t *ac; 8767 ipa6_conn_t *ac6; 8768 sin_t *sin; 8769 sin6_t *sin6; 8770 8771 ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ); 8772 ASSERT((tcp->tcp_family == AF_INET && 8773 tcp->tcp_ipversion == IPV4_VERSION) || 8774 (tcp->tcp_family == AF_INET6 && 8775 (tcp->tcp_ipversion == IPV4_VERSION || 8776 tcp->tcp_ipversion == IPV6_VERSION))); 8777 8778 mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI); 8779 if (!mp) 8780 return (mp); 8781 mp->b_datap->db_type = M_PROTO; 8782 tbr = (struct T_bind_req *)mp->b_rptr; 8783 tbr->PRIM_type = bind_prim; 8784 tbr->ADDR_offset = sizeof (*tbr); 8785 tbr->CONIND_number = 0; 8786 tbr->ADDR_length = addr_length; 8787 cp = (char *)&tbr[1]; 8788 switch (addr_length) { 8789 case sizeof (ipa_conn_t): 8790 ASSERT(tcp->tcp_family == AF_INET); 8791 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 8792 8793 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); 8794 if (mp->b_cont == NULL) { 8795 freemsg(mp); 8796 return (NULL); 8797 } 8798 mp->b_cont->b_wptr += sizeof (ire_t); 8799 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; 8800 8801 /* cp known to be 32 bit aligned */ 8802 ac = (ipa_conn_t *)cp; 8803 ac->ac_laddr = tcp->tcp_ipha->ipha_src; 8804 ac->ac_faddr = tcp->tcp_remote; 8805 ac->ac_fport = tcp->tcp_fport; 8806 ac->ac_lport = tcp->tcp_lport; 8807 tcp->tcp_hard_binding = 1; 8808 break; 8809 8810 case sizeof (ipa6_conn_t): 8811 ASSERT(tcp->tcp_family == AF_INET6); 8812 8813 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); 8814 if (mp->b_cont == NULL) { 8815 freemsg(mp); 8816 return (NULL); 8817 } 8818 mp->b_cont->b_wptr += sizeof (ire_t); 8819 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; 8820 8821 /* cp known to be 32 bit aligned */ 8822 ac6 = (ipa6_conn_t *)cp; 8823 if (tcp->tcp_ipversion == IPV4_VERSION) { 8824 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 8825 &ac6->ac6_laddr); 8826 } else { 8827 ac6->ac6_laddr = tcp->tcp_ip6h->ip6_src; 8828 } 8829 ac6->ac6_faddr = tcp->tcp_remote_v6; 8830 ac6->ac6_fport = tcp->tcp_fport; 8831 ac6->ac6_lport = tcp->tcp_lport; 8832 tcp->tcp_hard_binding = 1; 8833 break; 8834 8835 case sizeof (sin_t): 8836 /* 8837 * NOTE: IPV6_ADDR_LEN also has same size. 8838 * Use family to discriminate. 8839 */ 8840 if (tcp->tcp_family == AF_INET) { 8841 sin = (sin_t *)cp; 8842 8843 *sin = sin_null; 8844 sin->sin_family = AF_INET; 8845 sin->sin_addr.s_addr = tcp->tcp_bound_source; 8846 sin->sin_port = tcp->tcp_lport; 8847 break; 8848 } else { 8849 *(in6_addr_t *)cp = tcp->tcp_bound_source_v6; 8850 } 8851 break; 8852 8853 case sizeof (sin6_t): 8854 ASSERT(tcp->tcp_family == AF_INET6); 8855 sin6 = (sin6_t *)cp; 8856 8857 *sin6 = sin6_null; 8858 sin6->sin6_family = AF_INET6; 8859 sin6->sin6_addr = tcp->tcp_bound_source_v6; 8860 sin6->sin6_port = tcp->tcp_lport; 8861 break; 8862 8863 case IP_ADDR_LEN: 8864 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 8865 *(uint32_t *)cp = tcp->tcp_ipha->ipha_src; 8866 break; 8867 8868 } 8869 /* Add protocol number to end */ 8870 cp[addr_length] = (char)IPPROTO_TCP; 8871 mp->b_wptr = (uchar_t *)&cp[addr_length + 1]; 8872 return (mp); 8873 } 8874 8875 /* 8876 * Notify IP that we are having trouble with this connection. IP should 8877 * blow the IRE away and start over. 8878 */ 8879 static void 8880 tcp_ip_notify(tcp_t *tcp) 8881 { 8882 struct iocblk *iocp; 8883 ipid_t *ipid; 8884 mblk_t *mp; 8885 8886 /* IPv6 has NUD thus notification to delete the IRE is not needed */ 8887 if (tcp->tcp_ipversion == IPV6_VERSION) 8888 return; 8889 8890 mp = mkiocb(IP_IOCTL); 8891 if (mp == NULL) 8892 return; 8893 8894 iocp = (struct iocblk *)mp->b_rptr; 8895 iocp->ioc_count = sizeof (ipid_t) + sizeof (tcp->tcp_ipha->ipha_dst); 8896 8897 mp->b_cont = allocb(iocp->ioc_count, BPRI_HI); 8898 if (!mp->b_cont) { 8899 freeb(mp); 8900 return; 8901 } 8902 8903 ipid = (ipid_t *)mp->b_cont->b_rptr; 8904 mp->b_cont->b_wptr += iocp->ioc_count; 8905 bzero(ipid, sizeof (*ipid)); 8906 ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY; 8907 ipid->ipid_ire_type = IRE_CACHE; 8908 ipid->ipid_addr_offset = sizeof (ipid_t); 8909 ipid->ipid_addr_length = sizeof (tcp->tcp_ipha->ipha_dst); 8910 /* 8911 * Note: in the case of source routing we want to blow away the 8912 * route to the first source route hop. 8913 */ 8914 bcopy(&tcp->tcp_ipha->ipha_dst, &ipid[1], 8915 sizeof (tcp->tcp_ipha->ipha_dst)); 8916 8917 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 8918 } 8919 8920 /* Unlink and return any mblk that looks like it contains an ire */ 8921 static mblk_t * 8922 tcp_ire_mp(mblk_t *mp) 8923 { 8924 mblk_t *prev_mp; 8925 8926 for (;;) { 8927 prev_mp = mp; 8928 mp = mp->b_cont; 8929 if (mp == NULL) 8930 break; 8931 switch (DB_TYPE(mp)) { 8932 case IRE_DB_TYPE: 8933 case IRE_DB_REQ_TYPE: 8934 if (prev_mp != NULL) 8935 prev_mp->b_cont = mp->b_cont; 8936 mp->b_cont = NULL; 8937 return (mp); 8938 default: 8939 break; 8940 } 8941 } 8942 return (mp); 8943 } 8944 8945 /* 8946 * Timer callback routine for keepalive probe. We do a fake resend of 8947 * last ACKed byte. Then set a timer using RTO. When the timer expires, 8948 * check to see if we have heard anything from the other end for the last 8949 * RTO period. If we have, set the timer to expire for another 8950 * tcp_keepalive_intrvl and check again. If we have not, set a timer using 8951 * RTO << 1 and check again when it expires. Keep exponentially increasing 8952 * the timeout if we have not heard from the other side. If for more than 8953 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything, 8954 * kill the connection unless the keepalive abort threshold is 0. In 8955 * that case, we will probe "forever." 8956 */ 8957 static void 8958 tcp_keepalive_killer(void *arg) 8959 { 8960 mblk_t *mp; 8961 conn_t *connp = (conn_t *)arg; 8962 tcp_t *tcp = connp->conn_tcp; 8963 int32_t firetime; 8964 int32_t idletime; 8965 int32_t ka_intrvl; 8966 8967 tcp->tcp_ka_tid = 0; 8968 8969 if (tcp->tcp_fused) 8970 return; 8971 8972 BUMP_MIB(&tcp_mib, tcpTimKeepalive); 8973 ka_intrvl = tcp->tcp_ka_interval; 8974 8975 /* 8976 * Keepalive probe should only be sent if the application has not 8977 * done a close on the connection. 8978 */ 8979 if (tcp->tcp_state > TCPS_CLOSE_WAIT) { 8980 return; 8981 } 8982 /* Timer fired too early, restart it. */ 8983 if (tcp->tcp_state < TCPS_ESTABLISHED) { 8984 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 8985 MSEC_TO_TICK(ka_intrvl)); 8986 return; 8987 } 8988 8989 idletime = TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time); 8990 /* 8991 * If we have not heard from the other side for a long 8992 * time, kill the connection unless the keepalive abort 8993 * threshold is 0. In that case, we will probe "forever." 8994 */ 8995 if (tcp->tcp_ka_abort_thres != 0 && 8996 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) { 8997 BUMP_MIB(&tcp_mib, tcpTimKeepaliveDrop); 8998 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ? 8999 tcp->tcp_client_errno : ETIMEDOUT, 11); 9000 return; 9001 } 9002 9003 if (tcp->tcp_snxt == tcp->tcp_suna && 9004 idletime >= ka_intrvl) { 9005 /* Fake resend of last ACKed byte. */ 9006 mblk_t *mp1 = allocb(1, BPRI_LO); 9007 9008 if (mp1 != NULL) { 9009 *mp1->b_wptr++ = '\0'; 9010 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL, 9011 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE); 9012 freeb(mp1); 9013 /* 9014 * if allocation failed, fall through to start the 9015 * timer back. 9016 */ 9017 if (mp != NULL) { 9018 TCP_RECORD_TRACE(tcp, mp, 9019 TCP_TRACE_SEND_PKT); 9020 tcp_send_data(tcp, tcp->tcp_wq, mp); 9021 BUMP_MIB(&tcp_mib, tcpTimKeepaliveProbe); 9022 if (tcp->tcp_ka_last_intrvl != 0) { 9023 /* 9024 * We should probe again at least 9025 * in ka_intrvl, but not more than 9026 * tcp_rexmit_interval_max. 9027 */ 9028 firetime = MIN(ka_intrvl - 1, 9029 tcp->tcp_ka_last_intrvl << 1); 9030 if (firetime > tcp_rexmit_interval_max) 9031 firetime = 9032 tcp_rexmit_interval_max; 9033 } else { 9034 firetime = tcp->tcp_rto; 9035 } 9036 tcp->tcp_ka_tid = TCP_TIMER(tcp, 9037 tcp_keepalive_killer, 9038 MSEC_TO_TICK(firetime)); 9039 tcp->tcp_ka_last_intrvl = firetime; 9040 return; 9041 } 9042 } 9043 } else { 9044 tcp->tcp_ka_last_intrvl = 0; 9045 } 9046 9047 /* firetime can be negative if (mp1 == NULL || mp == NULL) */ 9048 if ((firetime = ka_intrvl - idletime) < 0) { 9049 firetime = ka_intrvl; 9050 } 9051 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 9052 MSEC_TO_TICK(firetime)); 9053 } 9054 9055 int 9056 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) 9057 { 9058 queue_t *q = tcp->tcp_rq; 9059 int32_t mss = tcp->tcp_mss; 9060 int maxpsz; 9061 9062 if (TCP_IS_DETACHED(tcp)) 9063 return (mss); 9064 9065 if (tcp->tcp_fused) { 9066 maxpsz = tcp_fuse_maxpsz_set(tcp); 9067 mss = INFPSZ; 9068 } else if (tcp->tcp_mdt || tcp->tcp_maxpsz == 0) { 9069 /* 9070 * Set the sd_qn_maxpsz according to the socket send buffer 9071 * size, and sd_maxblk to INFPSZ (-1). This will essentially 9072 * instruct the stream head to copyin user data into contiguous 9073 * kernel-allocated buffers without breaking it up into smaller 9074 * chunks. We round up the buffer size to the nearest SMSS. 9075 */ 9076 maxpsz = MSS_ROUNDUP(tcp->tcp_xmit_hiwater, mss); 9077 if (tcp->tcp_kssl_ctx == NULL) 9078 mss = INFPSZ; 9079 else 9080 mss = SSL3_MAX_RECORD_LEN; 9081 } else { 9082 /* 9083 * Set sd_qn_maxpsz to approx half the (receivers) buffer 9084 * (and a multiple of the mss). This instructs the stream 9085 * head to break down larger than SMSS writes into SMSS- 9086 * size mblks, up to tcp_maxpsz_multiplier mblks at a time. 9087 */ 9088 maxpsz = tcp->tcp_maxpsz * mss; 9089 if (maxpsz > tcp->tcp_xmit_hiwater/2) { 9090 maxpsz = tcp->tcp_xmit_hiwater/2; 9091 /* Round up to nearest mss */ 9092 maxpsz = MSS_ROUNDUP(maxpsz, mss); 9093 } 9094 } 9095 (void) setmaxps(q, maxpsz); 9096 tcp->tcp_wq->q_maxpsz = maxpsz; 9097 9098 if (set_maxblk) 9099 (void) mi_set_sth_maxblk(q, mss); 9100 9101 return (mss); 9102 } 9103 9104 /* 9105 * Extract option values from a tcp header. We put any found values into the 9106 * tcpopt struct and return a bitmask saying which options were found. 9107 */ 9108 static int 9109 tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) 9110 { 9111 uchar_t *endp; 9112 int len; 9113 uint32_t mss; 9114 uchar_t *up = (uchar_t *)tcph; 9115 int found = 0; 9116 int32_t sack_len; 9117 tcp_seq sack_begin, sack_end; 9118 tcp_t *tcp; 9119 9120 endp = up + TCP_HDR_LENGTH(tcph); 9121 up += TCP_MIN_HEADER_LENGTH; 9122 while (up < endp) { 9123 len = endp - up; 9124 switch (*up) { 9125 case TCPOPT_EOL: 9126 break; 9127 9128 case TCPOPT_NOP: 9129 up++; 9130 continue; 9131 9132 case TCPOPT_MAXSEG: 9133 if (len < TCPOPT_MAXSEG_LEN || 9134 up[1] != TCPOPT_MAXSEG_LEN) 9135 break; 9136 9137 mss = BE16_TO_U16(up+2); 9138 /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 9139 tcpopt->tcp_opt_mss = mss; 9140 found |= TCP_OPT_MSS_PRESENT; 9141 9142 up += TCPOPT_MAXSEG_LEN; 9143 continue; 9144 9145 case TCPOPT_WSCALE: 9146 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 9147 break; 9148 9149 if (up[2] > TCP_MAX_WINSHIFT) 9150 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 9151 else 9152 tcpopt->tcp_opt_wscale = up[2]; 9153 found |= TCP_OPT_WSCALE_PRESENT; 9154 9155 up += TCPOPT_WS_LEN; 9156 continue; 9157 9158 case TCPOPT_SACK_PERMITTED: 9159 if (len < TCPOPT_SACK_OK_LEN || 9160 up[1] != TCPOPT_SACK_OK_LEN) 9161 break; 9162 found |= TCP_OPT_SACK_OK_PRESENT; 9163 up += TCPOPT_SACK_OK_LEN; 9164 continue; 9165 9166 case TCPOPT_SACK: 9167 if (len <= 2 || up[1] <= 2 || len < up[1]) 9168 break; 9169 9170 /* If TCP is not interested in SACK blks... */ 9171 if ((tcp = tcpopt->tcp) == NULL) { 9172 up += up[1]; 9173 continue; 9174 } 9175 sack_len = up[1] - TCPOPT_HEADER_LEN; 9176 up += TCPOPT_HEADER_LEN; 9177 9178 /* 9179 * If the list is empty, allocate one and assume 9180 * nothing is sack'ed. 9181 */ 9182 ASSERT(tcp->tcp_sack_info != NULL); 9183 if (tcp->tcp_notsack_list == NULL) { 9184 tcp_notsack_update(&(tcp->tcp_notsack_list), 9185 tcp->tcp_suna, tcp->tcp_snxt, 9186 &(tcp->tcp_num_notsack_blk), 9187 &(tcp->tcp_cnt_notsack_list)); 9188 9189 /* 9190 * Make sure tcp_notsack_list is not NULL. 9191 * This happens when kmem_alloc(KM_NOSLEEP) 9192 * returns NULL. 9193 */ 9194 if (tcp->tcp_notsack_list == NULL) { 9195 up += sack_len; 9196 continue; 9197 } 9198 tcp->tcp_fack = tcp->tcp_suna; 9199 } 9200 9201 while (sack_len > 0) { 9202 if (up + 8 > endp) { 9203 up = endp; 9204 break; 9205 } 9206 sack_begin = BE32_TO_U32(up); 9207 up += 4; 9208 sack_end = BE32_TO_U32(up); 9209 up += 4; 9210 sack_len -= 8; 9211 /* 9212 * Bounds checking. Make sure the SACK 9213 * info is within tcp_suna and tcp_snxt. 9214 * If this SACK blk is out of bound, ignore 9215 * it but continue to parse the following 9216 * blks. 9217 */ 9218 if (SEQ_LEQ(sack_end, sack_begin) || 9219 SEQ_LT(sack_begin, tcp->tcp_suna) || 9220 SEQ_GT(sack_end, tcp->tcp_snxt)) { 9221 continue; 9222 } 9223 tcp_notsack_insert(&(tcp->tcp_notsack_list), 9224 sack_begin, sack_end, 9225 &(tcp->tcp_num_notsack_blk), 9226 &(tcp->tcp_cnt_notsack_list)); 9227 if (SEQ_GT(sack_end, tcp->tcp_fack)) { 9228 tcp->tcp_fack = sack_end; 9229 } 9230 } 9231 found |= TCP_OPT_SACK_PRESENT; 9232 continue; 9233 9234 case TCPOPT_TSTAMP: 9235 if (len < TCPOPT_TSTAMP_LEN || 9236 up[1] != TCPOPT_TSTAMP_LEN) 9237 break; 9238 9239 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 9240 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 9241 9242 found |= TCP_OPT_TSTAMP_PRESENT; 9243 9244 up += TCPOPT_TSTAMP_LEN; 9245 continue; 9246 9247 default: 9248 if (len <= 1 || len < (int)up[1] || up[1] == 0) 9249 break; 9250 up += up[1]; 9251 continue; 9252 } 9253 break; 9254 } 9255 return (found); 9256 } 9257 9258 /* 9259 * Set the mss associated with a particular tcp based on its current value, 9260 * and a new one passed in. Observe minimums and maximums, and reset 9261 * other state variables that we want to view as multiples of mss. 9262 * 9263 * This function is called in various places mainly because 9264 * 1) Various stuffs, tcp_mss, tcp_cwnd, ... need to be adjusted when the 9265 * other side's SYN/SYN-ACK packet arrives. 9266 * 2) PMTUd may get us a new MSS. 9267 * 3) If the other side stops sending us timestamp option, we need to 9268 * increase the MSS size to use the extra bytes available. 9269 */ 9270 static void 9271 tcp_mss_set(tcp_t *tcp, uint32_t mss) 9272 { 9273 uint32_t mss_max; 9274 9275 if (tcp->tcp_ipversion == IPV4_VERSION) 9276 mss_max = tcp_mss_max_ipv4; 9277 else 9278 mss_max = tcp_mss_max_ipv6; 9279 9280 if (mss < tcp_mss_min) 9281 mss = tcp_mss_min; 9282 if (mss > mss_max) 9283 mss = mss_max; 9284 /* 9285 * Unless naglim has been set by our client to 9286 * a non-mss value, force naglim to track mss. 9287 * This can help to aggregate small writes. 9288 */ 9289 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 9290 tcp->tcp_naglim = mss; 9291 /* 9292 * TCP should be able to buffer at least 4 MSS data for obvious 9293 * performance reason. 9294 */ 9295 if ((mss << 2) > tcp->tcp_xmit_hiwater) 9296 tcp->tcp_xmit_hiwater = mss << 2; 9297 9298 /* 9299 * Check if we need to apply the tcp_init_cwnd here. If 9300 * it is set and the MSS gets bigger (should not happen 9301 * normally), we need to adjust the resulting tcp_cwnd properly. 9302 * The new tcp_cwnd should not get bigger. 9303 */ 9304 if (tcp->tcp_init_cwnd == 0) { 9305 tcp->tcp_cwnd = MIN(tcp_slow_start_initial * mss, 9306 MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 9307 } else { 9308 if (tcp->tcp_mss < mss) { 9309 tcp->tcp_cwnd = MAX(1, 9310 (tcp->tcp_init_cwnd * tcp->tcp_mss / mss)) * mss; 9311 } else { 9312 tcp->tcp_cwnd = tcp->tcp_init_cwnd * mss; 9313 } 9314 } 9315 tcp->tcp_mss = mss; 9316 tcp->tcp_cwnd_cnt = 0; 9317 (void) tcp_maxpsz_set(tcp, B_TRUE); 9318 } 9319 9320 static int 9321 tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9322 { 9323 tcp_t *tcp = NULL; 9324 conn_t *connp; 9325 int err; 9326 dev_t conn_dev; 9327 zoneid_t zoneid = getzoneid(); 9328 9329 /* 9330 * Special case for install: miniroot needs to be able to access files 9331 * via NFS as though it were always in the global zone. 9332 */ 9333 if (credp == kcred && nfs_global_client_only != 0) 9334 zoneid = GLOBAL_ZONEID; 9335 9336 if (q->q_ptr != NULL) 9337 return (0); 9338 9339 if (sflag == MODOPEN) { 9340 /* 9341 * This is a special case. The purpose of a modopen 9342 * is to allow just the T_SVR4_OPTMGMT_REQ to pass 9343 * through for MIB browsers. Everything else is failed. 9344 */ 9345 connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt)); 9346 9347 if (connp == NULL) 9348 return (ENOMEM); 9349 9350 connp->conn_flags |= IPCL_TCPMOD; 9351 connp->conn_cred = credp; 9352 connp->conn_zoneid = zoneid; 9353 q->q_ptr = WR(q)->q_ptr = connp; 9354 crhold(credp); 9355 q->q_qinfo = &tcp_mod_rinit; 9356 WR(q)->q_qinfo = &tcp_mod_winit; 9357 qprocson(q); 9358 return (0); 9359 } 9360 9361 if ((conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) 9362 return (EBUSY); 9363 9364 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 9365 9366 if (flag & SO_ACCEPTOR) { 9367 q->q_qinfo = &tcp_acceptor_rinit; 9368 q->q_ptr = (void *)conn_dev; 9369 WR(q)->q_qinfo = &tcp_acceptor_winit; 9370 WR(q)->q_ptr = (void *)conn_dev; 9371 qprocson(q); 9372 return (0); 9373 } 9374 9375 connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt)); 9376 if (connp == NULL) { 9377 inet_minor_free(ip_minor_arena, conn_dev); 9378 q->q_ptr = NULL; 9379 return (ENOSR); 9380 } 9381 connp->conn_sqp = IP_SQUEUE_GET(lbolt); 9382 tcp = connp->conn_tcp; 9383 9384 q->q_ptr = WR(q)->q_ptr = connp; 9385 if (getmajor(*devp) == TCP6_MAJ) { 9386 connp->conn_flags |= (IPCL_TCP6|IPCL_ISV6); 9387 connp->conn_send = ip_output_v6; 9388 connp->conn_af_isv6 = B_TRUE; 9389 connp->conn_pkt_isv6 = B_TRUE; 9390 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; 9391 tcp->tcp_ipversion = IPV6_VERSION; 9392 tcp->tcp_family = AF_INET6; 9393 tcp->tcp_mss = tcp_mss_def_ipv6; 9394 } else { 9395 connp->conn_flags |= IPCL_TCP4; 9396 connp->conn_send = ip_output; 9397 connp->conn_af_isv6 = B_FALSE; 9398 connp->conn_pkt_isv6 = B_FALSE; 9399 tcp->tcp_ipversion = IPV4_VERSION; 9400 tcp->tcp_family = AF_INET; 9401 tcp->tcp_mss = tcp_mss_def_ipv4; 9402 } 9403 9404 /* 9405 * TCP keeps a copy of cred for cache locality reasons but 9406 * we put a reference only once. If connp->conn_cred 9407 * becomes invalid, tcp_cred should also be set to NULL. 9408 */ 9409 tcp->tcp_cred = connp->conn_cred = credp; 9410 crhold(connp->conn_cred); 9411 tcp->tcp_cpid = curproc->p_pid; 9412 connp->conn_zoneid = zoneid; 9413 connp->conn_mlp_type = mlptSingle; 9414 connp->conn_ulp_labeled = !is_system_labeled(); 9415 9416 /* 9417 * If the caller has the process-wide flag set, then default to MAC 9418 * exempt mode. This allows read-down to unlabeled hosts. 9419 */ 9420 if (getpflags(NET_MAC_AWARE, credp) != 0) 9421 connp->conn_mac_exempt = B_TRUE; 9422 9423 connp->conn_dev = conn_dev; 9424 9425 ASSERT(q->q_qinfo == &tcp_rinit); 9426 ASSERT(WR(q)->q_qinfo == &tcp_winit); 9427 9428 if (flag & SO_SOCKSTR) { 9429 /* 9430 * No need to insert a socket in tcp acceptor hash. 9431 * If it was a socket acceptor stream, we dealt with 9432 * it above. A socket listener can never accept a 9433 * connection and doesn't need acceptor_id. 9434 */ 9435 connp->conn_flags |= IPCL_SOCKET; 9436 tcp->tcp_issocket = 1; 9437 WR(q)->q_qinfo = &tcp_sock_winit; 9438 } else { 9439 #ifdef _ILP32 9440 tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); 9441 #else 9442 tcp->tcp_acceptor_id = conn_dev; 9443 #endif /* _ILP32 */ 9444 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 9445 } 9446 9447 if (tcp_trace) 9448 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_SLEEP); 9449 9450 err = tcp_init(tcp, q); 9451 if (err != 0) { 9452 inet_minor_free(ip_minor_arena, connp->conn_dev); 9453 tcp_acceptor_hash_remove(tcp); 9454 CONN_DEC_REF(connp); 9455 q->q_ptr = WR(q)->q_ptr = NULL; 9456 return (err); 9457 } 9458 9459 RD(q)->q_hiwat = tcp_recv_hiwat; 9460 tcp->tcp_rwnd = tcp_recv_hiwat; 9461 9462 /* Non-zero default values */ 9463 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 9464 /* 9465 * Put the ref for TCP. Ref for IP was already put 9466 * by ipcl_conn_create. Also Make the conn_t globally 9467 * visible to walkers 9468 */ 9469 mutex_enter(&connp->conn_lock); 9470 CONN_INC_REF_LOCKED(connp); 9471 ASSERT(connp->conn_ref == 2); 9472 connp->conn_state_flags &= ~CONN_INCIPIENT; 9473 mutex_exit(&connp->conn_lock); 9474 9475 qprocson(q); 9476 return (0); 9477 } 9478 9479 /* 9480 * Some TCP options can be "set" by requesting them in the option 9481 * buffer. This is needed for XTI feature test though we do not 9482 * allow it in general. We interpret that this mechanism is more 9483 * applicable to OSI protocols and need not be allowed in general. 9484 * This routine filters out options for which it is not allowed (most) 9485 * and lets through those (few) for which it is. [ The XTI interface 9486 * test suite specifics will imply that any XTI_GENERIC level XTI_* if 9487 * ever implemented will have to be allowed here ]. 9488 */ 9489 static boolean_t 9490 tcp_allow_connopt_set(int level, int name) 9491 { 9492 9493 switch (level) { 9494 case IPPROTO_TCP: 9495 switch (name) { 9496 case TCP_NODELAY: 9497 return (B_TRUE); 9498 default: 9499 return (B_FALSE); 9500 } 9501 /*NOTREACHED*/ 9502 default: 9503 return (B_FALSE); 9504 } 9505 /*NOTREACHED*/ 9506 } 9507 9508 /* 9509 * This routine gets default values of certain options whose default 9510 * values are maintained by protocol specific code 9511 */ 9512 /* ARGSUSED */ 9513 int 9514 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 9515 { 9516 int32_t *i1 = (int32_t *)ptr; 9517 9518 switch (level) { 9519 case IPPROTO_TCP: 9520 switch (name) { 9521 case TCP_NOTIFY_THRESHOLD: 9522 *i1 = tcp_ip_notify_interval; 9523 break; 9524 case TCP_ABORT_THRESHOLD: 9525 *i1 = tcp_ip_abort_interval; 9526 break; 9527 case TCP_CONN_NOTIFY_THRESHOLD: 9528 *i1 = tcp_ip_notify_cinterval; 9529 break; 9530 case TCP_CONN_ABORT_THRESHOLD: 9531 *i1 = tcp_ip_abort_cinterval; 9532 break; 9533 default: 9534 return (-1); 9535 } 9536 break; 9537 case IPPROTO_IP: 9538 switch (name) { 9539 case IP_TTL: 9540 *i1 = tcp_ipv4_ttl; 9541 break; 9542 default: 9543 return (-1); 9544 } 9545 break; 9546 case IPPROTO_IPV6: 9547 switch (name) { 9548 case IPV6_UNICAST_HOPS: 9549 *i1 = tcp_ipv6_hoplimit; 9550 break; 9551 default: 9552 return (-1); 9553 } 9554 break; 9555 default: 9556 return (-1); 9557 } 9558 return (sizeof (int)); 9559 } 9560 9561 9562 /* 9563 * TCP routine to get the values of options. 9564 */ 9565 int 9566 tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 9567 { 9568 int *i1 = (int *)ptr; 9569 conn_t *connp = Q_TO_CONN(q); 9570 tcp_t *tcp = connp->conn_tcp; 9571 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; 9572 9573 switch (level) { 9574 case SOL_SOCKET: 9575 switch (name) { 9576 case SO_LINGER: { 9577 struct linger *lgr = (struct linger *)ptr; 9578 9579 lgr->l_onoff = tcp->tcp_linger ? SO_LINGER : 0; 9580 lgr->l_linger = tcp->tcp_lingertime; 9581 } 9582 return (sizeof (struct linger)); 9583 case SO_DEBUG: 9584 *i1 = tcp->tcp_debug ? SO_DEBUG : 0; 9585 break; 9586 case SO_KEEPALIVE: 9587 *i1 = tcp->tcp_ka_enabled ? SO_KEEPALIVE : 0; 9588 break; 9589 case SO_DONTROUTE: 9590 *i1 = tcp->tcp_dontroute ? SO_DONTROUTE : 0; 9591 break; 9592 case SO_USELOOPBACK: 9593 *i1 = tcp->tcp_useloopback ? SO_USELOOPBACK : 0; 9594 break; 9595 case SO_BROADCAST: 9596 *i1 = tcp->tcp_broadcast ? SO_BROADCAST : 0; 9597 break; 9598 case SO_REUSEADDR: 9599 *i1 = tcp->tcp_reuseaddr ? SO_REUSEADDR : 0; 9600 break; 9601 case SO_OOBINLINE: 9602 *i1 = tcp->tcp_oobinline ? SO_OOBINLINE : 0; 9603 break; 9604 case SO_DGRAM_ERRIND: 9605 *i1 = tcp->tcp_dgram_errind ? SO_DGRAM_ERRIND : 0; 9606 break; 9607 case SO_TYPE: 9608 *i1 = SOCK_STREAM; 9609 break; 9610 case SO_SNDBUF: 9611 *i1 = tcp->tcp_xmit_hiwater; 9612 break; 9613 case SO_RCVBUF: 9614 *i1 = RD(q)->q_hiwat; 9615 break; 9616 case SO_SND_COPYAVOID: 9617 *i1 = tcp->tcp_snd_zcopy_on ? 9618 SO_SND_COPYAVOID : 0; 9619 break; 9620 case SO_ALLZONES: 9621 *i1 = connp->conn_allzones ? 1 : 0; 9622 break; 9623 case SO_ANON_MLP: 9624 *i1 = connp->conn_anon_mlp; 9625 break; 9626 case SO_MAC_EXEMPT: 9627 *i1 = connp->conn_mac_exempt; 9628 break; 9629 case SO_EXCLBIND: 9630 *i1 = tcp->tcp_exclbind ? SO_EXCLBIND : 0; 9631 break; 9632 default: 9633 return (-1); 9634 } 9635 break; 9636 case IPPROTO_TCP: 9637 switch (name) { 9638 case TCP_NODELAY: 9639 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; 9640 break; 9641 case TCP_MAXSEG: 9642 *i1 = tcp->tcp_mss; 9643 break; 9644 case TCP_NOTIFY_THRESHOLD: 9645 *i1 = (int)tcp->tcp_first_timer_threshold; 9646 break; 9647 case TCP_ABORT_THRESHOLD: 9648 *i1 = tcp->tcp_second_timer_threshold; 9649 break; 9650 case TCP_CONN_NOTIFY_THRESHOLD: 9651 *i1 = tcp->tcp_first_ctimer_threshold; 9652 break; 9653 case TCP_CONN_ABORT_THRESHOLD: 9654 *i1 = tcp->tcp_second_ctimer_threshold; 9655 break; 9656 case TCP_RECVDSTADDR: 9657 *i1 = tcp->tcp_recvdstaddr; 9658 break; 9659 case TCP_ANONPRIVBIND: 9660 *i1 = tcp->tcp_anon_priv_bind; 9661 break; 9662 case TCP_EXCLBIND: 9663 *i1 = tcp->tcp_exclbind ? TCP_EXCLBIND : 0; 9664 break; 9665 case TCP_INIT_CWND: 9666 *i1 = tcp->tcp_init_cwnd; 9667 break; 9668 case TCP_KEEPALIVE_THRESHOLD: 9669 *i1 = tcp->tcp_ka_interval; 9670 break; 9671 case TCP_KEEPALIVE_ABORT_THRESHOLD: 9672 *i1 = tcp->tcp_ka_abort_thres; 9673 break; 9674 case TCP_CORK: 9675 *i1 = tcp->tcp_cork; 9676 break; 9677 default: 9678 return (-1); 9679 } 9680 break; 9681 case IPPROTO_IP: 9682 if (tcp->tcp_family != AF_INET) 9683 return (-1); 9684 switch (name) { 9685 case IP_OPTIONS: 9686 case T_IP_OPTIONS: { 9687 /* 9688 * This is compatible with BSD in that in only return 9689 * the reverse source route with the final destination 9690 * as the last entry. The first 4 bytes of the option 9691 * will contain the final destination. 9692 */ 9693 int opt_len; 9694 9695 opt_len = (char *)tcp->tcp_tcph - (char *)tcp->tcp_ipha; 9696 opt_len -= tcp->tcp_label_len + IP_SIMPLE_HDR_LENGTH; 9697 ASSERT(opt_len >= 0); 9698 /* Caller ensures enough space */ 9699 if (opt_len > 0) { 9700 /* 9701 * TODO: Do we have to handle getsockopt on an 9702 * initiator as well? 9703 */ 9704 return (ip_opt_get_user(tcp->tcp_ipha, ptr)); 9705 } 9706 return (0); 9707 } 9708 case IP_TOS: 9709 case T_IP_TOS: 9710 *i1 = (int)tcp->tcp_ipha->ipha_type_of_service; 9711 break; 9712 case IP_TTL: 9713 *i1 = (int)tcp->tcp_ipha->ipha_ttl; 9714 break; 9715 case IP_NEXTHOP: 9716 /* Handled at IP level */ 9717 return (-EINVAL); 9718 default: 9719 return (-1); 9720 } 9721 break; 9722 case IPPROTO_IPV6: 9723 /* 9724 * IPPROTO_IPV6 options are only supported for sockets 9725 * that are using IPv6 on the wire. 9726 */ 9727 if (tcp->tcp_ipversion != IPV6_VERSION) { 9728 return (-1); 9729 } 9730 switch (name) { 9731 case IPV6_UNICAST_HOPS: 9732 *i1 = (unsigned int) tcp->tcp_ip6h->ip6_hops; 9733 break; /* goto sizeof (int) option return */ 9734 case IPV6_BOUND_IF: 9735 /* Zero if not set */ 9736 *i1 = tcp->tcp_bound_if; 9737 break; /* goto sizeof (int) option return */ 9738 case IPV6_RECVPKTINFO: 9739 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) 9740 *i1 = 1; 9741 else 9742 *i1 = 0; 9743 break; /* goto sizeof (int) option return */ 9744 case IPV6_RECVTCLASS: 9745 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS) 9746 *i1 = 1; 9747 else 9748 *i1 = 0; 9749 break; /* goto sizeof (int) option return */ 9750 case IPV6_RECVHOPLIMIT: 9751 if (tcp->tcp_ipv6_recvancillary & 9752 TCP_IPV6_RECVHOPLIMIT) 9753 *i1 = 1; 9754 else 9755 *i1 = 0; 9756 break; /* goto sizeof (int) option return */ 9757 case IPV6_RECVHOPOPTS: 9758 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) 9759 *i1 = 1; 9760 else 9761 *i1 = 0; 9762 break; /* goto sizeof (int) option return */ 9763 case IPV6_RECVDSTOPTS: 9764 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVDSTOPTS) 9765 *i1 = 1; 9766 else 9767 *i1 = 0; 9768 break; /* goto sizeof (int) option return */ 9769 case _OLD_IPV6_RECVDSTOPTS: 9770 if (tcp->tcp_ipv6_recvancillary & 9771 TCP_OLD_IPV6_RECVDSTOPTS) 9772 *i1 = 1; 9773 else 9774 *i1 = 0; 9775 break; /* goto sizeof (int) option return */ 9776 case IPV6_RECVRTHDR: 9777 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) 9778 *i1 = 1; 9779 else 9780 *i1 = 0; 9781 break; /* goto sizeof (int) option return */ 9782 case IPV6_RECVRTHDRDSTOPTS: 9783 if (tcp->tcp_ipv6_recvancillary & 9784 TCP_IPV6_RECVRTDSTOPTS) 9785 *i1 = 1; 9786 else 9787 *i1 = 0; 9788 break; /* goto sizeof (int) option return */ 9789 case IPV6_PKTINFO: { 9790 /* XXX assumes that caller has room for max size! */ 9791 struct in6_pktinfo *pkti; 9792 9793 pkti = (struct in6_pktinfo *)ptr; 9794 if (ipp->ipp_fields & IPPF_IFINDEX) 9795 pkti->ipi6_ifindex = ipp->ipp_ifindex; 9796 else 9797 pkti->ipi6_ifindex = 0; 9798 if (ipp->ipp_fields & IPPF_ADDR) 9799 pkti->ipi6_addr = ipp->ipp_addr; 9800 else 9801 pkti->ipi6_addr = ipv6_all_zeros; 9802 return (sizeof (struct in6_pktinfo)); 9803 } 9804 case IPV6_TCLASS: 9805 if (ipp->ipp_fields & IPPF_TCLASS) 9806 *i1 = ipp->ipp_tclass; 9807 else 9808 *i1 = IPV6_FLOW_TCLASS( 9809 IPV6_DEFAULT_VERS_AND_FLOW); 9810 break; /* goto sizeof (int) option return */ 9811 case IPV6_NEXTHOP: { 9812 sin6_t *sin6 = (sin6_t *)ptr; 9813 9814 if (!(ipp->ipp_fields & IPPF_NEXTHOP)) 9815 return (0); 9816 *sin6 = sin6_null; 9817 sin6->sin6_family = AF_INET6; 9818 sin6->sin6_addr = ipp->ipp_nexthop; 9819 return (sizeof (sin6_t)); 9820 } 9821 case IPV6_HOPOPTS: 9822 if (!(ipp->ipp_fields & IPPF_HOPOPTS)) 9823 return (0); 9824 if (ipp->ipp_hopoptslen <= tcp->tcp_label_len) 9825 return (0); 9826 bcopy((char *)ipp->ipp_hopopts + tcp->tcp_label_len, 9827 ptr, ipp->ipp_hopoptslen - tcp->tcp_label_len); 9828 if (tcp->tcp_label_len > 0) { 9829 ptr[0] = ((char *)ipp->ipp_hopopts)[0]; 9830 ptr[1] = (ipp->ipp_hopoptslen - 9831 tcp->tcp_label_len + 7) / 8 - 1; 9832 } 9833 return (ipp->ipp_hopoptslen - tcp->tcp_label_len); 9834 case IPV6_RTHDRDSTOPTS: 9835 if (!(ipp->ipp_fields & IPPF_RTDSTOPTS)) 9836 return (0); 9837 bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen); 9838 return (ipp->ipp_rtdstoptslen); 9839 case IPV6_RTHDR: 9840 if (!(ipp->ipp_fields & IPPF_RTHDR)) 9841 return (0); 9842 bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen); 9843 return (ipp->ipp_rthdrlen); 9844 case IPV6_DSTOPTS: 9845 if (!(ipp->ipp_fields & IPPF_DSTOPTS)) 9846 return (0); 9847 bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen); 9848 return (ipp->ipp_dstoptslen); 9849 case IPV6_SRC_PREFERENCES: 9850 return (ip6_get_src_preferences(connp, 9851 (uint32_t *)ptr)); 9852 case IPV6_PATHMTU: { 9853 struct ip6_mtuinfo *mtuinfo = (struct ip6_mtuinfo *)ptr; 9854 9855 if (tcp->tcp_state < TCPS_ESTABLISHED) 9856 return (-1); 9857 9858 return (ip_fill_mtuinfo(&connp->conn_remv6, 9859 connp->conn_fport, mtuinfo)); 9860 } 9861 default: 9862 return (-1); 9863 } 9864 break; 9865 default: 9866 return (-1); 9867 } 9868 return (sizeof (int)); 9869 } 9870 9871 /* 9872 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. 9873 * Parameters are assumed to be verified by the caller. 9874 */ 9875 /* ARGSUSED */ 9876 int 9877 tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, 9878 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 9879 void *thisdg_attrs, cred_t *cr, mblk_t *mblk) 9880 { 9881 conn_t *connp = Q_TO_CONN(q); 9882 tcp_t *tcp = connp->conn_tcp; 9883 int *i1 = (int *)invalp; 9884 boolean_t onoff = (*i1 == 0) ? 0 : 1; 9885 boolean_t checkonly; 9886 int reterr; 9887 9888 switch (optset_context) { 9889 case SETFN_OPTCOM_CHECKONLY: 9890 checkonly = B_TRUE; 9891 /* 9892 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 9893 * inlen != 0 implies value supplied and 9894 * we have to "pretend" to set it. 9895 * inlen == 0 implies that there is no 9896 * value part in T_CHECK request and just validation 9897 * done elsewhere should be enough, we just return here. 9898 */ 9899 if (inlen == 0) { 9900 *outlenp = 0; 9901 return (0); 9902 } 9903 break; 9904 case SETFN_OPTCOM_NEGOTIATE: 9905 checkonly = B_FALSE; 9906 break; 9907 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ 9908 case SETFN_CONN_NEGOTIATE: 9909 checkonly = B_FALSE; 9910 /* 9911 * Negotiating local and "association-related" options 9912 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) 9913 * primitives is allowed by XTI, but we choose 9914 * to not implement this style negotiation for Internet 9915 * protocols (We interpret it is a must for OSI world but 9916 * optional for Internet protocols) for all options. 9917 * [ Will do only for the few options that enable test 9918 * suites that our XTI implementation of this feature 9919 * works for transports that do allow it ] 9920 */ 9921 if (!tcp_allow_connopt_set(level, name)) { 9922 *outlenp = 0; 9923 return (EINVAL); 9924 } 9925 break; 9926 default: 9927 /* 9928 * We should never get here 9929 */ 9930 *outlenp = 0; 9931 return (EINVAL); 9932 } 9933 9934 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 9935 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 9936 9937 /* 9938 * For TCP, we should have no ancillary data sent down 9939 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs 9940 * has to be zero. 9941 */ 9942 ASSERT(thisdg_attrs == NULL); 9943 9944 /* 9945 * For fixed length options, no sanity check 9946 * of passed in length is done. It is assumed *_optcom_req() 9947 * routines do the right thing. 9948 */ 9949 9950 switch (level) { 9951 case SOL_SOCKET: 9952 switch (name) { 9953 case SO_LINGER: { 9954 struct linger *lgr = (struct linger *)invalp; 9955 9956 if (!checkonly) { 9957 if (lgr->l_onoff) { 9958 tcp->tcp_linger = 1; 9959 tcp->tcp_lingertime = lgr->l_linger; 9960 } else { 9961 tcp->tcp_linger = 0; 9962 tcp->tcp_lingertime = 0; 9963 } 9964 /* struct copy */ 9965 *(struct linger *)outvalp = *lgr; 9966 } else { 9967 if (!lgr->l_onoff) { 9968 ((struct linger *)outvalp)->l_onoff = 0; 9969 ((struct linger *)outvalp)->l_linger = 0; 9970 } else { 9971 /* struct copy */ 9972 *(struct linger *)outvalp = *lgr; 9973 } 9974 } 9975 *outlenp = sizeof (struct linger); 9976 return (0); 9977 } 9978 case SO_DEBUG: 9979 if (!checkonly) 9980 tcp->tcp_debug = onoff; 9981 break; 9982 case SO_KEEPALIVE: 9983 if (checkonly) { 9984 /* T_CHECK case */ 9985 break; 9986 } 9987 9988 if (!onoff) { 9989 if (tcp->tcp_ka_enabled) { 9990 if (tcp->tcp_ka_tid != 0) { 9991 (void) TCP_TIMER_CANCEL(tcp, 9992 tcp->tcp_ka_tid); 9993 tcp->tcp_ka_tid = 0; 9994 } 9995 tcp->tcp_ka_enabled = 0; 9996 } 9997 break; 9998 } 9999 if (!tcp->tcp_ka_enabled) { 10000 /* Crank up the keepalive timer */ 10001 tcp->tcp_ka_last_intrvl = 0; 10002 tcp->tcp_ka_tid = TCP_TIMER(tcp, 10003 tcp_keepalive_killer, 10004 MSEC_TO_TICK(tcp->tcp_ka_interval)); 10005 tcp->tcp_ka_enabled = 1; 10006 } 10007 break; 10008 case SO_DONTROUTE: 10009 /* 10010 * SO_DONTROUTE, SO_USELOOPBACK, and SO_BROADCAST are 10011 * only of interest to IP. We track them here only so 10012 * that we can report their current value. 10013 */ 10014 if (!checkonly) { 10015 tcp->tcp_dontroute = onoff; 10016 tcp->tcp_connp->conn_dontroute = onoff; 10017 } 10018 break; 10019 case SO_USELOOPBACK: 10020 if (!checkonly) { 10021 tcp->tcp_useloopback = onoff; 10022 tcp->tcp_connp->conn_loopback = onoff; 10023 } 10024 break; 10025 case SO_BROADCAST: 10026 if (!checkonly) { 10027 tcp->tcp_broadcast = onoff; 10028 tcp->tcp_connp->conn_broadcast = onoff; 10029 } 10030 break; 10031 case SO_REUSEADDR: 10032 if (!checkonly) { 10033 tcp->tcp_reuseaddr = onoff; 10034 tcp->tcp_connp->conn_reuseaddr = onoff; 10035 } 10036 break; 10037 case SO_OOBINLINE: 10038 if (!checkonly) 10039 tcp->tcp_oobinline = onoff; 10040 break; 10041 case SO_DGRAM_ERRIND: 10042 if (!checkonly) 10043 tcp->tcp_dgram_errind = onoff; 10044 break; 10045 case SO_SNDBUF: { 10046 tcp_t *peer_tcp; 10047 10048 if (*i1 > tcp_max_buf) { 10049 *outlenp = 0; 10050 return (ENOBUFS); 10051 } 10052 if (checkonly) 10053 break; 10054 10055 tcp->tcp_xmit_hiwater = *i1; 10056 if (tcp_snd_lowat_fraction != 0) 10057 tcp->tcp_xmit_lowater = 10058 tcp->tcp_xmit_hiwater / 10059 tcp_snd_lowat_fraction; 10060 (void) tcp_maxpsz_set(tcp, B_TRUE); 10061 /* 10062 * If we are flow-controlled, recheck the condition. 10063 * There are apps that increase SO_SNDBUF size when 10064 * flow-controlled (EWOULDBLOCK), and expect the flow 10065 * control condition to be lifted right away. 10066 * 10067 * For the fused tcp loopback case, in order to avoid 10068 * a race with the peer's tcp_fuse_rrw() we need to 10069 * hold its fuse_lock while accessing tcp_flow_stopped. 10070 */ 10071 peer_tcp = tcp->tcp_loopback_peer; 10072 ASSERT(!tcp->tcp_fused || peer_tcp != NULL); 10073 if (tcp->tcp_fused) 10074 mutex_enter(&peer_tcp->tcp_fuse_lock); 10075 10076 if (tcp->tcp_flow_stopped && 10077 TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) { 10078 tcp_clrqfull(tcp); 10079 } 10080 if (tcp->tcp_fused) 10081 mutex_exit(&peer_tcp->tcp_fuse_lock); 10082 break; 10083 } 10084 case SO_RCVBUF: 10085 if (*i1 > tcp_max_buf) { 10086 *outlenp = 0; 10087 return (ENOBUFS); 10088 } 10089 /* Silently ignore zero */ 10090 if (!checkonly && *i1 != 0) { 10091 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); 10092 (void) tcp_rwnd_set(tcp, *i1); 10093 } 10094 /* 10095 * XXX should we return the rwnd here 10096 * and tcp_opt_get ? 10097 */ 10098 break; 10099 case SO_SND_COPYAVOID: 10100 if (!checkonly) { 10101 /* we only allow enable at most once for now */ 10102 if (tcp->tcp_loopback || 10103 (!tcp->tcp_snd_zcopy_aware && 10104 (onoff != 1 || !tcp_zcopy_check(tcp)))) { 10105 *outlenp = 0; 10106 return (EOPNOTSUPP); 10107 } 10108 tcp->tcp_snd_zcopy_aware = 1; 10109 } 10110 break; 10111 case SO_ALLZONES: 10112 /* Handled at the IP level */ 10113 return (-EINVAL); 10114 case SO_ANON_MLP: 10115 if (!checkonly) { 10116 mutex_enter(&connp->conn_lock); 10117 connp->conn_anon_mlp = onoff; 10118 mutex_exit(&connp->conn_lock); 10119 } 10120 break; 10121 case SO_MAC_EXEMPT: 10122 if (secpolicy_net_mac_aware(cr) != 0 || 10123 IPCL_IS_BOUND(connp)) 10124 return (EACCES); 10125 if (!checkonly) { 10126 mutex_enter(&connp->conn_lock); 10127 connp->conn_mac_exempt = onoff; 10128 mutex_exit(&connp->conn_lock); 10129 } 10130 break; 10131 case SO_EXCLBIND: 10132 if (!checkonly) 10133 tcp->tcp_exclbind = onoff; 10134 break; 10135 default: 10136 *outlenp = 0; 10137 return (EINVAL); 10138 } 10139 break; 10140 case IPPROTO_TCP: 10141 switch (name) { 10142 case TCP_NODELAY: 10143 if (!checkonly) 10144 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; 10145 break; 10146 case TCP_NOTIFY_THRESHOLD: 10147 if (!checkonly) 10148 tcp->tcp_first_timer_threshold = *i1; 10149 break; 10150 case TCP_ABORT_THRESHOLD: 10151 if (!checkonly) 10152 tcp->tcp_second_timer_threshold = *i1; 10153 break; 10154 case TCP_CONN_NOTIFY_THRESHOLD: 10155 if (!checkonly) 10156 tcp->tcp_first_ctimer_threshold = *i1; 10157 break; 10158 case TCP_CONN_ABORT_THRESHOLD: 10159 if (!checkonly) 10160 tcp->tcp_second_ctimer_threshold = *i1; 10161 break; 10162 case TCP_RECVDSTADDR: 10163 if (tcp->tcp_state > TCPS_LISTEN) 10164 return (EOPNOTSUPP); 10165 if (!checkonly) 10166 tcp->tcp_recvdstaddr = onoff; 10167 break; 10168 case TCP_ANONPRIVBIND: 10169 if ((reterr = secpolicy_net_privaddr(cr, 0)) != 0) { 10170 *outlenp = 0; 10171 return (reterr); 10172 } 10173 if (!checkonly) { 10174 tcp->tcp_anon_priv_bind = onoff; 10175 } 10176 break; 10177 case TCP_EXCLBIND: 10178 if (!checkonly) 10179 tcp->tcp_exclbind = onoff; 10180 break; /* goto sizeof (int) option return */ 10181 case TCP_INIT_CWND: { 10182 uint32_t init_cwnd = *((uint32_t *)invalp); 10183 10184 if (checkonly) 10185 break; 10186 10187 /* 10188 * Only allow socket with network configuration 10189 * privilege to set the initial cwnd to be larger 10190 * than allowed by RFC 3390. 10191 */ 10192 if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { 10193 tcp->tcp_init_cwnd = init_cwnd; 10194 break; 10195 } 10196 if ((reterr = secpolicy_net_config(cr, B_TRUE)) != 0) { 10197 *outlenp = 0; 10198 return (reterr); 10199 } 10200 if (init_cwnd > TCP_MAX_INIT_CWND) { 10201 *outlenp = 0; 10202 return (EINVAL); 10203 } 10204 tcp->tcp_init_cwnd = init_cwnd; 10205 break; 10206 } 10207 case TCP_KEEPALIVE_THRESHOLD: 10208 if (checkonly) 10209 break; 10210 10211 if (*i1 < tcp_keepalive_interval_low || 10212 *i1 > tcp_keepalive_interval_high) { 10213 *outlenp = 0; 10214 return (EINVAL); 10215 } 10216 if (*i1 != tcp->tcp_ka_interval) { 10217 tcp->tcp_ka_interval = *i1; 10218 /* 10219 * Check if we need to restart the 10220 * keepalive timer. 10221 */ 10222 if (tcp->tcp_ka_tid != 0) { 10223 ASSERT(tcp->tcp_ka_enabled); 10224 (void) TCP_TIMER_CANCEL(tcp, 10225 tcp->tcp_ka_tid); 10226 tcp->tcp_ka_last_intrvl = 0; 10227 tcp->tcp_ka_tid = TCP_TIMER(tcp, 10228 tcp_keepalive_killer, 10229 MSEC_TO_TICK(tcp->tcp_ka_interval)); 10230 } 10231 } 10232 break; 10233 case TCP_KEEPALIVE_ABORT_THRESHOLD: 10234 if (!checkonly) { 10235 if (*i1 < tcp_keepalive_abort_interval_low || 10236 *i1 > tcp_keepalive_abort_interval_high) { 10237 *outlenp = 0; 10238 return (EINVAL); 10239 } 10240 tcp->tcp_ka_abort_thres = *i1; 10241 } 10242 break; 10243 case TCP_CORK: 10244 if (!checkonly) { 10245 /* 10246 * if tcp->tcp_cork was set and is now 10247 * being unset, we have to make sure that 10248 * the remaining data gets sent out. Also 10249 * unset tcp->tcp_cork so that tcp_wput_data() 10250 * can send data even if it is less than mss 10251 */ 10252 if (tcp->tcp_cork && onoff == 0 && 10253 tcp->tcp_unsent > 0) { 10254 tcp->tcp_cork = B_FALSE; 10255 tcp_wput_data(tcp, NULL, B_FALSE); 10256 } 10257 tcp->tcp_cork = onoff; 10258 } 10259 break; 10260 default: 10261 *outlenp = 0; 10262 return (EINVAL); 10263 } 10264 break; 10265 case IPPROTO_IP: 10266 if (tcp->tcp_family != AF_INET) { 10267 *outlenp = 0; 10268 return (ENOPROTOOPT); 10269 } 10270 switch (name) { 10271 case IP_OPTIONS: 10272 case T_IP_OPTIONS: 10273 reterr = tcp_opt_set_header(tcp, checkonly, 10274 invalp, inlen); 10275 if (reterr) { 10276 *outlenp = 0; 10277 return (reterr); 10278 } 10279 /* OK return - copy input buffer into output buffer */ 10280 if (invalp != outvalp) { 10281 /* don't trust bcopy for identical src/dst */ 10282 bcopy(invalp, outvalp, inlen); 10283 } 10284 *outlenp = inlen; 10285 return (0); 10286 case IP_TOS: 10287 case T_IP_TOS: 10288 if (!checkonly) { 10289 tcp->tcp_ipha->ipha_type_of_service = 10290 (uchar_t)*i1; 10291 tcp->tcp_tos = (uchar_t)*i1; 10292 } 10293 break; 10294 case IP_TTL: 10295 if (!checkonly) { 10296 tcp->tcp_ipha->ipha_ttl = (uchar_t)*i1; 10297 tcp->tcp_ttl = (uchar_t)*i1; 10298 } 10299 break; 10300 case IP_BOUND_IF: 10301 case IP_NEXTHOP: 10302 /* Handled at the IP level */ 10303 return (-EINVAL); 10304 case IP_SEC_OPT: 10305 /* 10306 * We should not allow policy setting after 10307 * we start listening for connections. 10308 */ 10309 if (tcp->tcp_state == TCPS_LISTEN) { 10310 return (EINVAL); 10311 } else { 10312 /* Handled at the IP level */ 10313 return (-EINVAL); 10314 } 10315 default: 10316 *outlenp = 0; 10317 return (EINVAL); 10318 } 10319 break; 10320 case IPPROTO_IPV6: { 10321 ip6_pkt_t *ipp; 10322 10323 /* 10324 * IPPROTO_IPV6 options are only supported for sockets 10325 * that are using IPv6 on the wire. 10326 */ 10327 if (tcp->tcp_ipversion != IPV6_VERSION) { 10328 *outlenp = 0; 10329 return (ENOPROTOOPT); 10330 } 10331 /* 10332 * Only sticky options; no ancillary data 10333 */ 10334 ASSERT(thisdg_attrs == NULL); 10335 ipp = &tcp->tcp_sticky_ipp; 10336 10337 switch (name) { 10338 case IPV6_UNICAST_HOPS: 10339 /* -1 means use default */ 10340 if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) { 10341 *outlenp = 0; 10342 return (EINVAL); 10343 } 10344 if (!checkonly) { 10345 if (*i1 == -1) { 10346 tcp->tcp_ip6h->ip6_hops = 10347 ipp->ipp_unicast_hops = 10348 (uint8_t)tcp_ipv6_hoplimit; 10349 ipp->ipp_fields &= ~IPPF_UNICAST_HOPS; 10350 /* Pass modified value to IP. */ 10351 *i1 = tcp->tcp_ip6h->ip6_hops; 10352 } else { 10353 tcp->tcp_ip6h->ip6_hops = 10354 ipp->ipp_unicast_hops = 10355 (uint8_t)*i1; 10356 ipp->ipp_fields |= IPPF_UNICAST_HOPS; 10357 } 10358 reterr = tcp_build_hdrs(q, tcp); 10359 if (reterr != 0) 10360 return (reterr); 10361 } 10362 break; 10363 case IPV6_BOUND_IF: 10364 if (!checkonly) { 10365 int error = 0; 10366 10367 tcp->tcp_bound_if = *i1; 10368 error = ip_opt_set_ill(tcp->tcp_connp, *i1, 10369 B_TRUE, checkonly, level, name, mblk); 10370 if (error != 0) { 10371 *outlenp = 0; 10372 return (error); 10373 } 10374 } 10375 break; 10376 /* 10377 * Set boolean switches for ancillary data delivery 10378 */ 10379 case IPV6_RECVPKTINFO: 10380 if (!checkonly) { 10381 if (onoff) 10382 tcp->tcp_ipv6_recvancillary |= 10383 TCP_IPV6_RECVPKTINFO; 10384 else 10385 tcp->tcp_ipv6_recvancillary &= 10386 ~TCP_IPV6_RECVPKTINFO; 10387 /* Force it to be sent up with the next msg */ 10388 tcp->tcp_recvifindex = 0; 10389 } 10390 break; 10391 case IPV6_RECVTCLASS: 10392 if (!checkonly) { 10393 if (onoff) 10394 tcp->tcp_ipv6_recvancillary |= 10395 TCP_IPV6_RECVTCLASS; 10396 else 10397 tcp->tcp_ipv6_recvancillary &= 10398 ~TCP_IPV6_RECVTCLASS; 10399 } 10400 break; 10401 case IPV6_RECVHOPLIMIT: 10402 if (!checkonly) { 10403 if (onoff) 10404 tcp->tcp_ipv6_recvancillary |= 10405 TCP_IPV6_RECVHOPLIMIT; 10406 else 10407 tcp->tcp_ipv6_recvancillary &= 10408 ~TCP_IPV6_RECVHOPLIMIT; 10409 /* Force it to be sent up with the next msg */ 10410 tcp->tcp_recvhops = 0xffffffffU; 10411 } 10412 break; 10413 case IPV6_RECVHOPOPTS: 10414 if (!checkonly) { 10415 if (onoff) 10416 tcp->tcp_ipv6_recvancillary |= 10417 TCP_IPV6_RECVHOPOPTS; 10418 else 10419 tcp->tcp_ipv6_recvancillary &= 10420 ~TCP_IPV6_RECVHOPOPTS; 10421 } 10422 break; 10423 case IPV6_RECVDSTOPTS: 10424 if (!checkonly) { 10425 if (onoff) 10426 tcp->tcp_ipv6_recvancillary |= 10427 TCP_IPV6_RECVDSTOPTS; 10428 else 10429 tcp->tcp_ipv6_recvancillary &= 10430 ~TCP_IPV6_RECVDSTOPTS; 10431 } 10432 break; 10433 case _OLD_IPV6_RECVDSTOPTS: 10434 if (!checkonly) { 10435 if (onoff) 10436 tcp->tcp_ipv6_recvancillary |= 10437 TCP_OLD_IPV6_RECVDSTOPTS; 10438 else 10439 tcp->tcp_ipv6_recvancillary &= 10440 ~TCP_OLD_IPV6_RECVDSTOPTS; 10441 } 10442 break; 10443 case IPV6_RECVRTHDR: 10444 if (!checkonly) { 10445 if (onoff) 10446 tcp->tcp_ipv6_recvancillary |= 10447 TCP_IPV6_RECVRTHDR; 10448 else 10449 tcp->tcp_ipv6_recvancillary &= 10450 ~TCP_IPV6_RECVRTHDR; 10451 } 10452 break; 10453 case IPV6_RECVRTHDRDSTOPTS: 10454 if (!checkonly) { 10455 if (onoff) 10456 tcp->tcp_ipv6_recvancillary |= 10457 TCP_IPV6_RECVRTDSTOPTS; 10458 else 10459 tcp->tcp_ipv6_recvancillary &= 10460 ~TCP_IPV6_RECVRTDSTOPTS; 10461 } 10462 break; 10463 case IPV6_PKTINFO: 10464 if (inlen != 0 && inlen != sizeof (struct in6_pktinfo)) 10465 return (EINVAL); 10466 if (checkonly) 10467 break; 10468 10469 if (inlen == 0) { 10470 ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR); 10471 } else { 10472 struct in6_pktinfo *pkti; 10473 10474 pkti = (struct in6_pktinfo *)invalp; 10475 /* 10476 * RFC 3542 states that ipi6_addr must be 10477 * the unspecified address when setting the 10478 * IPV6_PKTINFO sticky socket option on a 10479 * TCP socket. 10480 */ 10481 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) 10482 return (EINVAL); 10483 /* 10484 * ip6_set_pktinfo() validates the source 10485 * address and interface index. 10486 */ 10487 reterr = ip6_set_pktinfo(cr, tcp->tcp_connp, 10488 pkti, mblk); 10489 if (reterr != 0) 10490 return (reterr); 10491 ipp->ipp_ifindex = pkti->ipi6_ifindex; 10492 ipp->ipp_addr = pkti->ipi6_addr; 10493 if (ipp->ipp_ifindex != 0) 10494 ipp->ipp_fields |= IPPF_IFINDEX; 10495 else 10496 ipp->ipp_fields &= ~IPPF_IFINDEX; 10497 if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)) 10498 ipp->ipp_fields |= IPPF_ADDR; 10499 else 10500 ipp->ipp_fields &= ~IPPF_ADDR; 10501 } 10502 reterr = tcp_build_hdrs(q, tcp); 10503 if (reterr != 0) 10504 return (reterr); 10505 break; 10506 case IPV6_TCLASS: 10507 if (inlen != 0 && inlen != sizeof (int)) 10508 return (EINVAL); 10509 if (checkonly) 10510 break; 10511 10512 if (inlen == 0) { 10513 ipp->ipp_fields &= ~IPPF_TCLASS; 10514 } else { 10515 if (*i1 > 255 || *i1 < -1) 10516 return (EINVAL); 10517 if (*i1 == -1) { 10518 ipp->ipp_tclass = 0; 10519 *i1 = 0; 10520 } else { 10521 ipp->ipp_tclass = *i1; 10522 } 10523 ipp->ipp_fields |= IPPF_TCLASS; 10524 } 10525 reterr = tcp_build_hdrs(q, tcp); 10526 if (reterr != 0) 10527 return (reterr); 10528 break; 10529 case IPV6_NEXTHOP: 10530 /* 10531 * IP will verify that the nexthop is reachable 10532 * and fail for sticky options. 10533 */ 10534 if (inlen != 0 && inlen != sizeof (sin6_t)) 10535 return (EINVAL); 10536 if (checkonly) 10537 break; 10538 10539 if (inlen == 0) { 10540 ipp->ipp_fields &= ~IPPF_NEXTHOP; 10541 } else { 10542 sin6_t *sin6 = (sin6_t *)invalp; 10543 10544 if (sin6->sin6_family != AF_INET6) 10545 return (EAFNOSUPPORT); 10546 if (IN6_IS_ADDR_V4MAPPED( 10547 &sin6->sin6_addr)) 10548 return (EADDRNOTAVAIL); 10549 ipp->ipp_nexthop = sin6->sin6_addr; 10550 if (!IN6_IS_ADDR_UNSPECIFIED( 10551 &ipp->ipp_nexthop)) 10552 ipp->ipp_fields |= IPPF_NEXTHOP; 10553 else 10554 ipp->ipp_fields &= ~IPPF_NEXTHOP; 10555 } 10556 reterr = tcp_build_hdrs(q, tcp); 10557 if (reterr != 0) 10558 return (reterr); 10559 break; 10560 case IPV6_HOPOPTS: { 10561 ip6_hbh_t *hopts = (ip6_hbh_t *)invalp; 10562 10563 /* 10564 * Sanity checks - minimum size, size a multiple of 10565 * eight bytes, and matching size passed in. 10566 */ 10567 if (inlen != 0 && 10568 inlen != (8 * (hopts->ip6h_len + 1))) 10569 return (EINVAL); 10570 10571 if (checkonly) 10572 break; 10573 10574 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10575 (uchar_t **)&ipp->ipp_hopopts, 10576 &ipp->ipp_hopoptslen, tcp->tcp_label_len); 10577 if (reterr != 0) 10578 return (reterr); 10579 if (ipp->ipp_hopoptslen == 0) 10580 ipp->ipp_fields &= ~IPPF_HOPOPTS; 10581 else 10582 ipp->ipp_fields |= IPPF_HOPOPTS; 10583 reterr = tcp_build_hdrs(q, tcp); 10584 if (reterr != 0) 10585 return (reterr); 10586 break; 10587 } 10588 case IPV6_RTHDRDSTOPTS: { 10589 ip6_dest_t *dopts = (ip6_dest_t *)invalp; 10590 10591 /* 10592 * Sanity checks - minimum size, size a multiple of 10593 * eight bytes, and matching size passed in. 10594 */ 10595 if (inlen != 0 && 10596 inlen != (8 * (dopts->ip6d_len + 1))) 10597 return (EINVAL); 10598 10599 if (checkonly) 10600 break; 10601 10602 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10603 (uchar_t **)&ipp->ipp_rtdstopts, 10604 &ipp->ipp_rtdstoptslen, 0); 10605 if (reterr != 0) 10606 return (reterr); 10607 if (ipp->ipp_rtdstoptslen == 0) 10608 ipp->ipp_fields &= ~IPPF_RTDSTOPTS; 10609 else 10610 ipp->ipp_fields |= IPPF_RTDSTOPTS; 10611 reterr = tcp_build_hdrs(q, tcp); 10612 if (reterr != 0) 10613 return (reterr); 10614 break; 10615 } 10616 case IPV6_DSTOPTS: { 10617 ip6_dest_t *dopts = (ip6_dest_t *)invalp; 10618 10619 /* 10620 * Sanity checks - minimum size, size a multiple of 10621 * eight bytes, and matching size passed in. 10622 */ 10623 if (inlen != 0 && 10624 inlen != (8 * (dopts->ip6d_len + 1))) 10625 return (EINVAL); 10626 10627 if (checkonly) 10628 break; 10629 10630 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10631 (uchar_t **)&ipp->ipp_dstopts, 10632 &ipp->ipp_dstoptslen, 0); 10633 if (reterr != 0) 10634 return (reterr); 10635 if (ipp->ipp_dstoptslen == 0) 10636 ipp->ipp_fields &= ~IPPF_DSTOPTS; 10637 else 10638 ipp->ipp_fields |= IPPF_DSTOPTS; 10639 reterr = tcp_build_hdrs(q, tcp); 10640 if (reterr != 0) 10641 return (reterr); 10642 break; 10643 } 10644 case IPV6_RTHDR: { 10645 ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp; 10646 10647 /* 10648 * Sanity checks - minimum size, size a multiple of 10649 * eight bytes, and matching size passed in. 10650 */ 10651 if (inlen != 0 && 10652 inlen != (8 * (rt->ip6r_len + 1))) 10653 return (EINVAL); 10654 10655 if (checkonly) 10656 break; 10657 10658 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10659 (uchar_t **)&ipp->ipp_rthdr, 10660 &ipp->ipp_rthdrlen, 0); 10661 if (reterr != 0) 10662 return (reterr); 10663 if (ipp->ipp_rthdrlen == 0) 10664 ipp->ipp_fields &= ~IPPF_RTHDR; 10665 else 10666 ipp->ipp_fields |= IPPF_RTHDR; 10667 reterr = tcp_build_hdrs(q, tcp); 10668 if (reterr != 0) 10669 return (reterr); 10670 break; 10671 } 10672 case IPV6_V6ONLY: 10673 if (!checkonly) 10674 tcp->tcp_connp->conn_ipv6_v6only = onoff; 10675 break; 10676 case IPV6_USE_MIN_MTU: 10677 if (inlen != sizeof (int)) 10678 return (EINVAL); 10679 10680 if (*i1 < -1 || *i1 > 1) 10681 return (EINVAL); 10682 10683 if (checkonly) 10684 break; 10685 10686 ipp->ipp_fields |= IPPF_USE_MIN_MTU; 10687 ipp->ipp_use_min_mtu = *i1; 10688 break; 10689 case IPV6_BOUND_PIF: 10690 /* Handled at the IP level */ 10691 return (-EINVAL); 10692 case IPV6_SEC_OPT: 10693 /* 10694 * We should not allow policy setting after 10695 * we start listening for connections. 10696 */ 10697 if (tcp->tcp_state == TCPS_LISTEN) { 10698 return (EINVAL); 10699 } else { 10700 /* Handled at the IP level */ 10701 return (-EINVAL); 10702 } 10703 case IPV6_SRC_PREFERENCES: 10704 if (inlen != sizeof (uint32_t)) 10705 return (EINVAL); 10706 reterr = ip6_set_src_preferences(tcp->tcp_connp, 10707 *(uint32_t *)invalp); 10708 if (reterr != 0) { 10709 *outlenp = 0; 10710 return (reterr); 10711 } 10712 break; 10713 default: 10714 *outlenp = 0; 10715 return (EINVAL); 10716 } 10717 break; 10718 } /* end IPPROTO_IPV6 */ 10719 default: 10720 *outlenp = 0; 10721 return (EINVAL); 10722 } 10723 /* 10724 * Common case of OK return with outval same as inval 10725 */ 10726 if (invalp != outvalp) { 10727 /* don't trust bcopy for identical src/dst */ 10728 (void) bcopy(invalp, outvalp, inlen); 10729 } 10730 *outlenp = inlen; 10731 return (0); 10732 } 10733 10734 /* 10735 * Update tcp_sticky_hdrs based on tcp_sticky_ipp. 10736 * The headers include ip6i_t (if needed), ip6_t, any sticky extension 10737 * headers, and the maximum size tcp header (to avoid reallocation 10738 * on the fly for additional tcp options). 10739 * Returns failure if can't allocate memory. 10740 */ 10741 static int 10742 tcp_build_hdrs(queue_t *q, tcp_t *tcp) 10743 { 10744 char *hdrs; 10745 uint_t hdrs_len; 10746 ip6i_t *ip6i; 10747 char buf[TCP_MAX_HDR_LENGTH]; 10748 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; 10749 in6_addr_t src, dst; 10750 10751 /* 10752 * save the existing tcp header and source/dest IP addresses 10753 */ 10754 bcopy(tcp->tcp_tcph, buf, tcp->tcp_tcp_hdr_len); 10755 src = tcp->tcp_ip6h->ip6_src; 10756 dst = tcp->tcp_ip6h->ip6_dst; 10757 hdrs_len = ip_total_hdrs_len_v6(ipp) + TCP_MAX_HDR_LENGTH; 10758 ASSERT(hdrs_len != 0); 10759 if (hdrs_len > tcp->tcp_iphc_len) { 10760 /* Need to reallocate */ 10761 hdrs = kmem_zalloc(hdrs_len, KM_NOSLEEP); 10762 if (hdrs == NULL) 10763 return (ENOMEM); 10764 if (tcp->tcp_iphc != NULL) { 10765 if (tcp->tcp_hdr_grown) { 10766 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 10767 } else { 10768 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 10769 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 10770 } 10771 tcp->tcp_iphc_len = 0; 10772 } 10773 ASSERT(tcp->tcp_iphc_len == 0); 10774 tcp->tcp_iphc = hdrs; 10775 tcp->tcp_iphc_len = hdrs_len; 10776 tcp->tcp_hdr_grown = B_TRUE; 10777 } 10778 ip_build_hdrs_v6((uchar_t *)tcp->tcp_iphc, 10779 hdrs_len - TCP_MAX_HDR_LENGTH, ipp, IPPROTO_TCP); 10780 10781 /* Set header fields not in ipp */ 10782 if (ipp->ipp_fields & IPPF_HAS_IP6I) { 10783 ip6i = (ip6i_t *)tcp->tcp_iphc; 10784 tcp->tcp_ip6h = (ip6_t *)&ip6i[1]; 10785 } else { 10786 tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc; 10787 } 10788 /* 10789 * tcp->tcp_ip_hdr_len will include ip6i_t if there is one. 10790 * 10791 * tcp->tcp_tcp_hdr_len doesn't change here. 10792 */ 10793 tcp->tcp_ip_hdr_len = hdrs_len - TCP_MAX_HDR_LENGTH; 10794 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + tcp->tcp_ip_hdr_len); 10795 tcp->tcp_hdr_len = tcp->tcp_ip_hdr_len + tcp->tcp_tcp_hdr_len; 10796 10797 bcopy(buf, tcp->tcp_tcph, tcp->tcp_tcp_hdr_len); 10798 10799 tcp->tcp_ip6h->ip6_src = src; 10800 tcp->tcp_ip6h->ip6_dst = dst; 10801 10802 /* 10803 * If the hop limit was not set by ip_build_hdrs_v6(), set it to 10804 * the default value for TCP. 10805 */ 10806 if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS)) 10807 tcp->tcp_ip6h->ip6_hops = tcp_ipv6_hoplimit; 10808 10809 /* 10810 * If we're setting extension headers after a connection 10811 * has been established, and if we have a routing header 10812 * among the extension headers, call ip_massage_options_v6 to 10813 * manipulate the routing header/ip6_dst set the checksum 10814 * difference in the tcp header template. 10815 * (This happens in tcp_connect_ipv6 if the routing header 10816 * is set prior to the connect.) 10817 * Set the tcp_sum to zero first in case we've cleared a 10818 * routing header or don't have one at all. 10819 */ 10820 tcp->tcp_sum = 0; 10821 if ((tcp->tcp_state >= TCPS_SYN_SENT) && 10822 (tcp->tcp_ipp_fields & IPPF_RTHDR)) { 10823 ip6_rthdr_t *rth = ip_find_rthdr_v6(tcp->tcp_ip6h, 10824 (uint8_t *)tcp->tcp_tcph); 10825 if (rth != NULL) { 10826 tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, 10827 rth); 10828 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + 10829 (tcp->tcp_sum >> 16)); 10830 } 10831 } 10832 10833 /* Try to get everything in a single mblk */ 10834 (void) mi_set_sth_wroff(RD(q), hdrs_len + tcp_wroff_xtra); 10835 return (0); 10836 } 10837 10838 /* 10839 * Transfer any source route option from ipha to buf/dst in reversed form. 10840 */ 10841 static int 10842 tcp_opt_rev_src_route(ipha_t *ipha, char *buf, uchar_t *dst) 10843 { 10844 ipoptp_t opts; 10845 uchar_t *opt; 10846 uint8_t optval; 10847 uint8_t optlen; 10848 uint32_t len = 0; 10849 10850 for (optval = ipoptp_first(&opts, ipha); 10851 optval != IPOPT_EOL; 10852 optval = ipoptp_next(&opts)) { 10853 opt = opts.ipoptp_cur; 10854 optlen = opts.ipoptp_len; 10855 switch (optval) { 10856 int off1, off2; 10857 case IPOPT_SSRR: 10858 case IPOPT_LSRR: 10859 10860 /* Reverse source route */ 10861 /* 10862 * First entry should be the next to last one in the 10863 * current source route (the last entry is our 10864 * address.) 10865 * The last entry should be the final destination. 10866 */ 10867 buf[IPOPT_OPTVAL] = (uint8_t)optval; 10868 buf[IPOPT_OLEN] = (uint8_t)optlen; 10869 off1 = IPOPT_MINOFF_SR - 1; 10870 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; 10871 if (off2 < 0) { 10872 /* No entries in source route */ 10873 break; 10874 } 10875 bcopy(opt + off2, dst, IP_ADDR_LEN); 10876 /* 10877 * Note: use src since ipha has not had its src 10878 * and dst reversed (it is in the state it was 10879 * received. 10880 */ 10881 bcopy(&ipha->ipha_src, buf + off2, 10882 IP_ADDR_LEN); 10883 off2 -= IP_ADDR_LEN; 10884 10885 while (off2 > 0) { 10886 bcopy(opt + off2, buf + off1, 10887 IP_ADDR_LEN); 10888 off1 += IP_ADDR_LEN; 10889 off2 -= IP_ADDR_LEN; 10890 } 10891 buf[IPOPT_OFFSET] = IPOPT_MINOFF_SR; 10892 buf += optlen; 10893 len += optlen; 10894 break; 10895 } 10896 } 10897 done: 10898 /* Pad the resulting options */ 10899 while (len & 0x3) { 10900 *buf++ = IPOPT_EOL; 10901 len++; 10902 } 10903 return (len); 10904 } 10905 10906 10907 /* 10908 * Extract and revert a source route from ipha (if any) 10909 * and then update the relevant fields in both tcp_t and the standard header. 10910 */ 10911 static void 10912 tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha) 10913 { 10914 char buf[TCP_MAX_HDR_LENGTH]; 10915 uint_t tcph_len; 10916 int len; 10917 10918 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 10919 len = IPH_HDR_LENGTH(ipha); 10920 if (len == IP_SIMPLE_HDR_LENGTH) 10921 /* Nothing to do */ 10922 return; 10923 if (len > IP_SIMPLE_HDR_LENGTH + TCP_MAX_IP_OPTIONS_LENGTH || 10924 (len & 0x3)) 10925 return; 10926 10927 tcph_len = tcp->tcp_tcp_hdr_len; 10928 bcopy(tcp->tcp_tcph, buf, tcph_len); 10929 tcp->tcp_sum = (tcp->tcp_ipha->ipha_dst >> 16) + 10930 (tcp->tcp_ipha->ipha_dst & 0xffff); 10931 len = tcp_opt_rev_src_route(ipha, (char *)tcp->tcp_ipha + 10932 IP_SIMPLE_HDR_LENGTH, (uchar_t *)&tcp->tcp_ipha->ipha_dst); 10933 len += IP_SIMPLE_HDR_LENGTH; 10934 tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + 10935 (tcp->tcp_ipha->ipha_dst & 0xffff)); 10936 if ((int)tcp->tcp_sum < 0) 10937 tcp->tcp_sum--; 10938 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); 10939 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16)); 10940 tcp->tcp_tcph = (tcph_t *)((char *)tcp->tcp_ipha + len); 10941 bcopy(buf, tcp->tcp_tcph, tcph_len); 10942 tcp->tcp_ip_hdr_len = len; 10943 tcp->tcp_ipha->ipha_version_and_hdr_length = 10944 (IP_VERSION << 4) | (len >> 2); 10945 len += tcph_len; 10946 tcp->tcp_hdr_len = len; 10947 } 10948 10949 /* 10950 * Copy the standard header into its new location, 10951 * lay in the new options and then update the relevant 10952 * fields in both tcp_t and the standard header. 10953 */ 10954 static int 10955 tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len) 10956 { 10957 uint_t tcph_len; 10958 uint8_t *ip_optp; 10959 tcph_t *new_tcph; 10960 10961 if ((len > TCP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3)) 10962 return (EINVAL); 10963 10964 if (len > IP_MAX_OPT_LENGTH - tcp->tcp_label_len) 10965 return (EINVAL); 10966 10967 if (checkonly) { 10968 /* 10969 * do not really set, just pretend to - T_CHECK 10970 */ 10971 return (0); 10972 } 10973 10974 ip_optp = (uint8_t *)tcp->tcp_ipha + IP_SIMPLE_HDR_LENGTH; 10975 if (tcp->tcp_label_len > 0) { 10976 int padlen; 10977 uint8_t opt; 10978 10979 /* convert list termination to no-ops */ 10980 padlen = tcp->tcp_label_len - ip_optp[IPOPT_OLEN]; 10981 ip_optp += ip_optp[IPOPT_OLEN]; 10982 opt = len > 0 ? IPOPT_NOP : IPOPT_EOL; 10983 while (--padlen >= 0) 10984 *ip_optp++ = opt; 10985 } 10986 tcph_len = tcp->tcp_tcp_hdr_len; 10987 new_tcph = (tcph_t *)(ip_optp + len); 10988 ovbcopy(tcp->tcp_tcph, new_tcph, tcph_len); 10989 tcp->tcp_tcph = new_tcph; 10990 bcopy(ptr, ip_optp, len); 10991 10992 len += IP_SIMPLE_HDR_LENGTH + tcp->tcp_label_len; 10993 10994 tcp->tcp_ip_hdr_len = len; 10995 tcp->tcp_ipha->ipha_version_and_hdr_length = 10996 (IP_VERSION << 4) | (len >> 2); 10997 tcp->tcp_hdr_len = len + tcph_len; 10998 if (!TCP_IS_DETACHED(tcp)) { 10999 /* Always allocate room for all options. */ 11000 (void) mi_set_sth_wroff(tcp->tcp_rq, 11001 TCP_MAX_COMBINED_HEADER_LENGTH + tcp_wroff_xtra); 11002 } 11003 return (0); 11004 } 11005 11006 /* Get callback routine passed to nd_load by tcp_param_register */ 11007 /* ARGSUSED */ 11008 static int 11009 tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 11010 { 11011 tcpparam_t *tcppa = (tcpparam_t *)cp; 11012 11013 (void) mi_mpprintf(mp, "%u", tcppa->tcp_param_val); 11014 return (0); 11015 } 11016 11017 /* 11018 * Walk through the param array specified registering each element with the 11019 * named dispatch handler. 11020 */ 11021 static boolean_t 11022 tcp_param_register(tcpparam_t *tcppa, int cnt) 11023 { 11024 for (; cnt-- > 0; tcppa++) { 11025 if (tcppa->tcp_param_name && tcppa->tcp_param_name[0]) { 11026 if (!nd_load(&tcp_g_nd, tcppa->tcp_param_name, 11027 tcp_param_get, tcp_param_set, 11028 (caddr_t)tcppa)) { 11029 nd_free(&tcp_g_nd); 11030 return (B_FALSE); 11031 } 11032 } 11033 } 11034 if (!nd_load(&tcp_g_nd, tcp_wroff_xtra_param.tcp_param_name, 11035 tcp_param_get, tcp_param_set_aligned, 11036 (caddr_t)&tcp_wroff_xtra_param)) { 11037 nd_free(&tcp_g_nd); 11038 return (B_FALSE); 11039 } 11040 if (!nd_load(&tcp_g_nd, tcp_mdt_head_param.tcp_param_name, 11041 tcp_param_get, tcp_param_set_aligned, 11042 (caddr_t)&tcp_mdt_head_param)) { 11043 nd_free(&tcp_g_nd); 11044 return (B_FALSE); 11045 } 11046 if (!nd_load(&tcp_g_nd, tcp_mdt_tail_param.tcp_param_name, 11047 tcp_param_get, tcp_param_set_aligned, 11048 (caddr_t)&tcp_mdt_tail_param)) { 11049 nd_free(&tcp_g_nd); 11050 return (B_FALSE); 11051 } 11052 if (!nd_load(&tcp_g_nd, tcp_mdt_max_pbufs_param.tcp_param_name, 11053 tcp_param_get, tcp_param_set, 11054 (caddr_t)&tcp_mdt_max_pbufs_param)) { 11055 nd_free(&tcp_g_nd); 11056 return (B_FALSE); 11057 } 11058 if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports", 11059 tcp_extra_priv_ports_get, NULL, NULL)) { 11060 nd_free(&tcp_g_nd); 11061 return (B_FALSE); 11062 } 11063 if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports_add", 11064 NULL, tcp_extra_priv_ports_add, NULL)) { 11065 nd_free(&tcp_g_nd); 11066 return (B_FALSE); 11067 } 11068 if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports_del", 11069 NULL, tcp_extra_priv_ports_del, NULL)) { 11070 nd_free(&tcp_g_nd); 11071 return (B_FALSE); 11072 } 11073 if (!nd_load(&tcp_g_nd, "tcp_status", tcp_status_report, NULL, 11074 NULL)) { 11075 nd_free(&tcp_g_nd); 11076 return (B_FALSE); 11077 } 11078 if (!nd_load(&tcp_g_nd, "tcp_bind_hash", tcp_bind_hash_report, 11079 NULL, NULL)) { 11080 nd_free(&tcp_g_nd); 11081 return (B_FALSE); 11082 } 11083 if (!nd_load(&tcp_g_nd, "tcp_listen_hash", tcp_listen_hash_report, 11084 NULL, NULL)) { 11085 nd_free(&tcp_g_nd); 11086 return (B_FALSE); 11087 } 11088 if (!nd_load(&tcp_g_nd, "tcp_conn_hash", tcp_conn_hash_report, 11089 NULL, NULL)) { 11090 nd_free(&tcp_g_nd); 11091 return (B_FALSE); 11092 } 11093 if (!nd_load(&tcp_g_nd, "tcp_acceptor_hash", tcp_acceptor_hash_report, 11094 NULL, NULL)) { 11095 nd_free(&tcp_g_nd); 11096 return (B_FALSE); 11097 } 11098 if (!nd_load(&tcp_g_nd, "tcp_host_param", tcp_host_param_report, 11099 tcp_host_param_set, NULL)) { 11100 nd_free(&tcp_g_nd); 11101 return (B_FALSE); 11102 } 11103 if (!nd_load(&tcp_g_nd, "tcp_host_param_ipv6", tcp_host_param_report, 11104 tcp_host_param_set_ipv6, NULL)) { 11105 nd_free(&tcp_g_nd); 11106 return (B_FALSE); 11107 } 11108 if (!nd_load(&tcp_g_nd, "tcp_1948_phrase", NULL, tcp_1948_phrase_set, 11109 NULL)) { 11110 nd_free(&tcp_g_nd); 11111 return (B_FALSE); 11112 } 11113 if (!nd_load(&tcp_g_nd, "tcp_reserved_port_list", 11114 tcp_reserved_port_list, NULL, NULL)) { 11115 nd_free(&tcp_g_nd); 11116 return (B_FALSE); 11117 } 11118 /* 11119 * Dummy ndd variables - only to convey obsolescence information 11120 * through printing of their name (no get or set routines) 11121 * XXX Remove in future releases ? 11122 */ 11123 if (!nd_load(&tcp_g_nd, 11124 "tcp_close_wait_interval(obsoleted - " 11125 "use tcp_time_wait_interval)", NULL, NULL, NULL)) { 11126 nd_free(&tcp_g_nd); 11127 return (B_FALSE); 11128 } 11129 return (B_TRUE); 11130 } 11131 11132 /* ndd set routine for tcp_wroff_xtra, tcp_mdt_hdr_{head,tail}_min. */ 11133 /* ARGSUSED */ 11134 static int 11135 tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 11136 cred_t *cr) 11137 { 11138 long new_value; 11139 tcpparam_t *tcppa = (tcpparam_t *)cp; 11140 11141 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11142 new_value < tcppa->tcp_param_min || 11143 new_value > tcppa->tcp_param_max) { 11144 return (EINVAL); 11145 } 11146 /* 11147 * Need to make sure new_value is a multiple of 4. If it is not, 11148 * round it up. For future 64 bit requirement, we actually make it 11149 * a multiple of 8. 11150 */ 11151 if (new_value & 0x7) { 11152 new_value = (new_value & ~0x7) + 0x8; 11153 } 11154 tcppa->tcp_param_val = new_value; 11155 return (0); 11156 } 11157 11158 /* Set callback routine passed to nd_load by tcp_param_register */ 11159 /* ARGSUSED */ 11160 static int 11161 tcp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) 11162 { 11163 long new_value; 11164 tcpparam_t *tcppa = (tcpparam_t *)cp; 11165 11166 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11167 new_value < tcppa->tcp_param_min || 11168 new_value > tcppa->tcp_param_max) { 11169 return (EINVAL); 11170 } 11171 tcppa->tcp_param_val = new_value; 11172 return (0); 11173 } 11174 11175 /* 11176 * Add a new piece to the tcp reassembly queue. If the gap at the beginning 11177 * is filled, return as much as we can. The message passed in may be 11178 * multi-part, chained using b_cont. "start" is the starting sequence 11179 * number for this piece. 11180 */ 11181 static mblk_t * 11182 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 11183 { 11184 uint32_t end; 11185 mblk_t *mp1; 11186 mblk_t *mp2; 11187 mblk_t *next_mp; 11188 uint32_t u1; 11189 11190 /* Walk through all the new pieces. */ 11191 do { 11192 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 11193 (uintptr_t)INT_MAX); 11194 end = start + (int)(mp->b_wptr - mp->b_rptr); 11195 next_mp = mp->b_cont; 11196 if (start == end) { 11197 /* Empty. Blast it. */ 11198 freeb(mp); 11199 continue; 11200 } 11201 mp->b_cont = NULL; 11202 TCP_REASS_SET_SEQ(mp, start); 11203 TCP_REASS_SET_END(mp, end); 11204 mp1 = tcp->tcp_reass_tail; 11205 if (!mp1) { 11206 tcp->tcp_reass_tail = mp; 11207 tcp->tcp_reass_head = mp; 11208 BUMP_MIB(&tcp_mib, tcpInDataUnorderSegs); 11209 UPDATE_MIB(&tcp_mib, 11210 tcpInDataUnorderBytes, end - start); 11211 continue; 11212 } 11213 /* New stuff completely beyond tail? */ 11214 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { 11215 /* Link it on end. */ 11216 mp1->b_cont = mp; 11217 tcp->tcp_reass_tail = mp; 11218 BUMP_MIB(&tcp_mib, tcpInDataUnorderSegs); 11219 UPDATE_MIB(&tcp_mib, 11220 tcpInDataUnorderBytes, end - start); 11221 continue; 11222 } 11223 mp1 = tcp->tcp_reass_head; 11224 u1 = TCP_REASS_SEQ(mp1); 11225 /* New stuff at the front? */ 11226 if (SEQ_LT(start, u1)) { 11227 /* Yes... Check for overlap. */ 11228 mp->b_cont = mp1; 11229 tcp->tcp_reass_head = mp; 11230 tcp_reass_elim_overlap(tcp, mp); 11231 continue; 11232 } 11233 /* 11234 * The new piece fits somewhere between the head and tail. 11235 * We find our slot, where mp1 precedes us and mp2 trails. 11236 */ 11237 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 11238 u1 = TCP_REASS_SEQ(mp2); 11239 if (SEQ_LEQ(start, u1)) 11240 break; 11241 } 11242 /* Link ourselves in */ 11243 mp->b_cont = mp2; 11244 mp1->b_cont = mp; 11245 11246 /* Trim overlap with following mblk(s) first */ 11247 tcp_reass_elim_overlap(tcp, mp); 11248 11249 /* Trim overlap with preceding mblk */ 11250 tcp_reass_elim_overlap(tcp, mp1); 11251 11252 } while (start = end, mp = next_mp); 11253 mp1 = tcp->tcp_reass_head; 11254 /* Anything ready to go? */ 11255 if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 11256 return (NULL); 11257 /* Eat what we can off the queue */ 11258 for (;;) { 11259 mp = mp1->b_cont; 11260 end = TCP_REASS_END(mp1); 11261 TCP_REASS_SET_SEQ(mp1, 0); 11262 TCP_REASS_SET_END(mp1, 0); 11263 if (!mp) { 11264 tcp->tcp_reass_tail = NULL; 11265 break; 11266 } 11267 if (end != TCP_REASS_SEQ(mp)) { 11268 mp1->b_cont = NULL; 11269 break; 11270 } 11271 mp1 = mp; 11272 } 11273 mp1 = tcp->tcp_reass_head; 11274 tcp->tcp_reass_head = mp; 11275 return (mp1); 11276 } 11277 11278 /* Eliminate any overlap that mp may have over later mblks */ 11279 static void 11280 tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 11281 { 11282 uint32_t end; 11283 mblk_t *mp1; 11284 uint32_t u1; 11285 11286 end = TCP_REASS_END(mp); 11287 while ((mp1 = mp->b_cont) != NULL) { 11288 u1 = TCP_REASS_SEQ(mp1); 11289 if (!SEQ_GT(end, u1)) 11290 break; 11291 if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 11292 mp->b_wptr -= end - u1; 11293 TCP_REASS_SET_END(mp, u1); 11294 BUMP_MIB(&tcp_mib, tcpInDataPartDupSegs); 11295 UPDATE_MIB(&tcp_mib, tcpInDataPartDupBytes, end - u1); 11296 break; 11297 } 11298 mp->b_cont = mp1->b_cont; 11299 TCP_REASS_SET_SEQ(mp1, 0); 11300 TCP_REASS_SET_END(mp1, 0); 11301 freeb(mp1); 11302 BUMP_MIB(&tcp_mib, tcpInDataDupSegs); 11303 UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, end - u1); 11304 } 11305 if (!mp1) 11306 tcp->tcp_reass_tail = mp; 11307 } 11308 11309 /* 11310 * Send up all messages queued on tcp_rcv_list. 11311 */ 11312 static uint_t 11313 tcp_rcv_drain(queue_t *q, tcp_t *tcp) 11314 { 11315 mblk_t *mp; 11316 uint_t ret = 0; 11317 uint_t thwin; 11318 #ifdef DEBUG 11319 uint_t cnt = 0; 11320 #endif 11321 /* Can't drain on an eager connection */ 11322 if (tcp->tcp_listener != NULL) 11323 return (ret); 11324 11325 /* 11326 * Handle two cases here: we are currently fused or we were 11327 * previously fused and have some urgent data to be delivered 11328 * upstream. The latter happens because we either ran out of 11329 * memory or were detached and therefore sending the SIGURG was 11330 * deferred until this point. In either case we pass control 11331 * over to tcp_fuse_rcv_drain() since it may need to complete 11332 * some work. 11333 */ 11334 if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) { 11335 ASSERT(tcp->tcp_fused_sigurg_mp != NULL); 11336 if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL : 11337 &tcp->tcp_fused_sigurg_mp)) 11338 return (ret); 11339 } 11340 11341 while ((mp = tcp->tcp_rcv_list) != NULL) { 11342 tcp->tcp_rcv_list = mp->b_next; 11343 mp->b_next = NULL; 11344 #ifdef DEBUG 11345 cnt += msgdsize(mp); 11346 #endif 11347 /* Does this need SSL processing first? */ 11348 if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) { 11349 tcp_kssl_input(tcp, mp); 11350 continue; 11351 } 11352 putnext(q, mp); 11353 } 11354 ASSERT(cnt == tcp->tcp_rcv_cnt); 11355 tcp->tcp_rcv_last_head = NULL; 11356 tcp->tcp_rcv_last_tail = NULL; 11357 tcp->tcp_rcv_cnt = 0; 11358 11359 /* Learn the latest rwnd information that we sent to the other side. */ 11360 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 11361 << tcp->tcp_rcv_ws; 11362 /* This is peer's calculated send window (our receive window). */ 11363 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 11364 /* 11365 * Increase the receive window to max. But we need to do receiver 11366 * SWS avoidance. This means that we need to check the increase of 11367 * of receive window is at least 1 MSS. 11368 */ 11369 if (canputnext(q) && (q->q_hiwat - thwin >= tcp->tcp_mss)) { 11370 /* 11371 * If the window that the other side knows is less than max 11372 * deferred acks segments, send an update immediately. 11373 */ 11374 if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { 11375 BUMP_MIB(&tcp_mib, tcpOutWinUpdate); 11376 ret = TH_ACK_NEEDED; 11377 } 11378 tcp->tcp_rwnd = q->q_hiwat; 11379 } 11380 /* No need for the push timer now. */ 11381 if (tcp->tcp_push_tid != 0) { 11382 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 11383 tcp->tcp_push_tid = 0; 11384 } 11385 return (ret); 11386 } 11387 11388 /* 11389 * Queue data on tcp_rcv_list which is a b_next chain. 11390 * tcp_rcv_last_head/tail is the last element of this chain. 11391 * Each element of the chain is a b_cont chain. 11392 * 11393 * M_DATA messages are added to the current element. 11394 * Other messages are added as new (b_next) elements. 11395 */ 11396 void 11397 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) 11398 { 11399 ASSERT(seg_len == msgdsize(mp)); 11400 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); 11401 11402 if (tcp->tcp_rcv_list == NULL) { 11403 ASSERT(tcp->tcp_rcv_last_head == NULL); 11404 tcp->tcp_rcv_list = mp; 11405 tcp->tcp_rcv_last_head = mp; 11406 } else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) { 11407 tcp->tcp_rcv_last_tail->b_cont = mp; 11408 } else { 11409 tcp->tcp_rcv_last_head->b_next = mp; 11410 tcp->tcp_rcv_last_head = mp; 11411 } 11412 11413 while (mp->b_cont) 11414 mp = mp->b_cont; 11415 11416 tcp->tcp_rcv_last_tail = mp; 11417 tcp->tcp_rcv_cnt += seg_len; 11418 tcp->tcp_rwnd -= seg_len; 11419 } 11420 11421 /* 11422 * DEFAULT TCP ENTRY POINT via squeue on READ side. 11423 * 11424 * This is the default entry function into TCP on the read side. TCP is 11425 * always entered via squeue i.e. using squeue's for mutual exclusion. 11426 * When classifier does a lookup to find the tcp, it also puts a reference 11427 * on the conn structure associated so the tcp is guaranteed to exist 11428 * when we come here. We still need to check the state because it might 11429 * as well has been closed. The squeue processing function i.e. squeue_enter, 11430 * squeue_enter_nodrain, or squeue_drain is responsible for doing the 11431 * CONN_DEC_REF. 11432 * 11433 * Apart from the default entry point, IP also sends packets directly to 11434 * tcp_rput_data for AF_INET fast path and tcp_conn_request for incoming 11435 * connections. 11436 */ 11437 void 11438 tcp_input(void *arg, mblk_t *mp, void *arg2) 11439 { 11440 conn_t *connp = (conn_t *)arg; 11441 tcp_t *tcp = (tcp_t *)connp->conn_tcp; 11442 11443 /* arg2 is the sqp */ 11444 ASSERT(arg2 != NULL); 11445 ASSERT(mp != NULL); 11446 11447 /* 11448 * Don't accept any input on a closed tcp as this TCP logically does 11449 * not exist on the system. Don't proceed further with this TCP. 11450 * For eg. this packet could trigger another close of this tcp 11451 * which would be disastrous for tcp_refcnt. tcp_close_detached / 11452 * tcp_clean_death / tcp_closei_local must be called at most once 11453 * on a TCP. In this case we need to refeed the packet into the 11454 * classifier and figure out where the packet should go. Need to 11455 * preserve the recv_ill somehow. Until we figure that out, for 11456 * now just drop the packet if we can't classify the packet. 11457 */ 11458 if (tcp->tcp_state == TCPS_CLOSED || 11459 tcp->tcp_state == TCPS_BOUND) { 11460 conn_t *new_connp; 11461 11462 new_connp = ipcl_classify(mp, connp->conn_zoneid); 11463 if (new_connp != NULL) { 11464 tcp_reinput(new_connp, mp, arg2); 11465 return; 11466 } 11467 /* We failed to classify. For now just drop the packet */ 11468 freemsg(mp); 11469 return; 11470 } 11471 11472 if (DB_TYPE(mp) == M_DATA) 11473 tcp_rput_data(connp, mp, arg2); 11474 else 11475 tcp_rput_common(tcp, mp); 11476 } 11477 11478 /* 11479 * The read side put procedure. 11480 * The packets passed up by ip are assume to be aligned according to 11481 * OK_32PTR and the IP+TCP headers fitting in the first mblk. 11482 */ 11483 static void 11484 tcp_rput_common(tcp_t *tcp, mblk_t *mp) 11485 { 11486 /* 11487 * tcp_rput_data() does not expect M_CTL except for the case 11488 * where tcp_ipv6_recvancillary is set and we get a IN_PKTINFO 11489 * type. Need to make sure that any other M_CTLs don't make 11490 * it to tcp_rput_data since it is not expecting any and doesn't 11491 * check for it. 11492 */ 11493 if (DB_TYPE(mp) == M_CTL) { 11494 switch (*(uint32_t *)(mp->b_rptr)) { 11495 case TCP_IOC_ABORT_CONN: 11496 /* 11497 * Handle connection abort request. 11498 */ 11499 tcp_ioctl_abort_handler(tcp, mp); 11500 return; 11501 case IPSEC_IN: 11502 /* 11503 * Only secure icmp arrive in TCP and they 11504 * don't go through data path. 11505 */ 11506 tcp_icmp_error(tcp, mp); 11507 return; 11508 case IN_PKTINFO: 11509 /* 11510 * Handle IPV6_RECVPKTINFO socket option on AF_INET6 11511 * sockets that are receiving IPv4 traffic. tcp 11512 */ 11513 ASSERT(tcp->tcp_family == AF_INET6); 11514 ASSERT(tcp->tcp_ipv6_recvancillary & 11515 TCP_IPV6_RECVPKTINFO); 11516 tcp_rput_data(tcp->tcp_connp, mp, 11517 tcp->tcp_connp->conn_sqp); 11518 return; 11519 case MDT_IOC_INFO_UPDATE: 11520 /* 11521 * Handle Multidata information update; the 11522 * following routine will free the message. 11523 */ 11524 if (tcp->tcp_connp->conn_mdt_ok) { 11525 tcp_mdt_update(tcp, 11526 &((ip_mdt_info_t *)mp->b_rptr)->mdt_capab, 11527 B_FALSE); 11528 } 11529 freemsg(mp); 11530 return; 11531 default: 11532 break; 11533 } 11534 } 11535 11536 /* No point processing the message if tcp is already closed */ 11537 if (TCP_IS_DETACHED_NONEAGER(tcp)) { 11538 freemsg(mp); 11539 return; 11540 } 11541 11542 tcp_rput_other(tcp, mp); 11543 } 11544 11545 11546 /* The minimum of smoothed mean deviation in RTO calculation. */ 11547 #define TCP_SD_MIN 400 11548 11549 /* 11550 * Set RTO for this connection. The formula is from Jacobson and Karels' 11551 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 11552 * are the same as those in Appendix A.2 of that paper. 11553 * 11554 * m = new measurement 11555 * sa = smoothed RTT average (8 * average estimates). 11556 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 11557 */ 11558 static void 11559 tcp_set_rto(tcp_t *tcp, clock_t rtt) 11560 { 11561 long m = TICK_TO_MSEC(rtt); 11562 clock_t sa = tcp->tcp_rtt_sa; 11563 clock_t sv = tcp->tcp_rtt_sd; 11564 clock_t rto; 11565 11566 BUMP_MIB(&tcp_mib, tcpRttUpdate); 11567 tcp->tcp_rtt_update++; 11568 11569 /* tcp_rtt_sa is not 0 means this is a new sample. */ 11570 if (sa != 0) { 11571 /* 11572 * Update average estimator: 11573 * new rtt = 7/8 old rtt + 1/8 Error 11574 */ 11575 11576 /* m is now Error in estimate. */ 11577 m -= sa >> 3; 11578 if ((sa += m) <= 0) { 11579 /* 11580 * Don't allow the smoothed average to be negative. 11581 * We use 0 to denote reinitialization of the 11582 * variables. 11583 */ 11584 sa = 1; 11585 } 11586 11587 /* 11588 * Update deviation estimator: 11589 * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) 11590 */ 11591 if (m < 0) 11592 m = -m; 11593 m -= sv >> 2; 11594 sv += m; 11595 } else { 11596 /* 11597 * This follows BSD's implementation. So the reinitialized 11598 * RTO is 3 * m. We cannot go less than 2 because if the 11599 * link is bandwidth dominated, doubling the window size 11600 * during slow start means doubling the RTT. We want to be 11601 * more conservative when we reinitialize our estimates. 3 11602 * is just a convenient number. 11603 */ 11604 sa = m << 3; 11605 sv = m << 1; 11606 } 11607 if (sv < TCP_SD_MIN) { 11608 /* 11609 * We do not know that if sa captures the delay ACK 11610 * effect as in a long train of segments, a receiver 11611 * does not delay its ACKs. So set the minimum of sv 11612 * to be TCP_SD_MIN, which is default to 400 ms, twice 11613 * of BSD DATO. That means the minimum of mean 11614 * deviation is 100 ms. 11615 * 11616 */ 11617 sv = TCP_SD_MIN; 11618 } 11619 tcp->tcp_rtt_sa = sa; 11620 tcp->tcp_rtt_sd = sv; 11621 /* 11622 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) 11623 * 11624 * Add tcp_rexmit_interval extra in case of extreme environment 11625 * where the algorithm fails to work. The default value of 11626 * tcp_rexmit_interval_extra should be 0. 11627 * 11628 * As we use a finer grained clock than BSD and update 11629 * RTO for every ACKs, add in another .25 of RTT to the 11630 * deviation of RTO to accomodate burstiness of 1/4 of 11631 * window size. 11632 */ 11633 rto = (sa >> 3) + sv + tcp_rexmit_interval_extra + (sa >> 5); 11634 11635 if (rto > tcp_rexmit_interval_max) { 11636 tcp->tcp_rto = tcp_rexmit_interval_max; 11637 } else if (rto < tcp_rexmit_interval_min) { 11638 tcp->tcp_rto = tcp_rexmit_interval_min; 11639 } else { 11640 tcp->tcp_rto = rto; 11641 } 11642 11643 /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 11644 tcp->tcp_timer_backoff = 0; 11645 } 11646 11647 /* 11648 * tcp_get_seg_mp() is called to get the pointer to a segment in the 11649 * send queue which starts at the given seq. no. 11650 * 11651 * Parameters: 11652 * tcp_t *tcp: the tcp instance pointer. 11653 * uint32_t seq: the starting seq. no of the requested segment. 11654 * int32_t *off: after the execution, *off will be the offset to 11655 * the returned mblk which points to the requested seq no. 11656 * It is the caller's responsibility to send in a non-null off. 11657 * 11658 * Return: 11659 * A mblk_t pointer pointing to the requested segment in send queue. 11660 */ 11661 static mblk_t * 11662 tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 11663 { 11664 int32_t cnt; 11665 mblk_t *mp; 11666 11667 /* Defensive coding. Make sure we don't send incorrect data. */ 11668 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt)) 11669 return (NULL); 11670 11671 cnt = seq - tcp->tcp_suna; 11672 mp = tcp->tcp_xmit_head; 11673 while (cnt > 0 && mp != NULL) { 11674 cnt -= mp->b_wptr - mp->b_rptr; 11675 if (cnt < 0) { 11676 cnt += mp->b_wptr - mp->b_rptr; 11677 break; 11678 } 11679 mp = mp->b_cont; 11680 } 11681 ASSERT(mp != NULL); 11682 *off = cnt; 11683 return (mp); 11684 } 11685 11686 /* 11687 * This function handles all retransmissions if SACK is enabled for this 11688 * connection. First it calculates how many segments can be retransmitted 11689 * based on tcp_pipe. Then it goes thru the notsack list to find eligible 11690 * segments. A segment is eligible if sack_cnt for that segment is greater 11691 * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 11692 * all eligible segments, it checks to see if TCP can send some new segments 11693 * (fast recovery). If it can, set the appropriate flag for tcp_rput_data(). 11694 * 11695 * Parameters: 11696 * tcp_t *tcp: the tcp structure of the connection. 11697 * uint_t *flags: in return, appropriate value will be set for 11698 * tcp_rput_data(). 11699 */ 11700 static void 11701 tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) 11702 { 11703 notsack_blk_t *notsack_blk; 11704 int32_t usable_swnd; 11705 int32_t mss; 11706 uint32_t seg_len; 11707 mblk_t *xmit_mp; 11708 11709 ASSERT(tcp->tcp_sack_info != NULL); 11710 ASSERT(tcp->tcp_notsack_list != NULL); 11711 ASSERT(tcp->tcp_rexmit == B_FALSE); 11712 11713 /* Defensive coding in case there is a bug... */ 11714 if (tcp->tcp_notsack_list == NULL) { 11715 return; 11716 } 11717 notsack_blk = tcp->tcp_notsack_list; 11718 mss = tcp->tcp_mss; 11719 11720 /* 11721 * Limit the num of outstanding data in the network to be 11722 * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 11723 */ 11724 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 11725 11726 /* At least retransmit 1 MSS of data. */ 11727 if (usable_swnd <= 0) { 11728 usable_swnd = mss; 11729 } 11730 11731 /* Make sure no new RTT samples will be taken. */ 11732 tcp->tcp_csuna = tcp->tcp_snxt; 11733 11734 notsack_blk = tcp->tcp_notsack_list; 11735 while (usable_swnd > 0) { 11736 mblk_t *snxt_mp, *tmp_mp; 11737 tcp_seq begin = tcp->tcp_sack_snxt; 11738 tcp_seq end; 11739 int32_t off; 11740 11741 for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 11742 if (SEQ_GT(notsack_blk->end, begin) && 11743 (notsack_blk->sack_cnt >= 11744 tcp_dupack_fast_retransmit)) { 11745 end = notsack_blk->end; 11746 if (SEQ_LT(begin, notsack_blk->begin)) { 11747 begin = notsack_blk->begin; 11748 } 11749 break; 11750 } 11751 } 11752 /* 11753 * All holes are filled. Manipulate tcp_cwnd to send more 11754 * if we can. Note that after the SACK recovery, tcp_cwnd is 11755 * set to tcp_cwnd_ssthresh. 11756 */ 11757 if (notsack_blk == NULL) { 11758 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 11759 if (usable_swnd <= 0 || tcp->tcp_unsent == 0) { 11760 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 11761 ASSERT(tcp->tcp_cwnd > 0); 11762 return; 11763 } else { 11764 usable_swnd = usable_swnd / mss; 11765 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 11766 MAX(usable_swnd * mss, mss); 11767 *flags |= TH_XMIT_NEEDED; 11768 return; 11769 } 11770 } 11771 11772 /* 11773 * Note that we may send more than usable_swnd allows here 11774 * because of round off, but no more than 1 MSS of data. 11775 */ 11776 seg_len = end - begin; 11777 if (seg_len > mss) 11778 seg_len = mss; 11779 snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 11780 ASSERT(snxt_mp != NULL); 11781 /* This should not happen. Defensive coding again... */ 11782 if (snxt_mp == NULL) { 11783 return; 11784 } 11785 11786 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 11787 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 11788 if (xmit_mp == NULL) 11789 return; 11790 11791 usable_swnd -= seg_len; 11792 tcp->tcp_pipe += seg_len; 11793 tcp->tcp_sack_snxt = begin + seg_len; 11794 TCP_RECORD_TRACE(tcp, xmit_mp, TCP_TRACE_SEND_PKT); 11795 tcp_send_data(tcp, tcp->tcp_wq, xmit_mp); 11796 11797 /* 11798 * Update the send timestamp to avoid false retransmission. 11799 */ 11800 snxt_mp->b_prev = (mblk_t *)lbolt; 11801 11802 BUMP_MIB(&tcp_mib, tcpRetransSegs); 11803 UPDATE_MIB(&tcp_mib, tcpRetransBytes, seg_len); 11804 BUMP_MIB(&tcp_mib, tcpOutSackRetransSegs); 11805 /* 11806 * Update tcp_rexmit_max to extend this SACK recovery phase. 11807 * This happens when new data sent during fast recovery is 11808 * also lost. If TCP retransmits those new data, it needs 11809 * to extend SACK recover phase to avoid starting another 11810 * fast retransmit/recovery unnecessarily. 11811 */ 11812 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 11813 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 11814 } 11815 } 11816 } 11817 11818 /* 11819 * This function handles policy checking at TCP level for non-hard_bound/ 11820 * detached connections. 11821 */ 11822 static boolean_t 11823 tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h, 11824 boolean_t secure, boolean_t mctl_present) 11825 { 11826 ipsec_latch_t *ipl = NULL; 11827 ipsec_action_t *act = NULL; 11828 mblk_t *data_mp; 11829 ipsec_in_t *ii; 11830 const char *reason; 11831 kstat_named_t *counter; 11832 11833 ASSERT(mctl_present || !secure); 11834 11835 ASSERT((ipha == NULL && ip6h != NULL) || 11836 (ip6h == NULL && ipha != NULL)); 11837 11838 /* 11839 * We don't necessarily have an ipsec_in_act action to verify 11840 * policy because of assymetrical policy where we have only 11841 * outbound policy and no inbound policy (possible with global 11842 * policy). 11843 */ 11844 if (!secure) { 11845 if (act == NULL || act->ipa_act.ipa_type == IPSEC_ACT_BYPASS || 11846 act->ipa_act.ipa_type == IPSEC_ACT_CLEAR) 11847 return (B_TRUE); 11848 ipsec_log_policy_failure(tcp->tcp_wq, IPSEC_POLICY_MISMATCH, 11849 "tcp_check_policy", ipha, ip6h, secure); 11850 ip_drop_packet(first_mp, B_TRUE, NULL, NULL, 11851 &ipdrops_tcp_clear, &tcp_dropper); 11852 return (B_FALSE); 11853 } 11854 11855 /* 11856 * We have a secure packet. 11857 */ 11858 if (act == NULL) { 11859 ipsec_log_policy_failure(tcp->tcp_wq, 11860 IPSEC_POLICY_NOT_NEEDED, "tcp_check_policy", ipha, ip6h, 11861 secure); 11862 ip_drop_packet(first_mp, B_TRUE, NULL, NULL, 11863 &ipdrops_tcp_secure, &tcp_dropper); 11864 return (B_FALSE); 11865 } 11866 11867 /* 11868 * XXX This whole routine is currently incorrect. ipl should 11869 * be set to the latch pointer, but is currently not set, so 11870 * we initialize it to NULL to avoid picking up random garbage. 11871 */ 11872 if (ipl == NULL) 11873 return (B_TRUE); 11874 11875 data_mp = first_mp->b_cont; 11876 11877 ii = (ipsec_in_t *)first_mp->b_rptr; 11878 11879 if (ipsec_check_ipsecin_latch(ii, data_mp, ipl, ipha, ip6h, &reason, 11880 &counter)) { 11881 BUMP_MIB(&ip_mib, ipsecInSucceeded); 11882 return (B_TRUE); 11883 } 11884 (void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE, 11885 "tcp inbound policy mismatch: %s, packet dropped\n", 11886 reason); 11887 BUMP_MIB(&ip_mib, ipsecInFailed); 11888 11889 ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter, &tcp_dropper); 11890 return (B_FALSE); 11891 } 11892 11893 /* 11894 * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start 11895 * retransmission after a timeout. 11896 * 11897 * To limit the number of duplicate segments, we limit the number of segment 11898 * to be sent in one time to tcp_snd_burst, the burst variable. 11899 */ 11900 static void 11901 tcp_ss_rexmit(tcp_t *tcp) 11902 { 11903 uint32_t snxt; 11904 uint32_t smax; 11905 int32_t win; 11906 int32_t mss; 11907 int32_t off; 11908 int32_t burst = tcp->tcp_snd_burst; 11909 mblk_t *snxt_mp; 11910 11911 /* 11912 * Note that tcp_rexmit can be set even though TCP has retransmitted 11913 * all unack'ed segments. 11914 */ 11915 if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 11916 smax = tcp->tcp_rexmit_max; 11917 snxt = tcp->tcp_rexmit_nxt; 11918 if (SEQ_LT(snxt, tcp->tcp_suna)) { 11919 snxt = tcp->tcp_suna; 11920 } 11921 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 11922 win -= snxt - tcp->tcp_suna; 11923 mss = tcp->tcp_mss; 11924 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 11925 11926 while (SEQ_LT(snxt, smax) && (win > 0) && 11927 (burst > 0) && (snxt_mp != NULL)) { 11928 mblk_t *xmit_mp; 11929 mblk_t *old_snxt_mp = snxt_mp; 11930 uint32_t cnt = mss; 11931 11932 if (win < cnt) { 11933 cnt = win; 11934 } 11935 if (SEQ_GT(snxt + cnt, smax)) { 11936 cnt = smax - snxt; 11937 } 11938 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 11939 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 11940 if (xmit_mp == NULL) 11941 return; 11942 11943 tcp_send_data(tcp, tcp->tcp_wq, xmit_mp); 11944 11945 snxt += cnt; 11946 win -= cnt; 11947 /* 11948 * Update the send timestamp to avoid false 11949 * retransmission. 11950 */ 11951 old_snxt_mp->b_prev = (mblk_t *)lbolt; 11952 BUMP_MIB(&tcp_mib, tcpRetransSegs); 11953 UPDATE_MIB(&tcp_mib, tcpRetransBytes, cnt); 11954 11955 tcp->tcp_rexmit_nxt = snxt; 11956 burst--; 11957 } 11958 /* 11959 * If we have transmitted all we have at the time 11960 * we started the retranmission, we can leave 11961 * the rest of the job to tcp_wput_data(). But we 11962 * need to check the send window first. If the 11963 * win is not 0, go on with tcp_wput_data(). 11964 */ 11965 if (SEQ_LT(snxt, smax) || win == 0) { 11966 return; 11967 } 11968 } 11969 /* Only call tcp_wput_data() if there is data to be sent. */ 11970 if (tcp->tcp_unsent) { 11971 tcp_wput_data(tcp, NULL, B_FALSE); 11972 } 11973 } 11974 11975 /* 11976 * Process all TCP option in SYN segment. Note that this function should 11977 * be called after tcp_adapt_ire() is called so that the necessary info 11978 * from IRE is already set in the tcp structure. 11979 * 11980 * This function sets up the correct tcp_mss value according to the 11981 * MSS option value and our header size. It also sets up the window scale 11982 * and timestamp values, and initialize SACK info blocks. But it does not 11983 * change receive window size after setting the tcp_mss value. The caller 11984 * should do the appropriate change. 11985 */ 11986 void 11987 tcp_process_options(tcp_t *tcp, tcph_t *tcph) 11988 { 11989 int options; 11990 tcp_opt_t tcpopt; 11991 uint32_t mss_max; 11992 char *tmp_tcph; 11993 11994 tcpopt.tcp = NULL; 11995 options = tcp_parse_options(tcph, &tcpopt); 11996 11997 /* 11998 * Process MSS option. Note that MSS option value does not account 11999 * for IP or TCP options. This means that it is equal to MTU - minimum 12000 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 12001 * IPv6. 12002 */ 12003 if (!(options & TCP_OPT_MSS_PRESENT)) { 12004 if (tcp->tcp_ipversion == IPV4_VERSION) 12005 tcpopt.tcp_opt_mss = tcp_mss_def_ipv4; 12006 else 12007 tcpopt.tcp_opt_mss = tcp_mss_def_ipv6; 12008 } else { 12009 if (tcp->tcp_ipversion == IPV4_VERSION) 12010 mss_max = tcp_mss_max_ipv4; 12011 else 12012 mss_max = tcp_mss_max_ipv6; 12013 if (tcpopt.tcp_opt_mss < tcp_mss_min) 12014 tcpopt.tcp_opt_mss = tcp_mss_min; 12015 else if (tcpopt.tcp_opt_mss > mss_max) 12016 tcpopt.tcp_opt_mss = mss_max; 12017 } 12018 12019 /* Process Window Scale option. */ 12020 if (options & TCP_OPT_WSCALE_PRESENT) { 12021 tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 12022 tcp->tcp_snd_ws_ok = B_TRUE; 12023 } else { 12024 tcp->tcp_snd_ws = B_FALSE; 12025 tcp->tcp_snd_ws_ok = B_FALSE; 12026 tcp->tcp_rcv_ws = B_FALSE; 12027 } 12028 12029 /* Process Timestamp option. */ 12030 if ((options & TCP_OPT_TSTAMP_PRESENT) && 12031 (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) { 12032 tmp_tcph = (char *)tcp->tcp_tcph; 12033 12034 tcp->tcp_snd_ts_ok = B_TRUE; 12035 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 12036 tcp->tcp_last_rcv_lbolt = lbolt64; 12037 ASSERT(OK_32PTR(tmp_tcph)); 12038 ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 12039 12040 /* Fill in our template header with basic timestamp option. */ 12041 tmp_tcph += tcp->tcp_tcp_hdr_len; 12042 tmp_tcph[0] = TCPOPT_NOP; 12043 tmp_tcph[1] = TCPOPT_NOP; 12044 tmp_tcph[2] = TCPOPT_TSTAMP; 12045 tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 12046 tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN; 12047 tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN; 12048 tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4); 12049 } else { 12050 tcp->tcp_snd_ts_ok = B_FALSE; 12051 } 12052 12053 /* 12054 * Process SACK options. If SACK is enabled for this connection, 12055 * then allocate the SACK info structure. Note the following ways 12056 * when tcp_snd_sack_ok is set to true. 12057 * 12058 * For active connection: in tcp_adapt_ire() called in 12059 * tcp_rput_other(), or in tcp_rput_other() when tcp_sack_permitted 12060 * is checked. 12061 * 12062 * For passive connection: in tcp_adapt_ire() called in 12063 * tcp_accept_comm(). 12064 * 12065 * That's the reason why the extra TCP_IS_DETACHED() check is there. 12066 * That check makes sure that if we did not send a SACK OK option, 12067 * we will not enable SACK for this connection even though the other 12068 * side sends us SACK OK option. For active connection, the SACK 12069 * info structure has already been allocated. So we need to free 12070 * it if SACK is disabled. 12071 */ 12072 if ((options & TCP_OPT_SACK_OK_PRESENT) && 12073 (tcp->tcp_snd_sack_ok || 12074 (tcp_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) { 12075 /* This should be true only in the passive case. */ 12076 if (tcp->tcp_sack_info == NULL) { 12077 ASSERT(TCP_IS_DETACHED(tcp)); 12078 tcp->tcp_sack_info = 12079 kmem_cache_alloc(tcp_sack_info_cache, KM_NOSLEEP); 12080 } 12081 if (tcp->tcp_sack_info == NULL) { 12082 tcp->tcp_snd_sack_ok = B_FALSE; 12083 } else { 12084 tcp->tcp_snd_sack_ok = B_TRUE; 12085 if (tcp->tcp_snd_ts_ok) { 12086 tcp->tcp_max_sack_blk = 3; 12087 } else { 12088 tcp->tcp_max_sack_blk = 4; 12089 } 12090 } 12091 } else { 12092 /* 12093 * Resetting tcp_snd_sack_ok to B_FALSE so that 12094 * no SACK info will be used for this 12095 * connection. This assumes that SACK usage 12096 * permission is negotiated. This may need 12097 * to be changed once this is clarified. 12098 */ 12099 if (tcp->tcp_sack_info != NULL) { 12100 ASSERT(tcp->tcp_notsack_list == NULL); 12101 kmem_cache_free(tcp_sack_info_cache, 12102 tcp->tcp_sack_info); 12103 tcp->tcp_sack_info = NULL; 12104 } 12105 tcp->tcp_snd_sack_ok = B_FALSE; 12106 } 12107 12108 /* 12109 * Now we know the exact TCP/IP header length, subtract 12110 * that from tcp_mss to get our side's MSS. 12111 */ 12112 tcp->tcp_mss -= tcp->tcp_hdr_len; 12113 /* 12114 * Here we assume that the other side's header size will be equal to 12115 * our header size. We calculate the real MSS accordingly. Need to 12116 * take into additional stuffs IPsec puts in. 12117 * 12118 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 12119 */ 12120 tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len + tcp->tcp_ipsec_overhead - 12121 ((tcp->tcp_ipversion == IPV4_VERSION ? 12122 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH); 12123 12124 /* 12125 * Set MSS to the smaller one of both ends of the connection. 12126 * We should not have called tcp_mss_set() before, but our 12127 * side of the MSS should have been set to a proper value 12128 * by tcp_adapt_ire(). tcp_mss_set() will also set up the 12129 * STREAM head parameters properly. 12130 * 12131 * If we have a larger-than-16-bit window but the other side 12132 * didn't want to do window scale, tcp_rwnd_set() will take 12133 * care of that. 12134 */ 12135 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); 12136 } 12137 12138 /* 12139 * Sends the T_CONN_IND to the listener. The caller calls this 12140 * functions via squeue to get inside the listener's perimeter 12141 * once the 3 way hand shake is done a T_CONN_IND needs to be 12142 * sent. As an optimization, the caller can call this directly 12143 * if listener's perimeter is same as eager's. 12144 */ 12145 /* ARGSUSED */ 12146 void 12147 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) 12148 { 12149 conn_t *lconnp = (conn_t *)arg; 12150 tcp_t *listener = lconnp->conn_tcp; 12151 tcp_t *tcp; 12152 struct T_conn_ind *conn_ind; 12153 ipaddr_t *addr_cache; 12154 boolean_t need_send_conn_ind = B_FALSE; 12155 12156 /* retrieve the eager */ 12157 conn_ind = (struct T_conn_ind *)mp->b_rptr; 12158 ASSERT(conn_ind->OPT_offset != 0 && 12159 conn_ind->OPT_length == sizeof (intptr_t)); 12160 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 12161 conn_ind->OPT_length); 12162 12163 /* 12164 * TLI/XTI applications will get confused by 12165 * sending eager as an option since it violates 12166 * the option semantics. So remove the eager as 12167 * option since TLI/XTI app doesn't need it anyway. 12168 */ 12169 if (!TCP_IS_SOCKET(listener)) { 12170 conn_ind->OPT_length = 0; 12171 conn_ind->OPT_offset = 0; 12172 } 12173 if (listener->tcp_state == TCPS_CLOSED || 12174 TCP_IS_DETACHED(listener)) { 12175 /* 12176 * If listener has closed, it would have caused a 12177 * a cleanup/blowoff to happen for the eager. We 12178 * just need to return. 12179 */ 12180 freemsg(mp); 12181 return; 12182 } 12183 12184 12185 /* 12186 * if the conn_req_q is full defer passing up the 12187 * T_CONN_IND until space is availabe after t_accept() 12188 * processing 12189 */ 12190 mutex_enter(&listener->tcp_eager_lock); 12191 if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { 12192 tcp_t *tail; 12193 12194 /* 12195 * The eager already has an extra ref put in tcp_rput_data 12196 * so that it stays till accept comes back even though it 12197 * might get into TCPS_CLOSED as a result of a TH_RST etc. 12198 */ 12199 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 12200 listener->tcp_conn_req_cnt_q0--; 12201 listener->tcp_conn_req_cnt_q++; 12202 12203 /* Move from SYN_RCVD to ESTABLISHED list */ 12204 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 12205 tcp->tcp_eager_prev_q0; 12206 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 12207 tcp->tcp_eager_next_q0; 12208 tcp->tcp_eager_prev_q0 = NULL; 12209 tcp->tcp_eager_next_q0 = NULL; 12210 12211 /* 12212 * Insert at end of the queue because sockfs 12213 * sends down T_CONN_RES in chronological 12214 * order. Leaving the older conn indications 12215 * at front of the queue helps reducing search 12216 * time. 12217 */ 12218 tail = listener->tcp_eager_last_q; 12219 if (tail != NULL) 12220 tail->tcp_eager_next_q = tcp; 12221 else 12222 listener->tcp_eager_next_q = tcp; 12223 listener->tcp_eager_last_q = tcp; 12224 tcp->tcp_eager_next_q = NULL; 12225 /* 12226 * Delay sending up the T_conn_ind until we are 12227 * done with the eager. Once we have have sent up 12228 * the T_conn_ind, the accept can potentially complete 12229 * any time and release the refhold we have on the eager. 12230 */ 12231 need_send_conn_ind = B_TRUE; 12232 } else { 12233 /* 12234 * Defer connection on q0 and set deferred 12235 * connection bit true 12236 */ 12237 tcp->tcp_conn_def_q0 = B_TRUE; 12238 12239 /* take tcp out of q0 ... */ 12240 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 12241 tcp->tcp_eager_next_q0; 12242 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 12243 tcp->tcp_eager_prev_q0; 12244 12245 /* ... and place it at the end of q0 */ 12246 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 12247 tcp->tcp_eager_next_q0 = listener; 12248 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 12249 listener->tcp_eager_prev_q0 = tcp; 12250 tcp->tcp_conn.tcp_eager_conn_ind = mp; 12251 } 12252 12253 /* we have timed out before */ 12254 if (tcp->tcp_syn_rcvd_timeout != 0) { 12255 tcp->tcp_syn_rcvd_timeout = 0; 12256 listener->tcp_syn_rcvd_timeout--; 12257 if (listener->tcp_syn_defense && 12258 listener->tcp_syn_rcvd_timeout <= 12259 (tcp_conn_req_max_q0 >> 5) && 12260 10*MINUTES < TICK_TO_MSEC(lbolt64 - 12261 listener->tcp_last_rcv_lbolt)) { 12262 /* 12263 * Turn off the defense mode if we 12264 * believe the SYN attack is over. 12265 */ 12266 listener->tcp_syn_defense = B_FALSE; 12267 if (listener->tcp_ip_addr_cache) { 12268 kmem_free((void *)listener->tcp_ip_addr_cache, 12269 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 12270 listener->tcp_ip_addr_cache = NULL; 12271 } 12272 } 12273 } 12274 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 12275 if (addr_cache != NULL) { 12276 /* 12277 * We have finished a 3-way handshake with this 12278 * remote host. This proves the IP addr is good. 12279 * Cache it! 12280 */ 12281 addr_cache[IP_ADDR_CACHE_HASH( 12282 tcp->tcp_remote)] = tcp->tcp_remote; 12283 } 12284 mutex_exit(&listener->tcp_eager_lock); 12285 if (need_send_conn_ind) 12286 putnext(listener->tcp_rq, mp); 12287 } 12288 12289 mblk_t * 12290 tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp, 12291 uint_t *ifindexp, ip6_pkt_t *ippp) 12292 { 12293 in_pktinfo_t *pinfo; 12294 ip6_t *ip6h; 12295 uchar_t *rptr; 12296 mblk_t *first_mp = mp; 12297 boolean_t mctl_present = B_FALSE; 12298 uint_t ifindex = 0; 12299 ip6_pkt_t ipp; 12300 uint_t ipvers; 12301 uint_t ip_hdr_len; 12302 12303 rptr = mp->b_rptr; 12304 ASSERT(OK_32PTR(rptr)); 12305 ASSERT(tcp != NULL); 12306 ipp.ipp_fields = 0; 12307 12308 switch DB_TYPE(mp) { 12309 case M_CTL: 12310 mp = mp->b_cont; 12311 if (mp == NULL) { 12312 freemsg(first_mp); 12313 return (NULL); 12314 } 12315 if (DB_TYPE(mp) != M_DATA) { 12316 freemsg(first_mp); 12317 return (NULL); 12318 } 12319 mctl_present = B_TRUE; 12320 break; 12321 case M_DATA: 12322 break; 12323 default: 12324 cmn_err(CE_NOTE, "tcp_find_pktinfo: unknown db_type"); 12325 freemsg(mp); 12326 return (NULL); 12327 } 12328 ipvers = IPH_HDR_VERSION(rptr); 12329 if (ipvers == IPV4_VERSION) { 12330 if (tcp == NULL) { 12331 ip_hdr_len = IPH_HDR_LENGTH(rptr); 12332 goto done; 12333 } 12334 12335 ipp.ipp_fields |= IPPF_HOPLIMIT; 12336 ipp.ipp_hoplimit = ((ipha_t *)rptr)->ipha_ttl; 12337 12338 /* 12339 * If we have IN_PKTINFO in an M_CTL and tcp_ipv6_recvancillary 12340 * has TCP_IPV6_RECVPKTINFO set, pass I/F index along in ipp. 12341 */ 12342 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) && 12343 mctl_present) { 12344 pinfo = (in_pktinfo_t *)first_mp->b_rptr; 12345 if ((MBLKL(first_mp) == sizeof (in_pktinfo_t)) && 12346 (pinfo->in_pkt_ulp_type == IN_PKTINFO) && 12347 (pinfo->in_pkt_flags & IPF_RECVIF)) { 12348 ipp.ipp_fields |= IPPF_IFINDEX; 12349 ipp.ipp_ifindex = pinfo->in_pkt_ifindex; 12350 ifindex = pinfo->in_pkt_ifindex; 12351 } 12352 freeb(first_mp); 12353 mctl_present = B_FALSE; 12354 } 12355 ip_hdr_len = IPH_HDR_LENGTH(rptr); 12356 } else { 12357 ip6h = (ip6_t *)rptr; 12358 12359 ASSERT(ipvers == IPV6_VERSION); 12360 ipp.ipp_fields = IPPF_HOPLIMIT | IPPF_TCLASS; 12361 ipp.ipp_tclass = (ip6h->ip6_flow & 0x0FF00000) >> 20; 12362 ipp.ipp_hoplimit = ip6h->ip6_hops; 12363 12364 if (ip6h->ip6_nxt != IPPROTO_TCP) { 12365 uint8_t nexthdrp; 12366 12367 /* Look for ifindex information */ 12368 if (ip6h->ip6_nxt == IPPROTO_RAW) { 12369 ip6i_t *ip6i = (ip6i_t *)ip6h; 12370 if ((uchar_t *)&ip6i[1] > mp->b_wptr) { 12371 BUMP_MIB(&ip_mib, tcpInErrs); 12372 freemsg(first_mp); 12373 return (NULL); 12374 } 12375 12376 if (ip6i->ip6i_flags & IP6I_IFINDEX) { 12377 ASSERT(ip6i->ip6i_ifindex != 0); 12378 ipp.ipp_fields |= IPPF_IFINDEX; 12379 ipp.ipp_ifindex = ip6i->ip6i_ifindex; 12380 ifindex = ip6i->ip6i_ifindex; 12381 } 12382 rptr = (uchar_t *)&ip6i[1]; 12383 mp->b_rptr = rptr; 12384 if (rptr == mp->b_wptr) { 12385 mblk_t *mp1; 12386 mp1 = mp->b_cont; 12387 freeb(mp); 12388 mp = mp1; 12389 rptr = mp->b_rptr; 12390 } 12391 if (MBLKL(mp) < IPV6_HDR_LEN + 12392 sizeof (tcph_t)) { 12393 BUMP_MIB(&ip_mib, tcpInErrs); 12394 freemsg(first_mp); 12395 return (NULL); 12396 } 12397 ip6h = (ip6_t *)rptr; 12398 } 12399 12400 /* 12401 * Find any potentially interesting extension headers 12402 * as well as the length of the IPv6 + extension 12403 * headers. 12404 */ 12405 ip_hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp); 12406 /* Verify if this is a TCP packet */ 12407 if (nexthdrp != IPPROTO_TCP) { 12408 BUMP_MIB(&ip_mib, tcpInErrs); 12409 freemsg(first_mp); 12410 return (NULL); 12411 } 12412 } else { 12413 ip_hdr_len = IPV6_HDR_LEN; 12414 } 12415 } 12416 12417 done: 12418 if (ipversp != NULL) 12419 *ipversp = ipvers; 12420 if (ip_hdr_lenp != NULL) 12421 *ip_hdr_lenp = ip_hdr_len; 12422 if (ippp != NULL) 12423 *ippp = ipp; 12424 if (ifindexp != NULL) 12425 *ifindexp = ifindex; 12426 if (mctl_present) { 12427 freeb(first_mp); 12428 } 12429 return (mp); 12430 } 12431 12432 /* 12433 * Handle M_DATA messages from IP. Its called directly from IP via 12434 * squeue for AF_INET type sockets fast path. No M_CTL are expected 12435 * in this path. 12436 * 12437 * For everything else (including AF_INET6 sockets with 'tcp_ipversion' 12438 * v4 and v6), we are called through tcp_input() and a M_CTL can 12439 * be present for options but tcp_find_pktinfo() deals with it. We 12440 * only expect M_DATA packets after tcp_find_pktinfo() is done. 12441 * 12442 * The first argument is always the connp/tcp to which the mp belongs. 12443 * There are no exceptions to this rule. The caller has already put 12444 * a reference on this connp/tcp and once tcp_rput_data() returns, 12445 * the squeue will do the refrele. 12446 * 12447 * The TH_SYN for the listener directly go to tcp_conn_request via 12448 * squeue. 12449 * 12450 * sqp: NULL = recursive, sqp != NULL means called from squeue 12451 */ 12452 void 12453 tcp_rput_data(void *arg, mblk_t *mp, void *arg2) 12454 { 12455 int32_t bytes_acked; 12456 int32_t gap; 12457 mblk_t *mp1; 12458 uint_t flags; 12459 uint32_t new_swnd = 0; 12460 uchar_t *iphdr; 12461 uchar_t *rptr; 12462 int32_t rgap; 12463 uint32_t seg_ack; 12464 int seg_len; 12465 uint_t ip_hdr_len; 12466 uint32_t seg_seq; 12467 tcph_t *tcph; 12468 int urp; 12469 tcp_opt_t tcpopt; 12470 uint_t ipvers; 12471 ip6_pkt_t ipp; 12472 boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 12473 uint32_t cwnd; 12474 uint32_t add; 12475 int npkt; 12476 int mss; 12477 conn_t *connp = (conn_t *)arg; 12478 squeue_t *sqp = (squeue_t *)arg2; 12479 tcp_t *tcp = connp->conn_tcp; 12480 12481 /* 12482 * RST from fused tcp loopback peer should trigger an unfuse. 12483 */ 12484 if (tcp->tcp_fused) { 12485 TCP_STAT(tcp_fusion_aborted); 12486 tcp_unfuse(tcp); 12487 } 12488 12489 iphdr = mp->b_rptr; 12490 rptr = mp->b_rptr; 12491 ASSERT(OK_32PTR(rptr)); 12492 12493 /* 12494 * An AF_INET socket is not capable of receiving any pktinfo. Do inline 12495 * processing here. For rest call tcp_find_pktinfo to fill up the 12496 * necessary information. 12497 */ 12498 if (IPCL_IS_TCP4(connp)) { 12499 ipvers = IPV4_VERSION; 12500 ip_hdr_len = IPH_HDR_LENGTH(rptr); 12501 } else { 12502 mp = tcp_find_pktinfo(tcp, mp, &ipvers, &ip_hdr_len, 12503 NULL, &ipp); 12504 if (mp == NULL) { 12505 TCP_STAT(tcp_rput_v6_error); 12506 return; 12507 } 12508 iphdr = mp->b_rptr; 12509 rptr = mp->b_rptr; 12510 } 12511 ASSERT(DB_TYPE(mp) == M_DATA); 12512 12513 tcph = (tcph_t *)&rptr[ip_hdr_len]; 12514 seg_seq = ABE32_TO_U32(tcph->th_seq); 12515 seg_ack = ABE32_TO_U32(tcph->th_ack); 12516 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 12517 seg_len = (int)(mp->b_wptr - rptr) - 12518 (ip_hdr_len + TCP_HDR_LENGTH(tcph)); 12519 if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) { 12520 do { 12521 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 12522 (uintptr_t)INT_MAX); 12523 seg_len += (int)(mp1->b_wptr - mp1->b_rptr); 12524 } while ((mp1 = mp1->b_cont) != NULL && 12525 mp1->b_datap->db_type == M_DATA); 12526 } 12527 12528 if (tcp->tcp_state == TCPS_TIME_WAIT) { 12529 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 12530 seg_len, tcph); 12531 return; 12532 } 12533 12534 if (sqp != NULL) { 12535 /* 12536 * This is the correct place to update tcp_last_recv_time. Note 12537 * that it is also updated for tcp structure that belongs to 12538 * global and listener queues which do not really need updating. 12539 * But that should not cause any harm. And it is updated for 12540 * all kinds of incoming segments, not only for data segments. 12541 */ 12542 tcp->tcp_last_recv_time = lbolt; 12543 } 12544 12545 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 12546 12547 BUMP_LOCAL(tcp->tcp_ibsegs); 12548 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_RECV_PKT); 12549 12550 if ((flags & TH_URG) && sqp != NULL) { 12551 /* 12552 * TCP can't handle urgent pointers that arrive before 12553 * the connection has been accept()ed since it can't 12554 * buffer OOB data. Discard segment if this happens. 12555 * 12556 * Nor can it reassemble urgent pointers, so discard 12557 * if it's not the next segment expected. 12558 * 12559 * Otherwise, collapse chain into one mblk (discard if 12560 * that fails). This makes sure the headers, retransmitted 12561 * data, and new data all are in the same mblk. 12562 */ 12563 ASSERT(mp != NULL); 12564 if (tcp->tcp_listener || !pullupmsg(mp, -1)) { 12565 freemsg(mp); 12566 return; 12567 } 12568 /* Update pointers into message */ 12569 iphdr = rptr = mp->b_rptr; 12570 tcph = (tcph_t *)&rptr[ip_hdr_len]; 12571 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) { 12572 /* 12573 * Since we can't handle any data with this urgent 12574 * pointer that is out of sequence, we expunge 12575 * the data. This allows us to still register 12576 * the urgent mark and generate the M_PCSIG, 12577 * which we can do. 12578 */ 12579 mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph); 12580 seg_len = 0; 12581 } 12582 } 12583 12584 switch (tcp->tcp_state) { 12585 case TCPS_SYN_SENT: 12586 if (flags & TH_ACK) { 12587 /* 12588 * Note that our stack cannot send data before a 12589 * connection is established, therefore the 12590 * following check is valid. Otherwise, it has 12591 * to be changed. 12592 */ 12593 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 12594 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 12595 freemsg(mp); 12596 if (flags & TH_RST) 12597 return; 12598 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 12599 tcp, seg_ack, 0, TH_RST); 12600 return; 12601 } 12602 ASSERT(tcp->tcp_suna + 1 == seg_ack); 12603 } 12604 if (flags & TH_RST) { 12605 freemsg(mp); 12606 if (flags & TH_ACK) 12607 (void) tcp_clean_death(tcp, 12608 ECONNREFUSED, 13); 12609 return; 12610 } 12611 if (!(flags & TH_SYN)) { 12612 freemsg(mp); 12613 return; 12614 } 12615 12616 /* Process all TCP options. */ 12617 tcp_process_options(tcp, tcph); 12618 /* 12619 * The following changes our rwnd to be a multiple of the 12620 * MIN(peer MSS, our MSS) for performance reason. 12621 */ 12622 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rq->q_hiwat, 12623 tcp->tcp_mss)); 12624 12625 /* Is the other end ECN capable? */ 12626 if (tcp->tcp_ecn_ok) { 12627 if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 12628 tcp->tcp_ecn_ok = B_FALSE; 12629 } 12630 } 12631 /* 12632 * Clear ECN flags because it may interfere with later 12633 * processing. 12634 */ 12635 flags &= ~(TH_ECE|TH_CWR); 12636 12637 tcp->tcp_irs = seg_seq; 12638 tcp->tcp_rack = seg_seq; 12639 tcp->tcp_rnxt = seg_seq + 1; 12640 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 12641 if (!TCP_IS_DETACHED(tcp)) { 12642 /* Allocate room for SACK options if needed. */ 12643 if (tcp->tcp_snd_sack_ok) { 12644 (void) mi_set_sth_wroff(tcp->tcp_rq, 12645 tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + 12646 (tcp->tcp_loopback ? 0 : tcp_wroff_xtra)); 12647 } else { 12648 (void) mi_set_sth_wroff(tcp->tcp_rq, 12649 tcp->tcp_hdr_len + 12650 (tcp->tcp_loopback ? 0 : tcp_wroff_xtra)); 12651 } 12652 } 12653 if (flags & TH_ACK) { 12654 /* 12655 * If we can't get the confirmation upstream, pretend 12656 * we didn't even see this one. 12657 * 12658 * XXX: how can we pretend we didn't see it if we 12659 * have updated rnxt et. al. 12660 * 12661 * For loopback we defer sending up the T_CONN_CON 12662 * until after some checks below. 12663 */ 12664 mp1 = NULL; 12665 if (!tcp_conn_con(tcp, iphdr, tcph, mp, 12666 tcp->tcp_loopback ? &mp1 : NULL)) { 12667 freemsg(mp); 12668 return; 12669 } 12670 /* SYN was acked - making progress */ 12671 if (tcp->tcp_ipversion == IPV6_VERSION) 12672 tcp->tcp_ip_forward_progress = B_TRUE; 12673 12674 /* One for the SYN */ 12675 tcp->tcp_suna = tcp->tcp_iss + 1; 12676 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 12677 tcp->tcp_state = TCPS_ESTABLISHED; 12678 12679 /* 12680 * If SYN was retransmitted, need to reset all 12681 * retransmission info. This is because this 12682 * segment will be treated as a dup ACK. 12683 */ 12684 if (tcp->tcp_rexmit) { 12685 tcp->tcp_rexmit = B_FALSE; 12686 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 12687 tcp->tcp_rexmit_max = tcp->tcp_snxt; 12688 tcp->tcp_snd_burst = tcp->tcp_localnet ? 12689 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 12690 tcp->tcp_ms_we_have_waited = 0; 12691 12692 /* 12693 * Set tcp_cwnd back to 1 MSS, per 12694 * recommendation from 12695 * draft-floyd-incr-init-win-01.txt, 12696 * Increasing TCP's Initial Window. 12697 */ 12698 tcp->tcp_cwnd = tcp->tcp_mss; 12699 } 12700 12701 tcp->tcp_swl1 = seg_seq; 12702 tcp->tcp_swl2 = seg_ack; 12703 12704 new_swnd = BE16_TO_U16(tcph->th_win); 12705 tcp->tcp_swnd = new_swnd; 12706 if (new_swnd > tcp->tcp_max_swnd) 12707 tcp->tcp_max_swnd = new_swnd; 12708 12709 /* 12710 * Always send the three-way handshake ack immediately 12711 * in order to make the connection complete as soon as 12712 * possible on the accepting host. 12713 */ 12714 flags |= TH_ACK_NEEDED; 12715 12716 /* 12717 * Special case for loopback. At this point we have 12718 * received SYN-ACK from the remote endpoint. In 12719 * order to ensure that both endpoints reach the 12720 * fused state prior to any data exchange, the final 12721 * ACK needs to be sent before we indicate T_CONN_CON 12722 * to the module upstream. 12723 */ 12724 if (tcp->tcp_loopback) { 12725 mblk_t *ack_mp; 12726 12727 ASSERT(!tcp->tcp_unfusable); 12728 ASSERT(mp1 != NULL); 12729 /* 12730 * For loopback, we always get a pure SYN-ACK 12731 * and only need to send back the final ACK 12732 * with no data (this is because the other 12733 * tcp is ours and we don't do T/TCP). This 12734 * final ACK triggers the passive side to 12735 * perform fusion in ESTABLISHED state. 12736 */ 12737 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) { 12738 if (tcp->tcp_ack_tid != 0) { 12739 (void) TCP_TIMER_CANCEL(tcp, 12740 tcp->tcp_ack_tid); 12741 tcp->tcp_ack_tid = 0; 12742 } 12743 TCP_RECORD_TRACE(tcp, ack_mp, 12744 TCP_TRACE_SEND_PKT); 12745 tcp_send_data(tcp, tcp->tcp_wq, ack_mp); 12746 BUMP_LOCAL(tcp->tcp_obsegs); 12747 BUMP_MIB(&tcp_mib, tcpOutAck); 12748 12749 /* Send up T_CONN_CON */ 12750 putnext(tcp->tcp_rq, mp1); 12751 12752 freemsg(mp); 12753 return; 12754 } 12755 /* 12756 * Forget fusion; we need to handle more 12757 * complex cases below. Send the deferred 12758 * T_CONN_CON message upstream and proceed 12759 * as usual. Mark this tcp as not capable 12760 * of fusion. 12761 */ 12762 TCP_STAT(tcp_fusion_unfusable); 12763 tcp->tcp_unfusable = B_TRUE; 12764 putnext(tcp->tcp_rq, mp1); 12765 } 12766 12767 /* 12768 * Check to see if there is data to be sent. If 12769 * yes, set the transmit flag. Then check to see 12770 * if received data processing needs to be done. 12771 * If not, go straight to xmit_check. This short 12772 * cut is OK as we don't support T/TCP. 12773 */ 12774 if (tcp->tcp_unsent) 12775 flags |= TH_XMIT_NEEDED; 12776 12777 if (seg_len == 0 && !(flags & TH_URG)) { 12778 freemsg(mp); 12779 goto xmit_check; 12780 } 12781 12782 flags &= ~TH_SYN; 12783 seg_seq++; 12784 break; 12785 } 12786 tcp->tcp_state = TCPS_SYN_RCVD; 12787 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 12788 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 12789 if (mp1) { 12790 DB_CPID(mp1) = tcp->tcp_cpid; 12791 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); 12792 tcp_send_data(tcp, tcp->tcp_wq, mp1); 12793 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 12794 } 12795 freemsg(mp); 12796 return; 12797 case TCPS_SYN_RCVD: 12798 if (flags & TH_ACK) { 12799 /* 12800 * In this state, a SYN|ACK packet is either bogus 12801 * because the other side must be ACKing our SYN which 12802 * indicates it has seen the ACK for their SYN and 12803 * shouldn't retransmit it or we're crossing SYNs 12804 * on active open. 12805 */ 12806 if ((flags & TH_SYN) && !tcp->tcp_active_open) { 12807 freemsg(mp); 12808 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_syn", 12809 tcp, seg_ack, 0, TH_RST); 12810 return; 12811 } 12812 /* 12813 * NOTE: RFC 793 pg. 72 says this should be 12814 * tcp->tcp_suna <= seg_ack <= tcp->tcp_snxt 12815 * but that would mean we have an ack that ignored 12816 * our SYN. 12817 */ 12818 if (SEQ_LEQ(seg_ack, tcp->tcp_suna) || 12819 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 12820 freemsg(mp); 12821 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 12822 tcp, seg_ack, 0, TH_RST); 12823 return; 12824 } 12825 } 12826 break; 12827 case TCPS_LISTEN: 12828 /* 12829 * Only a TLI listener can come through this path when a 12830 * acceptor is going back to be a listener and a packet 12831 * for the acceptor hits the classifier. For a socket 12832 * listener, this can never happen because a listener 12833 * can never accept connection on itself and hence a 12834 * socket acceptor can not go back to being a listener. 12835 */ 12836 ASSERT(!TCP_IS_SOCKET(tcp)); 12837 /*FALLTHRU*/ 12838 case TCPS_CLOSED: 12839 case TCPS_BOUND: { 12840 conn_t *new_connp; 12841 12842 new_connp = ipcl_classify(mp, connp->conn_zoneid); 12843 if (new_connp != NULL) { 12844 tcp_reinput(new_connp, mp, connp->conn_sqp); 12845 return; 12846 } 12847 /* We failed to classify. For now just drop the packet */ 12848 freemsg(mp); 12849 return; 12850 } 12851 case TCPS_IDLE: 12852 /* 12853 * Handle the case where the tcp_clean_death() has happened 12854 * on a connection (application hasn't closed yet) but a packet 12855 * was already queued on squeue before tcp_clean_death() 12856 * was processed. Calling tcp_clean_death() twice on same 12857 * connection can result in weird behaviour. 12858 */ 12859 freemsg(mp); 12860 return; 12861 default: 12862 break; 12863 } 12864 12865 /* 12866 * Already on the correct queue/perimeter. 12867 * If this is a detached connection and not an eager 12868 * connection hanging off a listener then new data 12869 * (past the FIN) will cause a reset. 12870 * We do a special check here where it 12871 * is out of the main line, rather than check 12872 * if we are detached every time we see new 12873 * data down below. 12874 */ 12875 if (TCP_IS_DETACHED_NONEAGER(tcp) && 12876 (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) { 12877 BUMP_MIB(&tcp_mib, tcpInClosed); 12878 TCP_RECORD_TRACE(tcp, 12879 mp, TCP_TRACE_RECV_PKT); 12880 12881 freemsg(mp); 12882 /* 12883 * This could be an SSL closure alert. We're detached so just 12884 * acknowledge it this last time. 12885 */ 12886 if (tcp->tcp_kssl_ctx != NULL) { 12887 kssl_release_ctx(tcp->tcp_kssl_ctx); 12888 tcp->tcp_kssl_ctx = NULL; 12889 12890 tcp->tcp_rnxt += seg_len; 12891 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 12892 flags |= TH_ACK_NEEDED; 12893 goto ack_check; 12894 } 12895 12896 tcp_xmit_ctl("new data when detached", tcp, 12897 tcp->tcp_snxt, 0, TH_RST); 12898 (void) tcp_clean_death(tcp, EPROTO, 12); 12899 return; 12900 } 12901 12902 mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph); 12903 urp = BE16_TO_U16(tcph->th_urp) - TCP_OLD_URP_INTERPRETATION; 12904 new_swnd = BE16_TO_U16(tcph->th_win) << 12905 ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 12906 mss = tcp->tcp_mss; 12907 12908 if (tcp->tcp_snd_ts_ok) { 12909 if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 12910 /* 12911 * This segment is not acceptable. 12912 * Drop it and send back an ACK. 12913 */ 12914 freemsg(mp); 12915 flags |= TH_ACK_NEEDED; 12916 goto ack_check; 12917 } 12918 } else if (tcp->tcp_snd_sack_ok) { 12919 ASSERT(tcp->tcp_sack_info != NULL); 12920 tcpopt.tcp = tcp; 12921 /* 12922 * SACK info in already updated in tcp_parse_options. Ignore 12923 * all other TCP options... 12924 */ 12925 (void) tcp_parse_options(tcph, &tcpopt); 12926 } 12927 try_again:; 12928 gap = seg_seq - tcp->tcp_rnxt; 12929 rgap = tcp->tcp_rwnd - (gap + seg_len); 12930 /* 12931 * gap is the amount of sequence space between what we expect to see 12932 * and what we got for seg_seq. A positive value for gap means 12933 * something got lost. A negative value means we got some old stuff. 12934 */ 12935 if (gap < 0) { 12936 /* Old stuff present. Is the SYN in there? */ 12937 if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 12938 (seg_len != 0)) { 12939 flags &= ~TH_SYN; 12940 seg_seq++; 12941 urp--; 12942 /* Recompute the gaps after noting the SYN. */ 12943 goto try_again; 12944 } 12945 BUMP_MIB(&tcp_mib, tcpInDataDupSegs); 12946 UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, 12947 (seg_len > -gap ? -gap : seg_len)); 12948 /* Remove the old stuff from seg_len. */ 12949 seg_len += gap; 12950 /* 12951 * Anything left? 12952 * Make sure to check for unack'd FIN when rest of data 12953 * has been previously ack'd. 12954 */ 12955 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 12956 /* 12957 * Resets are only valid if they lie within our offered 12958 * window. If the RST bit is set, we just ignore this 12959 * segment. 12960 */ 12961 if (flags & TH_RST) { 12962 freemsg(mp); 12963 return; 12964 } 12965 12966 /* 12967 * The arriving of dup data packets indicate that we 12968 * may have postponed an ack for too long, or the other 12969 * side's RTT estimate is out of shape. Start acking 12970 * more often. 12971 */ 12972 if (SEQ_GEQ(seg_seq + seg_len - gap, tcp->tcp_rack) && 12973 tcp->tcp_rack_cnt >= 1 && 12974 tcp->tcp_rack_abs_max > 2) { 12975 tcp->tcp_rack_abs_max--; 12976 } 12977 tcp->tcp_rack_cur_max = 1; 12978 12979 /* 12980 * This segment is "unacceptable". None of its 12981 * sequence space lies within our advertized window. 12982 * 12983 * Adjust seg_len to the original value for tracing. 12984 */ 12985 seg_len -= gap; 12986 if (tcp->tcp_debug) { 12987 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 12988 "tcp_rput: unacceptable, gap %d, rgap %d, " 12989 "flags 0x%x, seg_seq %u, seg_ack %u, " 12990 "seg_len %d, rnxt %u, snxt %u, %s", 12991 gap, rgap, flags, seg_seq, seg_ack, 12992 seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 12993 tcp_display(tcp, NULL, 12994 DISP_ADDR_AND_PORT)); 12995 } 12996 12997 /* 12998 * Arrange to send an ACK in response to the 12999 * unacceptable segment per RFC 793 page 69. There 13000 * is only one small difference between ours and the 13001 * acceptability test in the RFC - we accept ACK-only 13002 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 13003 * will be generated. 13004 * 13005 * Note that we have to ACK an ACK-only packet at least 13006 * for stacks that send 0-length keep-alives with 13007 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 13008 * section 4.2.3.6. As long as we don't ever generate 13009 * an unacceptable packet in response to an incoming 13010 * packet that is unacceptable, it should not cause 13011 * "ACK wars". 13012 */ 13013 flags |= TH_ACK_NEEDED; 13014 13015 /* 13016 * Continue processing this segment in order to use the 13017 * ACK information it contains, but skip all other 13018 * sequence-number processing. Processing the ACK 13019 * information is necessary in order to 13020 * re-synchronize connections that may have lost 13021 * synchronization. 13022 * 13023 * We clear seg_len and flag fields related to 13024 * sequence number processing as they are not 13025 * to be trusted for an unacceptable segment. 13026 */ 13027 seg_len = 0; 13028 flags &= ~(TH_SYN | TH_FIN | TH_URG); 13029 goto process_ack; 13030 } 13031 13032 /* Fix seg_seq, and chew the gap off the front. */ 13033 seg_seq = tcp->tcp_rnxt; 13034 urp += gap; 13035 do { 13036 mblk_t *mp2; 13037 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 13038 (uintptr_t)UINT_MAX); 13039 gap += (uint_t)(mp->b_wptr - mp->b_rptr); 13040 if (gap > 0) { 13041 mp->b_rptr = mp->b_wptr - gap; 13042 break; 13043 } 13044 mp2 = mp; 13045 mp = mp->b_cont; 13046 freeb(mp2); 13047 } while (gap < 0); 13048 /* 13049 * If the urgent data has already been acknowledged, we 13050 * should ignore TH_URG below 13051 */ 13052 if (urp < 0) 13053 flags &= ~TH_URG; 13054 } 13055 /* 13056 * rgap is the amount of stuff received out of window. A negative 13057 * value is the amount out of window. 13058 */ 13059 if (rgap < 0) { 13060 mblk_t *mp2; 13061 13062 if (tcp->tcp_rwnd == 0) { 13063 BUMP_MIB(&tcp_mib, tcpInWinProbe); 13064 } else { 13065 BUMP_MIB(&tcp_mib, tcpInDataPastWinSegs); 13066 UPDATE_MIB(&tcp_mib, tcpInDataPastWinBytes, -rgap); 13067 } 13068 13069 /* 13070 * seg_len does not include the FIN, so if more than 13071 * just the FIN is out of window, we act like we don't 13072 * see it. (If just the FIN is out of window, rgap 13073 * will be zero and we will go ahead and acknowledge 13074 * the FIN.) 13075 */ 13076 flags &= ~TH_FIN; 13077 13078 /* Fix seg_len and make sure there is something left. */ 13079 seg_len += rgap; 13080 if (seg_len <= 0) { 13081 /* 13082 * Resets are only valid if they lie within our offered 13083 * window. If the RST bit is set, we just ignore this 13084 * segment. 13085 */ 13086 if (flags & TH_RST) { 13087 freemsg(mp); 13088 return; 13089 } 13090 13091 /* Per RFC 793, we need to send back an ACK. */ 13092 flags |= TH_ACK_NEEDED; 13093 13094 /* 13095 * Send SIGURG as soon as possible i.e. even 13096 * if the TH_URG was delivered in a window probe 13097 * packet (which will be unacceptable). 13098 * 13099 * We generate a signal if none has been generated 13100 * for this connection or if this is a new urgent 13101 * byte. Also send a zero-length "unmarked" message 13102 * to inform SIOCATMARK that this is not the mark. 13103 * 13104 * tcp_urp_last_valid is cleared when the T_exdata_ind 13105 * is sent up. This plus the check for old data 13106 * (gap >= 0) handles the wraparound of the sequence 13107 * number space without having to always track the 13108 * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks 13109 * this max in its rcv_up variable). 13110 * 13111 * This prevents duplicate SIGURGS due to a "late" 13112 * zero-window probe when the T_EXDATA_IND has already 13113 * been sent up. 13114 */ 13115 if ((flags & TH_URG) && 13116 (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, 13117 tcp->tcp_urp_last))) { 13118 mp1 = allocb(0, BPRI_MED); 13119 if (mp1 == NULL) { 13120 freemsg(mp); 13121 return; 13122 } 13123 if (!TCP_IS_DETACHED(tcp) && 13124 !putnextctl1(tcp->tcp_rq, M_PCSIG, 13125 SIGURG)) { 13126 /* Try again on the rexmit. */ 13127 freemsg(mp1); 13128 freemsg(mp); 13129 return; 13130 } 13131 /* 13132 * If the next byte would be the mark 13133 * then mark with MARKNEXT else mark 13134 * with NOTMARKNEXT. 13135 */ 13136 if (gap == 0 && urp == 0) 13137 mp1->b_flag |= MSGMARKNEXT; 13138 else 13139 mp1->b_flag |= MSGNOTMARKNEXT; 13140 freemsg(tcp->tcp_urp_mark_mp); 13141 tcp->tcp_urp_mark_mp = mp1; 13142 flags |= TH_SEND_URP_MARK; 13143 tcp->tcp_urp_last_valid = B_TRUE; 13144 tcp->tcp_urp_last = urp + seg_seq; 13145 } 13146 /* 13147 * If this is a zero window probe, continue to 13148 * process the ACK part. But we need to set seg_len 13149 * to 0 to avoid data processing. Otherwise just 13150 * drop the segment and send back an ACK. 13151 */ 13152 if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 13153 flags &= ~(TH_SYN | TH_URG); 13154 seg_len = 0; 13155 goto process_ack; 13156 } else { 13157 freemsg(mp); 13158 goto ack_check; 13159 } 13160 } 13161 /* Pitch out of window stuff off the end. */ 13162 rgap = seg_len; 13163 mp2 = mp; 13164 do { 13165 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 13166 (uintptr_t)INT_MAX); 13167 rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 13168 if (rgap < 0) { 13169 mp2->b_wptr += rgap; 13170 if ((mp1 = mp2->b_cont) != NULL) { 13171 mp2->b_cont = NULL; 13172 freemsg(mp1); 13173 } 13174 break; 13175 } 13176 } while ((mp2 = mp2->b_cont) != NULL); 13177 } 13178 ok:; 13179 /* 13180 * TCP should check ECN info for segments inside the window only. 13181 * Therefore the check should be done here. 13182 */ 13183 if (tcp->tcp_ecn_ok) { 13184 if (flags & TH_CWR) { 13185 tcp->tcp_ecn_echo_on = B_FALSE; 13186 } 13187 /* 13188 * Note that both ECN_CE and CWR can be set in the 13189 * same segment. In this case, we once again turn 13190 * on ECN_ECHO. 13191 */ 13192 if (tcp->tcp_ipversion == IPV4_VERSION) { 13193 uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service; 13194 13195 if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 13196 tcp->tcp_ecn_echo_on = B_TRUE; 13197 } 13198 } else { 13199 uint32_t vcf = ((ip6_t *)rptr)->ip6_vcf; 13200 13201 if ((vcf & htonl(IPH_ECN_CE << 20)) == 13202 htonl(IPH_ECN_CE << 20)) { 13203 tcp->tcp_ecn_echo_on = B_TRUE; 13204 } 13205 } 13206 } 13207 13208 /* 13209 * Check whether we can update tcp_ts_recent. This test is 13210 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 13211 * Extensions for High Performance: An Update", Internet Draft. 13212 */ 13213 if (tcp->tcp_snd_ts_ok && 13214 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 13215 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 13216 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 13217 tcp->tcp_last_rcv_lbolt = lbolt64; 13218 } 13219 13220 if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 13221 /* 13222 * FIN in an out of order segment. We record this in 13223 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 13224 * Clear the FIN so that any check on FIN flag will fail. 13225 * Remember that FIN also counts in the sequence number 13226 * space. So we need to ack out of order FIN only segments. 13227 */ 13228 if (flags & TH_FIN) { 13229 tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 13230 tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 13231 flags &= ~TH_FIN; 13232 flags |= TH_ACK_NEEDED; 13233 } 13234 if (seg_len > 0) { 13235 /* Fill in the SACK blk list. */ 13236 if (tcp->tcp_snd_sack_ok) { 13237 ASSERT(tcp->tcp_sack_info != NULL); 13238 tcp_sack_insert(tcp->tcp_sack_list, 13239 seg_seq, seg_seq + seg_len, 13240 &(tcp->tcp_num_sack_blk)); 13241 } 13242 13243 /* 13244 * Attempt reassembly and see if we have something 13245 * ready to go. 13246 */ 13247 mp = tcp_reass(tcp, mp, seg_seq); 13248 /* Always ack out of order packets */ 13249 flags |= TH_ACK_NEEDED | TH_PUSH; 13250 if (mp) { 13251 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 13252 (uintptr_t)INT_MAX); 13253 seg_len = mp->b_cont ? msgdsize(mp) : 13254 (int)(mp->b_wptr - mp->b_rptr); 13255 seg_seq = tcp->tcp_rnxt; 13256 /* 13257 * A gap is filled and the seq num and len 13258 * of the gap match that of a previously 13259 * received FIN, put the FIN flag back in. 13260 */ 13261 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 13262 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 13263 flags |= TH_FIN; 13264 tcp->tcp_valid_bits &= 13265 ~TCP_OFO_FIN_VALID; 13266 } 13267 } else { 13268 /* 13269 * Keep going even with NULL mp. 13270 * There may be a useful ACK or something else 13271 * we don't want to miss. 13272 * 13273 * But TCP should not perform fast retransmit 13274 * because of the ack number. TCP uses 13275 * seg_len == 0 to determine if it is a pure 13276 * ACK. And this is not a pure ACK. 13277 */ 13278 seg_len = 0; 13279 ofo_seg = B_TRUE; 13280 } 13281 } 13282 } else if (seg_len > 0) { 13283 BUMP_MIB(&tcp_mib, tcpInDataInorderSegs); 13284 UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, seg_len); 13285 /* 13286 * If an out of order FIN was received before, and the seq 13287 * num and len of the new segment match that of the FIN, 13288 * put the FIN flag back in. 13289 */ 13290 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 13291 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 13292 flags |= TH_FIN; 13293 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 13294 } 13295 } 13296 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 13297 if (flags & TH_RST) { 13298 freemsg(mp); 13299 switch (tcp->tcp_state) { 13300 case TCPS_SYN_RCVD: 13301 (void) tcp_clean_death(tcp, ECONNREFUSED, 14); 13302 break; 13303 case TCPS_ESTABLISHED: 13304 case TCPS_FIN_WAIT_1: 13305 case TCPS_FIN_WAIT_2: 13306 case TCPS_CLOSE_WAIT: 13307 (void) tcp_clean_death(tcp, ECONNRESET, 15); 13308 break; 13309 case TCPS_CLOSING: 13310 case TCPS_LAST_ACK: 13311 (void) tcp_clean_death(tcp, 0, 16); 13312 break; 13313 default: 13314 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 13315 (void) tcp_clean_death(tcp, ENXIO, 17); 13316 break; 13317 } 13318 return; 13319 } 13320 if (flags & TH_SYN) { 13321 /* 13322 * See RFC 793, Page 71 13323 * 13324 * The seq number must be in the window as it should 13325 * be "fixed" above. If it is outside window, it should 13326 * be already rejected. Note that we allow seg_seq to be 13327 * rnxt + rwnd because we want to accept 0 window probe. 13328 */ 13329 ASSERT(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 13330 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 13331 freemsg(mp); 13332 /* 13333 * If the ACK flag is not set, just use our snxt as the 13334 * seq number of the RST segment. 13335 */ 13336 if (!(flags & TH_ACK)) { 13337 seg_ack = tcp->tcp_snxt; 13338 } 13339 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 13340 TH_RST|TH_ACK); 13341 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 13342 (void) tcp_clean_death(tcp, ECONNRESET, 18); 13343 return; 13344 } 13345 /* 13346 * urp could be -1 when the urp field in the packet is 0 13347 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent 13348 * byte was at seg_seq - 1, in which case we ignore the urgent flag. 13349 */ 13350 if (flags & TH_URG && urp >= 0) { 13351 if (!tcp->tcp_urp_last_valid || 13352 SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { 13353 /* 13354 * If we haven't generated the signal yet for this 13355 * urgent pointer value, do it now. Also, send up a 13356 * zero-length M_DATA indicating whether or not this is 13357 * the mark. The latter is not needed when a 13358 * T_EXDATA_IND is sent up. However, if there are 13359 * allocation failures this code relies on the sender 13360 * retransmitting and the socket code for determining 13361 * the mark should not block waiting for the peer to 13362 * transmit. Thus, for simplicity we always send up the 13363 * mark indication. 13364 */ 13365 mp1 = allocb(0, BPRI_MED); 13366 if (mp1 == NULL) { 13367 freemsg(mp); 13368 return; 13369 } 13370 if (!TCP_IS_DETACHED(tcp) && 13371 !putnextctl1(tcp->tcp_rq, M_PCSIG, SIGURG)) { 13372 /* Try again on the rexmit. */ 13373 freemsg(mp1); 13374 freemsg(mp); 13375 return; 13376 } 13377 /* 13378 * Mark with NOTMARKNEXT for now. 13379 * The code below will change this to MARKNEXT 13380 * if we are at the mark. 13381 * 13382 * If there are allocation failures (e.g. in dupmsg 13383 * below) the next time tcp_rput_data sees the urgent 13384 * segment it will send up the MSG*MARKNEXT message. 13385 */ 13386 mp1->b_flag |= MSGNOTMARKNEXT; 13387 freemsg(tcp->tcp_urp_mark_mp); 13388 tcp->tcp_urp_mark_mp = mp1; 13389 flags |= TH_SEND_URP_MARK; 13390 #ifdef DEBUG 13391 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13392 "tcp_rput: sent M_PCSIG 2 seq %x urp %x " 13393 "last %x, %s", 13394 seg_seq, urp, tcp->tcp_urp_last, 13395 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 13396 #endif /* DEBUG */ 13397 tcp->tcp_urp_last_valid = B_TRUE; 13398 tcp->tcp_urp_last = urp + seg_seq; 13399 } else if (tcp->tcp_urp_mark_mp != NULL) { 13400 /* 13401 * An allocation failure prevented the previous 13402 * tcp_rput_data from sending up the allocated 13403 * MSG*MARKNEXT message - send it up this time 13404 * around. 13405 */ 13406 flags |= TH_SEND_URP_MARK; 13407 } 13408 13409 /* 13410 * If the urgent byte is in this segment, make sure that it is 13411 * all by itself. This makes it much easier to deal with the 13412 * possibility of an allocation failure on the T_exdata_ind. 13413 * Note that seg_len is the number of bytes in the segment, and 13414 * urp is the offset into the segment of the urgent byte. 13415 * urp < seg_len means that the urgent byte is in this segment. 13416 */ 13417 if (urp < seg_len) { 13418 if (seg_len != 1) { 13419 uint32_t tmp_rnxt; 13420 /* 13421 * Break it up and feed it back in. 13422 * Re-attach the IP header. 13423 */ 13424 mp->b_rptr = iphdr; 13425 if (urp > 0) { 13426 /* 13427 * There is stuff before the urgent 13428 * byte. 13429 */ 13430 mp1 = dupmsg(mp); 13431 if (!mp1) { 13432 /* 13433 * Trim from urgent byte on. 13434 * The rest will come back. 13435 */ 13436 (void) adjmsg(mp, 13437 urp - seg_len); 13438 tcp_rput_data(connp, 13439 mp, NULL); 13440 return; 13441 } 13442 (void) adjmsg(mp1, urp - seg_len); 13443 /* Feed this piece back in. */ 13444 tmp_rnxt = tcp->tcp_rnxt; 13445 tcp_rput_data(connp, mp1, NULL); 13446 /* 13447 * If the data passed back in was not 13448 * processed (ie: bad ACK) sending 13449 * the remainder back in will cause a 13450 * loop. In this case, drop the 13451 * packet and let the sender try 13452 * sending a good packet. 13453 */ 13454 if (tmp_rnxt == tcp->tcp_rnxt) { 13455 freemsg(mp); 13456 return; 13457 } 13458 } 13459 if (urp != seg_len - 1) { 13460 uint32_t tmp_rnxt; 13461 /* 13462 * There is stuff after the urgent 13463 * byte. 13464 */ 13465 mp1 = dupmsg(mp); 13466 if (!mp1) { 13467 /* 13468 * Trim everything beyond the 13469 * urgent byte. The rest will 13470 * come back. 13471 */ 13472 (void) adjmsg(mp, 13473 urp + 1 - seg_len); 13474 tcp_rput_data(connp, 13475 mp, NULL); 13476 return; 13477 } 13478 (void) adjmsg(mp1, urp + 1 - seg_len); 13479 tmp_rnxt = tcp->tcp_rnxt; 13480 tcp_rput_data(connp, mp1, NULL); 13481 /* 13482 * If the data passed back in was not 13483 * processed (ie: bad ACK) sending 13484 * the remainder back in will cause a 13485 * loop. In this case, drop the 13486 * packet and let the sender try 13487 * sending a good packet. 13488 */ 13489 if (tmp_rnxt == tcp->tcp_rnxt) { 13490 freemsg(mp); 13491 return; 13492 } 13493 } 13494 tcp_rput_data(connp, mp, NULL); 13495 return; 13496 } 13497 /* 13498 * This segment contains only the urgent byte. We 13499 * have to allocate the T_exdata_ind, if we can. 13500 */ 13501 if (!tcp->tcp_urp_mp) { 13502 struct T_exdata_ind *tei; 13503 mp1 = allocb(sizeof (struct T_exdata_ind), 13504 BPRI_MED); 13505 if (!mp1) { 13506 /* 13507 * Sigh... It'll be back. 13508 * Generate any MSG*MARK message now. 13509 */ 13510 freemsg(mp); 13511 seg_len = 0; 13512 if (flags & TH_SEND_URP_MARK) { 13513 13514 13515 ASSERT(tcp->tcp_urp_mark_mp); 13516 tcp->tcp_urp_mark_mp->b_flag &= 13517 ~MSGNOTMARKNEXT; 13518 tcp->tcp_urp_mark_mp->b_flag |= 13519 MSGMARKNEXT; 13520 } 13521 goto ack_check; 13522 } 13523 mp1->b_datap->db_type = M_PROTO; 13524 tei = (struct T_exdata_ind *)mp1->b_rptr; 13525 tei->PRIM_type = T_EXDATA_IND; 13526 tei->MORE_flag = 0; 13527 mp1->b_wptr = (uchar_t *)&tei[1]; 13528 tcp->tcp_urp_mp = mp1; 13529 #ifdef DEBUG 13530 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13531 "tcp_rput: allocated exdata_ind %s", 13532 tcp_display(tcp, NULL, 13533 DISP_PORT_ONLY)); 13534 #endif /* DEBUG */ 13535 /* 13536 * There is no need to send a separate MSG*MARK 13537 * message since the T_EXDATA_IND will be sent 13538 * now. 13539 */ 13540 flags &= ~TH_SEND_URP_MARK; 13541 freemsg(tcp->tcp_urp_mark_mp); 13542 tcp->tcp_urp_mark_mp = NULL; 13543 } 13544 /* 13545 * Now we are all set. On the next putnext upstream, 13546 * tcp_urp_mp will be non-NULL and will get prepended 13547 * to what has to be this piece containing the urgent 13548 * byte. If for any reason we abort this segment below, 13549 * if it comes back, we will have this ready, or it 13550 * will get blown off in close. 13551 */ 13552 } else if (urp == seg_len) { 13553 /* 13554 * The urgent byte is the next byte after this sequence 13555 * number. If there is data it is marked with 13556 * MSGMARKNEXT and any tcp_urp_mark_mp is discarded 13557 * since it is not needed. Otherwise, if the code 13558 * above just allocated a zero-length tcp_urp_mark_mp 13559 * message, that message is tagged with MSGMARKNEXT. 13560 * Sending up these MSGMARKNEXT messages makes 13561 * SIOCATMARK work correctly even though 13562 * the T_EXDATA_IND will not be sent up until the 13563 * urgent byte arrives. 13564 */ 13565 if (seg_len != 0) { 13566 flags |= TH_MARKNEXT_NEEDED; 13567 freemsg(tcp->tcp_urp_mark_mp); 13568 tcp->tcp_urp_mark_mp = NULL; 13569 flags &= ~TH_SEND_URP_MARK; 13570 } else if (tcp->tcp_urp_mark_mp != NULL) { 13571 flags |= TH_SEND_URP_MARK; 13572 tcp->tcp_urp_mark_mp->b_flag &= 13573 ~MSGNOTMARKNEXT; 13574 tcp->tcp_urp_mark_mp->b_flag |= MSGMARKNEXT; 13575 } 13576 #ifdef DEBUG 13577 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13578 "tcp_rput: AT MARK, len %d, flags 0x%x, %s", 13579 seg_len, flags, 13580 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 13581 #endif /* DEBUG */ 13582 } else { 13583 /* Data left until we hit mark */ 13584 #ifdef DEBUG 13585 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13586 "tcp_rput: URP %d bytes left, %s", 13587 urp - seg_len, tcp_display(tcp, NULL, 13588 DISP_PORT_ONLY)); 13589 #endif /* DEBUG */ 13590 } 13591 } 13592 13593 process_ack: 13594 if (!(flags & TH_ACK)) { 13595 freemsg(mp); 13596 goto xmit_check; 13597 } 13598 } 13599 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 13600 13601 if (tcp->tcp_ipversion == IPV6_VERSION && bytes_acked > 0) 13602 tcp->tcp_ip_forward_progress = B_TRUE; 13603 if (tcp->tcp_state == TCPS_SYN_RCVD) { 13604 if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) && 13605 ((tcp->tcp_kssl_ent == NULL) || !tcp->tcp_kssl_pending)) { 13606 /* 3-way handshake complete - pass up the T_CONN_IND */ 13607 tcp_t *listener = tcp->tcp_listener; 13608 mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind; 13609 13610 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 13611 /* 13612 * We are here means eager is fine but it can 13613 * get a TH_RST at any point between now and till 13614 * accept completes and disappear. We need to 13615 * ensure that reference to eager is valid after 13616 * we get out of eager's perimeter. So we do 13617 * an extra refhold. 13618 */ 13619 CONN_INC_REF(connp); 13620 13621 /* 13622 * The listener also exists because of the refhold 13623 * done in tcp_conn_request. Its possible that it 13624 * might have closed. We will check that once we 13625 * get inside listeners context. 13626 */ 13627 CONN_INC_REF(listener->tcp_connp); 13628 if (listener->tcp_connp->conn_sqp == 13629 connp->conn_sqp) { 13630 tcp_send_conn_ind(listener->tcp_connp, mp, 13631 listener->tcp_connp->conn_sqp); 13632 CONN_DEC_REF(listener->tcp_connp); 13633 } else if (!tcp->tcp_loopback) { 13634 squeue_fill(listener->tcp_connp->conn_sqp, mp, 13635 tcp_send_conn_ind, 13636 listener->tcp_connp, SQTAG_TCP_CONN_IND); 13637 } else { 13638 squeue_enter(listener->tcp_connp->conn_sqp, mp, 13639 tcp_send_conn_ind, listener->tcp_connp, 13640 SQTAG_TCP_CONN_IND); 13641 } 13642 } 13643 13644 if (tcp->tcp_active_open) { 13645 /* 13646 * We are seeing the final ack in the three way 13647 * hand shake of a active open'ed connection 13648 * so we must send up a T_CONN_CON 13649 */ 13650 if (!tcp_conn_con(tcp, iphdr, tcph, mp, NULL)) { 13651 freemsg(mp); 13652 return; 13653 } 13654 /* 13655 * Don't fuse the loopback endpoints for 13656 * simultaneous active opens. 13657 */ 13658 if (tcp->tcp_loopback) { 13659 TCP_STAT(tcp_fusion_unfusable); 13660 tcp->tcp_unfusable = B_TRUE; 13661 } 13662 } 13663 13664 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 13665 bytes_acked--; 13666 /* SYN was acked - making progress */ 13667 if (tcp->tcp_ipversion == IPV6_VERSION) 13668 tcp->tcp_ip_forward_progress = B_TRUE; 13669 13670 /* 13671 * If SYN was retransmitted, need to reset all 13672 * retransmission info as this segment will be 13673 * treated as a dup ACK. 13674 */ 13675 if (tcp->tcp_rexmit) { 13676 tcp->tcp_rexmit = B_FALSE; 13677 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 13678 tcp->tcp_rexmit_max = tcp->tcp_snxt; 13679 tcp->tcp_snd_burst = tcp->tcp_localnet ? 13680 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 13681 tcp->tcp_ms_we_have_waited = 0; 13682 tcp->tcp_cwnd = mss; 13683 } 13684 13685 /* 13686 * We set the send window to zero here. 13687 * This is needed if there is data to be 13688 * processed already on the queue. 13689 * Later (at swnd_update label), the 13690 * "new_swnd > tcp_swnd" condition is satisfied 13691 * the XMIT_NEEDED flag is set in the current 13692 * (SYN_RCVD) state. This ensures tcp_wput_data() is 13693 * called if there is already data on queue in 13694 * this state. 13695 */ 13696 tcp->tcp_swnd = 0; 13697 13698 if (new_swnd > tcp->tcp_max_swnd) 13699 tcp->tcp_max_swnd = new_swnd; 13700 tcp->tcp_swl1 = seg_seq; 13701 tcp->tcp_swl2 = seg_ack; 13702 tcp->tcp_state = TCPS_ESTABLISHED; 13703 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 13704 13705 /* Fuse when both sides are in ESTABLISHED state */ 13706 if (tcp->tcp_loopback && do_tcp_fusion) 13707 tcp_fuse(tcp, iphdr, tcph); 13708 13709 } 13710 /* This code follows 4.4BSD-Lite2 mostly. */ 13711 if (bytes_acked < 0) 13712 goto est; 13713 13714 /* 13715 * If TCP is ECN capable and the congestion experience bit is 13716 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 13717 * done once per window (or more loosely, per RTT). 13718 */ 13719 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 13720 tcp->tcp_cwr = B_FALSE; 13721 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { 13722 if (!tcp->tcp_cwr) { 13723 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss; 13724 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; 13725 tcp->tcp_cwnd = npkt * mss; 13726 /* 13727 * If the cwnd is 0, use the timer to clock out 13728 * new segments. This is required by the ECN spec. 13729 */ 13730 if (npkt == 0) { 13731 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 13732 /* 13733 * This makes sure that when the ACK comes 13734 * back, we will increase tcp_cwnd by 1 MSS. 13735 */ 13736 tcp->tcp_cwnd_cnt = 0; 13737 } 13738 tcp->tcp_cwr = B_TRUE; 13739 /* 13740 * This marks the end of the current window of in 13741 * flight data. That is why we don't use 13742 * tcp_suna + tcp_swnd. Only data in flight can 13743 * provide ECN info. 13744 */ 13745 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 13746 tcp->tcp_ecn_cwr_sent = B_FALSE; 13747 } 13748 } 13749 13750 mp1 = tcp->tcp_xmit_head; 13751 if (bytes_acked == 0) { 13752 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 13753 int dupack_cnt; 13754 13755 BUMP_MIB(&tcp_mib, tcpInDupAck); 13756 /* 13757 * Fast retransmit. When we have seen exactly three 13758 * identical ACKs while we have unacked data 13759 * outstanding we take it as a hint that our peer 13760 * dropped something. 13761 * 13762 * If TCP is retransmitting, don't do fast retransmit. 13763 */ 13764 if (mp1 && tcp->tcp_suna != tcp->tcp_snxt && 13765 ! tcp->tcp_rexmit) { 13766 /* Do Limited Transmit */ 13767 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 13768 tcp_dupack_fast_retransmit) { 13769 /* 13770 * RFC 3042 13771 * 13772 * What we need to do is temporarily 13773 * increase tcp_cwnd so that new 13774 * data can be sent if it is allowed 13775 * by the receive window (tcp_rwnd). 13776 * tcp_wput_data() will take care of 13777 * the rest. 13778 * 13779 * If the connection is SACK capable, 13780 * only do limited xmit when there 13781 * is SACK info. 13782 * 13783 * Note how tcp_cwnd is incremented. 13784 * The first dup ACK will increase 13785 * it by 1 MSS. The second dup ACK 13786 * will increase it by 2 MSS. This 13787 * means that only 1 new segment will 13788 * be sent for each dup ACK. 13789 */ 13790 if (tcp->tcp_unsent > 0 && 13791 (!tcp->tcp_snd_sack_ok || 13792 (tcp->tcp_snd_sack_ok && 13793 tcp->tcp_notsack_list != NULL))) { 13794 tcp->tcp_cwnd += mss << 13795 (tcp->tcp_dupack_cnt - 1); 13796 flags |= TH_LIMIT_XMIT; 13797 } 13798 } else if (dupack_cnt == 13799 tcp_dupack_fast_retransmit) { 13800 13801 /* 13802 * If we have reduced tcp_ssthresh 13803 * because of ECN, do not reduce it again 13804 * unless it is already one window of data 13805 * away. After one window of data, tcp_cwr 13806 * should then be cleared. Note that 13807 * for non ECN capable connection, tcp_cwr 13808 * should always be false. 13809 * 13810 * Adjust cwnd since the duplicate 13811 * ack indicates that a packet was 13812 * dropped (due to congestion.) 13813 */ 13814 if (!tcp->tcp_cwr) { 13815 npkt = ((tcp->tcp_snxt - 13816 tcp->tcp_suna) >> 1) / mss; 13817 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 13818 mss; 13819 tcp->tcp_cwnd = (npkt + 13820 tcp->tcp_dupack_cnt) * mss; 13821 } 13822 if (tcp->tcp_ecn_ok) { 13823 tcp->tcp_cwr = B_TRUE; 13824 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 13825 tcp->tcp_ecn_cwr_sent = B_FALSE; 13826 } 13827 13828 /* 13829 * We do Hoe's algorithm. Refer to her 13830 * paper "Improving the Start-up Behavior 13831 * of a Congestion Control Scheme for TCP," 13832 * appeared in SIGCOMM'96. 13833 * 13834 * Save highest seq no we have sent so far. 13835 * Be careful about the invisible FIN byte. 13836 */ 13837 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 13838 (tcp->tcp_unsent == 0)) { 13839 tcp->tcp_rexmit_max = tcp->tcp_fss; 13840 } else { 13841 tcp->tcp_rexmit_max = tcp->tcp_snxt; 13842 } 13843 13844 /* 13845 * Do not allow bursty traffic during. 13846 * fast recovery. Refer to Fall and Floyd's 13847 * paper "Simulation-based Comparisons of 13848 * Tahoe, Reno and SACK TCP" (in CCR?) 13849 * This is a best current practise. 13850 */ 13851 tcp->tcp_snd_burst = TCP_CWND_SS; 13852 13853 /* 13854 * For SACK: 13855 * Calculate tcp_pipe, which is the 13856 * estimated number of bytes in 13857 * network. 13858 * 13859 * tcp_fack is the highest sack'ed seq num 13860 * TCP has received. 13861 * 13862 * tcp_pipe is explained in the above quoted 13863 * Fall and Floyd's paper. tcp_fack is 13864 * explained in Mathis and Mahdavi's 13865 * "Forward Acknowledgment: Refining TCP 13866 * Congestion Control" in SIGCOMM '96. 13867 */ 13868 if (tcp->tcp_snd_sack_ok) { 13869 ASSERT(tcp->tcp_sack_info != NULL); 13870 if (tcp->tcp_notsack_list != NULL) { 13871 tcp->tcp_pipe = tcp->tcp_snxt - 13872 tcp->tcp_fack; 13873 tcp->tcp_sack_snxt = seg_ack; 13874 flags |= TH_NEED_SACK_REXMIT; 13875 } else { 13876 /* 13877 * Always initialize tcp_pipe 13878 * even though we don't have 13879 * any SACK info. If later 13880 * we get SACK info and 13881 * tcp_pipe is not initialized, 13882 * funny things will happen. 13883 */ 13884 tcp->tcp_pipe = 13885 tcp->tcp_cwnd_ssthresh; 13886 } 13887 } else { 13888 flags |= TH_REXMIT_NEEDED; 13889 } /* tcp_snd_sack_ok */ 13890 13891 } else { 13892 /* 13893 * Here we perform congestion 13894 * avoidance, but NOT slow start. 13895 * This is known as the Fast 13896 * Recovery Algorithm. 13897 */ 13898 if (tcp->tcp_snd_sack_ok && 13899 tcp->tcp_notsack_list != NULL) { 13900 flags |= TH_NEED_SACK_REXMIT; 13901 tcp->tcp_pipe -= mss; 13902 if (tcp->tcp_pipe < 0) 13903 tcp->tcp_pipe = 0; 13904 } else { 13905 /* 13906 * We know that one more packet has 13907 * left the pipe thus we can update 13908 * cwnd. 13909 */ 13910 cwnd = tcp->tcp_cwnd + mss; 13911 if (cwnd > tcp->tcp_cwnd_max) 13912 cwnd = tcp->tcp_cwnd_max; 13913 tcp->tcp_cwnd = cwnd; 13914 if (tcp->tcp_unsent > 0) 13915 flags |= TH_XMIT_NEEDED; 13916 } 13917 } 13918 } 13919 } else if (tcp->tcp_zero_win_probe) { 13920 /* 13921 * If the window has opened, need to arrange 13922 * to send additional data. 13923 */ 13924 if (new_swnd != 0) { 13925 /* tcp_suna != tcp_snxt */ 13926 /* Packet contains a window update */ 13927 BUMP_MIB(&tcp_mib, tcpInWinUpdate); 13928 tcp->tcp_zero_win_probe = 0; 13929 tcp->tcp_timer_backoff = 0; 13930 tcp->tcp_ms_we_have_waited = 0; 13931 13932 /* 13933 * Transmit starting with tcp_suna since 13934 * the one byte probe is not ack'ed. 13935 * If TCP has sent more than one identical 13936 * probe, tcp_rexmit will be set. That means 13937 * tcp_ss_rexmit() will send out the one 13938 * byte along with new data. Otherwise, 13939 * fake the retransmission. 13940 */ 13941 flags |= TH_XMIT_NEEDED; 13942 if (!tcp->tcp_rexmit) { 13943 tcp->tcp_rexmit = B_TRUE; 13944 tcp->tcp_dupack_cnt = 0; 13945 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 13946 tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 13947 } 13948 } 13949 } 13950 goto swnd_update; 13951 } 13952 13953 /* 13954 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 13955 * If the ACK value acks something that we have not yet sent, it might 13956 * be an old duplicate segment. Send an ACK to re-synchronize the 13957 * other side. 13958 * Note: reset in response to unacceptable ACK in SYN_RECEIVE 13959 * state is handled above, so we can always just drop the segment and 13960 * send an ACK here. 13961 * 13962 * Should we send ACKs in response to ACK only segments? 13963 */ 13964 if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 13965 BUMP_MIB(&tcp_mib, tcpInAckUnsent); 13966 /* drop the received segment */ 13967 freemsg(mp); 13968 13969 /* 13970 * Send back an ACK. If tcp_drop_ack_unsent_cnt is 13971 * greater than 0, check if the number of such 13972 * bogus ACks is greater than that count. If yes, 13973 * don't send back any ACK. This prevents TCP from 13974 * getting into an ACK storm if somehow an attacker 13975 * successfully spoofs an acceptable segment to our 13976 * peer. 13977 */ 13978 if (tcp_drop_ack_unsent_cnt > 0 && 13979 ++tcp->tcp_in_ack_unsent > tcp_drop_ack_unsent_cnt) { 13980 TCP_STAT(tcp_in_ack_unsent_drop); 13981 return; 13982 } 13983 mp = tcp_ack_mp(tcp); 13984 if (mp != NULL) { 13985 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 13986 BUMP_LOCAL(tcp->tcp_obsegs); 13987 BUMP_MIB(&tcp_mib, tcpOutAck); 13988 tcp_send_data(tcp, tcp->tcp_wq, mp); 13989 } 13990 return; 13991 } 13992 13993 /* 13994 * TCP gets a new ACK, update the notsack'ed list to delete those 13995 * blocks that are covered by this ACK. 13996 */ 13997 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 13998 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 13999 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 14000 } 14001 14002 /* 14003 * If we got an ACK after fast retransmit, check to see 14004 * if it is a partial ACK. If it is not and the congestion 14005 * window was inflated to account for the other side's 14006 * cached packets, retract it. If it is, do Hoe's algorithm. 14007 */ 14008 if (tcp->tcp_dupack_cnt >= tcp_dupack_fast_retransmit) { 14009 ASSERT(tcp->tcp_rexmit == B_FALSE); 14010 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 14011 tcp->tcp_dupack_cnt = 0; 14012 /* 14013 * Restore the orig tcp_cwnd_ssthresh after 14014 * fast retransmit phase. 14015 */ 14016 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 14017 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; 14018 } 14019 tcp->tcp_rexmit_max = seg_ack; 14020 tcp->tcp_cwnd_cnt = 0; 14021 tcp->tcp_snd_burst = tcp->tcp_localnet ? 14022 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 14023 14024 /* 14025 * Remove all notsack info to avoid confusion with 14026 * the next fast retrasnmit/recovery phase. 14027 */ 14028 if (tcp->tcp_snd_sack_ok && 14029 tcp->tcp_notsack_list != NULL) { 14030 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 14031 } 14032 } else { 14033 if (tcp->tcp_snd_sack_ok && 14034 tcp->tcp_notsack_list != NULL) { 14035 flags |= TH_NEED_SACK_REXMIT; 14036 tcp->tcp_pipe -= mss; 14037 if (tcp->tcp_pipe < 0) 14038 tcp->tcp_pipe = 0; 14039 } else { 14040 /* 14041 * Hoe's algorithm: 14042 * 14043 * Retransmit the unack'ed segment and 14044 * restart fast recovery. Note that we 14045 * need to scale back tcp_cwnd to the 14046 * original value when we started fast 14047 * recovery. This is to prevent overly 14048 * aggressive behaviour in sending new 14049 * segments. 14050 */ 14051 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + 14052 tcp_dupack_fast_retransmit * mss; 14053 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 14054 flags |= TH_REXMIT_NEEDED; 14055 } 14056 } 14057 } else { 14058 tcp->tcp_dupack_cnt = 0; 14059 if (tcp->tcp_rexmit) { 14060 /* 14061 * TCP is retranmitting. If the ACK ack's all 14062 * outstanding data, update tcp_rexmit_max and 14063 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 14064 * to the correct value. 14065 * 14066 * Note that SEQ_LEQ() is used. This is to avoid 14067 * unnecessary fast retransmit caused by dup ACKs 14068 * received when TCP does slow start retransmission 14069 * after a time out. During this phase, TCP may 14070 * send out segments which are already received. 14071 * This causes dup ACKs to be sent back. 14072 */ 14073 if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 14074 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 14075 tcp->tcp_rexmit_nxt = seg_ack; 14076 } 14077 if (seg_ack != tcp->tcp_rexmit_max) { 14078 flags |= TH_XMIT_NEEDED; 14079 } 14080 } else { 14081 tcp->tcp_rexmit = B_FALSE; 14082 tcp->tcp_xmit_zc_clean = B_FALSE; 14083 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 14084 tcp->tcp_snd_burst = tcp->tcp_localnet ? 14085 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 14086 } 14087 tcp->tcp_ms_we_have_waited = 0; 14088 } 14089 } 14090 14091 BUMP_MIB(&tcp_mib, tcpInAckSegs); 14092 UPDATE_MIB(&tcp_mib, tcpInAckBytes, bytes_acked); 14093 tcp->tcp_suna = seg_ack; 14094 if (tcp->tcp_zero_win_probe != 0) { 14095 tcp->tcp_zero_win_probe = 0; 14096 tcp->tcp_timer_backoff = 0; 14097 } 14098 14099 /* 14100 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 14101 * Note that it cannot be the SYN being ack'ed. The code flow 14102 * will not reach here. 14103 */ 14104 if (mp1 == NULL) { 14105 goto fin_acked; 14106 } 14107 14108 /* 14109 * Update the congestion window. 14110 * 14111 * If TCP is not ECN capable or TCP is ECN capable but the 14112 * congestion experience bit is not set, increase the tcp_cwnd as 14113 * usual. 14114 */ 14115 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 14116 cwnd = tcp->tcp_cwnd; 14117 add = mss; 14118 14119 if (cwnd >= tcp->tcp_cwnd_ssthresh) { 14120 /* 14121 * This is to prevent an increase of less than 1 MSS of 14122 * tcp_cwnd. With partial increase, tcp_wput_data() 14123 * may send out tinygrams in order to preserve mblk 14124 * boundaries. 14125 * 14126 * By initializing tcp_cwnd_cnt to new tcp_cwnd and 14127 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is 14128 * increased by 1 MSS for every RTTs. 14129 */ 14130 if (tcp->tcp_cwnd_cnt <= 0) { 14131 tcp->tcp_cwnd_cnt = cwnd + add; 14132 } else { 14133 tcp->tcp_cwnd_cnt -= add; 14134 add = 0; 14135 } 14136 } 14137 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); 14138 } 14139 14140 /* See if the latest urgent data has been acknowledged */ 14141 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && 14142 SEQ_GT(seg_ack, tcp->tcp_urg)) 14143 tcp->tcp_valid_bits &= ~TCP_URG_VALID; 14144 14145 /* Can we update the RTT estimates? */ 14146 if (tcp->tcp_snd_ts_ok) { 14147 /* Ignore zero timestamp echo-reply. */ 14148 if (tcpopt.tcp_opt_ts_ecr != 0) { 14149 tcp_set_rto(tcp, (int32_t)lbolt - 14150 (int32_t)tcpopt.tcp_opt_ts_ecr); 14151 } 14152 14153 /* If needed, restart the timer. */ 14154 if (tcp->tcp_set_timer == 1) { 14155 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 14156 tcp->tcp_set_timer = 0; 14157 } 14158 /* 14159 * Update tcp_csuna in case the other side stops sending 14160 * us timestamps. 14161 */ 14162 tcp->tcp_csuna = tcp->tcp_snxt; 14163 } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 14164 /* 14165 * An ACK sequence we haven't seen before, so get the RTT 14166 * and update the RTO. But first check if the timestamp is 14167 * valid to use. 14168 */ 14169 if ((mp1->b_next != NULL) && 14170 SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) 14171 tcp_set_rto(tcp, (int32_t)lbolt - 14172 (int32_t)(intptr_t)mp1->b_prev); 14173 else 14174 BUMP_MIB(&tcp_mib, tcpRttNoUpdate); 14175 14176 /* Remeber the last sequence to be ACKed */ 14177 tcp->tcp_csuna = seg_ack; 14178 if (tcp->tcp_set_timer == 1) { 14179 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 14180 tcp->tcp_set_timer = 0; 14181 } 14182 } else { 14183 BUMP_MIB(&tcp_mib, tcpRttNoUpdate); 14184 } 14185 14186 /* Eat acknowledged bytes off the xmit queue. */ 14187 for (;;) { 14188 mblk_t *mp2; 14189 uchar_t *wptr; 14190 14191 wptr = mp1->b_wptr; 14192 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 14193 bytes_acked -= (int)(wptr - mp1->b_rptr); 14194 if (bytes_acked < 0) { 14195 mp1->b_rptr = wptr + bytes_acked; 14196 /* 14197 * Set a new timestamp if all the bytes timed by the 14198 * old timestamp have been ack'ed. 14199 */ 14200 if (SEQ_GT(seg_ack, 14201 (uint32_t)(uintptr_t)(mp1->b_next))) { 14202 mp1->b_prev = (mblk_t *)(uintptr_t)lbolt; 14203 mp1->b_next = NULL; 14204 } 14205 break; 14206 } 14207 mp1->b_next = NULL; 14208 mp1->b_prev = NULL; 14209 mp2 = mp1; 14210 mp1 = mp1->b_cont; 14211 14212 /* 14213 * This notification is required for some zero-copy 14214 * clients to maintain a copy semantic. After the data 14215 * is ack'ed, client is safe to modify or reuse the buffer. 14216 */ 14217 if (tcp->tcp_snd_zcopy_aware && 14218 (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 14219 tcp_zcopy_notify(tcp); 14220 freeb(mp2); 14221 if (bytes_acked == 0) { 14222 if (mp1 == NULL) { 14223 /* Everything is ack'ed, clear the tail. */ 14224 tcp->tcp_xmit_tail = NULL; 14225 /* 14226 * Cancel the timer unless we are still 14227 * waiting for an ACK for the FIN packet. 14228 */ 14229 if (tcp->tcp_timer_tid != 0 && 14230 tcp->tcp_snxt == tcp->tcp_suna) { 14231 (void) TCP_TIMER_CANCEL(tcp, 14232 tcp->tcp_timer_tid); 14233 tcp->tcp_timer_tid = 0; 14234 } 14235 goto pre_swnd_update; 14236 } 14237 if (mp2 != tcp->tcp_xmit_tail) 14238 break; 14239 tcp->tcp_xmit_tail = mp1; 14240 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 14241 (uintptr_t)INT_MAX); 14242 tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 14243 mp1->b_rptr); 14244 break; 14245 } 14246 if (mp1 == NULL) { 14247 /* 14248 * More was acked but there is nothing more 14249 * outstanding. This means that the FIN was 14250 * just acked or that we're talking to a clown. 14251 */ 14252 fin_acked: 14253 ASSERT(tcp->tcp_fin_sent); 14254 tcp->tcp_xmit_tail = NULL; 14255 if (tcp->tcp_fin_sent) { 14256 /* FIN was acked - making progress */ 14257 if (tcp->tcp_ipversion == IPV6_VERSION && 14258 !tcp->tcp_fin_acked) 14259 tcp->tcp_ip_forward_progress = B_TRUE; 14260 tcp->tcp_fin_acked = B_TRUE; 14261 if (tcp->tcp_linger_tid != 0 && 14262 TCP_TIMER_CANCEL(tcp, 14263 tcp->tcp_linger_tid) >= 0) { 14264 tcp_stop_lingering(tcp); 14265 } 14266 } else { 14267 /* 14268 * We should never get here because 14269 * we have already checked that the 14270 * number of bytes ack'ed should be 14271 * smaller than or equal to what we 14272 * have sent so far (it is the 14273 * acceptability check of the ACK). 14274 * We can only get here if the send 14275 * queue is corrupted. 14276 * 14277 * Terminate the connection and 14278 * panic the system. It is better 14279 * for us to panic instead of 14280 * continuing to avoid other disaster. 14281 */ 14282 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 14283 tcp->tcp_rnxt, TH_RST|TH_ACK); 14284 panic("Memory corruption " 14285 "detected for connection %s.", 14286 tcp_display(tcp, NULL, 14287 DISP_ADDR_AND_PORT)); 14288 /*NOTREACHED*/ 14289 } 14290 goto pre_swnd_update; 14291 } 14292 ASSERT(mp2 != tcp->tcp_xmit_tail); 14293 } 14294 if (tcp->tcp_unsent) { 14295 flags |= TH_XMIT_NEEDED; 14296 } 14297 pre_swnd_update: 14298 tcp->tcp_xmit_head = mp1; 14299 swnd_update: 14300 /* 14301 * The following check is different from most other implementations. 14302 * For bi-directional transfer, when segments are dropped, the 14303 * "normal" check will not accept a window update in those 14304 * retransmitted segemnts. Failing to do that, TCP may send out 14305 * segments which are outside receiver's window. As TCP accepts 14306 * the ack in those retransmitted segments, if the window update in 14307 * the same segment is not accepted, TCP will incorrectly calculates 14308 * that it can send more segments. This can create a deadlock 14309 * with the receiver if its window becomes zero. 14310 */ 14311 if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 14312 SEQ_LT(tcp->tcp_swl1, seg_seq) || 14313 (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 14314 /* 14315 * The criteria for update is: 14316 * 14317 * 1. the segment acknowledges some data. Or 14318 * 2. the segment is new, i.e. it has a higher seq num. Or 14319 * 3. the segment is not old and the advertised window is 14320 * larger than the previous advertised window. 14321 */ 14322 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 14323 flags |= TH_XMIT_NEEDED; 14324 tcp->tcp_swnd = new_swnd; 14325 if (new_swnd > tcp->tcp_max_swnd) 14326 tcp->tcp_max_swnd = new_swnd; 14327 tcp->tcp_swl1 = seg_seq; 14328 tcp->tcp_swl2 = seg_ack; 14329 } 14330 est: 14331 if (tcp->tcp_state > TCPS_ESTABLISHED) { 14332 14333 switch (tcp->tcp_state) { 14334 case TCPS_FIN_WAIT_1: 14335 if (tcp->tcp_fin_acked) { 14336 tcp->tcp_state = TCPS_FIN_WAIT_2; 14337 /* 14338 * We implement the non-standard BSD/SunOS 14339 * FIN_WAIT_2 flushing algorithm. 14340 * If there is no user attached to this 14341 * TCP endpoint, then this TCP struct 14342 * could hang around forever in FIN_WAIT_2 14343 * state if the peer forgets to send us 14344 * a FIN. To prevent this, we wait only 14345 * 2*MSL (a convenient time value) for 14346 * the FIN to arrive. If it doesn't show up, 14347 * we flush the TCP endpoint. This algorithm, 14348 * though a violation of RFC-793, has worked 14349 * for over 10 years in BSD systems. 14350 * Note: SunOS 4.x waits 675 seconds before 14351 * flushing the FIN_WAIT_2 connection. 14352 */ 14353 TCP_TIMER_RESTART(tcp, 14354 tcp_fin_wait_2_flush_interval); 14355 } 14356 break; 14357 case TCPS_FIN_WAIT_2: 14358 break; /* Shutdown hook? */ 14359 case TCPS_LAST_ACK: 14360 freemsg(mp); 14361 if (tcp->tcp_fin_acked) { 14362 (void) tcp_clean_death(tcp, 0, 19); 14363 return; 14364 } 14365 goto xmit_check; 14366 case TCPS_CLOSING: 14367 if (tcp->tcp_fin_acked) { 14368 tcp->tcp_state = TCPS_TIME_WAIT; 14369 /* 14370 * Unconditionally clear the exclusive binding 14371 * bit so this TIME-WAIT connection won't 14372 * interfere with new ones. 14373 */ 14374 tcp->tcp_exclbind = 0; 14375 if (!TCP_IS_DETACHED(tcp)) { 14376 TCP_TIMER_RESTART(tcp, 14377 tcp_time_wait_interval); 14378 } else { 14379 tcp_time_wait_append(tcp); 14380 TCP_DBGSTAT(tcp_rput_time_wait); 14381 } 14382 } 14383 /*FALLTHRU*/ 14384 case TCPS_CLOSE_WAIT: 14385 freemsg(mp); 14386 goto xmit_check; 14387 default: 14388 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 14389 break; 14390 } 14391 } 14392 if (flags & TH_FIN) { 14393 /* Make sure we ack the fin */ 14394 flags |= TH_ACK_NEEDED; 14395 if (!tcp->tcp_fin_rcvd) { 14396 tcp->tcp_fin_rcvd = B_TRUE; 14397 tcp->tcp_rnxt++; 14398 tcph = tcp->tcp_tcph; 14399 U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); 14400 14401 /* 14402 * Generate the ordrel_ind at the end unless we 14403 * are an eager guy. 14404 * In the eager case tcp_rsrv will do this when run 14405 * after tcp_accept is done. 14406 */ 14407 if (tcp->tcp_listener == NULL && 14408 !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding)) 14409 flags |= TH_ORDREL_NEEDED; 14410 switch (tcp->tcp_state) { 14411 case TCPS_SYN_RCVD: 14412 case TCPS_ESTABLISHED: 14413 tcp->tcp_state = TCPS_CLOSE_WAIT; 14414 /* Keepalive? */ 14415 break; 14416 case TCPS_FIN_WAIT_1: 14417 if (!tcp->tcp_fin_acked) { 14418 tcp->tcp_state = TCPS_CLOSING; 14419 break; 14420 } 14421 /* FALLTHRU */ 14422 case TCPS_FIN_WAIT_2: 14423 tcp->tcp_state = TCPS_TIME_WAIT; 14424 /* 14425 * Unconditionally clear the exclusive binding 14426 * bit so this TIME-WAIT connection won't 14427 * interfere with new ones. 14428 */ 14429 tcp->tcp_exclbind = 0; 14430 if (!TCP_IS_DETACHED(tcp)) { 14431 TCP_TIMER_RESTART(tcp, 14432 tcp_time_wait_interval); 14433 } else { 14434 tcp_time_wait_append(tcp); 14435 TCP_DBGSTAT(tcp_rput_time_wait); 14436 } 14437 if (seg_len) { 14438 /* 14439 * implies data piggybacked on FIN. 14440 * break to handle data. 14441 */ 14442 break; 14443 } 14444 freemsg(mp); 14445 goto ack_check; 14446 } 14447 } 14448 } 14449 if (mp == NULL) 14450 goto xmit_check; 14451 if (seg_len == 0) { 14452 freemsg(mp); 14453 goto xmit_check; 14454 } 14455 if (mp->b_rptr == mp->b_wptr) { 14456 /* 14457 * The header has been consumed, so we remove the 14458 * zero-length mblk here. 14459 */ 14460 mp1 = mp; 14461 mp = mp->b_cont; 14462 freeb(mp1); 14463 } 14464 tcph = tcp->tcp_tcph; 14465 tcp->tcp_rack_cnt++; 14466 { 14467 uint32_t cur_max; 14468 14469 cur_max = tcp->tcp_rack_cur_max; 14470 if (tcp->tcp_rack_cnt >= cur_max) { 14471 /* 14472 * We have more unacked data than we should - send 14473 * an ACK now. 14474 */ 14475 flags |= TH_ACK_NEEDED; 14476 cur_max++; 14477 if (cur_max > tcp->tcp_rack_abs_max) 14478 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 14479 else 14480 tcp->tcp_rack_cur_max = cur_max; 14481 } else if (TCP_IS_DETACHED(tcp)) { 14482 /* We don't have an ACK timer for detached TCP. */ 14483 flags |= TH_ACK_NEEDED; 14484 } else if (seg_len < mss) { 14485 /* 14486 * If we get a segment that is less than an mss, and we 14487 * already have unacknowledged data, and the amount 14488 * unacknowledged is not a multiple of mss, then we 14489 * better generate an ACK now. Otherwise, this may be 14490 * the tail piece of a transaction, and we would rather 14491 * wait for the response. 14492 */ 14493 uint32_t udif; 14494 ASSERT((uintptr_t)(tcp->tcp_rnxt - tcp->tcp_rack) <= 14495 (uintptr_t)INT_MAX); 14496 udif = (int)(tcp->tcp_rnxt - tcp->tcp_rack); 14497 if (udif && (udif % mss)) 14498 flags |= TH_ACK_NEEDED; 14499 else 14500 flags |= TH_ACK_TIMER_NEEDED; 14501 } else { 14502 /* Start delayed ack timer */ 14503 flags |= TH_ACK_TIMER_NEEDED; 14504 } 14505 } 14506 tcp->tcp_rnxt += seg_len; 14507 U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); 14508 14509 /* Update SACK list */ 14510 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 14511 tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 14512 &(tcp->tcp_num_sack_blk)); 14513 } 14514 14515 if (tcp->tcp_urp_mp) { 14516 tcp->tcp_urp_mp->b_cont = mp; 14517 mp = tcp->tcp_urp_mp; 14518 tcp->tcp_urp_mp = NULL; 14519 /* Ready for a new signal. */ 14520 tcp->tcp_urp_last_valid = B_FALSE; 14521 #ifdef DEBUG 14522 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14523 "tcp_rput: sending exdata_ind %s", 14524 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 14525 #endif /* DEBUG */ 14526 } 14527 14528 /* 14529 * Check for ancillary data changes compared to last segment. 14530 */ 14531 if (tcp->tcp_ipv6_recvancillary != 0) { 14532 mp = tcp_rput_add_ancillary(tcp, mp, &ipp); 14533 if (mp == NULL) 14534 return; 14535 } 14536 14537 if (tcp->tcp_listener || tcp->tcp_hard_binding) { 14538 /* 14539 * Side queue inbound data until the accept happens. 14540 * tcp_accept/tcp_rput drains this when the accept happens. 14541 * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or 14542 * T_EXDATA_IND) it is queued on b_next. 14543 * XXX Make urgent data use this. Requires: 14544 * Removing tcp_listener check for TH_URG 14545 * Making M_PCPROTO and MARK messages skip the eager case 14546 */ 14547 14548 if (tcp->tcp_kssl_pending) { 14549 tcp_kssl_input(tcp, mp); 14550 } else { 14551 tcp_rcv_enqueue(tcp, mp, seg_len); 14552 } 14553 } else { 14554 if (mp->b_datap->db_type != M_DATA || 14555 (flags & TH_MARKNEXT_NEEDED)) { 14556 if (tcp->tcp_rcv_list != NULL) { 14557 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 14558 } 14559 ASSERT(tcp->tcp_rcv_list == NULL || 14560 tcp->tcp_fused_sigurg); 14561 if (flags & TH_MARKNEXT_NEEDED) { 14562 #ifdef DEBUG 14563 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14564 "tcp_rput: sending MSGMARKNEXT %s", 14565 tcp_display(tcp, NULL, 14566 DISP_PORT_ONLY)); 14567 #endif /* DEBUG */ 14568 mp->b_flag |= MSGMARKNEXT; 14569 flags &= ~TH_MARKNEXT_NEEDED; 14570 } 14571 14572 /* Does this need SSL processing first? */ 14573 if ((tcp->tcp_kssl_ctx != NULL) && 14574 (DB_TYPE(mp) == M_DATA)) { 14575 tcp_kssl_input(tcp, mp); 14576 } else { 14577 putnext(tcp->tcp_rq, mp); 14578 if (!canputnext(tcp->tcp_rq)) 14579 tcp->tcp_rwnd -= seg_len; 14580 } 14581 } else if ((flags & (TH_PUSH|TH_FIN)) || 14582 tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_rq->q_hiwat >> 3) { 14583 if (tcp->tcp_rcv_list != NULL) { 14584 /* 14585 * Enqueue the new segment first and then 14586 * call tcp_rcv_drain() to send all data 14587 * up. The other way to do this is to 14588 * send all queued data up and then call 14589 * putnext() to send the new segment up. 14590 * This way can remove the else part later 14591 * on. 14592 * 14593 * We don't this to avoid one more call to 14594 * canputnext() as tcp_rcv_drain() needs to 14595 * call canputnext(). 14596 */ 14597 tcp_rcv_enqueue(tcp, mp, seg_len); 14598 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 14599 } else { 14600 /* Does this need SSL processing first? */ 14601 if ((tcp->tcp_kssl_ctx != NULL) && 14602 (DB_TYPE(mp) == M_DATA)) { 14603 tcp_kssl_input(tcp, mp); 14604 } else { 14605 putnext(tcp->tcp_rq, mp); 14606 if (!canputnext(tcp->tcp_rq)) 14607 tcp->tcp_rwnd -= seg_len; 14608 } 14609 } 14610 } else { 14611 /* 14612 * Enqueue all packets when processing an mblk 14613 * from the co queue and also enqueue normal packets. 14614 */ 14615 tcp_rcv_enqueue(tcp, mp, seg_len); 14616 } 14617 /* 14618 * Make sure the timer is running if we have data waiting 14619 * for a push bit. This provides resiliency against 14620 * implementations that do not correctly generate push bits. 14621 */ 14622 if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) { 14623 /* 14624 * The connection may be closed at this point, so don't 14625 * do anything for a detached tcp. 14626 */ 14627 if (!TCP_IS_DETACHED(tcp)) 14628 tcp->tcp_push_tid = TCP_TIMER(tcp, 14629 tcp_push_timer, 14630 MSEC_TO_TICK(tcp_push_timer_interval)); 14631 } 14632 } 14633 xmit_check: 14634 /* Is there anything left to do? */ 14635 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 14636 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 14637 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED| 14638 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 14639 goto done; 14640 14641 /* Any transmit work to do and a non-zero window? */ 14642 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 14643 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 14644 if (flags & TH_REXMIT_NEEDED) { 14645 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 14646 14647 BUMP_MIB(&tcp_mib, tcpOutFastRetrans); 14648 if (snd_size > mss) 14649 snd_size = mss; 14650 if (snd_size > tcp->tcp_swnd) 14651 snd_size = tcp->tcp_swnd; 14652 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 14653 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 14654 B_TRUE); 14655 14656 if (mp1 != NULL) { 14657 tcp->tcp_xmit_head->b_prev = (mblk_t *)lbolt; 14658 tcp->tcp_csuna = tcp->tcp_snxt; 14659 BUMP_MIB(&tcp_mib, tcpRetransSegs); 14660 UPDATE_MIB(&tcp_mib, tcpRetransBytes, snd_size); 14661 TCP_RECORD_TRACE(tcp, mp1, 14662 TCP_TRACE_SEND_PKT); 14663 tcp_send_data(tcp, tcp->tcp_wq, mp1); 14664 } 14665 } 14666 if (flags & TH_NEED_SACK_REXMIT) { 14667 tcp_sack_rxmit(tcp, &flags); 14668 } 14669 /* 14670 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 14671 * out new segment. Note that tcp_rexmit should not be 14672 * set, otherwise TH_LIMIT_XMIT should not be set. 14673 */ 14674 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 14675 if (!tcp->tcp_rexmit) { 14676 tcp_wput_data(tcp, NULL, B_FALSE); 14677 } else { 14678 tcp_ss_rexmit(tcp); 14679 } 14680 } 14681 /* 14682 * Adjust tcp_cwnd back to normal value after sending 14683 * new data segments. 14684 */ 14685 if (flags & TH_LIMIT_XMIT) { 14686 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 14687 /* 14688 * This will restart the timer. Restarting the 14689 * timer is used to avoid a timeout before the 14690 * limited transmitted segment's ACK gets back. 14691 */ 14692 if (tcp->tcp_xmit_head != NULL) 14693 tcp->tcp_xmit_head->b_prev = (mblk_t *)lbolt; 14694 } 14695 14696 /* Anything more to do? */ 14697 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| 14698 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 14699 goto done; 14700 } 14701 ack_check: 14702 if (flags & TH_SEND_URP_MARK) { 14703 ASSERT(tcp->tcp_urp_mark_mp); 14704 /* 14705 * Send up any queued data and then send the mark message 14706 */ 14707 if (tcp->tcp_rcv_list != NULL) { 14708 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 14709 } 14710 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 14711 14712 mp1 = tcp->tcp_urp_mark_mp; 14713 tcp->tcp_urp_mark_mp = NULL; 14714 #ifdef DEBUG 14715 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14716 "tcp_rput: sending zero-length %s %s", 14717 ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" : 14718 "MSGNOTMARKNEXT"), 14719 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 14720 #endif /* DEBUG */ 14721 putnext(tcp->tcp_rq, mp1); 14722 flags &= ~TH_SEND_URP_MARK; 14723 } 14724 if (flags & TH_ACK_NEEDED) { 14725 /* 14726 * Time to send an ack for some reason. 14727 */ 14728 mp1 = tcp_ack_mp(tcp); 14729 14730 if (mp1 != NULL) { 14731 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); 14732 tcp_send_data(tcp, tcp->tcp_wq, mp1); 14733 BUMP_LOCAL(tcp->tcp_obsegs); 14734 BUMP_MIB(&tcp_mib, tcpOutAck); 14735 } 14736 if (tcp->tcp_ack_tid != 0) { 14737 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 14738 tcp->tcp_ack_tid = 0; 14739 } 14740 } 14741 if (flags & TH_ACK_TIMER_NEEDED) { 14742 /* 14743 * Arrange for deferred ACK or push wait timeout. 14744 * Start timer if it is not already running. 14745 */ 14746 if (tcp->tcp_ack_tid == 0) { 14747 tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer, 14748 MSEC_TO_TICK(tcp->tcp_localnet ? 14749 (clock_t)tcp_local_dack_interval : 14750 (clock_t)tcp_deferred_ack_interval)); 14751 } 14752 } 14753 if (flags & TH_ORDREL_NEEDED) { 14754 /* 14755 * Send up the ordrel_ind unless we are an eager guy. 14756 * In the eager case tcp_rsrv will do this when run 14757 * after tcp_accept is done. 14758 */ 14759 ASSERT(tcp->tcp_listener == NULL); 14760 if (tcp->tcp_rcv_list != NULL) { 14761 /* 14762 * Push any mblk(s) enqueued from co processing. 14763 */ 14764 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 14765 } 14766 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 14767 if ((mp1 = mi_tpi_ordrel_ind()) != NULL) { 14768 tcp->tcp_ordrel_done = B_TRUE; 14769 putnext(tcp->tcp_rq, mp1); 14770 if (tcp->tcp_deferred_clean_death) { 14771 /* 14772 * tcp_clean_death was deferred 14773 * for T_ORDREL_IND - do it now 14774 */ 14775 (void) tcp_clean_death(tcp, 14776 tcp->tcp_client_errno, 20); 14777 tcp->tcp_deferred_clean_death = B_FALSE; 14778 } 14779 } else { 14780 /* 14781 * Run the orderly release in the 14782 * service routine. 14783 */ 14784 qenable(tcp->tcp_rq); 14785 /* 14786 * Caveat(XXX): The machine may be so 14787 * overloaded that tcp_rsrv() is not scheduled 14788 * until after the endpoint has transitioned 14789 * to TCPS_TIME_WAIT 14790 * and tcp_time_wait_interval expires. Then 14791 * tcp_timer() will blow away state in tcp_t 14792 * and T_ORDREL_IND will never be delivered 14793 * upstream. Unlikely but potentially 14794 * a problem. 14795 */ 14796 } 14797 } 14798 done: 14799 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 14800 } 14801 14802 /* 14803 * This function does PAWS protection check. Returns B_TRUE if the 14804 * segment passes the PAWS test, else returns B_FALSE. 14805 */ 14806 boolean_t 14807 tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) 14808 { 14809 uint8_t flags; 14810 int options; 14811 uint8_t *up; 14812 14813 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 14814 /* 14815 * If timestamp option is aligned nicely, get values inline, 14816 * otherwise call general routine to parse. Only do that 14817 * if timestamp is the only option. 14818 */ 14819 if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH + 14820 TCPOPT_REAL_TS_LEN && 14821 OK_32PTR((up = ((uint8_t *)tcph) + 14822 TCP_MIN_HEADER_LENGTH)) && 14823 *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 14824 tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 14825 tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 14826 14827 options = TCP_OPT_TSTAMP_PRESENT; 14828 } else { 14829 if (tcp->tcp_snd_sack_ok) { 14830 tcpoptp->tcp = tcp; 14831 } else { 14832 tcpoptp->tcp = NULL; 14833 } 14834 options = tcp_parse_options(tcph, tcpoptp); 14835 } 14836 14837 if (options & TCP_OPT_TSTAMP_PRESENT) { 14838 /* 14839 * Do PAWS per RFC 1323 section 4.2. Accept RST 14840 * regardless of the timestamp, page 18 RFC 1323.bis. 14841 */ 14842 if ((flags & TH_RST) == 0 && 14843 TSTMP_LT(tcpoptp->tcp_opt_ts_val, 14844 tcp->tcp_ts_recent)) { 14845 if (TSTMP_LT(lbolt64, tcp->tcp_last_rcv_lbolt + 14846 PAWS_TIMEOUT)) { 14847 /* This segment is not acceptable. */ 14848 return (B_FALSE); 14849 } else { 14850 /* 14851 * Connection has been idle for 14852 * too long. Reset the timestamp 14853 * and assume the segment is valid. 14854 */ 14855 tcp->tcp_ts_recent = 14856 tcpoptp->tcp_opt_ts_val; 14857 } 14858 } 14859 } else { 14860 /* 14861 * If we don't get a timestamp on every packet, we 14862 * figure we can't really trust 'em, so we stop sending 14863 * and parsing them. 14864 */ 14865 tcp->tcp_snd_ts_ok = B_FALSE; 14866 14867 tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 14868 tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 14869 tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4); 14870 tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); 14871 if (tcp->tcp_snd_sack_ok) { 14872 ASSERT(tcp->tcp_sack_info != NULL); 14873 tcp->tcp_max_sack_blk = 4; 14874 } 14875 } 14876 return (B_TRUE); 14877 } 14878 14879 /* 14880 * Attach ancillary data to a received TCP segments for the 14881 * ancillary pieces requested by the application that are 14882 * different than they were in the previous data segment. 14883 * 14884 * Save the "current" values once memory allocation is ok so that 14885 * when memory allocation fails we can just wait for the next data segment. 14886 */ 14887 static mblk_t * 14888 tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) 14889 { 14890 struct T_optdata_ind *todi; 14891 int optlen; 14892 uchar_t *optptr; 14893 struct T_opthdr *toh; 14894 uint_t addflag; /* Which pieces to add */ 14895 mblk_t *mp1; 14896 14897 optlen = 0; 14898 addflag = 0; 14899 /* If app asked for pktinfo and the index has changed ... */ 14900 if ((ipp->ipp_fields & IPPF_IFINDEX) && 14901 ipp->ipp_ifindex != tcp->tcp_recvifindex && 14902 (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)) { 14903 optlen += sizeof (struct T_opthdr) + 14904 sizeof (struct in6_pktinfo); 14905 addflag |= TCP_IPV6_RECVPKTINFO; 14906 } 14907 /* If app asked for hoplimit and it has changed ... */ 14908 if ((ipp->ipp_fields & IPPF_HOPLIMIT) && 14909 ipp->ipp_hoplimit != tcp->tcp_recvhops && 14910 (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPLIMIT)) { 14911 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 14912 addflag |= TCP_IPV6_RECVHOPLIMIT; 14913 } 14914 /* If app asked for tclass and it has changed ... */ 14915 if ((ipp->ipp_fields & IPPF_TCLASS) && 14916 ipp->ipp_tclass != tcp->tcp_recvtclass && 14917 (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)) { 14918 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 14919 addflag |= TCP_IPV6_RECVTCLASS; 14920 } 14921 /* 14922 * If app asked for hopbyhop headers and it has changed ... 14923 * For security labels, note that (1) security labels can't change on 14924 * a connected socket at all, (2) we're connected to at most one peer, 14925 * (3) if anything changes, then it must be some other extra option. 14926 */ 14927 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) && 14928 ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen, 14929 (ipp->ipp_fields & IPPF_HOPOPTS), 14930 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) { 14931 optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen - 14932 tcp->tcp_label_len; 14933 addflag |= TCP_IPV6_RECVHOPOPTS; 14934 if (!ip_allocbuf((void **)&tcp->tcp_hopopts, 14935 &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), 14936 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) 14937 return (mp); 14938 } 14939 /* If app asked for dst headers before routing headers ... */ 14940 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTDSTOPTS) && 14941 ip_cmpbuf(tcp->tcp_rtdstopts, tcp->tcp_rtdstoptslen, 14942 (ipp->ipp_fields & IPPF_RTDSTOPTS), 14943 ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) { 14944 optlen += sizeof (struct T_opthdr) + 14945 ipp->ipp_rtdstoptslen; 14946 addflag |= TCP_IPV6_RECVRTDSTOPTS; 14947 if (!ip_allocbuf((void **)&tcp->tcp_rtdstopts, 14948 &tcp->tcp_rtdstoptslen, (ipp->ipp_fields & IPPF_RTDSTOPTS), 14949 ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) 14950 return (mp); 14951 } 14952 /* If app asked for routing headers and it has changed ... */ 14953 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) && 14954 ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen, 14955 (ipp->ipp_fields & IPPF_RTHDR), 14956 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { 14957 optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; 14958 addflag |= TCP_IPV6_RECVRTHDR; 14959 if (!ip_allocbuf((void **)&tcp->tcp_rthdr, 14960 &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), 14961 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) 14962 return (mp); 14963 } 14964 /* If app asked for dest headers and it has changed ... */ 14965 if ((tcp->tcp_ipv6_recvancillary & 14966 (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) && 14967 ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen, 14968 (ipp->ipp_fields & IPPF_DSTOPTS), 14969 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) { 14970 optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; 14971 addflag |= TCP_IPV6_RECVDSTOPTS; 14972 if (!ip_allocbuf((void **)&tcp->tcp_dstopts, 14973 &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), 14974 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) 14975 return (mp); 14976 } 14977 14978 if (optlen == 0) { 14979 /* Nothing to add */ 14980 return (mp); 14981 } 14982 mp1 = allocb(sizeof (struct T_optdata_ind) + optlen, BPRI_MED); 14983 if (mp1 == NULL) { 14984 /* 14985 * Defer sending ancillary data until the next TCP segment 14986 * arrives. 14987 */ 14988 return (mp); 14989 } 14990 mp1->b_cont = mp; 14991 mp = mp1; 14992 mp->b_wptr += sizeof (*todi) + optlen; 14993 mp->b_datap->db_type = M_PROTO; 14994 todi = (struct T_optdata_ind *)mp->b_rptr; 14995 todi->PRIM_type = T_OPTDATA_IND; 14996 todi->DATA_flag = 1; /* MORE data */ 14997 todi->OPT_length = optlen; 14998 todi->OPT_offset = sizeof (*todi); 14999 optptr = (uchar_t *)&todi[1]; 15000 /* 15001 * If app asked for pktinfo and the index has changed ... 15002 * Note that the local address never changes for the connection. 15003 */ 15004 if (addflag & TCP_IPV6_RECVPKTINFO) { 15005 struct in6_pktinfo *pkti; 15006 15007 toh = (struct T_opthdr *)optptr; 15008 toh->level = IPPROTO_IPV6; 15009 toh->name = IPV6_PKTINFO; 15010 toh->len = sizeof (*toh) + sizeof (*pkti); 15011 toh->status = 0; 15012 optptr += sizeof (*toh); 15013 pkti = (struct in6_pktinfo *)optptr; 15014 if (tcp->tcp_ipversion == IPV6_VERSION) 15015 pkti->ipi6_addr = tcp->tcp_ip6h->ip6_src; 15016 else 15017 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 15018 &pkti->ipi6_addr); 15019 pkti->ipi6_ifindex = ipp->ipp_ifindex; 15020 optptr += sizeof (*pkti); 15021 ASSERT(OK_32PTR(optptr)); 15022 /* Save as "last" value */ 15023 tcp->tcp_recvifindex = ipp->ipp_ifindex; 15024 } 15025 /* If app asked for hoplimit and it has changed ... */ 15026 if (addflag & TCP_IPV6_RECVHOPLIMIT) { 15027 toh = (struct T_opthdr *)optptr; 15028 toh->level = IPPROTO_IPV6; 15029 toh->name = IPV6_HOPLIMIT; 15030 toh->len = sizeof (*toh) + sizeof (uint_t); 15031 toh->status = 0; 15032 optptr += sizeof (*toh); 15033 *(uint_t *)optptr = ipp->ipp_hoplimit; 15034 optptr += sizeof (uint_t); 15035 ASSERT(OK_32PTR(optptr)); 15036 /* Save as "last" value */ 15037 tcp->tcp_recvhops = ipp->ipp_hoplimit; 15038 } 15039 /* If app asked for tclass and it has changed ... */ 15040 if (addflag & TCP_IPV6_RECVTCLASS) { 15041 toh = (struct T_opthdr *)optptr; 15042 toh->level = IPPROTO_IPV6; 15043 toh->name = IPV6_TCLASS; 15044 toh->len = sizeof (*toh) + sizeof (uint_t); 15045 toh->status = 0; 15046 optptr += sizeof (*toh); 15047 *(uint_t *)optptr = ipp->ipp_tclass; 15048 optptr += sizeof (uint_t); 15049 ASSERT(OK_32PTR(optptr)); 15050 /* Save as "last" value */ 15051 tcp->tcp_recvtclass = ipp->ipp_tclass; 15052 } 15053 if (addflag & TCP_IPV6_RECVHOPOPTS) { 15054 toh = (struct T_opthdr *)optptr; 15055 toh->level = IPPROTO_IPV6; 15056 toh->name = IPV6_HOPOPTS; 15057 toh->len = sizeof (*toh) + ipp->ipp_hopoptslen - 15058 tcp->tcp_label_len; 15059 toh->status = 0; 15060 optptr += sizeof (*toh); 15061 bcopy((uchar_t *)ipp->ipp_hopopts + tcp->tcp_label_len, optptr, 15062 ipp->ipp_hopoptslen - tcp->tcp_label_len); 15063 optptr += ipp->ipp_hopoptslen - tcp->tcp_label_len; 15064 ASSERT(OK_32PTR(optptr)); 15065 /* Save as last value */ 15066 ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, 15067 (ipp->ipp_fields & IPPF_HOPOPTS), 15068 ipp->ipp_hopopts, ipp->ipp_hopoptslen); 15069 } 15070 if (addflag & TCP_IPV6_RECVRTDSTOPTS) { 15071 toh = (struct T_opthdr *)optptr; 15072 toh->level = IPPROTO_IPV6; 15073 toh->name = IPV6_RTHDRDSTOPTS; 15074 toh->len = sizeof (*toh) + ipp->ipp_rtdstoptslen; 15075 toh->status = 0; 15076 optptr += sizeof (*toh); 15077 bcopy(ipp->ipp_rtdstopts, optptr, ipp->ipp_rtdstoptslen); 15078 optptr += ipp->ipp_rtdstoptslen; 15079 ASSERT(OK_32PTR(optptr)); 15080 /* Save as last value */ 15081 ip_savebuf((void **)&tcp->tcp_rtdstopts, 15082 &tcp->tcp_rtdstoptslen, 15083 (ipp->ipp_fields & IPPF_RTDSTOPTS), 15084 ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); 15085 } 15086 if (addflag & TCP_IPV6_RECVRTHDR) { 15087 toh = (struct T_opthdr *)optptr; 15088 toh->level = IPPROTO_IPV6; 15089 toh->name = IPV6_RTHDR; 15090 toh->len = sizeof (*toh) + ipp->ipp_rthdrlen; 15091 toh->status = 0; 15092 optptr += sizeof (*toh); 15093 bcopy(ipp->ipp_rthdr, optptr, ipp->ipp_rthdrlen); 15094 optptr += ipp->ipp_rthdrlen; 15095 ASSERT(OK_32PTR(optptr)); 15096 /* Save as last value */ 15097 ip_savebuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen, 15098 (ipp->ipp_fields & IPPF_RTHDR), 15099 ipp->ipp_rthdr, ipp->ipp_rthdrlen); 15100 } 15101 if (addflag & (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) { 15102 toh = (struct T_opthdr *)optptr; 15103 toh->level = IPPROTO_IPV6; 15104 toh->name = IPV6_DSTOPTS; 15105 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen; 15106 toh->status = 0; 15107 optptr += sizeof (*toh); 15108 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen); 15109 optptr += ipp->ipp_dstoptslen; 15110 ASSERT(OK_32PTR(optptr)); 15111 /* Save as last value */ 15112 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen, 15113 (ipp->ipp_fields & IPPF_DSTOPTS), 15114 ipp->ipp_dstopts, ipp->ipp_dstoptslen); 15115 } 15116 ASSERT(optptr == mp->b_wptr); 15117 return (mp); 15118 } 15119 15120 15121 /* 15122 * Handle a *T_BIND_REQ that has failed either due to a T_ERROR_ACK 15123 * or a "bad" IRE detected by tcp_adapt_ire. 15124 * We can't tell if the failure was due to the laddr or the faddr 15125 * thus we clear out all addresses and ports. 15126 */ 15127 static void 15128 tcp_bind_failed(tcp_t *tcp, mblk_t *mp, int error) 15129 { 15130 queue_t *q = tcp->tcp_rq; 15131 tcph_t *tcph; 15132 struct T_error_ack *tea; 15133 conn_t *connp = tcp->tcp_connp; 15134 15135 15136 ASSERT(mp->b_datap->db_type == M_PCPROTO); 15137 15138 if (mp->b_cont) { 15139 freemsg(mp->b_cont); 15140 mp->b_cont = NULL; 15141 } 15142 tea = (struct T_error_ack *)mp->b_rptr; 15143 switch (tea->PRIM_type) { 15144 case T_BIND_ACK: 15145 /* 15146 * Need to unbind with classifier since we were just told that 15147 * our bind succeeded. 15148 */ 15149 tcp->tcp_hard_bound = B_FALSE; 15150 tcp->tcp_hard_binding = B_FALSE; 15151 15152 ipcl_hash_remove(connp); 15153 /* Reuse the mblk if possible */ 15154 ASSERT(mp->b_datap->db_lim - mp->b_datap->db_base >= 15155 sizeof (*tea)); 15156 mp->b_rptr = mp->b_datap->db_base; 15157 mp->b_wptr = mp->b_rptr + sizeof (*tea); 15158 tea = (struct T_error_ack *)mp->b_rptr; 15159 tea->PRIM_type = T_ERROR_ACK; 15160 tea->TLI_error = TSYSERR; 15161 tea->UNIX_error = error; 15162 if (tcp->tcp_state >= TCPS_SYN_SENT) { 15163 tea->ERROR_prim = T_CONN_REQ; 15164 } else { 15165 tea->ERROR_prim = O_T_BIND_REQ; 15166 } 15167 break; 15168 15169 case T_ERROR_ACK: 15170 if (tcp->tcp_state >= TCPS_SYN_SENT) 15171 tea->ERROR_prim = T_CONN_REQ; 15172 break; 15173 default: 15174 panic("tcp_bind_failed: unexpected TPI type"); 15175 /*NOTREACHED*/ 15176 } 15177 15178 tcp->tcp_state = TCPS_IDLE; 15179 if (tcp->tcp_ipversion == IPV4_VERSION) 15180 tcp->tcp_ipha->ipha_src = 0; 15181 else 15182 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); 15183 /* 15184 * Copy of the src addr. in tcp_t is needed since 15185 * the lookup funcs. can only look at tcp_t 15186 */ 15187 V6_SET_ZERO(tcp->tcp_ip_src_v6); 15188 15189 tcph = tcp->tcp_tcph; 15190 tcph->th_lport[0] = 0; 15191 tcph->th_lport[1] = 0; 15192 tcp_bind_hash_remove(tcp); 15193 bzero(&connp->u_port, sizeof (connp->u_port)); 15194 /* blow away saved option results if any */ 15195 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 15196 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 15197 15198 conn_delete_ire(tcp->tcp_connp, NULL); 15199 putnext(q, mp); 15200 } 15201 15202 /* 15203 * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA 15204 * messages. 15205 */ 15206 void 15207 tcp_rput_other(tcp_t *tcp, mblk_t *mp) 15208 { 15209 mblk_t *mp1; 15210 uchar_t *rptr = mp->b_rptr; 15211 queue_t *q = tcp->tcp_rq; 15212 struct T_error_ack *tea; 15213 uint32_t mss; 15214 mblk_t *syn_mp; 15215 mblk_t *mdti; 15216 int retval; 15217 mblk_t *ire_mp; 15218 15219 switch (mp->b_datap->db_type) { 15220 case M_PROTO: 15221 case M_PCPROTO: 15222 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 15223 if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) 15224 break; 15225 tea = (struct T_error_ack *)rptr; 15226 switch (tea->PRIM_type) { 15227 case T_BIND_ACK: 15228 /* 15229 * Adapt Multidata information, if any. The 15230 * following tcp_mdt_update routine will free 15231 * the message. 15232 */ 15233 if ((mdti = tcp_mdt_info_mp(mp)) != NULL) { 15234 tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti-> 15235 b_rptr)->mdt_capab, B_TRUE); 15236 freemsg(mdti); 15237 } 15238 15239 /* Get the IRE, if we had requested for it */ 15240 ire_mp = tcp_ire_mp(mp); 15241 15242 if (tcp->tcp_hard_binding) { 15243 tcp->tcp_hard_binding = B_FALSE; 15244 tcp->tcp_hard_bound = B_TRUE; 15245 CL_INET_CONNECT(tcp); 15246 } else { 15247 if (ire_mp != NULL) 15248 freeb(ire_mp); 15249 goto after_syn_sent; 15250 } 15251 15252 retval = tcp_adapt_ire(tcp, ire_mp); 15253 if (ire_mp != NULL) 15254 freeb(ire_mp); 15255 if (retval == 0) { 15256 tcp_bind_failed(tcp, mp, 15257 (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? 15258 ENETUNREACH : EADDRNOTAVAIL)); 15259 return; 15260 } 15261 /* 15262 * Don't let an endpoint connect to itself. 15263 * Also checked in tcp_connect() but that 15264 * check can't handle the case when the 15265 * local IP address is INADDR_ANY. 15266 */ 15267 if (tcp->tcp_ipversion == IPV4_VERSION) { 15268 if ((tcp->tcp_ipha->ipha_dst == 15269 tcp->tcp_ipha->ipha_src) && 15270 (BE16_EQL(tcp->tcp_tcph->th_lport, 15271 tcp->tcp_tcph->th_fport))) { 15272 tcp_bind_failed(tcp, mp, EADDRNOTAVAIL); 15273 return; 15274 } 15275 } else { 15276 if (IN6_ARE_ADDR_EQUAL( 15277 &tcp->tcp_ip6h->ip6_dst, 15278 &tcp->tcp_ip6h->ip6_src) && 15279 (BE16_EQL(tcp->tcp_tcph->th_lport, 15280 tcp->tcp_tcph->th_fport))) { 15281 tcp_bind_failed(tcp, mp, EADDRNOTAVAIL); 15282 return; 15283 } 15284 } 15285 ASSERT(tcp->tcp_state == TCPS_SYN_SENT); 15286 /* 15287 * This should not be possible! Just for 15288 * defensive coding... 15289 */ 15290 if (tcp->tcp_state != TCPS_SYN_SENT) 15291 goto after_syn_sent; 15292 15293 if (is_system_labeled() && 15294 !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) { 15295 tcp_bind_failed(tcp, mp, EHOSTUNREACH); 15296 return; 15297 } 15298 15299 ASSERT(q == tcp->tcp_rq); 15300 /* 15301 * tcp_adapt_ire() does not adjust 15302 * for TCP/IP header length. 15303 */ 15304 mss = tcp->tcp_mss - tcp->tcp_hdr_len; 15305 15306 /* 15307 * Just make sure our rwnd is at 15308 * least tcp_recv_hiwat_mss * MSS 15309 * large, and round up to the nearest 15310 * MSS. 15311 * 15312 * We do the round up here because 15313 * we need to get the interface 15314 * MTU first before we can do the 15315 * round up. 15316 */ 15317 tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 15318 tcp_recv_hiwat_minmss * mss); 15319 q->q_hiwat = tcp->tcp_rwnd; 15320 tcp_set_ws_value(tcp); 15321 U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), 15322 tcp->tcp_tcph->th_win); 15323 if (tcp->tcp_rcv_ws > 0 || tcp_wscale_always) 15324 tcp->tcp_snd_ws_ok = B_TRUE; 15325 15326 /* 15327 * Set tcp_snd_ts_ok to true 15328 * so that tcp_xmit_mp will 15329 * include the timestamp 15330 * option in the SYN segment. 15331 */ 15332 if (tcp_tstamp_always || 15333 (tcp->tcp_rcv_ws && tcp_tstamp_if_wscale)) { 15334 tcp->tcp_snd_ts_ok = B_TRUE; 15335 } 15336 15337 /* 15338 * tcp_snd_sack_ok can be set in 15339 * tcp_adapt_ire() if the sack metric 15340 * is set. So check it here also. 15341 */ 15342 if (tcp_sack_permitted == 2 || 15343 tcp->tcp_snd_sack_ok) { 15344 if (tcp->tcp_sack_info == NULL) { 15345 tcp->tcp_sack_info = 15346 kmem_cache_alloc(tcp_sack_info_cache, 15347 KM_SLEEP); 15348 } 15349 tcp->tcp_snd_sack_ok = B_TRUE; 15350 } 15351 15352 /* 15353 * Should we use ECN? Note that the current 15354 * default value (SunOS 5.9) of tcp_ecn_permitted 15355 * is 1. The reason for doing this is that there 15356 * are equipments out there that will drop ECN 15357 * enabled IP packets. Setting it to 1 avoids 15358 * compatibility problems. 15359 */ 15360 if (tcp_ecn_permitted == 2) 15361 tcp->tcp_ecn_ok = B_TRUE; 15362 15363 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 15364 syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 15365 tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 15366 if (syn_mp) { 15367 cred_t *cr; 15368 pid_t pid; 15369 15370 /* 15371 * Obtain the credential from the 15372 * thread calling connect(); the credential 15373 * lives on in the second mblk which 15374 * originated from T_CONN_REQ and is echoed 15375 * with the T_BIND_ACK from ip. If none 15376 * can be found, default to the creator 15377 * of the socket. 15378 */ 15379 if (mp->b_cont == NULL || 15380 (cr = DB_CRED(mp->b_cont)) == NULL) { 15381 cr = tcp->tcp_cred; 15382 pid = tcp->tcp_cpid; 15383 } else { 15384 pid = DB_CPID(mp->b_cont); 15385 } 15386 15387 TCP_RECORD_TRACE(tcp, syn_mp, 15388 TCP_TRACE_SEND_PKT); 15389 mblk_setcred(syn_mp, cr); 15390 DB_CPID(syn_mp) = pid; 15391 tcp_send_data(tcp, tcp->tcp_wq, syn_mp); 15392 } 15393 after_syn_sent: 15394 /* 15395 * A trailer mblk indicates a waiting client upstream. 15396 * We complete here the processing begun in 15397 * either tcp_bind() or tcp_connect() by passing 15398 * upstream the reply message they supplied. 15399 */ 15400 mp1 = mp; 15401 mp = mp->b_cont; 15402 freeb(mp1); 15403 if (mp) 15404 break; 15405 return; 15406 case T_ERROR_ACK: 15407 if (tcp->tcp_debug) { 15408 (void) strlog(TCP_MOD_ID, 0, 1, 15409 SL_TRACE|SL_ERROR, 15410 "tcp_rput_other: case T_ERROR_ACK, " 15411 "ERROR_prim == %d", 15412 tea->ERROR_prim); 15413 } 15414 switch (tea->ERROR_prim) { 15415 case O_T_BIND_REQ: 15416 case T_BIND_REQ: 15417 tcp_bind_failed(tcp, mp, 15418 (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? 15419 ENETUNREACH : EADDRNOTAVAIL)); 15420 return; 15421 case T_UNBIND_REQ: 15422 tcp->tcp_hard_binding = B_FALSE; 15423 tcp->tcp_hard_bound = B_FALSE; 15424 if (mp->b_cont) { 15425 freemsg(mp->b_cont); 15426 mp->b_cont = NULL; 15427 } 15428 if (tcp->tcp_unbind_pending) 15429 tcp->tcp_unbind_pending = 0; 15430 else { 15431 /* From tcp_ip_unbind() - free */ 15432 freemsg(mp); 15433 return; 15434 } 15435 break; 15436 case T_SVR4_OPTMGMT_REQ: 15437 if (tcp->tcp_drop_opt_ack_cnt > 0) { 15438 /* T_OPTMGMT_REQ generated by TCP */ 15439 printf("T_SVR4_OPTMGMT_REQ failed " 15440 "%d/%d - dropped (cnt %d)\n", 15441 tea->TLI_error, tea->UNIX_error, 15442 tcp->tcp_drop_opt_ack_cnt); 15443 freemsg(mp); 15444 tcp->tcp_drop_opt_ack_cnt--; 15445 return; 15446 } 15447 break; 15448 } 15449 if (tea->ERROR_prim == T_SVR4_OPTMGMT_REQ && 15450 tcp->tcp_drop_opt_ack_cnt > 0) { 15451 printf("T_SVR4_OPTMGMT_REQ failed %d/%d " 15452 "- dropped (cnt %d)\n", 15453 tea->TLI_error, tea->UNIX_error, 15454 tcp->tcp_drop_opt_ack_cnt); 15455 freemsg(mp); 15456 tcp->tcp_drop_opt_ack_cnt--; 15457 return; 15458 } 15459 break; 15460 case T_OPTMGMT_ACK: 15461 if (tcp->tcp_drop_opt_ack_cnt > 0) { 15462 /* T_OPTMGMT_REQ generated by TCP */ 15463 freemsg(mp); 15464 tcp->tcp_drop_opt_ack_cnt--; 15465 return; 15466 } 15467 break; 15468 default: 15469 break; 15470 } 15471 break; 15472 case M_CTL: 15473 /* 15474 * ICMP messages. 15475 */ 15476 tcp_icmp_error(tcp, mp); 15477 return; 15478 case M_FLUSH: 15479 if (*rptr & FLUSHR) 15480 flushq(q, FLUSHDATA); 15481 break; 15482 default: 15483 break; 15484 } 15485 /* 15486 * Make sure we set this bit before sending the ACK for 15487 * bind. Otherwise accept could possibly run and free 15488 * this tcp struct. 15489 */ 15490 putnext(q, mp); 15491 } 15492 15493 /* 15494 * Called as the result of a qbufcall or a qtimeout to remedy a failure 15495 * to allocate a T_ordrel_ind in tcp_rsrv(). qenable(q) will make 15496 * tcp_rsrv() try again. 15497 */ 15498 static void 15499 tcp_ordrel_kick(void *arg) 15500 { 15501 conn_t *connp = (conn_t *)arg; 15502 tcp_t *tcp = connp->conn_tcp; 15503 15504 tcp->tcp_ordrelid = 0; 15505 tcp->tcp_timeout = B_FALSE; 15506 if (!TCP_IS_DETACHED(tcp) && tcp->tcp_rq != NULL && 15507 tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 15508 qenable(tcp->tcp_rq); 15509 } 15510 } 15511 15512 /* ARGSUSED */ 15513 static void 15514 tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) 15515 { 15516 conn_t *connp = (conn_t *)arg; 15517 tcp_t *tcp = connp->conn_tcp; 15518 queue_t *q = tcp->tcp_rq; 15519 uint_t thwin; 15520 15521 freeb(mp); 15522 15523 TCP_STAT(tcp_rsrv_calls); 15524 15525 if (TCP_IS_DETACHED(tcp) || q == NULL) { 15526 return; 15527 } 15528 15529 if (tcp->tcp_fused) { 15530 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 15531 15532 ASSERT(tcp->tcp_fused); 15533 ASSERT(peer_tcp != NULL && peer_tcp->tcp_fused); 15534 ASSERT(peer_tcp->tcp_loopback_peer == tcp); 15535 ASSERT(!TCP_IS_DETACHED(tcp)); 15536 ASSERT(tcp->tcp_connp->conn_sqp == 15537 peer_tcp->tcp_connp->conn_sqp); 15538 15539 /* 15540 * Normally we would not get backenabled in synchronous 15541 * streams mode, but in case this happens, we need to plug 15542 * synchronous streams during our drain to prevent a race 15543 * with tcp_fuse_rrw() or tcp_fuse_rinfop(). 15544 */ 15545 TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp); 15546 if (tcp->tcp_rcv_list != NULL) 15547 (void) tcp_rcv_drain(tcp->tcp_rq, tcp); 15548 15549 tcp_clrqfull(peer_tcp); 15550 TCP_FUSE_SYNCSTR_UNPLUG_DRAIN(tcp); 15551 TCP_STAT(tcp_fusion_backenabled); 15552 return; 15553 } 15554 15555 if (canputnext(q)) { 15556 tcp->tcp_rwnd = q->q_hiwat; 15557 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 15558 << tcp->tcp_rcv_ws; 15559 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 15560 /* 15561 * Send back a window update immediately if TCP is above 15562 * ESTABLISHED state and the increase of the rcv window 15563 * that the other side knows is at least 1 MSS after flow 15564 * control is lifted. 15565 */ 15566 if (tcp->tcp_state >= TCPS_ESTABLISHED && 15567 (q->q_hiwat - thwin >= tcp->tcp_mss)) { 15568 tcp_xmit_ctl(NULL, tcp, 15569 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 15570 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 15571 BUMP_MIB(&tcp_mib, tcpOutWinUpdate); 15572 } 15573 } 15574 /* Handle a failure to allocate a T_ORDREL_IND here */ 15575 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 15576 ASSERT(tcp->tcp_listener == NULL); 15577 if (tcp->tcp_rcv_list != NULL) { 15578 (void) tcp_rcv_drain(q, tcp); 15579 } 15580 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 15581 mp = mi_tpi_ordrel_ind(); 15582 if (mp) { 15583 tcp->tcp_ordrel_done = B_TRUE; 15584 putnext(q, mp); 15585 if (tcp->tcp_deferred_clean_death) { 15586 /* 15587 * tcp_clean_death was deferred for 15588 * T_ORDREL_IND - do it now 15589 */ 15590 tcp->tcp_deferred_clean_death = B_FALSE; 15591 (void) tcp_clean_death(tcp, 15592 tcp->tcp_client_errno, 22); 15593 } 15594 } else if (!tcp->tcp_timeout && tcp->tcp_ordrelid == 0) { 15595 /* 15596 * If there isn't already a timer running 15597 * start one. Use a 4 second 15598 * timer as a fallback since it can't fail. 15599 */ 15600 tcp->tcp_timeout = B_TRUE; 15601 tcp->tcp_ordrelid = TCP_TIMER(tcp, tcp_ordrel_kick, 15602 MSEC_TO_TICK(4000)); 15603 } 15604 } 15605 } 15606 15607 /* 15608 * The read side service routine is called mostly when we get back-enabled as a 15609 * result of flow control relief. Since we don't actually queue anything in 15610 * TCP, we have no data to send out of here. What we do is clear the receive 15611 * window, and send out a window update. 15612 * This routine is also called to drive an orderly release message upstream 15613 * if the attempt in tcp_rput failed. 15614 */ 15615 static void 15616 tcp_rsrv(queue_t *q) 15617 { 15618 conn_t *connp = Q_TO_CONN(q); 15619 tcp_t *tcp = connp->conn_tcp; 15620 mblk_t *mp; 15621 15622 /* No code does a putq on the read side */ 15623 ASSERT(q->q_first == NULL); 15624 15625 /* Nothing to do for the default queue */ 15626 if (q == tcp_g_q) { 15627 return; 15628 } 15629 15630 mp = allocb(0, BPRI_HI); 15631 if (mp == NULL) { 15632 /* 15633 * We are under memory pressure. Return for now and we 15634 * we will be called again later. 15635 */ 15636 if (!tcp->tcp_timeout && tcp->tcp_ordrelid == 0) { 15637 /* 15638 * If there isn't already a timer running 15639 * start one. Use a 4 second 15640 * timer as a fallback since it can't fail. 15641 */ 15642 tcp->tcp_timeout = B_TRUE; 15643 tcp->tcp_ordrelid = TCP_TIMER(tcp, tcp_ordrel_kick, 15644 MSEC_TO_TICK(4000)); 15645 } 15646 return; 15647 } 15648 CONN_INC_REF(connp); 15649 squeue_enter(connp->conn_sqp, mp, tcp_rsrv_input, connp, 15650 SQTAG_TCP_RSRV); 15651 } 15652 15653 /* 15654 * tcp_rwnd_set() is called to adjust the receive window to a desired value. 15655 * We do not allow the receive window to shrink. After setting rwnd, 15656 * set the flow control hiwat of the stream. 15657 * 15658 * This function is called in 2 cases: 15659 * 15660 * 1) Before data transfer begins, in tcp_accept_comm() for accepting a 15661 * connection (passive open) and in tcp_rput_data() for active connect. 15662 * This is called after tcp_mss_set() when the desired MSS value is known. 15663 * This makes sure that our window size is a mutiple of the other side's 15664 * MSS. 15665 * 2) Handling SO_RCVBUF option. 15666 * 15667 * It is ASSUMED that the requested size is a multiple of the current MSS. 15668 * 15669 * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 15670 * user requests so. 15671 */ 15672 static int 15673 tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 15674 { 15675 uint32_t mss = tcp->tcp_mss; 15676 uint32_t old_max_rwnd; 15677 uint32_t max_transmittable_rwnd; 15678 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 15679 15680 if (tcp->tcp_fused) { 15681 size_t sth_hiwat; 15682 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 15683 15684 ASSERT(peer_tcp != NULL); 15685 /* 15686 * Record the stream head's high water mark for 15687 * this endpoint; this is used for flow-control 15688 * purposes in tcp_fuse_output(). 15689 */ 15690 sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd); 15691 if (!tcp_detached) 15692 (void) mi_set_sth_hiwat(tcp->tcp_rq, sth_hiwat); 15693 15694 /* 15695 * In the fusion case, the maxpsz stream head value of 15696 * our peer is set according to its send buffer size 15697 * and our receive buffer size; since the latter may 15698 * have changed we need to update the peer's maxpsz. 15699 */ 15700 (void) tcp_maxpsz_set(peer_tcp, B_TRUE); 15701 return (rwnd); 15702 } 15703 15704 if (tcp_detached) 15705 old_max_rwnd = tcp->tcp_rwnd; 15706 else 15707 old_max_rwnd = tcp->tcp_rq->q_hiwat; 15708 15709 /* 15710 * Insist on a receive window that is at least 15711 * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 15712 * funny TCP interactions of Nagle algorithm, SWS avoidance 15713 * and delayed acknowledgement. 15714 */ 15715 rwnd = MAX(rwnd, tcp_recv_hiwat_minmss * mss); 15716 15717 /* 15718 * If window size info has already been exchanged, TCP should not 15719 * shrink the window. Shrinking window is doable if done carefully. 15720 * We may add that support later. But so far there is not a real 15721 * need to do that. 15722 */ 15723 if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 15724 /* MSS may have changed, do a round up again. */ 15725 rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 15726 } 15727 15728 /* 15729 * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 15730 * can be applied even before the window scale option is decided. 15731 */ 15732 max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 15733 if (rwnd > max_transmittable_rwnd) { 15734 rwnd = max_transmittable_rwnd - 15735 (max_transmittable_rwnd % mss); 15736 if (rwnd < mss) 15737 rwnd = max_transmittable_rwnd; 15738 /* 15739 * If we're over the limit we may have to back down tcp_rwnd. 15740 * The increment below won't work for us. So we set all three 15741 * here and the increment below will have no effect. 15742 */ 15743 tcp->tcp_rwnd = old_max_rwnd = rwnd; 15744 } 15745 if (tcp->tcp_localnet) { 15746 tcp->tcp_rack_abs_max = 15747 MIN(tcp_local_dacks_max, rwnd / mss / 2); 15748 } else { 15749 /* 15750 * For a remote host on a different subnet (through a router), 15751 * we ack every other packet to be conforming to RFC1122. 15752 * tcp_deferred_acks_max is default to 2. 15753 */ 15754 tcp->tcp_rack_abs_max = 15755 MIN(tcp_deferred_acks_max, rwnd / mss / 2); 15756 } 15757 if (tcp->tcp_rack_cur_max > tcp->tcp_rack_abs_max) 15758 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 15759 else 15760 tcp->tcp_rack_cur_max = 0; 15761 /* 15762 * Increment the current rwnd by the amount the maximum grew (we 15763 * can not overwrite it since we might be in the middle of a 15764 * connection.) 15765 */ 15766 tcp->tcp_rwnd += rwnd - old_max_rwnd; 15767 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 15768 if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 15769 tcp->tcp_cwnd_max = rwnd; 15770 15771 if (tcp_detached) 15772 return (rwnd); 15773 /* 15774 * We set the maximum receive window into rq->q_hiwat. 15775 * This is not actually used for flow control. 15776 */ 15777 tcp->tcp_rq->q_hiwat = rwnd; 15778 /* 15779 * Set the Stream head high water mark. This doesn't have to be 15780 * here, since we are simply using default values, but we would 15781 * prefer to choose these values algorithmically, with a likely 15782 * relationship to rwnd. 15783 */ 15784 (void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd, tcp_sth_rcv_hiwat)); 15785 return (rwnd); 15786 } 15787 15788 /* 15789 * Return SNMP stuff in buffer in mpdata. 15790 */ 15791 int 15792 tcp_snmp_get(queue_t *q, mblk_t *mpctl) 15793 { 15794 mblk_t *mpdata; 15795 mblk_t *mp_conn_ctl = NULL; 15796 mblk_t *mp_conn_tail; 15797 mblk_t *mp_attr_ctl = NULL; 15798 mblk_t *mp_attr_tail; 15799 mblk_t *mp6_conn_ctl = NULL; 15800 mblk_t *mp6_conn_tail; 15801 mblk_t *mp6_attr_ctl = NULL; 15802 mblk_t *mp6_attr_tail; 15803 struct opthdr *optp; 15804 mib2_tcpConnEntry_t tce; 15805 mib2_tcp6ConnEntry_t tce6; 15806 mib2_transportMLPEntry_t mlp; 15807 connf_t *connfp; 15808 conn_t *connp; 15809 int i; 15810 boolean_t ispriv; 15811 zoneid_t zoneid; 15812 int v4_conn_idx; 15813 int v6_conn_idx; 15814 15815 if (mpctl == NULL || 15816 (mpdata = mpctl->b_cont) == NULL || 15817 (mp_conn_ctl = copymsg(mpctl)) == NULL || 15818 (mp_attr_ctl = copymsg(mpctl)) == NULL || 15819 (mp6_conn_ctl = copymsg(mpctl)) == NULL || 15820 (mp6_attr_ctl = copymsg(mpctl)) == NULL) { 15821 freemsg(mp_conn_ctl); 15822 freemsg(mp_attr_ctl); 15823 freemsg(mp6_conn_ctl); 15824 freemsg(mp6_attr_ctl); 15825 return (0); 15826 } 15827 15828 /* build table of connections -- need count in fixed part */ 15829 SET_MIB(tcp_mib.tcpRtoAlgorithm, 4); /* vanj */ 15830 SET_MIB(tcp_mib.tcpRtoMin, tcp_rexmit_interval_min); 15831 SET_MIB(tcp_mib.tcpRtoMax, tcp_rexmit_interval_max); 15832 SET_MIB(tcp_mib.tcpMaxConn, -1); 15833 SET_MIB(tcp_mib.tcpCurrEstab, 0); 15834 15835 ispriv = 15836 secpolicy_net_config((Q_TO_CONN(q))->conn_cred, B_TRUE) == 0; 15837 zoneid = Q_TO_CONN(q)->conn_zoneid; 15838 15839 v4_conn_idx = v6_conn_idx = 0; 15840 mp_conn_tail = mp_attr_tail = mp6_conn_tail = mp6_attr_tail = NULL; 15841 15842 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 15843 15844 connfp = &ipcl_globalhash_fanout[i]; 15845 15846 connp = NULL; 15847 15848 while ((connp = 15849 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 15850 tcp_t *tcp; 15851 boolean_t needattr; 15852 15853 if (connp->conn_zoneid != zoneid) 15854 continue; /* not in this zone */ 15855 15856 tcp = connp->conn_tcp; 15857 UPDATE_MIB(&tcp_mib, tcpInSegs, tcp->tcp_ibsegs); 15858 tcp->tcp_ibsegs = 0; 15859 UPDATE_MIB(&tcp_mib, tcpOutSegs, tcp->tcp_obsegs); 15860 tcp->tcp_obsegs = 0; 15861 15862 tce6.tcp6ConnState = tce.tcpConnState = 15863 tcp_snmp_state(tcp); 15864 if (tce.tcpConnState == MIB2_TCP_established || 15865 tce.tcpConnState == MIB2_TCP_closeWait) 15866 BUMP_MIB(&tcp_mib, tcpCurrEstab); 15867 15868 needattr = B_FALSE; 15869 bzero(&mlp, sizeof (mlp)); 15870 if (connp->conn_mlp_type != mlptSingle) { 15871 if (connp->conn_mlp_type == mlptShared || 15872 connp->conn_mlp_type == mlptBoth) 15873 mlp.tme_flags |= MIB2_TMEF_SHARED; 15874 if (connp->conn_mlp_type == mlptPrivate || 15875 connp->conn_mlp_type == mlptBoth) 15876 mlp.tme_flags |= MIB2_TMEF_PRIVATE; 15877 needattr = B_TRUE; 15878 } 15879 if (connp->conn_peercred != NULL) { 15880 ts_label_t *tsl; 15881 15882 tsl = crgetlabel(connp->conn_peercred); 15883 mlp.tme_doi = label2doi(tsl); 15884 mlp.tme_label = *label2bslabel(tsl); 15885 needattr = B_TRUE; 15886 } 15887 15888 /* Create a message to report on IPv6 entries */ 15889 if (tcp->tcp_ipversion == IPV6_VERSION) { 15890 tce6.tcp6ConnLocalAddress = tcp->tcp_ip_src_v6; 15891 tce6.tcp6ConnRemAddress = tcp->tcp_remote_v6; 15892 tce6.tcp6ConnLocalPort = ntohs(tcp->tcp_lport); 15893 tce6.tcp6ConnRemPort = ntohs(tcp->tcp_fport); 15894 tce6.tcp6ConnIfIndex = tcp->tcp_bound_if; 15895 /* Don't want just anybody seeing these... */ 15896 if (ispriv) { 15897 tce6.tcp6ConnEntryInfo.ce_snxt = 15898 tcp->tcp_snxt; 15899 tce6.tcp6ConnEntryInfo.ce_suna = 15900 tcp->tcp_suna; 15901 tce6.tcp6ConnEntryInfo.ce_rnxt = 15902 tcp->tcp_rnxt; 15903 tce6.tcp6ConnEntryInfo.ce_rack = 15904 tcp->tcp_rack; 15905 } else { 15906 /* 15907 * Netstat, unfortunately, uses this to 15908 * get send/receive queue sizes. How to fix? 15909 * Why not compute the difference only? 15910 */ 15911 tce6.tcp6ConnEntryInfo.ce_snxt = 15912 tcp->tcp_snxt - tcp->tcp_suna; 15913 tce6.tcp6ConnEntryInfo.ce_suna = 0; 15914 tce6.tcp6ConnEntryInfo.ce_rnxt = 15915 tcp->tcp_rnxt - tcp->tcp_rack; 15916 tce6.tcp6ConnEntryInfo.ce_rack = 0; 15917 } 15918 15919 tce6.tcp6ConnEntryInfo.ce_swnd = tcp->tcp_swnd; 15920 tce6.tcp6ConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; 15921 tce6.tcp6ConnEntryInfo.ce_rto = tcp->tcp_rto; 15922 tce6.tcp6ConnEntryInfo.ce_mss = tcp->tcp_mss; 15923 tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state; 15924 15925 (void) snmp_append_data2(mp6_conn_ctl->b_cont, 15926 &mp6_conn_tail, (char *)&tce6, sizeof (tce6)); 15927 15928 mlp.tme_connidx = v6_conn_idx++; 15929 if (needattr) 15930 (void) snmp_append_data2(mp6_attr_ctl->b_cont, 15931 &mp6_attr_tail, (char *)&mlp, sizeof (mlp)); 15932 } 15933 /* 15934 * Create an IPv4 table entry for IPv4 entries and also 15935 * for IPv6 entries which are bound to in6addr_any 15936 * but don't have IPV6_V6ONLY set. 15937 * (i.e. anything an IPv4 peer could connect to) 15938 */ 15939 if (tcp->tcp_ipversion == IPV4_VERSION || 15940 (tcp->tcp_state <= TCPS_LISTEN && 15941 !tcp->tcp_connp->conn_ipv6_v6only && 15942 IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip_src_v6))) { 15943 if (tcp->tcp_ipversion == IPV6_VERSION) { 15944 tce.tcpConnRemAddress = INADDR_ANY; 15945 tce.tcpConnLocalAddress = INADDR_ANY; 15946 } else { 15947 tce.tcpConnRemAddress = 15948 tcp->tcp_remote; 15949 tce.tcpConnLocalAddress = 15950 tcp->tcp_ip_src; 15951 } 15952 tce.tcpConnLocalPort = ntohs(tcp->tcp_lport); 15953 tce.tcpConnRemPort = ntohs(tcp->tcp_fport); 15954 /* Don't want just anybody seeing these... */ 15955 if (ispriv) { 15956 tce.tcpConnEntryInfo.ce_snxt = 15957 tcp->tcp_snxt; 15958 tce.tcpConnEntryInfo.ce_suna = 15959 tcp->tcp_suna; 15960 tce.tcpConnEntryInfo.ce_rnxt = 15961 tcp->tcp_rnxt; 15962 tce.tcpConnEntryInfo.ce_rack = 15963 tcp->tcp_rack; 15964 } else { 15965 /* 15966 * Netstat, unfortunately, uses this to 15967 * get send/receive queue sizes. How 15968 * to fix? 15969 * Why not compute the difference only? 15970 */ 15971 tce.tcpConnEntryInfo.ce_snxt = 15972 tcp->tcp_snxt - tcp->tcp_suna; 15973 tce.tcpConnEntryInfo.ce_suna = 0; 15974 tce.tcpConnEntryInfo.ce_rnxt = 15975 tcp->tcp_rnxt - tcp->tcp_rack; 15976 tce.tcpConnEntryInfo.ce_rack = 0; 15977 } 15978 15979 tce.tcpConnEntryInfo.ce_swnd = tcp->tcp_swnd; 15980 tce.tcpConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; 15981 tce.tcpConnEntryInfo.ce_rto = tcp->tcp_rto; 15982 tce.tcpConnEntryInfo.ce_mss = tcp->tcp_mss; 15983 tce.tcpConnEntryInfo.ce_state = 15984 tcp->tcp_state; 15985 15986 (void) snmp_append_data2(mp_conn_ctl->b_cont, 15987 &mp_conn_tail, (char *)&tce, sizeof (tce)); 15988 15989 mlp.tme_connidx = v4_conn_idx++; 15990 if (needattr) 15991 (void) snmp_append_data2( 15992 mp_attr_ctl->b_cont, 15993 &mp_attr_tail, (char *)&mlp, 15994 sizeof (mlp)); 15995 } 15996 } 15997 } 15998 15999 /* fixed length structure for IPv4 and IPv6 counters */ 16000 SET_MIB(tcp_mib.tcpConnTableSize, sizeof (mib2_tcpConnEntry_t)); 16001 SET_MIB(tcp_mib.tcp6ConnTableSize, sizeof (mib2_tcp6ConnEntry_t)); 16002 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16003 optp->level = MIB2_TCP; 16004 optp->name = 0; 16005 (void) snmp_append_data(mpdata, (char *)&tcp_mib, sizeof (tcp_mib)); 16006 optp->len = msgdsize(mpdata); 16007 qreply(q, mpctl); 16008 16009 /* table of connections... */ 16010 optp = (struct opthdr *)&mp_conn_ctl->b_rptr[ 16011 sizeof (struct T_optmgmt_ack)]; 16012 optp->level = MIB2_TCP; 16013 optp->name = MIB2_TCP_CONN; 16014 optp->len = msgdsize(mp_conn_ctl->b_cont); 16015 qreply(q, mp_conn_ctl); 16016 16017 /* table of MLP attributes... */ 16018 optp = (struct opthdr *)&mp_attr_ctl->b_rptr[ 16019 sizeof (struct T_optmgmt_ack)]; 16020 optp->level = MIB2_TCP; 16021 optp->name = EXPER_XPORT_MLP; 16022 optp->len = msgdsize(mp_attr_ctl->b_cont); 16023 if (optp->len == 0) 16024 freemsg(mp_attr_ctl); 16025 else 16026 qreply(q, mp_attr_ctl); 16027 16028 /* table of IPv6 connections... */ 16029 optp = (struct opthdr *)&mp6_conn_ctl->b_rptr[ 16030 sizeof (struct T_optmgmt_ack)]; 16031 optp->level = MIB2_TCP6; 16032 optp->name = MIB2_TCP6_CONN; 16033 optp->len = msgdsize(mp6_conn_ctl->b_cont); 16034 qreply(q, mp6_conn_ctl); 16035 16036 /* table of IPv6 MLP attributes... */ 16037 optp = (struct opthdr *)&mp6_attr_ctl->b_rptr[ 16038 sizeof (struct T_optmgmt_ack)]; 16039 optp->level = MIB2_TCP6; 16040 optp->name = EXPER_XPORT_MLP; 16041 optp->len = msgdsize(mp6_attr_ctl->b_cont); 16042 if (optp->len == 0) 16043 freemsg(mp6_attr_ctl); 16044 else 16045 qreply(q, mp6_attr_ctl); 16046 return (1); 16047 } 16048 16049 /* Return 0 if invalid set request, 1 otherwise, including non-tcp requests */ 16050 /* ARGSUSED */ 16051 int 16052 tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) 16053 { 16054 mib2_tcpConnEntry_t *tce = (mib2_tcpConnEntry_t *)ptr; 16055 16056 switch (level) { 16057 case MIB2_TCP: 16058 switch (name) { 16059 case 13: 16060 if (tce->tcpConnState != MIB2_TCP_deleteTCB) 16061 return (0); 16062 /* TODO: delete entry defined by tce */ 16063 return (1); 16064 default: 16065 return (0); 16066 } 16067 default: 16068 return (1); 16069 } 16070 } 16071 16072 /* Translate TCP state to MIB2 TCP state. */ 16073 static int 16074 tcp_snmp_state(tcp_t *tcp) 16075 { 16076 if (tcp == NULL) 16077 return (0); 16078 16079 switch (tcp->tcp_state) { 16080 case TCPS_CLOSED: 16081 case TCPS_IDLE: /* RFC1213 doesn't have analogue for IDLE & BOUND */ 16082 case TCPS_BOUND: 16083 return (MIB2_TCP_closed); 16084 case TCPS_LISTEN: 16085 return (MIB2_TCP_listen); 16086 case TCPS_SYN_SENT: 16087 return (MIB2_TCP_synSent); 16088 case TCPS_SYN_RCVD: 16089 return (MIB2_TCP_synReceived); 16090 case TCPS_ESTABLISHED: 16091 return (MIB2_TCP_established); 16092 case TCPS_CLOSE_WAIT: 16093 return (MIB2_TCP_closeWait); 16094 case TCPS_FIN_WAIT_1: 16095 return (MIB2_TCP_finWait1); 16096 case TCPS_CLOSING: 16097 return (MIB2_TCP_closing); 16098 case TCPS_LAST_ACK: 16099 return (MIB2_TCP_lastAck); 16100 case TCPS_FIN_WAIT_2: 16101 return (MIB2_TCP_finWait2); 16102 case TCPS_TIME_WAIT: 16103 return (MIB2_TCP_timeWait); 16104 default: 16105 return (0); 16106 } 16107 } 16108 16109 static char tcp_report_header[] = 16110 "TCP " MI_COL_HDRPAD_STR 16111 "zone dest snxt suna " 16112 "swnd rnxt rack rwnd rto mss w sw rw t " 16113 "recent [lport,fport] state"; 16114 16115 /* 16116 * TCP status report triggered via the Named Dispatch mechanism. 16117 */ 16118 /* ARGSUSED */ 16119 static void 16120 tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, tcp_t *thisstream, 16121 cred_t *cr) 16122 { 16123 char hash[10], addrbuf[INET6_ADDRSTRLEN]; 16124 boolean_t ispriv = secpolicy_net_config(cr, B_TRUE) == 0; 16125 char cflag; 16126 in6_addr_t v6dst; 16127 char buf[80]; 16128 uint_t print_len, buf_len; 16129 16130 buf_len = mp->b_datap->db_lim - mp->b_wptr; 16131 if (buf_len <= 0) 16132 return; 16133 16134 if (hashval >= 0) 16135 (void) sprintf(hash, "%03d ", hashval); 16136 else 16137 hash[0] = '\0'; 16138 16139 /* 16140 * Note that we use the remote address in the tcp_b structure. 16141 * This means that it will print out the real destination address, 16142 * not the next hop's address if source routing is used. This 16143 * avoid the confusion on the output because user may not 16144 * know that source routing is used for a connection. 16145 */ 16146 if (tcp->tcp_ipversion == IPV4_VERSION) { 16147 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &v6dst); 16148 } else { 16149 v6dst = tcp->tcp_remote_v6; 16150 } 16151 (void) inet_ntop(AF_INET6, &v6dst, addrbuf, sizeof (addrbuf)); 16152 /* 16153 * the ispriv checks are so that normal users cannot determine 16154 * sequence number information using NDD. 16155 */ 16156 16157 if (TCP_IS_DETACHED(tcp)) 16158 cflag = '*'; 16159 else 16160 cflag = ' '; 16161 print_len = snprintf((char *)mp->b_wptr, buf_len, 16162 "%s " MI_COL_PTRFMT_STR "%d %s %08x %08x %010d %08x %08x " 16163 "%010d %05ld %05d %1d %02d %02d %1d %08x %s%c\n", 16164 hash, 16165 (void *)tcp, 16166 tcp->tcp_connp->conn_zoneid, 16167 addrbuf, 16168 (ispriv) ? tcp->tcp_snxt : 0, 16169 (ispriv) ? tcp->tcp_suna : 0, 16170 tcp->tcp_swnd, 16171 (ispriv) ? tcp->tcp_rnxt : 0, 16172 (ispriv) ? tcp->tcp_rack : 0, 16173 tcp->tcp_rwnd, 16174 tcp->tcp_rto, 16175 tcp->tcp_mss, 16176 tcp->tcp_snd_ws_ok, 16177 tcp->tcp_snd_ws, 16178 tcp->tcp_rcv_ws, 16179 tcp->tcp_snd_ts_ok, 16180 tcp->tcp_ts_recent, 16181 tcp_display(tcp, buf, DISP_PORT_ONLY), cflag); 16182 if (print_len < buf_len) { 16183 ((mblk_t *)mp)->b_wptr += print_len; 16184 } else { 16185 ((mblk_t *)mp)->b_wptr += buf_len; 16186 } 16187 } 16188 16189 /* 16190 * TCP status report (for listeners only) triggered via the Named Dispatch 16191 * mechanism. 16192 */ 16193 /* ARGSUSED */ 16194 static void 16195 tcp_report_listener(mblk_t *mp, tcp_t *tcp, int hashval) 16196 { 16197 char addrbuf[INET6_ADDRSTRLEN]; 16198 in6_addr_t v6dst; 16199 uint_t print_len, buf_len; 16200 16201 buf_len = mp->b_datap->db_lim - mp->b_wptr; 16202 if (buf_len <= 0) 16203 return; 16204 16205 if (tcp->tcp_ipversion == IPV4_VERSION) { 16206 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6dst); 16207 (void) inet_ntop(AF_INET6, &v6dst, addrbuf, sizeof (addrbuf)); 16208 } else { 16209 (void) inet_ntop(AF_INET6, &tcp->tcp_ip6h->ip6_src, 16210 addrbuf, sizeof (addrbuf)); 16211 } 16212 print_len = snprintf((char *)mp->b_wptr, buf_len, 16213 "%03d " 16214 MI_COL_PTRFMT_STR 16215 "%d %s %05u %08u %d/%d/%d%c\n", 16216 hashval, (void *)tcp, 16217 tcp->tcp_connp->conn_zoneid, 16218 addrbuf, 16219 (uint_t)BE16_TO_U16(tcp->tcp_tcph->th_lport), 16220 tcp->tcp_conn_req_seqnum, 16221 tcp->tcp_conn_req_cnt_q0, tcp->tcp_conn_req_cnt_q, 16222 tcp->tcp_conn_req_max, 16223 tcp->tcp_syn_defense ? '*' : ' '); 16224 if (print_len < buf_len) { 16225 ((mblk_t *)mp)->b_wptr += print_len; 16226 } else { 16227 ((mblk_t *)mp)->b_wptr += buf_len; 16228 } 16229 } 16230 16231 /* TCP status report triggered via the Named Dispatch mechanism. */ 16232 /* ARGSUSED */ 16233 static int 16234 tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16235 { 16236 tcp_t *tcp; 16237 int i; 16238 conn_t *connp; 16239 connf_t *connfp; 16240 zoneid_t zoneid; 16241 16242 /* 16243 * Because of the ndd constraint, at most we can have 64K buffer 16244 * to put in all TCP info. So to be more efficient, just 16245 * allocate a 64K buffer here, assuming we need that large buffer. 16246 * This may be a problem as any user can read tcp_status. Therefore 16247 * we limit the rate of doing this using tcp_ndd_get_info_interval. 16248 * This should be OK as normal users should not do this too often. 16249 */ 16250 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16251 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16252 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16253 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16254 return (0); 16255 } 16256 } 16257 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16258 /* The following may work even if we cannot get a large buf. */ 16259 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16260 return (0); 16261 } 16262 16263 (void) mi_mpprintf(mp, "%s", tcp_report_header); 16264 16265 zoneid = Q_TO_CONN(q)->conn_zoneid; 16266 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 16267 16268 connfp = &ipcl_globalhash_fanout[i]; 16269 16270 connp = NULL; 16271 16272 while ((connp = 16273 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 16274 tcp = connp->conn_tcp; 16275 if (zoneid != GLOBAL_ZONEID && 16276 zoneid != connp->conn_zoneid) 16277 continue; 16278 tcp_report_item(mp->b_cont, tcp, -1, tcp, 16279 cr); 16280 } 16281 16282 } 16283 16284 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16285 return (0); 16286 } 16287 16288 /* TCP status report triggered via the Named Dispatch mechanism. */ 16289 /* ARGSUSED */ 16290 static int 16291 tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16292 { 16293 tf_t *tbf; 16294 tcp_t *tcp; 16295 int i; 16296 zoneid_t zoneid; 16297 16298 /* Refer to comments in tcp_status_report(). */ 16299 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16300 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16301 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16302 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16303 return (0); 16304 } 16305 } 16306 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16307 /* The following may work even if we cannot get a large buf. */ 16308 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16309 return (0); 16310 } 16311 16312 (void) mi_mpprintf(mp, " %s", tcp_report_header); 16313 16314 zoneid = Q_TO_CONN(q)->conn_zoneid; 16315 16316 for (i = 0; i < A_CNT(tcp_bind_fanout); i++) { 16317 tbf = &tcp_bind_fanout[i]; 16318 mutex_enter(&tbf->tf_lock); 16319 for (tcp = tbf->tf_tcp; tcp != NULL; 16320 tcp = tcp->tcp_bind_hash) { 16321 if (zoneid != GLOBAL_ZONEID && 16322 zoneid != tcp->tcp_connp->conn_zoneid) 16323 continue; 16324 CONN_INC_REF(tcp->tcp_connp); 16325 tcp_report_item(mp->b_cont, tcp, i, 16326 Q_TO_TCP(q), cr); 16327 CONN_DEC_REF(tcp->tcp_connp); 16328 } 16329 mutex_exit(&tbf->tf_lock); 16330 } 16331 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16332 return (0); 16333 } 16334 16335 /* TCP status report triggered via the Named Dispatch mechanism. */ 16336 /* ARGSUSED */ 16337 static int 16338 tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16339 { 16340 connf_t *connfp; 16341 conn_t *connp; 16342 tcp_t *tcp; 16343 int i; 16344 zoneid_t zoneid; 16345 16346 /* Refer to comments in tcp_status_report(). */ 16347 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16348 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16349 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16350 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16351 return (0); 16352 } 16353 } 16354 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16355 /* The following may work even if we cannot get a large buf. */ 16356 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16357 return (0); 16358 } 16359 16360 (void) mi_mpprintf(mp, 16361 " TCP " MI_COL_HDRPAD_STR 16362 "zone IP addr port seqnum backlog (q0/q/max)"); 16363 16364 zoneid = Q_TO_CONN(q)->conn_zoneid; 16365 16366 for (i = 0; i < ipcl_bind_fanout_size; i++) { 16367 connfp = &ipcl_bind_fanout[i]; 16368 connp = NULL; 16369 while ((connp = 16370 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 16371 tcp = connp->conn_tcp; 16372 if (zoneid != GLOBAL_ZONEID && 16373 zoneid != connp->conn_zoneid) 16374 continue; 16375 tcp_report_listener(mp->b_cont, tcp, i); 16376 } 16377 } 16378 16379 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16380 return (0); 16381 } 16382 16383 /* TCP status report triggered via the Named Dispatch mechanism. */ 16384 /* ARGSUSED */ 16385 static int 16386 tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16387 { 16388 connf_t *connfp; 16389 conn_t *connp; 16390 tcp_t *tcp; 16391 int i; 16392 zoneid_t zoneid; 16393 16394 /* Refer to comments in tcp_status_report(). */ 16395 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16396 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16397 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16398 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16399 return (0); 16400 } 16401 } 16402 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16403 /* The following may work even if we cannot get a large buf. */ 16404 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16405 return (0); 16406 } 16407 16408 (void) mi_mpprintf(mp, "tcp_conn_hash_size = %d", 16409 ipcl_conn_fanout_size); 16410 (void) mi_mpprintf(mp, " %s", tcp_report_header); 16411 16412 zoneid = Q_TO_CONN(q)->conn_zoneid; 16413 16414 for (i = 0; i < ipcl_conn_fanout_size; i++) { 16415 connfp = &ipcl_conn_fanout[i]; 16416 connp = NULL; 16417 while ((connp = 16418 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 16419 tcp = connp->conn_tcp; 16420 if (zoneid != GLOBAL_ZONEID && 16421 zoneid != connp->conn_zoneid) 16422 continue; 16423 tcp_report_item(mp->b_cont, tcp, i, 16424 Q_TO_TCP(q), cr); 16425 } 16426 } 16427 16428 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16429 return (0); 16430 } 16431 16432 /* TCP status report triggered via the Named Dispatch mechanism. */ 16433 /* ARGSUSED */ 16434 static int 16435 tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16436 { 16437 tf_t *tf; 16438 tcp_t *tcp; 16439 int i; 16440 zoneid_t zoneid; 16441 16442 /* Refer to comments in tcp_status_report(). */ 16443 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16444 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16445 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16446 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16447 return (0); 16448 } 16449 } 16450 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16451 /* The following may work even if we cannot get a large buf. */ 16452 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16453 return (0); 16454 } 16455 16456 (void) mi_mpprintf(mp, " %s", tcp_report_header); 16457 16458 zoneid = Q_TO_CONN(q)->conn_zoneid; 16459 16460 for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) { 16461 tf = &tcp_acceptor_fanout[i]; 16462 mutex_enter(&tf->tf_lock); 16463 for (tcp = tf->tf_tcp; tcp != NULL; 16464 tcp = tcp->tcp_acceptor_hash) { 16465 if (zoneid != GLOBAL_ZONEID && 16466 zoneid != tcp->tcp_connp->conn_zoneid) 16467 continue; 16468 tcp_report_item(mp->b_cont, tcp, i, 16469 Q_TO_TCP(q), cr); 16470 } 16471 mutex_exit(&tf->tf_lock); 16472 } 16473 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16474 return (0); 16475 } 16476 16477 /* 16478 * tcp_timer is the timer service routine. It handles the retransmission, 16479 * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out 16480 * from the state of the tcp instance what kind of action needs to be done 16481 * at the time it is called. 16482 */ 16483 static void 16484 tcp_timer(void *arg) 16485 { 16486 mblk_t *mp; 16487 clock_t first_threshold; 16488 clock_t second_threshold; 16489 clock_t ms; 16490 uint32_t mss; 16491 conn_t *connp = (conn_t *)arg; 16492 tcp_t *tcp = connp->conn_tcp; 16493 16494 tcp->tcp_timer_tid = 0; 16495 16496 if (tcp->tcp_fused) 16497 return; 16498 16499 first_threshold = tcp->tcp_first_timer_threshold; 16500 second_threshold = tcp->tcp_second_timer_threshold; 16501 switch (tcp->tcp_state) { 16502 case TCPS_IDLE: 16503 case TCPS_BOUND: 16504 case TCPS_LISTEN: 16505 return; 16506 case TCPS_SYN_RCVD: { 16507 tcp_t *listener = tcp->tcp_listener; 16508 16509 if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) { 16510 ASSERT(tcp->tcp_rq == listener->tcp_rq); 16511 /* it's our first timeout */ 16512 tcp->tcp_syn_rcvd_timeout = 1; 16513 mutex_enter(&listener->tcp_eager_lock); 16514 listener->tcp_syn_rcvd_timeout++; 16515 if (!listener->tcp_syn_defense && 16516 (listener->tcp_syn_rcvd_timeout > 16517 (tcp_conn_req_max_q0 >> 2)) && 16518 (tcp_conn_req_max_q0 > 200)) { 16519 /* We may be under attack. Put on a defense. */ 16520 listener->tcp_syn_defense = B_TRUE; 16521 cmn_err(CE_WARN, "High TCP connect timeout " 16522 "rate! System (port %d) may be under a " 16523 "SYN flood attack!", 16524 BE16_TO_U16(listener->tcp_tcph->th_lport)); 16525 16526 listener->tcp_ip_addr_cache = kmem_zalloc( 16527 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t), 16528 KM_NOSLEEP); 16529 } 16530 mutex_exit(&listener->tcp_eager_lock); 16531 } 16532 } 16533 /* FALLTHRU */ 16534 case TCPS_SYN_SENT: 16535 first_threshold = tcp->tcp_first_ctimer_threshold; 16536 second_threshold = tcp->tcp_second_ctimer_threshold; 16537 break; 16538 case TCPS_ESTABLISHED: 16539 case TCPS_FIN_WAIT_1: 16540 case TCPS_CLOSING: 16541 case TCPS_CLOSE_WAIT: 16542 case TCPS_LAST_ACK: 16543 /* If we have data to rexmit */ 16544 if (tcp->tcp_suna != tcp->tcp_snxt) { 16545 clock_t time_to_wait; 16546 16547 BUMP_MIB(&tcp_mib, tcpTimRetrans); 16548 if (!tcp->tcp_xmit_head) 16549 break; 16550 time_to_wait = lbolt - 16551 (clock_t)tcp->tcp_xmit_head->b_prev; 16552 time_to_wait = tcp->tcp_rto - 16553 TICK_TO_MSEC(time_to_wait); 16554 /* 16555 * If the timer fires too early, 1 clock tick earlier, 16556 * restart the timer. 16557 */ 16558 if (time_to_wait > msec_per_tick) { 16559 TCP_STAT(tcp_timer_fire_early); 16560 TCP_TIMER_RESTART(tcp, time_to_wait); 16561 return; 16562 } 16563 /* 16564 * When we probe zero windows, we force the swnd open. 16565 * If our peer acks with a closed window swnd will be 16566 * set to zero by tcp_rput(). As long as we are 16567 * receiving acks tcp_rput will 16568 * reset 'tcp_ms_we_have_waited' so as not to trip the 16569 * first and second interval actions. NOTE: the timer 16570 * interval is allowed to continue its exponential 16571 * backoff. 16572 */ 16573 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 16574 if (tcp->tcp_debug) { 16575 (void) strlog(TCP_MOD_ID, 0, 1, 16576 SL_TRACE, "tcp_timer: zero win"); 16577 } 16578 } else { 16579 /* 16580 * After retransmission, we need to do 16581 * slow start. Set the ssthresh to one 16582 * half of current effective window and 16583 * cwnd to one MSS. Also reset 16584 * tcp_cwnd_cnt. 16585 * 16586 * Note that if tcp_ssthresh is reduced because 16587 * of ECN, do not reduce it again unless it is 16588 * already one window of data away (tcp_cwr 16589 * should then be cleared) or this is a 16590 * timeout for a retransmitted segment. 16591 */ 16592 uint32_t npkt; 16593 16594 if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 16595 npkt = ((tcp->tcp_timer_backoff ? 16596 tcp->tcp_cwnd_ssthresh : 16597 tcp->tcp_snxt - 16598 tcp->tcp_suna) >> 1) / tcp->tcp_mss; 16599 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 16600 tcp->tcp_mss; 16601 } 16602 tcp->tcp_cwnd = tcp->tcp_mss; 16603 tcp->tcp_cwnd_cnt = 0; 16604 if (tcp->tcp_ecn_ok) { 16605 tcp->tcp_cwr = B_TRUE; 16606 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 16607 tcp->tcp_ecn_cwr_sent = B_FALSE; 16608 } 16609 } 16610 break; 16611 } 16612 /* 16613 * We have something to send yet we cannot send. The 16614 * reason can be: 16615 * 16616 * 1. Zero send window: we need to do zero window probe. 16617 * 2. Zero cwnd: because of ECN, we need to "clock out 16618 * segments. 16619 * 3. SWS avoidance: receiver may have shrunk window, 16620 * reset our knowledge. 16621 * 16622 * Note that condition 2 can happen with either 1 or 16623 * 3. But 1 and 3 are exclusive. 16624 */ 16625 if (tcp->tcp_unsent != 0) { 16626 if (tcp->tcp_cwnd == 0) { 16627 /* 16628 * Set tcp_cwnd to 1 MSS so that a 16629 * new segment can be sent out. We 16630 * are "clocking out" new data when 16631 * the network is really congested. 16632 */ 16633 ASSERT(tcp->tcp_ecn_ok); 16634 tcp->tcp_cwnd = tcp->tcp_mss; 16635 } 16636 if (tcp->tcp_swnd == 0) { 16637 /* Extend window for zero window probe */ 16638 tcp->tcp_swnd++; 16639 tcp->tcp_zero_win_probe = B_TRUE; 16640 BUMP_MIB(&tcp_mib, tcpOutWinProbe); 16641 } else { 16642 /* 16643 * Handle timeout from sender SWS avoidance. 16644 * Reset our knowledge of the max send window 16645 * since the receiver might have reduced its 16646 * receive buffer. Avoid setting tcp_max_swnd 16647 * to one since that will essentially disable 16648 * the SWS checks. 16649 * 16650 * Note that since we don't have a SWS 16651 * state variable, if the timeout is set 16652 * for ECN but not for SWS, this 16653 * code will also be executed. This is 16654 * fine as tcp_max_swnd is updated 16655 * constantly and it will not affect 16656 * anything. 16657 */ 16658 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 16659 } 16660 tcp_wput_data(tcp, NULL, B_FALSE); 16661 return; 16662 } 16663 /* Is there a FIN that needs to be to re retransmitted? */ 16664 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 16665 !tcp->tcp_fin_acked) 16666 break; 16667 /* Nothing to do, return without restarting timer. */ 16668 TCP_STAT(tcp_timer_fire_miss); 16669 return; 16670 case TCPS_FIN_WAIT_2: 16671 /* 16672 * User closed the TCP endpoint and peer ACK'ed our FIN. 16673 * We waited some time for for peer's FIN, but it hasn't 16674 * arrived. We flush the connection now to avoid 16675 * case where the peer has rebooted. 16676 */ 16677 if (TCP_IS_DETACHED(tcp)) { 16678 (void) tcp_clean_death(tcp, 0, 23); 16679 } else { 16680 TCP_TIMER_RESTART(tcp, tcp_fin_wait_2_flush_interval); 16681 } 16682 return; 16683 case TCPS_TIME_WAIT: 16684 (void) tcp_clean_death(tcp, 0, 24); 16685 return; 16686 default: 16687 if (tcp->tcp_debug) { 16688 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 16689 "tcp_timer: strange state (%d) %s", 16690 tcp->tcp_state, tcp_display(tcp, NULL, 16691 DISP_PORT_ONLY)); 16692 } 16693 return; 16694 } 16695 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 16696 /* 16697 * For zero window probe, we need to send indefinitely, 16698 * unless we have not heard from the other side for some 16699 * time... 16700 */ 16701 if ((tcp->tcp_zero_win_probe == 0) || 16702 (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) > 16703 second_threshold)) { 16704 BUMP_MIB(&tcp_mib, tcpTimRetransDrop); 16705 /* 16706 * If TCP is in SYN_RCVD state, send back a 16707 * RST|ACK as BSD does. Note that tcp_zero_win_probe 16708 * should be zero in TCPS_SYN_RCVD state. 16709 */ 16710 if (tcp->tcp_state == TCPS_SYN_RCVD) { 16711 tcp_xmit_ctl("tcp_timer: RST sent on timeout " 16712 "in SYN_RCVD", 16713 tcp, tcp->tcp_snxt, 16714 tcp->tcp_rnxt, TH_RST | TH_ACK); 16715 } 16716 (void) tcp_clean_death(tcp, 16717 tcp->tcp_client_errno ? 16718 tcp->tcp_client_errno : ETIMEDOUT, 25); 16719 return; 16720 } else { 16721 /* 16722 * Set tcp_ms_we_have_waited to second_threshold 16723 * so that in next timeout, we will do the above 16724 * check (lbolt - tcp_last_recv_time). This is 16725 * also to avoid overflow. 16726 * 16727 * We don't need to decrement tcp_timer_backoff 16728 * to avoid overflow because it will be decremented 16729 * later if new timeout value is greater than 16730 * tcp_rexmit_interval_max. In the case when 16731 * tcp_rexmit_interval_max is greater than 16732 * second_threshold, it means that we will wait 16733 * longer than second_threshold to send the next 16734 * window probe. 16735 */ 16736 tcp->tcp_ms_we_have_waited = second_threshold; 16737 } 16738 } else if (ms > first_threshold) { 16739 if (tcp->tcp_snd_zcopy_aware && (!tcp->tcp_xmit_zc_clean) && 16740 tcp->tcp_xmit_head != NULL) { 16741 tcp->tcp_xmit_head = 16742 tcp_zcopy_backoff(tcp, tcp->tcp_xmit_head, 1); 16743 } 16744 /* 16745 * We have been retransmitting for too long... The RTT 16746 * we calculated is probably incorrect. Reinitialize it. 16747 * Need to compensate for 0 tcp_rtt_sa. Reset 16748 * tcp_rtt_update so that we won't accidentally cache a 16749 * bad value. But only do this if this is not a zero 16750 * window probe. 16751 */ 16752 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { 16753 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 16754 (tcp->tcp_rtt_sa >> 5); 16755 tcp->tcp_rtt_sa = 0; 16756 tcp_ip_notify(tcp); 16757 tcp->tcp_rtt_update = 0; 16758 } 16759 } 16760 tcp->tcp_timer_backoff++; 16761 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 16762 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 16763 tcp_rexmit_interval_min) { 16764 /* 16765 * This means the original RTO is tcp_rexmit_interval_min. 16766 * So we will use tcp_rexmit_interval_min as the RTO value 16767 * and do the backoff. 16768 */ 16769 ms = tcp_rexmit_interval_min << tcp->tcp_timer_backoff; 16770 } else { 16771 ms <<= tcp->tcp_timer_backoff; 16772 } 16773 if (ms > tcp_rexmit_interval_max) { 16774 ms = tcp_rexmit_interval_max; 16775 /* 16776 * ms is at max, decrement tcp_timer_backoff to avoid 16777 * overflow. 16778 */ 16779 tcp->tcp_timer_backoff--; 16780 } 16781 tcp->tcp_ms_we_have_waited += ms; 16782 if (tcp->tcp_zero_win_probe == 0) { 16783 tcp->tcp_rto = ms; 16784 } 16785 TCP_TIMER_RESTART(tcp, ms); 16786 /* 16787 * This is after a timeout and tcp_rto is backed off. Set 16788 * tcp_set_timer to 1 so that next time RTO is updated, we will 16789 * restart the timer with a correct value. 16790 */ 16791 tcp->tcp_set_timer = 1; 16792 mss = tcp->tcp_snxt - tcp->tcp_suna; 16793 if (mss > tcp->tcp_mss) 16794 mss = tcp->tcp_mss; 16795 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 16796 mss = tcp->tcp_swnd; 16797 16798 if ((mp = tcp->tcp_xmit_head) != NULL) 16799 mp->b_prev = (mblk_t *)lbolt; 16800 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 16801 B_TRUE); 16802 16803 /* 16804 * When slow start after retransmission begins, start with 16805 * this seq no. tcp_rexmit_max marks the end of special slow 16806 * start phase. tcp_snd_burst controls how many segments 16807 * can be sent because of an ack. 16808 */ 16809 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 16810 tcp->tcp_snd_burst = TCP_CWND_SS; 16811 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 16812 (tcp->tcp_unsent == 0)) { 16813 tcp->tcp_rexmit_max = tcp->tcp_fss; 16814 } else { 16815 tcp->tcp_rexmit_max = tcp->tcp_snxt; 16816 } 16817 tcp->tcp_rexmit = B_TRUE; 16818 tcp->tcp_dupack_cnt = 0; 16819 16820 /* 16821 * Remove all rexmit SACK blk to start from fresh. 16822 */ 16823 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 16824 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 16825 tcp->tcp_num_notsack_blk = 0; 16826 tcp->tcp_cnt_notsack_list = 0; 16827 } 16828 if (mp == NULL) { 16829 return; 16830 } 16831 /* Attach credentials to retransmitted initial SYNs. */ 16832 if (tcp->tcp_state == TCPS_SYN_SENT) { 16833 mblk_setcred(mp, tcp->tcp_cred); 16834 DB_CPID(mp) = tcp->tcp_cpid; 16835 } 16836 16837 tcp->tcp_csuna = tcp->tcp_snxt; 16838 BUMP_MIB(&tcp_mib, tcpRetransSegs); 16839 UPDATE_MIB(&tcp_mib, tcpRetransBytes, mss); 16840 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 16841 tcp_send_data(tcp, tcp->tcp_wq, mp); 16842 16843 } 16844 16845 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ 16846 static void 16847 tcp_unbind(tcp_t *tcp, mblk_t *mp) 16848 { 16849 conn_t *connp; 16850 16851 switch (tcp->tcp_state) { 16852 case TCPS_BOUND: 16853 case TCPS_LISTEN: 16854 break; 16855 default: 16856 tcp_err_ack(tcp, mp, TOUTSTATE, 0); 16857 return; 16858 } 16859 16860 /* 16861 * Need to clean up all the eagers since after the unbind, segments 16862 * will no longer be delivered to this listener stream. 16863 */ 16864 mutex_enter(&tcp->tcp_eager_lock); 16865 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 16866 tcp_eager_cleanup(tcp, 0); 16867 } 16868 mutex_exit(&tcp->tcp_eager_lock); 16869 16870 if (tcp->tcp_ipversion == IPV4_VERSION) { 16871 tcp->tcp_ipha->ipha_src = 0; 16872 } else { 16873 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); 16874 } 16875 V6_SET_ZERO(tcp->tcp_ip_src_v6); 16876 bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport)); 16877 tcp_bind_hash_remove(tcp); 16878 tcp->tcp_state = TCPS_IDLE; 16879 tcp->tcp_mdt = B_FALSE; 16880 /* Send M_FLUSH according to TPI */ 16881 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); 16882 connp = tcp->tcp_connp; 16883 connp->conn_mdt_ok = B_FALSE; 16884 ipcl_hash_remove(connp); 16885 bzero(&connp->conn_ports, sizeof (connp->conn_ports)); 16886 mp = mi_tpi_ok_ack_alloc(mp); 16887 putnext(tcp->tcp_rq, mp); 16888 } 16889 16890 /* 16891 * Don't let port fall into the privileged range. 16892 * Since the extra privileged ports can be arbitrary we also 16893 * ensure that we exclude those from consideration. 16894 * tcp_g_epriv_ports is not sorted thus we loop over it until 16895 * there are no changes. 16896 * 16897 * Note: No locks are held when inspecting tcp_g_*epriv_ports 16898 * but instead the code relies on: 16899 * - the fact that the address of the array and its size never changes 16900 * - the atomic assignment of the elements of the array 16901 * 16902 * Returns 0 if there are no more ports available. 16903 * 16904 * TS note: skip multilevel ports. 16905 */ 16906 static in_port_t 16907 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) 16908 { 16909 int i; 16910 boolean_t restart = B_FALSE; 16911 16912 if (random && tcp_random_anon_port != 0) { 16913 (void) random_get_pseudo_bytes((uint8_t *)&port, 16914 sizeof (in_port_t)); 16915 /* 16916 * Unless changed by a sys admin, the smallest anon port 16917 * is 32768 and the largest anon port is 65535. It is 16918 * very likely (50%) for the random port to be smaller 16919 * than the smallest anon port. When that happens, 16920 * add port % (anon port range) to the smallest anon 16921 * port to get the random port. It should fall into the 16922 * valid anon port range. 16923 */ 16924 if (port < tcp_smallest_anon_port) { 16925 port = tcp_smallest_anon_port + 16926 port % (tcp_largest_anon_port - 16927 tcp_smallest_anon_port); 16928 } 16929 } 16930 16931 retry: 16932 if (port < tcp_smallest_anon_port) 16933 port = (in_port_t)tcp_smallest_anon_port; 16934 16935 if (port > tcp_largest_anon_port) { 16936 if (restart) 16937 return (0); 16938 restart = B_TRUE; 16939 port = (in_port_t)tcp_smallest_anon_port; 16940 } 16941 16942 if (port < tcp_smallest_nonpriv_port) 16943 port = (in_port_t)tcp_smallest_nonpriv_port; 16944 16945 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 16946 if (port == tcp_g_epriv_ports[i]) { 16947 port++; 16948 /* 16949 * Make sure whether the port is in the 16950 * valid range. 16951 */ 16952 goto retry; 16953 } 16954 } 16955 if (is_system_labeled() && 16956 (i = tsol_next_port(crgetzone(tcp->tcp_cred), port, 16957 IPPROTO_TCP, B_TRUE)) != 0) { 16958 port = i; 16959 goto retry; 16960 } 16961 return (port); 16962 } 16963 16964 /* 16965 * Return the next anonymous port in the privileged port range for 16966 * bind checking. It starts at IPPORT_RESERVED - 1 and goes 16967 * downwards. This is the same behavior as documented in the userland 16968 * library call rresvport(3N). 16969 * 16970 * TS note: skip multilevel ports. 16971 */ 16972 static in_port_t 16973 tcp_get_next_priv_port(const tcp_t *tcp) 16974 { 16975 static in_port_t next_priv_port = IPPORT_RESERVED - 1; 16976 in_port_t nextport; 16977 boolean_t restart = B_FALSE; 16978 16979 retry: 16980 if (next_priv_port < tcp_min_anonpriv_port || 16981 next_priv_port >= IPPORT_RESERVED) { 16982 next_priv_port = IPPORT_RESERVED - 1; 16983 if (restart) 16984 return (0); 16985 restart = B_TRUE; 16986 } 16987 if (is_system_labeled() && 16988 (nextport = tsol_next_port(crgetzone(tcp->tcp_cred), 16989 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { 16990 next_priv_port = nextport; 16991 goto retry; 16992 } 16993 return (next_priv_port--); 16994 } 16995 16996 /* The write side r/w procedure. */ 16997 16998 #if CCS_STATS 16999 struct { 17000 struct { 17001 int64_t count, bytes; 17002 } tot, hit; 17003 } wrw_stats; 17004 #endif 17005 17006 /* 17007 * Call by tcp_wput() to handle all non data, except M_PROTO and M_PCPROTO, 17008 * messages. 17009 */ 17010 /* ARGSUSED */ 17011 static void 17012 tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2) 17013 { 17014 conn_t *connp = (conn_t *)arg; 17015 tcp_t *tcp = connp->conn_tcp; 17016 queue_t *q = tcp->tcp_wq; 17017 17018 ASSERT(DB_TYPE(mp) != M_IOCTL); 17019 /* 17020 * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close. 17021 * Once the close starts, streamhead and sockfs will not let any data 17022 * packets come down (close ensures that there are no threads using the 17023 * queue and no new threads will come down) but since qprocsoff() 17024 * hasn't happened yet, a M_FLUSH or some non data message might 17025 * get reflected back (in response to our own FLUSHRW) and get 17026 * processed after tcp_close() is done. The conn would still be valid 17027 * because a ref would have added but we need to check the state 17028 * before actually processing the packet. 17029 */ 17030 if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) { 17031 freemsg(mp); 17032 return; 17033 } 17034 17035 switch (DB_TYPE(mp)) { 17036 case M_IOCDATA: 17037 tcp_wput_iocdata(tcp, mp); 17038 break; 17039 case M_FLUSH: 17040 tcp_wput_flush(tcp, mp); 17041 break; 17042 default: 17043 CALL_IP_WPUT(connp, q, mp); 17044 break; 17045 } 17046 } 17047 17048 /* 17049 * The TCP fast path write put procedure. 17050 * NOTE: the logic of the fast path is duplicated from tcp_wput_data() 17051 */ 17052 /* ARGSUSED */ 17053 void 17054 tcp_output(void *arg, mblk_t *mp, void *arg2) 17055 { 17056 int len; 17057 int hdrlen; 17058 int plen; 17059 mblk_t *mp1; 17060 uchar_t *rptr; 17061 uint32_t snxt; 17062 tcph_t *tcph; 17063 struct datab *db; 17064 uint32_t suna; 17065 uint32_t mss; 17066 ipaddr_t *dst; 17067 ipaddr_t *src; 17068 uint32_t sum; 17069 int usable; 17070 conn_t *connp = (conn_t *)arg; 17071 tcp_t *tcp = connp->conn_tcp; 17072 uint32_t msize; 17073 17074 /* 17075 * Try and ASSERT the minimum possible references on the 17076 * conn early enough. Since we are executing on write side, 17077 * the connection is obviously not detached and that means 17078 * there is a ref each for TCP and IP. Since we are behind 17079 * the squeue, the minimum references needed are 3. If the 17080 * conn is in classifier hash list, there should be an 17081 * extra ref for that (we check both the possibilities). 17082 */ 17083 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 17084 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 17085 17086 ASSERT(DB_TYPE(mp) == M_DATA); 17087 msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 17088 17089 mutex_enter(&connp->conn_lock); 17090 tcp->tcp_squeue_bytes -= msize; 17091 mutex_exit(&connp->conn_lock); 17092 17093 /* Bypass tcp protocol for fused tcp loopback */ 17094 if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 17095 return; 17096 17097 mss = tcp->tcp_mss; 17098 if (tcp->tcp_xmit_zc_clean) 17099 mp = tcp_zcopy_backoff(tcp, mp, 0); 17100 17101 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 17102 len = (int)(mp->b_wptr - mp->b_rptr); 17103 17104 /* 17105 * Criteria for fast path: 17106 * 17107 * 1. no unsent data 17108 * 2. single mblk in request 17109 * 3. connection established 17110 * 4. data in mblk 17111 * 5. len <= mss 17112 * 6. no tcp_valid bits 17113 */ 17114 if ((tcp->tcp_unsent != 0) || 17115 (tcp->tcp_cork) || 17116 (mp->b_cont != NULL) || 17117 (tcp->tcp_state != TCPS_ESTABLISHED) || 17118 (len == 0) || 17119 (len > mss) || 17120 (tcp->tcp_valid_bits != 0)) { 17121 tcp_wput_data(tcp, mp, B_FALSE); 17122 return; 17123 } 17124 17125 ASSERT(tcp->tcp_xmit_tail_unsent == 0); 17126 ASSERT(tcp->tcp_fin_sent == 0); 17127 17128 /* queue new packet onto retransmission queue */ 17129 if (tcp->tcp_xmit_head == NULL) { 17130 tcp->tcp_xmit_head = mp; 17131 } else { 17132 tcp->tcp_xmit_last->b_cont = mp; 17133 } 17134 tcp->tcp_xmit_last = mp; 17135 tcp->tcp_xmit_tail = mp; 17136 17137 /* find out how much we can send */ 17138 /* BEGIN CSTYLED */ 17139 /* 17140 * un-acked usable 17141 * |--------------|-----------------| 17142 * tcp_suna tcp_snxt tcp_suna+tcp_swnd 17143 */ 17144 /* END CSTYLED */ 17145 17146 /* start sending from tcp_snxt */ 17147 snxt = tcp->tcp_snxt; 17148 17149 /* 17150 * Check to see if this connection has been idled for some 17151 * time and no ACK is expected. If it is, we need to slow 17152 * start again to get back the connection's "self-clock" as 17153 * described in VJ's paper. 17154 * 17155 * Refer to the comment in tcp_mss_set() for the calculation 17156 * of tcp_cwnd after idle. 17157 */ 17158 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 17159 (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 17160 SET_TCP_INIT_CWND(tcp, mss, tcp_slow_start_after_idle); 17161 } 17162 17163 usable = tcp->tcp_swnd; /* tcp window size */ 17164 if (usable > tcp->tcp_cwnd) 17165 usable = tcp->tcp_cwnd; /* congestion window smaller */ 17166 usable -= snxt; /* subtract stuff already sent */ 17167 suna = tcp->tcp_suna; 17168 usable += suna; 17169 /* usable can be < 0 if the congestion window is smaller */ 17170 if (len > usable) { 17171 /* Can't send complete M_DATA in one shot */ 17172 goto slow; 17173 } 17174 17175 if (tcp->tcp_flow_stopped && 17176 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 17177 tcp_clrqfull(tcp); 17178 } 17179 17180 /* 17181 * determine if anything to send (Nagle). 17182 * 17183 * 1. len < tcp_mss (i.e. small) 17184 * 2. unacknowledged data present 17185 * 3. len < nagle limit 17186 * 4. last packet sent < nagle limit (previous packet sent) 17187 */ 17188 if ((len < mss) && (snxt != suna) && 17189 (len < (int)tcp->tcp_naglim) && 17190 (tcp->tcp_last_sent_len < tcp->tcp_naglim)) { 17191 /* 17192 * This was the first unsent packet and normally 17193 * mss < xmit_hiwater so there is no need to worry 17194 * about flow control. The next packet will go 17195 * through the flow control check in tcp_wput_data(). 17196 */ 17197 /* leftover work from above */ 17198 tcp->tcp_unsent = len; 17199 tcp->tcp_xmit_tail_unsent = len; 17200 17201 return; 17202 } 17203 17204 /* len <= tcp->tcp_mss && len == unsent so no silly window */ 17205 17206 if (snxt == suna) { 17207 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 17208 } 17209 17210 /* we have always sent something */ 17211 tcp->tcp_rack_cnt = 0; 17212 17213 tcp->tcp_snxt = snxt + len; 17214 tcp->tcp_rack = tcp->tcp_rnxt; 17215 17216 if ((mp1 = dupb(mp)) == 0) 17217 goto no_memory; 17218 mp->b_prev = (mblk_t *)(uintptr_t)lbolt; 17219 mp->b_next = (mblk_t *)(uintptr_t)snxt; 17220 17221 /* adjust tcp header information */ 17222 tcph = tcp->tcp_tcph; 17223 tcph->th_flags[0] = (TH_ACK|TH_PUSH); 17224 17225 sum = len + tcp->tcp_tcp_hdr_len + tcp->tcp_sum; 17226 sum = (sum >> 16) + (sum & 0xFFFF); 17227 U16_TO_ABE16(sum, tcph->th_sum); 17228 17229 U32_TO_ABE32(snxt, tcph->th_seq); 17230 17231 BUMP_MIB(&tcp_mib, tcpOutDataSegs); 17232 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len); 17233 BUMP_LOCAL(tcp->tcp_obsegs); 17234 17235 /* Update the latest receive window size in TCP header. */ 17236 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 17237 tcph->th_win); 17238 17239 tcp->tcp_last_sent_len = (ushort_t)len; 17240 17241 plen = len + tcp->tcp_hdr_len; 17242 17243 if (tcp->tcp_ipversion == IPV4_VERSION) { 17244 tcp->tcp_ipha->ipha_length = htons(plen); 17245 } else { 17246 tcp->tcp_ip6h->ip6_plen = htons(plen - 17247 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 17248 } 17249 17250 /* see if we need to allocate a mblk for the headers */ 17251 hdrlen = tcp->tcp_hdr_len; 17252 rptr = mp1->b_rptr - hdrlen; 17253 db = mp1->b_datap; 17254 if ((db->db_ref != 2) || rptr < db->db_base || 17255 (!OK_32PTR(rptr))) { 17256 /* NOTE: we assume allocb returns an OK_32PTR */ 17257 mp = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 17258 tcp_wroff_xtra, BPRI_MED); 17259 if (!mp) { 17260 freemsg(mp1); 17261 goto no_memory; 17262 } 17263 mp->b_cont = mp1; 17264 mp1 = mp; 17265 /* Leave room for Link Level header */ 17266 /* hdrlen = tcp->tcp_hdr_len; */ 17267 rptr = &mp1->b_rptr[tcp_wroff_xtra]; 17268 mp1->b_wptr = &rptr[hdrlen]; 17269 } 17270 mp1->b_rptr = rptr; 17271 17272 /* Fill in the timestamp option. */ 17273 if (tcp->tcp_snd_ts_ok) { 17274 U32_TO_BE32((uint32_t)lbolt, 17275 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 17276 U32_TO_BE32(tcp->tcp_ts_recent, 17277 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 17278 } else { 17279 ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 17280 } 17281 17282 /* copy header into outgoing packet */ 17283 dst = (ipaddr_t *)rptr; 17284 src = (ipaddr_t *)tcp->tcp_iphc; 17285 dst[0] = src[0]; 17286 dst[1] = src[1]; 17287 dst[2] = src[2]; 17288 dst[3] = src[3]; 17289 dst[4] = src[4]; 17290 dst[5] = src[5]; 17291 dst[6] = src[6]; 17292 dst[7] = src[7]; 17293 dst[8] = src[8]; 17294 dst[9] = src[9]; 17295 if (hdrlen -= 40) { 17296 hdrlen >>= 2; 17297 dst += 10; 17298 src += 10; 17299 do { 17300 *dst++ = *src++; 17301 } while (--hdrlen); 17302 } 17303 17304 /* 17305 * Set the ECN info in the TCP header. Note that this 17306 * is not the template header. 17307 */ 17308 if (tcp->tcp_ecn_ok) { 17309 SET_ECT(tcp, rptr); 17310 17311 tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 17312 if (tcp->tcp_ecn_echo_on) 17313 tcph->th_flags[0] |= TH_ECE; 17314 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 17315 tcph->th_flags[0] |= TH_CWR; 17316 tcp->tcp_ecn_cwr_sent = B_TRUE; 17317 } 17318 } 17319 17320 if (tcp->tcp_ip_forward_progress) { 17321 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 17322 *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; 17323 tcp->tcp_ip_forward_progress = B_FALSE; 17324 } 17325 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); 17326 tcp_send_data(tcp, tcp->tcp_wq, mp1); 17327 return; 17328 17329 /* 17330 * If we ran out of memory, we pretend to have sent the packet 17331 * and that it was lost on the wire. 17332 */ 17333 no_memory: 17334 return; 17335 17336 slow: 17337 /* leftover work from above */ 17338 tcp->tcp_unsent = len; 17339 tcp->tcp_xmit_tail_unsent = len; 17340 tcp_wput_data(tcp, NULL, B_FALSE); 17341 } 17342 17343 /* 17344 * The function called through squeue to get behind eager's perimeter to 17345 * finish the accept processing. 17346 */ 17347 /* ARGSUSED */ 17348 void 17349 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) 17350 { 17351 conn_t *connp = (conn_t *)arg; 17352 tcp_t *tcp = connp->conn_tcp; 17353 queue_t *q = tcp->tcp_rq; 17354 mblk_t *mp1; 17355 mblk_t *stropt_mp = mp; 17356 struct stroptions *stropt; 17357 uint_t thwin; 17358 17359 /* 17360 * Drop the eager's ref on the listener, that was placed when 17361 * this eager began life in tcp_conn_request. 17362 */ 17363 CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); 17364 17365 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) { 17366 /* 17367 * Someone blewoff the eager before we could finish 17368 * the accept. 17369 * 17370 * The only reason eager exists it because we put in 17371 * a ref on it when conn ind went up. We need to send 17372 * a disconnect indication up while the last reference 17373 * on the eager will be dropped by the squeue when we 17374 * return. 17375 */ 17376 ASSERT(tcp->tcp_listener == NULL); 17377 if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { 17378 struct T_discon_ind *tdi; 17379 17380 (void) putnextctl1(q, M_FLUSH, FLUSHRW); 17381 /* 17382 * Let us reuse the incoming mblk to avoid memory 17383 * allocation failure problems. We know that the 17384 * size of the incoming mblk i.e. stroptions is greater 17385 * than sizeof T_discon_ind. So the reallocb below 17386 * can't fail. 17387 */ 17388 freemsg(mp->b_cont); 17389 mp->b_cont = NULL; 17390 ASSERT(DB_REF(mp) == 1); 17391 mp = reallocb(mp, sizeof (struct T_discon_ind), 17392 B_FALSE); 17393 ASSERT(mp != NULL); 17394 DB_TYPE(mp) = M_PROTO; 17395 ((union T_primitives *)mp->b_rptr)->type = T_DISCON_IND; 17396 tdi = (struct T_discon_ind *)mp->b_rptr; 17397 if (tcp->tcp_issocket) { 17398 tdi->DISCON_reason = ECONNREFUSED; 17399 tdi->SEQ_number = 0; 17400 } else { 17401 tdi->DISCON_reason = ENOPROTOOPT; 17402 tdi->SEQ_number = 17403 tcp->tcp_conn_req_seqnum; 17404 } 17405 mp->b_wptr = mp->b_rptr + sizeof (struct T_discon_ind); 17406 putnext(q, mp); 17407 } else { 17408 freemsg(mp); 17409 } 17410 if (tcp->tcp_hard_binding) { 17411 tcp->tcp_hard_binding = B_FALSE; 17412 tcp->tcp_hard_bound = B_TRUE; 17413 } 17414 tcp->tcp_detached = B_FALSE; 17415 return; 17416 } 17417 17418 mp1 = stropt_mp->b_cont; 17419 stropt_mp->b_cont = NULL; 17420 ASSERT(DB_TYPE(stropt_mp) == M_SETOPTS); 17421 stropt = (struct stroptions *)stropt_mp->b_rptr; 17422 17423 while (mp1 != NULL) { 17424 mp = mp1; 17425 mp1 = mp1->b_cont; 17426 mp->b_cont = NULL; 17427 tcp->tcp_drop_opt_ack_cnt++; 17428 CALL_IP_WPUT(connp, tcp->tcp_wq, mp); 17429 } 17430 mp = NULL; 17431 17432 /* 17433 * For a loopback connection with tcp_direct_sockfs on, note that 17434 * we don't have to protect tcp_rcv_list yet because synchronous 17435 * streams has not yet been enabled and tcp_fuse_rrw() cannot 17436 * possibly race with us. 17437 */ 17438 17439 /* 17440 * Set the max window size (tcp_rq->q_hiwat) of the acceptor 17441 * properly. This is the first time we know of the acceptor' 17442 * queue. So we do it here. 17443 */ 17444 if (tcp->tcp_rcv_list == NULL) { 17445 /* 17446 * Recv queue is empty, tcp_rwnd should not have changed. 17447 * That means it should be equal to the listener's tcp_rwnd. 17448 */ 17449 tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd; 17450 } else { 17451 #ifdef DEBUG 17452 uint_t cnt = 0; 17453 17454 mp1 = tcp->tcp_rcv_list; 17455 while ((mp = mp1) != NULL) { 17456 mp1 = mp->b_next; 17457 cnt += msgdsize(mp); 17458 } 17459 ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt); 17460 #endif 17461 /* There is some data, add them back to get the max. */ 17462 tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; 17463 } 17464 17465 stropt->so_flags = SO_HIWAT; 17466 stropt->so_hiwat = MAX(q->q_hiwat, tcp_sth_rcv_hiwat); 17467 17468 stropt->so_flags |= SO_MAXBLK; 17469 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 17470 17471 /* 17472 * This is the first time we run on the correct 17473 * queue after tcp_accept. So fix all the q parameters 17474 * here. 17475 */ 17476 /* Allocate room for SACK options if needed. */ 17477 stropt->so_flags |= SO_WROFF; 17478 if (tcp->tcp_fused) { 17479 ASSERT(tcp->tcp_loopback); 17480 ASSERT(tcp->tcp_loopback_peer != NULL); 17481 /* 17482 * For fused tcp loopback, set the stream head's write 17483 * offset value to zero since we won't be needing any room 17484 * for TCP/IP headers. This would also improve performance 17485 * since it would reduce the amount of work done by kmem. 17486 * Non-fused tcp loopback case is handled separately below. 17487 */ 17488 stropt->so_wroff = 0; 17489 /* 17490 * Record the stream head's high water mark for this endpoint; 17491 * this is used for flow-control purposes in tcp_fuse_output(). 17492 */ 17493 stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(tcp, q->q_hiwat); 17494 /* 17495 * Update the peer's transmit parameters according to 17496 * our recently calculated high water mark value. 17497 */ 17498 (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); 17499 } else if (tcp->tcp_snd_sack_ok) { 17500 stropt->so_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + 17501 (tcp->tcp_loopback ? 0 : tcp_wroff_xtra); 17502 } else { 17503 stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : 17504 tcp_wroff_xtra); 17505 } 17506 17507 /* 17508 * If this is endpoint is handling SSL, then reserve extra 17509 * offset and space at the end. 17510 * Also have the stream head allocate SSL3_MAX_RECORD_LEN packets, 17511 * overriding the previous setting. The extra cost of signing and 17512 * encrypting multiple MSS-size records (12 of them with Ethernet), 17513 * instead of a single contiguous one by the stream head 17514 * largely outweighs the statistical reduction of ACKs, when 17515 * applicable. The peer will also save on decyption and verification 17516 * costs. 17517 */ 17518 if (tcp->tcp_kssl_ctx != NULL) { 17519 stropt->so_wroff += SSL3_WROFFSET; 17520 17521 stropt->so_flags |= SO_TAIL; 17522 stropt->so_tail = SSL3_MAX_TAIL_LEN; 17523 17524 stropt->so_maxblk = SSL3_MAX_RECORD_LEN; 17525 } 17526 17527 /* Send the options up */ 17528 putnext(q, stropt_mp); 17529 17530 /* 17531 * Pass up any data and/or a fin that has been received. 17532 * 17533 * Adjust receive window in case it had decreased 17534 * (because there is data <=> tcp_rcv_list != NULL) 17535 * while the connection was detached. Note that 17536 * in case the eager was flow-controlled, w/o this 17537 * code, the rwnd may never open up again! 17538 */ 17539 if (tcp->tcp_rcv_list != NULL) { 17540 /* We drain directly in case of fused tcp loopback */ 17541 if (!tcp->tcp_fused && canputnext(q)) { 17542 tcp->tcp_rwnd = q->q_hiwat; 17543 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 17544 << tcp->tcp_rcv_ws; 17545 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 17546 if (tcp->tcp_state >= TCPS_ESTABLISHED && 17547 (q->q_hiwat - thwin >= tcp->tcp_mss)) { 17548 tcp_xmit_ctl(NULL, 17549 tcp, (tcp->tcp_swnd == 0) ? 17550 tcp->tcp_suna : tcp->tcp_snxt, 17551 tcp->tcp_rnxt, TH_ACK); 17552 BUMP_MIB(&tcp_mib, tcpOutWinUpdate); 17553 } 17554 17555 } 17556 (void) tcp_rcv_drain(q, tcp); 17557 17558 /* 17559 * For fused tcp loopback, back-enable peer endpoint 17560 * if it's currently flow-controlled. 17561 */ 17562 if (tcp->tcp_fused && 17563 tcp->tcp_loopback_peer->tcp_flow_stopped) { 17564 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 17565 17566 ASSERT(peer_tcp != NULL); 17567 ASSERT(peer_tcp->tcp_fused); 17568 17569 tcp_clrqfull(peer_tcp); 17570 TCP_STAT(tcp_fusion_backenabled); 17571 } 17572 } 17573 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 17574 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 17575 mp = mi_tpi_ordrel_ind(); 17576 if (mp) { 17577 tcp->tcp_ordrel_done = B_TRUE; 17578 putnext(q, mp); 17579 if (tcp->tcp_deferred_clean_death) { 17580 /* 17581 * tcp_clean_death was deferred 17582 * for T_ORDREL_IND - do it now 17583 */ 17584 (void) tcp_clean_death(tcp, 17585 tcp->tcp_client_errno, 21); 17586 tcp->tcp_deferred_clean_death = B_FALSE; 17587 } 17588 } else { 17589 /* 17590 * Run the orderly release in the 17591 * service routine. 17592 */ 17593 qenable(q); 17594 } 17595 } 17596 if (tcp->tcp_hard_binding) { 17597 tcp->tcp_hard_binding = B_FALSE; 17598 tcp->tcp_hard_bound = B_TRUE; 17599 } 17600 17601 tcp->tcp_detached = B_FALSE; 17602 17603 /* We can enable synchronous streams now */ 17604 if (tcp->tcp_fused) { 17605 tcp_fuse_syncstr_enable_pair(tcp); 17606 } 17607 17608 if (tcp->tcp_ka_enabled) { 17609 tcp->tcp_ka_last_intrvl = 0; 17610 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 17611 MSEC_TO_TICK(tcp->tcp_ka_interval)); 17612 } 17613 17614 /* 17615 * At this point, eager is fully established and will 17616 * have the following references - 17617 * 17618 * 2 references for connection to exist (1 for TCP and 1 for IP). 17619 * 1 reference for the squeue which will be dropped by the squeue as 17620 * soon as this function returns. 17621 * There will be 1 additonal reference for being in classifier 17622 * hash list provided something bad hasn't happened. 17623 */ 17624 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 17625 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 17626 } 17627 17628 /* 17629 * The function called through squeue to get behind listener's perimeter to 17630 * send a deffered conn_ind. 17631 */ 17632 /* ARGSUSED */ 17633 void 17634 tcp_send_pending(void *arg, mblk_t *mp, void *arg2) 17635 { 17636 conn_t *connp = (conn_t *)arg; 17637 tcp_t *listener = connp->conn_tcp; 17638 17639 if (listener->tcp_state == TCPS_CLOSED || 17640 TCP_IS_DETACHED(listener)) { 17641 /* 17642 * If listener has closed, it would have caused a 17643 * a cleanup/blowoff to happen for the eager. 17644 */ 17645 tcp_t *tcp; 17646 struct T_conn_ind *conn_ind; 17647 17648 conn_ind = (struct T_conn_ind *)mp->b_rptr; 17649 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 17650 conn_ind->OPT_length); 17651 /* 17652 * We need to drop the ref on eager that was put 17653 * tcp_rput_data() before trying to send the conn_ind 17654 * to listener. The conn_ind was deferred in tcp_send_conn_ind 17655 * and tcp_wput_accept() is sending this deferred conn_ind but 17656 * listener is closed so we drop the ref. 17657 */ 17658 CONN_DEC_REF(tcp->tcp_connp); 17659 freemsg(mp); 17660 return; 17661 } 17662 putnext(listener->tcp_rq, mp); 17663 } 17664 17665 17666 /* 17667 * This is the STREAMS entry point for T_CONN_RES coming down on 17668 * Acceptor STREAM when sockfs listener does accept processing. 17669 * Read the block comment on top pf tcp_conn_request(). 17670 */ 17671 void 17672 tcp_wput_accept(queue_t *q, mblk_t *mp) 17673 { 17674 queue_t *rq = RD(q); 17675 struct T_conn_res *conn_res; 17676 tcp_t *eager; 17677 tcp_t *listener; 17678 struct T_ok_ack *ok; 17679 t_scalar_t PRIM_type; 17680 mblk_t *opt_mp; 17681 conn_t *econnp; 17682 17683 ASSERT(DB_TYPE(mp) == M_PROTO); 17684 17685 conn_res = (struct T_conn_res *)mp->b_rptr; 17686 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 17687 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { 17688 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 17689 if (mp != NULL) 17690 putnext(rq, mp); 17691 return; 17692 } 17693 switch (conn_res->PRIM_type) { 17694 case O_T_CONN_RES: 17695 case T_CONN_RES: 17696 /* 17697 * We pass up an err ack if allocb fails. This will 17698 * cause sockfs to issue a T_DISCON_REQ which will cause 17699 * tcp_eager_blowoff to be called. sockfs will then call 17700 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. 17701 * we need to do the allocb up here because we have to 17702 * make sure rq->q_qinfo->qi_qclose still points to the 17703 * correct function (tcpclose_accept) in case allocb 17704 * fails. 17705 */ 17706 opt_mp = allocb(sizeof (struct stroptions), BPRI_HI); 17707 if (opt_mp == NULL) { 17708 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 17709 if (mp != NULL) 17710 putnext(rq, mp); 17711 return; 17712 } 17713 17714 bcopy(mp->b_rptr + conn_res->OPT_offset, 17715 &eager, conn_res->OPT_length); 17716 PRIM_type = conn_res->PRIM_type; 17717 mp->b_datap->db_type = M_PCPROTO; 17718 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); 17719 ok = (struct T_ok_ack *)mp->b_rptr; 17720 ok->PRIM_type = T_OK_ACK; 17721 ok->CORRECT_prim = PRIM_type; 17722 econnp = eager->tcp_connp; 17723 econnp->conn_dev = (dev_t)q->q_ptr; 17724 eager->tcp_rq = rq; 17725 eager->tcp_wq = q; 17726 rq->q_ptr = econnp; 17727 rq->q_qinfo = &tcp_rinit; 17728 q->q_ptr = econnp; 17729 q->q_qinfo = &tcp_winit; 17730 listener = eager->tcp_listener; 17731 eager->tcp_issocket = B_TRUE; 17732 17733 econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; 17734 econnp->conn_allzones = listener->tcp_connp->conn_allzones; 17735 17736 /* Put the ref for IP */ 17737 CONN_INC_REF(econnp); 17738 17739 /* 17740 * We should have minimum of 3 references on the conn 17741 * at this point. One each for TCP and IP and one for 17742 * the T_conn_ind that was sent up when the 3-way handshake 17743 * completed. In the normal case we would also have another 17744 * reference (making a total of 4) for the conn being in the 17745 * classifier hash list. However the eager could have received 17746 * an RST subsequently and tcp_closei_local could have removed 17747 * the eager from the classifier hash list, hence we can't 17748 * assert that reference. 17749 */ 17750 ASSERT(econnp->conn_ref >= 3); 17751 17752 /* 17753 * Send the new local address also up to sockfs. There 17754 * should already be enough space in the mp that came 17755 * down from soaccept(). 17756 */ 17757 if (eager->tcp_family == AF_INET) { 17758 sin_t *sin; 17759 17760 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 17761 (sizeof (struct T_ok_ack) + sizeof (sin_t))); 17762 sin = (sin_t *)mp->b_wptr; 17763 mp->b_wptr += sizeof (sin_t); 17764 sin->sin_family = AF_INET; 17765 sin->sin_port = eager->tcp_lport; 17766 sin->sin_addr.s_addr = eager->tcp_ipha->ipha_src; 17767 } else { 17768 sin6_t *sin6; 17769 17770 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 17771 sizeof (struct T_ok_ack) + sizeof (sin6_t)); 17772 sin6 = (sin6_t *)mp->b_wptr; 17773 mp->b_wptr += sizeof (sin6_t); 17774 sin6->sin6_family = AF_INET6; 17775 sin6->sin6_port = eager->tcp_lport; 17776 if (eager->tcp_ipversion == IPV4_VERSION) { 17777 sin6->sin6_flowinfo = 0; 17778 IN6_IPADDR_TO_V4MAPPED( 17779 eager->tcp_ipha->ipha_src, 17780 &sin6->sin6_addr); 17781 } else { 17782 ASSERT(eager->tcp_ip6h != NULL); 17783 sin6->sin6_flowinfo = 17784 eager->tcp_ip6h->ip6_vcf & 17785 ~IPV6_VERS_AND_FLOW_MASK; 17786 sin6->sin6_addr = eager->tcp_ip6h->ip6_src; 17787 } 17788 sin6->sin6_scope_id = 0; 17789 sin6->__sin6_src_id = 0; 17790 } 17791 17792 putnext(rq, mp); 17793 17794 opt_mp->b_datap->db_type = M_SETOPTS; 17795 opt_mp->b_wptr += sizeof (struct stroptions); 17796 17797 /* 17798 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO 17799 * from listener to acceptor. The message is chained on the 17800 * bind_mp which tcp_rput_other will send down to IP. 17801 */ 17802 if (listener->tcp_bound_if != 0) { 17803 /* allocate optmgmt req */ 17804 mp = tcp_setsockopt_mp(IPPROTO_IPV6, 17805 IPV6_BOUND_IF, (char *)&listener->tcp_bound_if, 17806 sizeof (int)); 17807 if (mp != NULL) 17808 linkb(opt_mp, mp); 17809 } 17810 if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { 17811 uint_t on = 1; 17812 17813 /* allocate optmgmt req */ 17814 mp = tcp_setsockopt_mp(IPPROTO_IPV6, 17815 IPV6_RECVPKTINFO, (char *)&on, sizeof (on)); 17816 if (mp != NULL) 17817 linkb(opt_mp, mp); 17818 } 17819 17820 17821 mutex_enter(&listener->tcp_eager_lock); 17822 17823 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 17824 17825 tcp_t *tail; 17826 tcp_t *tcp; 17827 mblk_t *mp1; 17828 17829 tcp = listener->tcp_eager_prev_q0; 17830 /* 17831 * listener->tcp_eager_prev_q0 points to the TAIL of the 17832 * deferred T_conn_ind queue. We need to get to the head 17833 * of the queue in order to send up T_conn_ind the same 17834 * order as how the 3WHS is completed. 17835 */ 17836 while (tcp != listener) { 17837 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 && 17838 !tcp->tcp_kssl_pending) 17839 break; 17840 else 17841 tcp = tcp->tcp_eager_prev_q0; 17842 } 17843 /* None of the pending eagers can be sent up now */ 17844 if (tcp == listener) 17845 goto no_more_eagers; 17846 17847 mp1 = tcp->tcp_conn.tcp_eager_conn_ind; 17848 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 17849 /* Move from q0 to q */ 17850 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 17851 listener->tcp_conn_req_cnt_q0--; 17852 listener->tcp_conn_req_cnt_q++; 17853 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 17854 tcp->tcp_eager_prev_q0; 17855 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 17856 tcp->tcp_eager_next_q0; 17857 tcp->tcp_eager_prev_q0 = NULL; 17858 tcp->tcp_eager_next_q0 = NULL; 17859 tcp->tcp_conn_def_q0 = B_FALSE; 17860 17861 /* 17862 * Insert at end of the queue because sockfs sends 17863 * down T_CONN_RES in chronological order. Leaving 17864 * the older conn indications at front of the queue 17865 * helps reducing search time. 17866 */ 17867 tail = listener->tcp_eager_last_q; 17868 if (tail != NULL) { 17869 tail->tcp_eager_next_q = tcp; 17870 } else { 17871 listener->tcp_eager_next_q = tcp; 17872 } 17873 listener->tcp_eager_last_q = tcp; 17874 tcp->tcp_eager_next_q = NULL; 17875 17876 /* Need to get inside the listener perimeter */ 17877 CONN_INC_REF(listener->tcp_connp); 17878 squeue_fill(listener->tcp_connp->conn_sqp, mp1, 17879 tcp_send_pending, listener->tcp_connp, 17880 SQTAG_TCP_SEND_PENDING); 17881 } 17882 no_more_eagers: 17883 tcp_eager_unlink(eager); 17884 mutex_exit(&listener->tcp_eager_lock); 17885 17886 /* 17887 * At this point, the eager is detached from the listener 17888 * but we still have an extra refs on eager (apart from the 17889 * usual tcp references). The ref was placed in tcp_rput_data 17890 * before sending the conn_ind in tcp_send_conn_ind. 17891 * The ref will be dropped in tcp_accept_finish(). 17892 */ 17893 squeue_enter_nodrain(econnp->conn_sqp, opt_mp, 17894 tcp_accept_finish, econnp, SQTAG_TCP_ACCEPT_FINISH_Q0); 17895 return; 17896 default: 17897 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); 17898 if (mp != NULL) 17899 putnext(rq, mp); 17900 return; 17901 } 17902 } 17903 17904 void 17905 tcp_wput(queue_t *q, mblk_t *mp) 17906 { 17907 conn_t *connp = Q_TO_CONN(q); 17908 tcp_t *tcp; 17909 void (*output_proc)(); 17910 t_scalar_t type; 17911 uchar_t *rptr; 17912 struct iocblk *iocp; 17913 uint32_t msize; 17914 17915 ASSERT(connp->conn_ref >= 2); 17916 17917 switch (DB_TYPE(mp)) { 17918 case M_DATA: 17919 tcp = connp->conn_tcp; 17920 ASSERT(tcp != NULL); 17921 17922 msize = msgdsize(mp); 17923 17924 mutex_enter(&connp->conn_lock); 17925 CONN_INC_REF_LOCKED(connp); 17926 17927 tcp->tcp_squeue_bytes += msize; 17928 if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) { 17929 mutex_exit(&connp->conn_lock); 17930 tcp_setqfull(tcp); 17931 } else 17932 mutex_exit(&connp->conn_lock); 17933 17934 (*tcp_squeue_wput_proc)(connp->conn_sqp, mp, 17935 tcp_output, connp, SQTAG_TCP_OUTPUT); 17936 return; 17937 case M_PROTO: 17938 case M_PCPROTO: 17939 /* 17940 * if it is a snmp message, don't get behind the squeue 17941 */ 17942 tcp = connp->conn_tcp; 17943 rptr = mp->b_rptr; 17944 if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 17945 type = ((union T_primitives *)rptr)->type; 17946 } else { 17947 if (tcp->tcp_debug) { 17948 (void) strlog(TCP_MOD_ID, 0, 1, 17949 SL_ERROR|SL_TRACE, 17950 "tcp_wput_proto, dropping one..."); 17951 } 17952 freemsg(mp); 17953 return; 17954 } 17955 if (type == T_SVR4_OPTMGMT_REQ) { 17956 cred_t *cr = DB_CREDDEF(mp, tcp->tcp_cred); 17957 if (snmpcom_req(q, mp, tcp_snmp_set, tcp_snmp_get, 17958 cr)) { 17959 /* 17960 * This was a SNMP request 17961 */ 17962 return; 17963 } else { 17964 output_proc = tcp_wput_proto; 17965 } 17966 } else { 17967 output_proc = tcp_wput_proto; 17968 } 17969 break; 17970 case M_IOCTL: 17971 /* 17972 * Most ioctls can be processed right away without going via 17973 * squeues - process them right here. Those that do require 17974 * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK) 17975 * are processed by tcp_wput_ioctl(). 17976 */ 17977 iocp = (struct iocblk *)mp->b_rptr; 17978 tcp = connp->conn_tcp; 17979 17980 switch (iocp->ioc_cmd) { 17981 case TCP_IOC_ABORT_CONN: 17982 tcp_ioctl_abort_conn(q, mp); 17983 return; 17984 case TI_GETPEERNAME: 17985 if (tcp->tcp_state < TCPS_SYN_RCVD) { 17986 iocp->ioc_error = ENOTCONN; 17987 iocp->ioc_count = 0; 17988 mp->b_datap->db_type = M_IOCACK; 17989 qreply(q, mp); 17990 return; 17991 } 17992 /* FALLTHRU */ 17993 case TI_GETMYNAME: 17994 mi_copyin(q, mp, NULL, 17995 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 17996 return; 17997 case ND_SET: 17998 /* nd_getset does the necessary checks */ 17999 case ND_GET: 18000 if (!nd_getset(q, tcp_g_nd, mp)) { 18001 CALL_IP_WPUT(connp, q, mp); 18002 return; 18003 } 18004 qreply(q, mp); 18005 return; 18006 case TCP_IOC_DEFAULT_Q: 18007 /* 18008 * Wants to be the default wq. Check the credentials 18009 * first, the rest is executed via squeue. 18010 */ 18011 if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) { 18012 iocp->ioc_error = EPERM; 18013 iocp->ioc_count = 0; 18014 mp->b_datap->db_type = M_IOCACK; 18015 qreply(q, mp); 18016 return; 18017 } 18018 output_proc = tcp_wput_ioctl; 18019 break; 18020 default: 18021 output_proc = tcp_wput_ioctl; 18022 break; 18023 } 18024 break; 18025 default: 18026 output_proc = tcp_wput_nondata; 18027 break; 18028 } 18029 18030 CONN_INC_REF(connp); 18031 (*tcp_squeue_wput_proc)(connp->conn_sqp, mp, 18032 output_proc, connp, SQTAG_TCP_WPUT_OTHER); 18033 } 18034 18035 /* 18036 * Initial STREAMS write side put() procedure for sockets. It tries to 18037 * handle the T_CAPABILITY_REQ which sockfs sends down while setting 18038 * up the socket without using the squeue. Non T_CAPABILITY_REQ messages 18039 * are handled by tcp_wput() as usual. 18040 * 18041 * All further messages will also be handled by tcp_wput() because we cannot 18042 * be sure that the above short cut is safe later. 18043 */ 18044 static void 18045 tcp_wput_sock(queue_t *wq, mblk_t *mp) 18046 { 18047 conn_t *connp = Q_TO_CONN(wq); 18048 tcp_t *tcp = connp->conn_tcp; 18049 struct T_capability_req *car = (struct T_capability_req *)mp->b_rptr; 18050 18051 ASSERT(wq->q_qinfo == &tcp_sock_winit); 18052 wq->q_qinfo = &tcp_winit; 18053 18054 ASSERT(IPCL_IS_TCP(connp)); 18055 ASSERT(TCP_IS_SOCKET(tcp)); 18056 18057 if (DB_TYPE(mp) == M_PCPROTO && 18058 MBLKL(mp) == sizeof (struct T_capability_req) && 18059 car->PRIM_type == T_CAPABILITY_REQ) { 18060 tcp_capability_req(tcp, mp); 18061 return; 18062 } 18063 18064 tcp_wput(wq, mp); 18065 } 18066 18067 static boolean_t 18068 tcp_zcopy_check(tcp_t *tcp) 18069 { 18070 conn_t *connp = tcp->tcp_connp; 18071 ire_t *ire; 18072 boolean_t zc_enabled = B_FALSE; 18073 18074 if (do_tcpzcopy == 2) 18075 zc_enabled = B_TRUE; 18076 else if (tcp->tcp_ipversion == IPV4_VERSION && 18077 IPCL_IS_CONNECTED(connp) && 18078 (connp->conn_flags & IPCL_CHECK_POLICY) == 0 && 18079 connp->conn_dontroute == 0 && 18080 !connp->conn_nexthop_set && 18081 connp->conn_xmit_if_ill == NULL && 18082 connp->conn_nofailover_ill == NULL && 18083 do_tcpzcopy == 1) { 18084 /* 18085 * the checks above closely resemble the fast path checks 18086 * in tcp_send_data(). 18087 */ 18088 mutex_enter(&connp->conn_lock); 18089 ire = connp->conn_ire_cache; 18090 ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); 18091 if (ire != NULL && !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18092 IRE_REFHOLD(ire); 18093 if (ire->ire_stq != NULL) { 18094 ill_t *ill = (ill_t *)ire->ire_stq->q_ptr; 18095 18096 zc_enabled = ill && (ill->ill_capabilities & 18097 ILL_CAPAB_ZEROCOPY) && 18098 (ill->ill_zerocopy_capab-> 18099 ill_zerocopy_flags != 0); 18100 } 18101 IRE_REFRELE(ire); 18102 } 18103 mutex_exit(&connp->conn_lock); 18104 } 18105 tcp->tcp_snd_zcopy_on = zc_enabled; 18106 if (!TCP_IS_DETACHED(tcp)) { 18107 if (zc_enabled) { 18108 (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMSAFE); 18109 TCP_STAT(tcp_zcopy_on); 18110 } else { 18111 (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE); 18112 TCP_STAT(tcp_zcopy_off); 18113 } 18114 } 18115 return (zc_enabled); 18116 } 18117 18118 static mblk_t * 18119 tcp_zcopy_disable(tcp_t *tcp, mblk_t *bp) 18120 { 18121 if (do_tcpzcopy == 2) 18122 return (bp); 18123 else if (tcp->tcp_snd_zcopy_on) { 18124 tcp->tcp_snd_zcopy_on = B_FALSE; 18125 if (!TCP_IS_DETACHED(tcp)) { 18126 (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE); 18127 TCP_STAT(tcp_zcopy_disable); 18128 } 18129 } 18130 return (tcp_zcopy_backoff(tcp, bp, 0)); 18131 } 18132 18133 /* 18134 * Backoff from a zero-copy mblk by copying data to a new mblk and freeing 18135 * the original desballoca'ed segmapped mblk. 18136 */ 18137 static mblk_t * 18138 tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, int fix_xmitlist) 18139 { 18140 mblk_t *head, *tail, *nbp; 18141 if (IS_VMLOANED_MBLK(bp)) { 18142 TCP_STAT(tcp_zcopy_backoff); 18143 if ((head = copyb(bp)) == NULL) { 18144 /* fail to backoff; leave it for the next backoff */ 18145 tcp->tcp_xmit_zc_clean = B_FALSE; 18146 return (bp); 18147 } 18148 if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { 18149 if (fix_xmitlist) 18150 tcp_zcopy_notify(tcp); 18151 else 18152 head->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 18153 } 18154 nbp = bp->b_cont; 18155 if (fix_xmitlist) { 18156 head->b_prev = bp->b_prev; 18157 head->b_next = bp->b_next; 18158 if (tcp->tcp_xmit_tail == bp) 18159 tcp->tcp_xmit_tail = head; 18160 } 18161 bp->b_next = NULL; 18162 bp->b_prev = NULL; 18163 freeb(bp); 18164 } else { 18165 head = bp; 18166 nbp = bp->b_cont; 18167 } 18168 tail = head; 18169 while (nbp) { 18170 if (IS_VMLOANED_MBLK(nbp)) { 18171 TCP_STAT(tcp_zcopy_backoff); 18172 if ((tail->b_cont = copyb(nbp)) == NULL) { 18173 tcp->tcp_xmit_zc_clean = B_FALSE; 18174 tail->b_cont = nbp; 18175 return (head); 18176 } 18177 tail = tail->b_cont; 18178 if (nbp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { 18179 if (fix_xmitlist) 18180 tcp_zcopy_notify(tcp); 18181 else 18182 tail->b_datap->db_struioflag |= 18183 STRUIO_ZCNOTIFY; 18184 } 18185 bp = nbp; 18186 nbp = nbp->b_cont; 18187 if (fix_xmitlist) { 18188 tail->b_prev = bp->b_prev; 18189 tail->b_next = bp->b_next; 18190 if (tcp->tcp_xmit_tail == bp) 18191 tcp->tcp_xmit_tail = tail; 18192 } 18193 bp->b_next = NULL; 18194 bp->b_prev = NULL; 18195 freeb(bp); 18196 } else { 18197 tail->b_cont = nbp; 18198 tail = nbp; 18199 nbp = nbp->b_cont; 18200 } 18201 } 18202 if (fix_xmitlist) { 18203 tcp->tcp_xmit_last = tail; 18204 tcp->tcp_xmit_zc_clean = B_TRUE; 18205 } 18206 return (head); 18207 } 18208 18209 static void 18210 tcp_zcopy_notify(tcp_t *tcp) 18211 { 18212 struct stdata *stp; 18213 18214 if (tcp->tcp_detached) 18215 return; 18216 stp = STREAM(tcp->tcp_rq); 18217 mutex_enter(&stp->sd_lock); 18218 stp->sd_flag |= STZCNOTIFY; 18219 cv_broadcast(&stp->sd_zcopy_wait); 18220 mutex_exit(&stp->sd_lock); 18221 } 18222 18223 static void 18224 tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) 18225 { 18226 ipha_t *ipha; 18227 ipaddr_t src; 18228 ipaddr_t dst; 18229 uint32_t cksum; 18230 ire_t *ire; 18231 uint16_t *up; 18232 ill_t *ill; 18233 conn_t *connp = tcp->tcp_connp; 18234 uint32_t hcksum_txflags = 0; 18235 mblk_t *ire_fp_mp; 18236 uint_t ire_fp_mp_len; 18237 18238 ASSERT(DB_TYPE(mp) == M_DATA); 18239 18240 if (DB_CRED(mp) == NULL) 18241 mblk_setcred(mp, CONN_CRED(connp)); 18242 18243 ipha = (ipha_t *)mp->b_rptr; 18244 src = ipha->ipha_src; 18245 dst = ipha->ipha_dst; 18246 18247 /* 18248 * Drop off fast path for IPv6 and also if options are present or 18249 * we need to resolve a TS label. 18250 */ 18251 if (tcp->tcp_ipversion != IPV4_VERSION || 18252 !IPCL_IS_CONNECTED(connp) || 18253 (connp->conn_flags & IPCL_CHECK_POLICY) != 0 || 18254 connp->conn_dontroute || 18255 connp->conn_nexthop_set || 18256 connp->conn_xmit_if_ill != NULL || 18257 connp->conn_nofailover_ill != NULL || 18258 !connp->conn_ulp_labeled || 18259 ipha->ipha_ident == IP_HDR_INCLUDED || 18260 ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION || 18261 IPP_ENABLED(IPP_LOCAL_OUT)) { 18262 if (tcp->tcp_snd_zcopy_aware) 18263 mp = tcp_zcopy_disable(tcp, mp); 18264 TCP_STAT(tcp_ip_send); 18265 CALL_IP_WPUT(connp, q, mp); 18266 return; 18267 } 18268 18269 mutex_enter(&connp->conn_lock); 18270 ire = connp->conn_ire_cache; 18271 ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); 18272 if (ire != NULL && ire->ire_addr == dst && 18273 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18274 IRE_REFHOLD(ire); 18275 mutex_exit(&connp->conn_lock); 18276 } else { 18277 boolean_t cached = B_FALSE; 18278 18279 /* force a recheck later on */ 18280 tcp->tcp_ire_ill_check_done = B_FALSE; 18281 18282 TCP_DBGSTAT(tcp_ire_null1); 18283 connp->conn_ire_cache = NULL; 18284 mutex_exit(&connp->conn_lock); 18285 if (ire != NULL) 18286 IRE_REFRELE_NOTR(ire); 18287 ire = ire_cache_lookup(dst, connp->conn_zoneid, 18288 MBLK_GETLABEL(mp)); 18289 if (ire == NULL) { 18290 if (tcp->tcp_snd_zcopy_aware) 18291 mp = tcp_zcopy_backoff(tcp, mp, 0); 18292 TCP_STAT(tcp_ire_null); 18293 CALL_IP_WPUT(connp, q, mp); 18294 return; 18295 } 18296 IRE_REFHOLD_NOTR(ire); 18297 /* 18298 * Since we are inside the squeue, there cannot be another 18299 * thread in TCP trying to set the conn_ire_cache now. The 18300 * check for IRE_MARK_CONDEMNED ensures that an interface 18301 * unplumb thread has not yet started cleaning up the conns. 18302 * Hence we don't need to grab the conn lock. 18303 */ 18304 if (!(connp->conn_state_flags & CONN_CLOSING)) { 18305 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 18306 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18307 connp->conn_ire_cache = ire; 18308 cached = B_TRUE; 18309 } 18310 rw_exit(&ire->ire_bucket->irb_lock); 18311 } 18312 18313 /* 18314 * We can continue to use the ire but since it was 18315 * not cached, we should drop the extra reference. 18316 */ 18317 if (!cached) 18318 IRE_REFRELE_NOTR(ire); 18319 18320 /* 18321 * Rampart note: no need to select a new label here, since 18322 * labels are not allowed to change during the life of a TCP 18323 * connection. 18324 */ 18325 } 18326 18327 /* 18328 * The following if case identifies whether or not 18329 * we are forced to take the slowpath. 18330 */ 18331 if (ire->ire_flags & RTF_MULTIRT || 18332 ire->ire_stq == NULL || 18333 ire->ire_max_frag < ntohs(ipha->ipha_length) || 18334 (ire->ire_nce != NULL && 18335 (ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL) || 18336 (ire_fp_mp_len = MBLKL(ire_fp_mp)) > MBLKHEAD(mp)) { 18337 if (tcp->tcp_snd_zcopy_aware) 18338 mp = tcp_zcopy_disable(tcp, mp); 18339 TCP_STAT(tcp_ip_ire_send); 18340 IRE_REFRELE(ire); 18341 CALL_IP_WPUT(connp, q, mp); 18342 return; 18343 } 18344 18345 ill = ire_to_ill(ire); 18346 if (connp->conn_outgoing_ill != NULL) { 18347 ill_t *conn_outgoing_ill = NULL; 18348 /* 18349 * Choose a good ill in the group to send the packets on. 18350 */ 18351 ire = conn_set_outgoing_ill(connp, ire, &conn_outgoing_ill); 18352 ill = ire_to_ill(ire); 18353 } 18354 ASSERT(ill != NULL); 18355 18356 if (!tcp->tcp_ire_ill_check_done) { 18357 tcp_ire_ill_check(tcp, ire, ill, B_TRUE); 18358 tcp->tcp_ire_ill_check_done = B_TRUE; 18359 } 18360 18361 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); 18362 ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); 18363 #ifndef _BIG_ENDIAN 18364 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); 18365 #endif 18366 18367 /* 18368 * Check to see if we need to re-enable MDT for this connection 18369 * because it was previously disabled due to changes in the ill; 18370 * note that by doing it here, this re-enabling only applies when 18371 * the packet is not dispatched through CALL_IP_WPUT(). 18372 * 18373 * That means for IPv4, it is worth re-enabling MDT for the fastpath 18374 * case, since that's how we ended up here. For IPv6, we do the 18375 * re-enabling work in ip_xmit_v6(), albeit indirectly via squeue. 18376 */ 18377 if (connp->conn_mdt_ok && !tcp->tcp_mdt && ILL_MDT_USABLE(ill)) { 18378 /* 18379 * Restore MDT for this connection, so that next time around 18380 * it is eligible to go through tcp_multisend() path again. 18381 */ 18382 TCP_STAT(tcp_mdt_conn_resumed1); 18383 tcp->tcp_mdt = B_TRUE; 18384 ip1dbg(("tcp_send_data: reenabling MDT for connp %p on " 18385 "interface %s\n", (void *)connp, ill->ill_name)); 18386 } 18387 18388 if (tcp->tcp_snd_zcopy_aware) { 18389 if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 || 18390 (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0)) 18391 mp = tcp_zcopy_disable(tcp, mp); 18392 /* 18393 * we shouldn't need to reset ipha as the mp containing 18394 * ipha should never be a zero-copy mp. 18395 */ 18396 } 18397 18398 if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { 18399 ASSERT(ill->ill_hcksum_capab != NULL); 18400 hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; 18401 } 18402 18403 /* pseudo-header checksum (do it in parts for IP header checksum) */ 18404 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 18405 18406 ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION); 18407 up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); 18408 18409 IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up, 18410 IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum); 18411 18412 /* Software checksum? */ 18413 if (DB_CKSUMFLAGS(mp) == 0) { 18414 TCP_STAT(tcp_out_sw_cksum); 18415 TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes, 18416 ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH); 18417 } 18418 18419 ipha->ipha_fragment_offset_and_flags |= 18420 (uint32_t)htons(ire->ire_frag_flag); 18421 18422 /* Calculate IP header checksum if hardware isn't capable */ 18423 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 18424 IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0], 18425 ((uint16_t *)ipha)[4]); 18426 } 18427 18428 ASSERT(DB_TYPE(ire_fp_mp) == M_DATA); 18429 mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len; 18430 bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len); 18431 18432 UPDATE_OB_PKT_COUNT(ire); 18433 ire->ire_last_used_time = lbolt; 18434 BUMP_MIB(&ip_mib, ipOutRequests); 18435 18436 if (ILL_DLS_CAPABLE(ill)) { 18437 /* 18438 * Send the packet directly to DLD, where it may be queued 18439 * depending on the availability of transmit resources at 18440 * the media layer. 18441 */ 18442 IP_DLS_ILL_TX(ill, mp); 18443 } else { 18444 putnext(ire->ire_stq, mp); 18445 } 18446 IRE_REFRELE(ire); 18447 } 18448 18449 /* 18450 * This handles the case when the receiver has shrunk its win. Per RFC 1122 18451 * if the receiver shrinks the window, i.e. moves the right window to the 18452 * left, the we should not send new data, but should retransmit normally the 18453 * old unacked data between suna and suna + swnd. We might has sent data 18454 * that is now outside the new window, pretend that we didn't send it. 18455 */ 18456 static void 18457 tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count) 18458 { 18459 uint32_t snxt = tcp->tcp_snxt; 18460 mblk_t *xmit_tail; 18461 int32_t offset; 18462 18463 ASSERT(shrunk_count > 0); 18464 18465 /* Pretend we didn't send the data outside the window */ 18466 snxt -= shrunk_count; 18467 18468 /* Get the mblk and the offset in it per the shrunk window */ 18469 xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset); 18470 18471 ASSERT(xmit_tail != NULL); 18472 18473 /* Reset all the values per the now shrunk window */ 18474 tcp->tcp_snxt = snxt; 18475 tcp->tcp_xmit_tail = xmit_tail; 18476 tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr - xmit_tail->b_rptr - 18477 offset; 18478 tcp->tcp_unsent += shrunk_count; 18479 18480 if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0) 18481 /* 18482 * Make sure the timer is running so that we will probe a zero 18483 * window. 18484 */ 18485 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 18486 } 18487 18488 18489 /* 18490 * The TCP normal data output path. 18491 * NOTE: the logic of the fast path is duplicated from this function. 18492 */ 18493 static void 18494 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) 18495 { 18496 int len; 18497 mblk_t *local_time; 18498 mblk_t *mp1; 18499 uint32_t snxt; 18500 int tail_unsent; 18501 int tcpstate; 18502 int usable = 0; 18503 mblk_t *xmit_tail; 18504 queue_t *q = tcp->tcp_wq; 18505 int32_t mss; 18506 int32_t num_sack_blk = 0; 18507 int32_t tcp_hdr_len; 18508 int32_t tcp_tcp_hdr_len; 18509 int mdt_thres; 18510 int rc; 18511 18512 tcpstate = tcp->tcp_state; 18513 if (mp == NULL) { 18514 /* 18515 * tcp_wput_data() with NULL mp should only be called when 18516 * there is unsent data. 18517 */ 18518 ASSERT(tcp->tcp_unsent > 0); 18519 /* Really tacky... but we need this for detached closes. */ 18520 len = tcp->tcp_unsent; 18521 goto data_null; 18522 } 18523 18524 #if CCS_STATS 18525 wrw_stats.tot.count++; 18526 wrw_stats.tot.bytes += msgdsize(mp); 18527 #endif 18528 ASSERT(mp->b_datap->db_type == M_DATA); 18529 /* 18530 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 18531 * or before a connection attempt has begun. 18532 */ 18533 if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 18534 (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 18535 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 18536 #ifdef DEBUG 18537 cmn_err(CE_WARN, 18538 "tcp_wput_data: data after ordrel, %s", 18539 tcp_display(tcp, NULL, 18540 DISP_ADDR_AND_PORT)); 18541 #else 18542 if (tcp->tcp_debug) { 18543 (void) strlog(TCP_MOD_ID, 0, 1, 18544 SL_TRACE|SL_ERROR, 18545 "tcp_wput_data: data after ordrel, %s\n", 18546 tcp_display(tcp, NULL, 18547 DISP_ADDR_AND_PORT)); 18548 } 18549 #endif /* DEBUG */ 18550 } 18551 if (tcp->tcp_snd_zcopy_aware && 18552 (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0) 18553 tcp_zcopy_notify(tcp); 18554 freemsg(mp); 18555 if (tcp->tcp_flow_stopped && 18556 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 18557 tcp_clrqfull(tcp); 18558 } 18559 return; 18560 } 18561 18562 /* Strip empties */ 18563 for (;;) { 18564 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 18565 (uintptr_t)INT_MAX); 18566 len = (int)(mp->b_wptr - mp->b_rptr); 18567 if (len > 0) 18568 break; 18569 mp1 = mp; 18570 mp = mp->b_cont; 18571 freeb(mp1); 18572 if (!mp) { 18573 return; 18574 } 18575 } 18576 18577 /* If we are the first on the list ... */ 18578 if (tcp->tcp_xmit_head == NULL) { 18579 tcp->tcp_xmit_head = mp; 18580 tcp->tcp_xmit_tail = mp; 18581 tcp->tcp_xmit_tail_unsent = len; 18582 } else { 18583 /* If tiny tx and room in txq tail, pullup to save mblks. */ 18584 struct datab *dp; 18585 18586 mp1 = tcp->tcp_xmit_last; 18587 if (len < tcp_tx_pull_len && 18588 (dp = mp1->b_datap)->db_ref == 1 && 18589 dp->db_lim - mp1->b_wptr >= len) { 18590 ASSERT(len > 0); 18591 ASSERT(!mp1->b_cont); 18592 if (len == 1) { 18593 *mp1->b_wptr++ = *mp->b_rptr; 18594 } else { 18595 bcopy(mp->b_rptr, mp1->b_wptr, len); 18596 mp1->b_wptr += len; 18597 } 18598 if (mp1 == tcp->tcp_xmit_tail) 18599 tcp->tcp_xmit_tail_unsent += len; 18600 mp1->b_cont = mp->b_cont; 18601 if (tcp->tcp_snd_zcopy_aware && 18602 (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 18603 mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 18604 freeb(mp); 18605 mp = mp1; 18606 } else { 18607 tcp->tcp_xmit_last->b_cont = mp; 18608 } 18609 len += tcp->tcp_unsent; 18610 } 18611 18612 /* Tack on however many more positive length mblks we have */ 18613 if ((mp1 = mp->b_cont) != NULL) { 18614 do { 18615 int tlen; 18616 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 18617 (uintptr_t)INT_MAX); 18618 tlen = (int)(mp1->b_wptr - mp1->b_rptr); 18619 if (tlen <= 0) { 18620 mp->b_cont = mp1->b_cont; 18621 freeb(mp1); 18622 } else { 18623 len += tlen; 18624 mp = mp1; 18625 } 18626 } while ((mp1 = mp->b_cont) != NULL); 18627 } 18628 tcp->tcp_xmit_last = mp; 18629 tcp->tcp_unsent = len; 18630 18631 if (urgent) 18632 usable = 1; 18633 18634 data_null: 18635 snxt = tcp->tcp_snxt; 18636 xmit_tail = tcp->tcp_xmit_tail; 18637 tail_unsent = tcp->tcp_xmit_tail_unsent; 18638 18639 /* 18640 * Note that tcp_mss has been adjusted to take into account the 18641 * timestamp option if applicable. Because SACK options do not 18642 * appear in every TCP segments and they are of variable lengths, 18643 * they cannot be included in tcp_mss. Thus we need to calculate 18644 * the actual segment length when we need to send a segment which 18645 * includes SACK options. 18646 */ 18647 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 18648 int32_t opt_len; 18649 18650 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 18651 tcp->tcp_num_sack_blk); 18652 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 18653 2 + TCPOPT_HEADER_LEN; 18654 mss = tcp->tcp_mss - opt_len; 18655 tcp_hdr_len = tcp->tcp_hdr_len + opt_len; 18656 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + opt_len; 18657 } else { 18658 mss = tcp->tcp_mss; 18659 tcp_hdr_len = tcp->tcp_hdr_len; 18660 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; 18661 } 18662 18663 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 18664 (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 18665 SET_TCP_INIT_CWND(tcp, mss, tcp_slow_start_after_idle); 18666 } 18667 if (tcpstate == TCPS_SYN_RCVD) { 18668 /* 18669 * The three-way connection establishment handshake is not 18670 * complete yet. We want to queue the data for transmission 18671 * after entering ESTABLISHED state (RFC793). A jump to 18672 * "done" label effectively leaves data on the queue. 18673 */ 18674 goto done; 18675 } else { 18676 int usable_r; 18677 18678 /* 18679 * In the special case when cwnd is zero, which can only 18680 * happen if the connection is ECN capable, return now. 18681 * New segments is sent using tcp_timer(). The timer 18682 * is set in tcp_rput_data(). 18683 */ 18684 if (tcp->tcp_cwnd == 0) { 18685 /* 18686 * Note that tcp_cwnd is 0 before 3-way handshake is 18687 * finished. 18688 */ 18689 ASSERT(tcp->tcp_ecn_ok || 18690 tcp->tcp_state < TCPS_ESTABLISHED); 18691 return; 18692 } 18693 18694 /* NOTE: trouble if xmitting while SYN not acked? */ 18695 usable_r = snxt - tcp->tcp_suna; 18696 usable_r = tcp->tcp_swnd - usable_r; 18697 18698 /* 18699 * Check if the receiver has shrunk the window. If 18700 * tcp_wput_data() with NULL mp is called, tcp_fin_sent 18701 * cannot be set as there is unsent data, so FIN cannot 18702 * be sent out. Otherwise, we need to take into account 18703 * of FIN as it consumes an "invisible" sequence number. 18704 */ 18705 ASSERT(tcp->tcp_fin_sent == 0); 18706 if (usable_r < 0) { 18707 /* 18708 * The receiver has shrunk the window and we have sent 18709 * -usable_r date beyond the window, re-adjust. 18710 * 18711 * If TCP window scaling is enabled, there can be 18712 * round down error as the advertised receive window 18713 * is actually right shifted n bits. This means that 18714 * the lower n bits info is wiped out. It will look 18715 * like the window is shrunk. Do a check here to 18716 * see if the shrunk amount is actually within the 18717 * error in window calculation. If it is, just 18718 * return. Note that this check is inside the 18719 * shrunk window check. This makes sure that even 18720 * though tcp_process_shrunk_swnd() is not called, 18721 * we will stop further processing. 18722 */ 18723 if ((-usable_r >> tcp->tcp_snd_ws) > 0) { 18724 tcp_process_shrunk_swnd(tcp, -usable_r); 18725 } 18726 return; 18727 } 18728 18729 /* usable = MIN(swnd, cwnd) - unacked_bytes */ 18730 if (tcp->tcp_swnd > tcp->tcp_cwnd) 18731 usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd; 18732 18733 /* usable = MIN(usable, unsent) */ 18734 if (usable_r > len) 18735 usable_r = len; 18736 18737 /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 18738 if (usable_r > 0) { 18739 usable = usable_r; 18740 } else { 18741 /* Bypass all other unnecessary processing. */ 18742 goto done; 18743 } 18744 } 18745 18746 local_time = (mblk_t *)lbolt; 18747 18748 /* 18749 * "Our" Nagle Algorithm. This is not the same as in the old 18750 * BSD. This is more in line with the true intent of Nagle. 18751 * 18752 * The conditions are: 18753 * 1. The amount of unsent data (or amount of data which can be 18754 * sent, whichever is smaller) is less than Nagle limit. 18755 * 2. The last sent size is also less than Nagle limit. 18756 * 3. There is unack'ed data. 18757 * 4. Urgent pointer is not set. Send urgent data ignoring the 18758 * Nagle algorithm. This reduces the probability that urgent 18759 * bytes get "merged" together. 18760 * 5. The app has not closed the connection. This eliminates the 18761 * wait time of the receiving side waiting for the last piece of 18762 * (small) data. 18763 * 18764 * If all are satisified, exit without sending anything. Note 18765 * that Nagle limit can be smaller than 1 MSS. Nagle limit is 18766 * the smaller of 1 MSS and global tcp_naglim_def (default to be 18767 * 4095). 18768 */ 18769 if (usable < (int)tcp->tcp_naglim && 18770 tcp->tcp_naglim > tcp->tcp_last_sent_len && 18771 snxt != tcp->tcp_suna && 18772 !(tcp->tcp_valid_bits & TCP_URG_VALID) && 18773 !(tcp->tcp_valid_bits & TCP_FSS_VALID)) { 18774 goto done; 18775 } 18776 18777 if (tcp->tcp_cork) { 18778 /* 18779 * if the tcp->tcp_cork option is set, then we have to force 18780 * TCP not to send partial segment (smaller than MSS bytes). 18781 * We are calculating the usable now based on full mss and 18782 * will save the rest of remaining data for later. 18783 */ 18784 if (usable < mss) 18785 goto done; 18786 usable = (usable / mss) * mss; 18787 } 18788 18789 /* Update the latest receive window size in TCP header. */ 18790 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 18791 tcp->tcp_tcph->th_win); 18792 18793 /* 18794 * Determine if it's worthwhile to attempt MDT, based on: 18795 * 18796 * 1. Simple TCP/IP{v4,v6} (no options). 18797 * 2. IPSEC/IPQoS processing is not needed for the TCP connection. 18798 * 3. If the TCP connection is in ESTABLISHED state. 18799 * 4. The TCP is not detached. 18800 * 18801 * If any of the above conditions have changed during the 18802 * connection, stop using MDT and restore the stream head 18803 * parameters accordingly. 18804 */ 18805 if (tcp->tcp_mdt && 18806 ((tcp->tcp_ipversion == IPV4_VERSION && 18807 tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || 18808 (tcp->tcp_ipversion == IPV6_VERSION && 18809 tcp->tcp_ip_hdr_len != IPV6_HDR_LEN) || 18810 tcp->tcp_state != TCPS_ESTABLISHED || 18811 TCP_IS_DETACHED(tcp) || !CONN_IS_MD_FASTPATH(tcp->tcp_connp) || 18812 CONN_IPSEC_OUT_ENCAPSULATED(tcp->tcp_connp) || 18813 IPP_ENABLED(IPP_LOCAL_OUT))) { 18814 tcp->tcp_connp->conn_mdt_ok = B_FALSE; 18815 tcp->tcp_mdt = B_FALSE; 18816 18817 /* Anything other than detached is considered pathological */ 18818 if (!TCP_IS_DETACHED(tcp)) { 18819 TCP_STAT(tcp_mdt_conn_halted1); 18820 (void) tcp_maxpsz_set(tcp, B_TRUE); 18821 } 18822 } 18823 18824 /* Use MDT if sendable amount is greater than the threshold */ 18825 if (tcp->tcp_mdt && 18826 (mdt_thres = mss << tcp_mdt_smss_threshold, usable > mdt_thres) && 18827 (tail_unsent > mdt_thres || (xmit_tail->b_cont != NULL && 18828 MBLKL(xmit_tail->b_cont) > mdt_thres)) && 18829 (tcp->tcp_valid_bits == 0 || 18830 tcp->tcp_valid_bits == TCP_FSS_VALID)) { 18831 ASSERT(tcp->tcp_connp->conn_mdt_ok); 18832 rc = tcp_multisend(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, 18833 num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, 18834 local_time, mdt_thres); 18835 } else { 18836 rc = tcp_send(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, 18837 num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, 18838 local_time, INT_MAX); 18839 } 18840 18841 /* Pretend that all we were trying to send really got sent */ 18842 if (rc < 0 && tail_unsent < 0) { 18843 do { 18844 xmit_tail = xmit_tail->b_cont; 18845 xmit_tail->b_prev = local_time; 18846 ASSERT((uintptr_t)(xmit_tail->b_wptr - 18847 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 18848 tail_unsent += (int)(xmit_tail->b_wptr - 18849 xmit_tail->b_rptr); 18850 } while (tail_unsent < 0); 18851 } 18852 done:; 18853 tcp->tcp_xmit_tail = xmit_tail; 18854 tcp->tcp_xmit_tail_unsent = tail_unsent; 18855 len = tcp->tcp_snxt - snxt; 18856 if (len) { 18857 /* 18858 * If new data was sent, need to update the notsack 18859 * list, which is, afterall, data blocks that have 18860 * not been sack'ed by the receiver. New data is 18861 * not sack'ed. 18862 */ 18863 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 18864 /* len is a negative value. */ 18865 tcp->tcp_pipe -= len; 18866 tcp_notsack_update(&(tcp->tcp_notsack_list), 18867 tcp->tcp_snxt, snxt, 18868 &(tcp->tcp_num_notsack_blk), 18869 &(tcp->tcp_cnt_notsack_list)); 18870 } 18871 tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 18872 tcp->tcp_rack = tcp->tcp_rnxt; 18873 tcp->tcp_rack_cnt = 0; 18874 if ((snxt + len) == tcp->tcp_suna) { 18875 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 18876 } 18877 } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 18878 /* 18879 * Didn't send anything. Make sure the timer is running 18880 * so that we will probe a zero window. 18881 */ 18882 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 18883 } 18884 /* Note that len is the amount we just sent but with a negative sign */ 18885 tcp->tcp_unsent += len; 18886 if (tcp->tcp_flow_stopped) { 18887 if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 18888 tcp_clrqfull(tcp); 18889 } 18890 } else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) { 18891 tcp_setqfull(tcp); 18892 } 18893 } 18894 18895 /* 18896 * tcp_fill_header is called by tcp_send() and tcp_multisend() to fill the 18897 * outgoing TCP header with the template header, as well as other 18898 * options such as time-stamp, ECN and/or SACK. 18899 */ 18900 static void 18901 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) 18902 { 18903 tcph_t *tcp_tmpl, *tcp_h; 18904 uint32_t *dst, *src; 18905 int hdrlen; 18906 18907 ASSERT(OK_32PTR(rptr)); 18908 18909 /* Template header */ 18910 tcp_tmpl = tcp->tcp_tcph; 18911 18912 /* Header of outgoing packet */ 18913 tcp_h = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 18914 18915 /* dst and src are opaque 32-bit fields, used for copying */ 18916 dst = (uint32_t *)rptr; 18917 src = (uint32_t *)tcp->tcp_iphc; 18918 hdrlen = tcp->tcp_hdr_len; 18919 18920 /* Fill time-stamp option if needed */ 18921 if (tcp->tcp_snd_ts_ok) { 18922 U32_TO_BE32((uint32_t)now, 18923 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); 18924 U32_TO_BE32(tcp->tcp_ts_recent, 18925 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); 18926 } else { 18927 ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 18928 } 18929 18930 /* 18931 * Copy the template header; is this really more efficient than 18932 * calling bcopy()? For simple IPv4/TCP, it may be the case, 18933 * but perhaps not for other scenarios. 18934 */ 18935 dst[0] = src[0]; 18936 dst[1] = src[1]; 18937 dst[2] = src[2]; 18938 dst[3] = src[3]; 18939 dst[4] = src[4]; 18940 dst[5] = src[5]; 18941 dst[6] = src[6]; 18942 dst[7] = src[7]; 18943 dst[8] = src[8]; 18944 dst[9] = src[9]; 18945 if (hdrlen -= 40) { 18946 hdrlen >>= 2; 18947 dst += 10; 18948 src += 10; 18949 do { 18950 *dst++ = *src++; 18951 } while (--hdrlen); 18952 } 18953 18954 /* 18955 * Set the ECN info in the TCP header if it is not a zero 18956 * window probe. Zero window probe is only sent in 18957 * tcp_wput_data() and tcp_timer(). 18958 */ 18959 if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 18960 SET_ECT(tcp, rptr); 18961 18962 if (tcp->tcp_ecn_echo_on) 18963 tcp_h->th_flags[0] |= TH_ECE; 18964 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 18965 tcp_h->th_flags[0] |= TH_CWR; 18966 tcp->tcp_ecn_cwr_sent = B_TRUE; 18967 } 18968 } 18969 18970 /* Fill in SACK options */ 18971 if (num_sack_blk > 0) { 18972 uchar_t *wptr = rptr + tcp->tcp_hdr_len; 18973 sack_blk_t *tmp; 18974 int32_t i; 18975 18976 wptr[0] = TCPOPT_NOP; 18977 wptr[1] = TCPOPT_NOP; 18978 wptr[2] = TCPOPT_SACK; 18979 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 18980 sizeof (sack_blk_t); 18981 wptr += TCPOPT_REAL_SACK_LEN; 18982 18983 tmp = tcp->tcp_sack_list; 18984 for (i = 0; i < num_sack_blk; i++) { 18985 U32_TO_BE32(tmp[i].begin, wptr); 18986 wptr += sizeof (tcp_seq); 18987 U32_TO_BE32(tmp[i].end, wptr); 18988 wptr += sizeof (tcp_seq); 18989 } 18990 tcp_h->th_offset_and_rsrvd[0] += 18991 ((num_sack_blk * 2 + 1) << 4); 18992 } 18993 } 18994 18995 /* 18996 * tcp_mdt_add_attrs() is called by tcp_multisend() in order to attach 18997 * the destination address and SAP attribute, and if necessary, the 18998 * hardware checksum offload attribute to a Multidata message. 18999 */ 19000 static int 19001 tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum, 19002 const uint32_t start, const uint32_t stuff, const uint32_t end, 19003 const uint32_t flags) 19004 { 19005 /* Add global destination address & SAP attribute */ 19006 if (dlmp == NULL || !ip_md_addr_attr(mmd, NULL, dlmp)) { 19007 ip1dbg(("tcp_mdt_add_attrs: can't add global physical " 19008 "destination address+SAP\n")); 19009 19010 if (dlmp != NULL) 19011 TCP_STAT(tcp_mdt_allocfail); 19012 return (-1); 19013 } 19014 19015 /* Add global hwcksum attribute */ 19016 if (hwcksum && 19017 !ip_md_hcksum_attr(mmd, NULL, start, stuff, end, flags)) { 19018 ip1dbg(("tcp_mdt_add_attrs: can't add global hardware " 19019 "checksum attribute\n")); 19020 19021 TCP_STAT(tcp_mdt_allocfail); 19022 return (-1); 19023 } 19024 19025 return (0); 19026 } 19027 19028 /* 19029 * Smaller and private version of pdescinfo_t used specifically for TCP, 19030 * which allows for only two payload spans per packet. 19031 */ 19032 typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t; 19033 19034 /* 19035 * tcp_multisend() is called by tcp_wput_data() for Multidata Transmit 19036 * scheme, and returns one the following: 19037 * 19038 * -1 = failed allocation. 19039 * 0 = success; burst count reached, or usable send window is too small, 19040 * and that we'd rather wait until later before sending again. 19041 */ 19042 static int 19043 tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, 19044 const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable, 19045 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 19046 const int mdt_thres) 19047 { 19048 mblk_t *md_mp_head, *md_mp, *md_pbuf, *md_pbuf_nxt, *md_hbuf; 19049 multidata_t *mmd; 19050 uint_t obsegs, obbytes, hdr_frag_sz; 19051 uint_t cur_hdr_off, cur_pld_off, base_pld_off, first_snxt; 19052 int num_burst_seg, max_pld; 19053 pdesc_t *pkt; 19054 tcp_pdescinfo_t tcp_pkt_info; 19055 pdescinfo_t *pkt_info; 19056 int pbuf_idx, pbuf_idx_nxt; 19057 int seg_len, len, spill, af; 19058 boolean_t add_buffer, zcopy, clusterwide; 19059 boolean_t rconfirm = B_FALSE; 19060 boolean_t done = B_FALSE; 19061 uint32_t cksum; 19062 uint32_t hwcksum_flags; 19063 ire_t *ire; 19064 ill_t *ill; 19065 ipha_t *ipha; 19066 ip6_t *ip6h; 19067 ipaddr_t src, dst; 19068 ill_zerocopy_capab_t *zc_cap = NULL; 19069 uint16_t *up; 19070 int err; 19071 conn_t *connp; 19072 19073 #ifdef _BIG_ENDIAN 19074 #define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 28) & 0x7) 19075 #else 19076 #define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 4) & 0x7) 19077 #endif 19078 19079 #define PREP_NEW_MULTIDATA() { \ 19080 mmd = NULL; \ 19081 md_mp = md_hbuf = NULL; \ 19082 cur_hdr_off = 0; \ 19083 max_pld = tcp->tcp_mdt_max_pld; \ 19084 pbuf_idx = pbuf_idx_nxt = -1; \ 19085 add_buffer = B_TRUE; \ 19086 zcopy = B_FALSE; \ 19087 } 19088 19089 #define PREP_NEW_PBUF() { \ 19090 md_pbuf = md_pbuf_nxt = NULL; \ 19091 pbuf_idx = pbuf_idx_nxt = -1; \ 19092 cur_pld_off = 0; \ 19093 first_snxt = *snxt; \ 19094 ASSERT(*tail_unsent > 0); \ 19095 base_pld_off = MBLKL(*xmit_tail) - *tail_unsent; \ 19096 } 19097 19098 ASSERT(mdt_thres >= mss); 19099 ASSERT(*usable > 0 && *usable > mdt_thres); 19100 ASSERT(tcp->tcp_state == TCPS_ESTABLISHED); 19101 ASSERT(!TCP_IS_DETACHED(tcp)); 19102 ASSERT(tcp->tcp_valid_bits == 0 || 19103 tcp->tcp_valid_bits == TCP_FSS_VALID); 19104 ASSERT((tcp->tcp_ipversion == IPV4_VERSION && 19105 tcp->tcp_ip_hdr_len == IP_SIMPLE_HDR_LENGTH) || 19106 (tcp->tcp_ipversion == IPV6_VERSION && 19107 tcp->tcp_ip_hdr_len == IPV6_HDR_LEN)); 19108 19109 connp = tcp->tcp_connp; 19110 ASSERT(connp != NULL); 19111 ASSERT(CONN_IS_MD_FASTPATH(connp)); 19112 ASSERT(!CONN_IPSEC_OUT_ENCAPSULATED(connp)); 19113 19114 /* 19115 * Note that tcp will only declare at most 2 payload spans per 19116 * packet, which is much lower than the maximum allowable number 19117 * of packet spans per Multidata. For this reason, we use the 19118 * privately declared and smaller descriptor info structure, in 19119 * order to save some stack space. 19120 */ 19121 pkt_info = (pdescinfo_t *)&tcp_pkt_info; 19122 19123 af = (tcp->tcp_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6; 19124 if (af == AF_INET) { 19125 dst = tcp->tcp_ipha->ipha_dst; 19126 src = tcp->tcp_ipha->ipha_src; 19127 ASSERT(!CLASSD(dst)); 19128 } 19129 ASSERT(af == AF_INET || 19130 !IN6_IS_ADDR_MULTICAST(&tcp->tcp_ip6h->ip6_dst)); 19131 19132 obsegs = obbytes = 0; 19133 num_burst_seg = tcp->tcp_snd_burst; 19134 md_mp_head = NULL; 19135 PREP_NEW_MULTIDATA(); 19136 19137 /* 19138 * Before we go on further, make sure there is an IRE that we can 19139 * use, and that the ILL supports MDT. Otherwise, there's no point 19140 * in proceeding any further, and we should just hand everything 19141 * off to the legacy path. 19142 */ 19143 mutex_enter(&connp->conn_lock); 19144 ire = connp->conn_ire_cache; 19145 ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); 19146 if (ire != NULL && ((af == AF_INET && ire->ire_addr == dst) || 19147 (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, 19148 &tcp->tcp_ip6h->ip6_dst))) && 19149 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 19150 IRE_REFHOLD(ire); 19151 mutex_exit(&connp->conn_lock); 19152 } else { 19153 boolean_t cached = B_FALSE; 19154 ts_label_t *tsl; 19155 19156 /* force a recheck later on */ 19157 tcp->tcp_ire_ill_check_done = B_FALSE; 19158 19159 TCP_DBGSTAT(tcp_ire_null1); 19160 connp->conn_ire_cache = NULL; 19161 mutex_exit(&connp->conn_lock); 19162 19163 /* Release the old ire */ 19164 if (ire != NULL) 19165 IRE_REFRELE_NOTR(ire); 19166 19167 tsl = crgetlabel(CONN_CRED(connp)); 19168 ire = (af == AF_INET) ? 19169 ire_cache_lookup(dst, connp->conn_zoneid, tsl) : 19170 ire_cache_lookup_v6(&tcp->tcp_ip6h->ip6_dst, 19171 connp->conn_zoneid, tsl); 19172 19173 if (ire == NULL) { 19174 TCP_STAT(tcp_ire_null); 19175 goto legacy_send_no_md; 19176 } 19177 19178 IRE_REFHOLD_NOTR(ire); 19179 /* 19180 * Since we are inside the squeue, there cannot be another 19181 * thread in TCP trying to set the conn_ire_cache now. The 19182 * check for IRE_MARK_CONDEMNED ensures that an interface 19183 * unplumb thread has not yet started cleaning up the conns. 19184 * Hence we don't need to grab the conn lock. 19185 */ 19186 if (!(connp->conn_state_flags & CONN_CLOSING)) { 19187 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 19188 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 19189 connp->conn_ire_cache = ire; 19190 cached = B_TRUE; 19191 } 19192 rw_exit(&ire->ire_bucket->irb_lock); 19193 } 19194 19195 /* 19196 * We can continue to use the ire but since it was not 19197 * cached, we should drop the extra reference. 19198 */ 19199 if (!cached) 19200 IRE_REFRELE_NOTR(ire); 19201 } 19202 19203 ASSERT(ire != NULL); 19204 ASSERT(af != AF_INET || ire->ire_ipversion == IPV4_VERSION); 19205 ASSERT(af == AF_INET || !IN6_IS_ADDR_V4MAPPED(&(ire->ire_addr_v6))); 19206 ASSERT(af == AF_INET || ire->ire_nce != NULL); 19207 ASSERT(!(ire->ire_type & IRE_BROADCAST)); 19208 /* 19209 * If we do support loopback for MDT (which requires modifications 19210 * to the receiving paths), the following assertions should go away, 19211 * and we would be sending the Multidata to loopback conn later on. 19212 */ 19213 ASSERT(!IRE_IS_LOCAL(ire)); 19214 ASSERT(ire->ire_stq != NULL); 19215 19216 ill = ire_to_ill(ire); 19217 ASSERT(ill != NULL); 19218 ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL); 19219 19220 if (!tcp->tcp_ire_ill_check_done) { 19221 tcp_ire_ill_check(tcp, ire, ill, B_TRUE); 19222 tcp->tcp_ire_ill_check_done = B_TRUE; 19223 } 19224 19225 /* 19226 * If the underlying interface conditions have changed, or if the 19227 * new interface does not support MDT, go back to legacy path. 19228 */ 19229 if (!ILL_MDT_USABLE(ill) || (ire->ire_flags & RTF_MULTIRT) != 0) { 19230 /* don't go through this path anymore for this connection */ 19231 TCP_STAT(tcp_mdt_conn_halted2); 19232 tcp->tcp_mdt = B_FALSE; 19233 ip1dbg(("tcp_multisend: disabling MDT for connp %p on " 19234 "interface %s\n", (void *)connp, ill->ill_name)); 19235 /* IRE will be released prior to returning */ 19236 goto legacy_send_no_md; 19237 } 19238 19239 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) 19240 zc_cap = ill->ill_zerocopy_capab; 19241 19242 /* 19243 * Check if we can take tcp fast-path. Note that "incomplete" 19244 * ire's (where the link-layer for next hop is not resolved 19245 * or where the fast-path header in nce_fp_mp is not available 19246 * yet) are sent down the legacy (slow) path. 19247 * NOTE: We should fix ip_xmit_v4 to handle M_MULTIDATA 19248 */ 19249 if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) { 19250 /* IRE will be released prior to returning */ 19251 goto legacy_send_no_md; 19252 } 19253 19254 /* go to legacy path if interface doesn't support zerocopy */ 19255 if (tcp->tcp_snd_zcopy_aware && do_tcpzcopy != 2 && 19256 (zc_cap == NULL || zc_cap->ill_zerocopy_flags == 0)) { 19257 /* IRE will be released prior to returning */ 19258 goto legacy_send_no_md; 19259 } 19260 19261 /* does the interface support hardware checksum offload? */ 19262 hwcksum_flags = 0; 19263 if (ILL_HCKSUM_CAPABLE(ill) && 19264 (ill->ill_hcksum_capab->ill_hcksum_txflags & 19265 (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL | 19266 HCKSUM_IPHDRCKSUM)) && dohwcksum) { 19267 if (ill->ill_hcksum_capab->ill_hcksum_txflags & 19268 HCKSUM_IPHDRCKSUM) 19269 hwcksum_flags = HCK_IPV4_HDRCKSUM; 19270 19271 if (ill->ill_hcksum_capab->ill_hcksum_txflags & 19272 (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6)) 19273 hwcksum_flags |= HCK_FULLCKSUM; 19274 else if (ill->ill_hcksum_capab->ill_hcksum_txflags & 19275 HCKSUM_INET_PARTIAL) 19276 hwcksum_flags |= HCK_PARTIALCKSUM; 19277 } 19278 19279 /* 19280 * Each header fragment consists of the leading extra space, 19281 * followed by the TCP/IP header, and the trailing extra space. 19282 * We make sure that each header fragment begins on a 32-bit 19283 * aligned memory address (tcp_mdt_hdr_head is already 32-bit 19284 * aligned in tcp_mdt_update). 19285 */ 19286 hdr_frag_sz = roundup((tcp->tcp_mdt_hdr_head + tcp_hdr_len + 19287 tcp->tcp_mdt_hdr_tail), 4); 19288 19289 /* are we starting from the beginning of data block? */ 19290 if (*tail_unsent == 0) { 19291 *xmit_tail = (*xmit_tail)->b_cont; 19292 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= (uintptr_t)INT_MAX); 19293 *tail_unsent = (int)MBLKL(*xmit_tail); 19294 } 19295 19296 /* 19297 * Here we create one or more Multidata messages, each made up of 19298 * one header buffer and up to N payload buffers. This entire 19299 * operation is done within two loops: 19300 * 19301 * The outer loop mostly deals with creating the Multidata message, 19302 * as well as the header buffer that gets added to it. It also 19303 * links the Multidata messages together such that all of them can 19304 * be sent down to the lower layer in a single putnext call; this 19305 * linking behavior depends on the tcp_mdt_chain tunable. 19306 * 19307 * The inner loop takes an existing Multidata message, and adds 19308 * one or more (up to tcp_mdt_max_pld) payload buffers to it. It 19309 * packetizes those buffers by filling up the corresponding header 19310 * buffer fragments with the proper IP and TCP headers, and by 19311 * describing the layout of each packet in the packet descriptors 19312 * that get added to the Multidata. 19313 */ 19314 do { 19315 /* 19316 * If usable send window is too small, or data blocks in 19317 * transmit list are smaller than our threshold (i.e. app 19318 * performs large writes followed by small ones), we hand 19319 * off the control over to the legacy path. Note that we'll 19320 * get back the control once it encounters a large block. 19321 */ 19322 if (*usable < mss || (*tail_unsent <= mdt_thres && 19323 (*xmit_tail)->b_cont != NULL && 19324 MBLKL((*xmit_tail)->b_cont) <= mdt_thres)) { 19325 /* send down what we've got so far */ 19326 if (md_mp_head != NULL) { 19327 tcp_multisend_data(tcp, ire, ill, md_mp_head, 19328 obsegs, obbytes, &rconfirm); 19329 } 19330 /* 19331 * Pass control over to tcp_send(), but tell it to 19332 * return to us once a large-size transmission is 19333 * possible. 19334 */ 19335 TCP_STAT(tcp_mdt_legacy_small); 19336 if ((err = tcp_send(q, tcp, mss, tcp_hdr_len, 19337 tcp_tcp_hdr_len, num_sack_blk, usable, snxt, 19338 tail_unsent, xmit_tail, local_time, 19339 mdt_thres)) <= 0) { 19340 /* burst count reached, or alloc failed */ 19341 IRE_REFRELE(ire); 19342 return (err); 19343 } 19344 19345 /* tcp_send() may have sent everything, so check */ 19346 if (*usable <= 0) { 19347 IRE_REFRELE(ire); 19348 return (0); 19349 } 19350 19351 TCP_STAT(tcp_mdt_legacy_ret); 19352 /* 19353 * We may have delivered the Multidata, so make sure 19354 * to re-initialize before the next round. 19355 */ 19356 md_mp_head = NULL; 19357 obsegs = obbytes = 0; 19358 num_burst_seg = tcp->tcp_snd_burst; 19359 PREP_NEW_MULTIDATA(); 19360 19361 /* are we starting from the beginning of data block? */ 19362 if (*tail_unsent == 0) { 19363 *xmit_tail = (*xmit_tail)->b_cont; 19364 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= 19365 (uintptr_t)INT_MAX); 19366 *tail_unsent = (int)MBLKL(*xmit_tail); 19367 } 19368 } 19369 19370 /* 19371 * max_pld limits the number of mblks in tcp's transmit 19372 * queue that can be added to a Multidata message. Once 19373 * this counter reaches zero, no more additional mblks 19374 * can be added to it. What happens afterwards depends 19375 * on whether or not we are set to chain the Multidata 19376 * messages. If we are to link them together, reset 19377 * max_pld to its original value (tcp_mdt_max_pld) and 19378 * prepare to create a new Multidata message which will 19379 * get linked to md_mp_head. Else, leave it alone and 19380 * let the inner loop break on its own. 19381 */ 19382 if (tcp_mdt_chain && max_pld == 0) 19383 PREP_NEW_MULTIDATA(); 19384 19385 /* adding a payload buffer; re-initialize values */ 19386 if (add_buffer) 19387 PREP_NEW_PBUF(); 19388 19389 /* 19390 * If we don't have a Multidata, either because we just 19391 * (re)entered this outer loop, or after we branched off 19392 * to tcp_send above, setup the Multidata and header 19393 * buffer to be used. 19394 */ 19395 if (md_mp == NULL) { 19396 int md_hbuflen; 19397 uint32_t start, stuff; 19398 19399 /* 19400 * Calculate Multidata header buffer size large enough 19401 * to hold all of the headers that can possibly be 19402 * sent at this moment. We'd rather over-estimate 19403 * the size than running out of space; this is okay 19404 * since this buffer is small anyway. 19405 */ 19406 md_hbuflen = (howmany(*usable, mss) + 1) * hdr_frag_sz; 19407 19408 /* 19409 * Start and stuff offset for partial hardware 19410 * checksum offload; these are currently for IPv4. 19411 * For full checksum offload, they are set to zero. 19412 */ 19413 if ((hwcksum_flags & HCK_PARTIALCKSUM)) { 19414 if (af == AF_INET) { 19415 start = IP_SIMPLE_HDR_LENGTH; 19416 stuff = IP_SIMPLE_HDR_LENGTH + 19417 TCP_CHECKSUM_OFFSET; 19418 } else { 19419 start = IPV6_HDR_LEN; 19420 stuff = IPV6_HDR_LEN + 19421 TCP_CHECKSUM_OFFSET; 19422 } 19423 } else { 19424 start = stuff = 0; 19425 } 19426 19427 /* 19428 * Create the header buffer, Multidata, as well as 19429 * any necessary attributes (destination address, 19430 * SAP and hardware checksum offload) that should 19431 * be associated with the Multidata message. 19432 */ 19433 ASSERT(cur_hdr_off == 0); 19434 if ((md_hbuf = allocb(md_hbuflen, BPRI_HI)) == NULL || 19435 ((md_hbuf->b_wptr += md_hbuflen), 19436 (mmd = mmd_alloc(md_hbuf, &md_mp, 19437 KM_NOSLEEP)) == NULL) || (tcp_mdt_add_attrs(mmd, 19438 /* fastpath mblk */ 19439 ire->ire_nce->nce_res_mp, 19440 /* hardware checksum enabled */ 19441 (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)), 19442 /* hardware checksum offsets */ 19443 start, stuff, 0, 19444 /* hardware checksum flag */ 19445 hwcksum_flags) != 0)) { 19446 legacy_send: 19447 if (md_mp != NULL) { 19448 /* Unlink message from the chain */ 19449 if (md_mp_head != NULL) { 19450 err = (intptr_t)rmvb(md_mp_head, 19451 md_mp); 19452 /* 19453 * We can't assert that rmvb 19454 * did not return -1, since we 19455 * may get here before linkb 19456 * happens. We do, however, 19457 * check if we just removed the 19458 * only element in the list. 19459 */ 19460 if (err == 0) 19461 md_mp_head = NULL; 19462 } 19463 /* md_hbuf gets freed automatically */ 19464 TCP_STAT(tcp_mdt_discarded); 19465 freeb(md_mp); 19466 } else { 19467 /* Either allocb or mmd_alloc failed */ 19468 TCP_STAT(tcp_mdt_allocfail); 19469 if (md_hbuf != NULL) 19470 freeb(md_hbuf); 19471 } 19472 19473 /* send down what we've got so far */ 19474 if (md_mp_head != NULL) { 19475 tcp_multisend_data(tcp, ire, ill, 19476 md_mp_head, obsegs, obbytes, 19477 &rconfirm); 19478 } 19479 legacy_send_no_md: 19480 if (ire != NULL) 19481 IRE_REFRELE(ire); 19482 /* 19483 * Too bad; let the legacy path handle this. 19484 * We specify INT_MAX for the threshold, since 19485 * we gave up with the Multidata processings 19486 * and let the old path have it all. 19487 */ 19488 TCP_STAT(tcp_mdt_legacy_all); 19489 return (tcp_send(q, tcp, mss, tcp_hdr_len, 19490 tcp_tcp_hdr_len, num_sack_blk, usable, 19491 snxt, tail_unsent, xmit_tail, local_time, 19492 INT_MAX)); 19493 } 19494 19495 /* link to any existing ones, if applicable */ 19496 TCP_STAT(tcp_mdt_allocd); 19497 if (md_mp_head == NULL) { 19498 md_mp_head = md_mp; 19499 } else if (tcp_mdt_chain) { 19500 TCP_STAT(tcp_mdt_linked); 19501 linkb(md_mp_head, md_mp); 19502 } 19503 } 19504 19505 ASSERT(md_mp_head != NULL); 19506 ASSERT(tcp_mdt_chain || md_mp_head->b_cont == NULL); 19507 ASSERT(md_mp != NULL && mmd != NULL); 19508 ASSERT(md_hbuf != NULL); 19509 19510 /* 19511 * Packetize the transmittable portion of the data block; 19512 * each data block is essentially added to the Multidata 19513 * as a payload buffer. We also deal with adding more 19514 * than one payload buffers, which happens when the remaining 19515 * packetized portion of the current payload buffer is less 19516 * than MSS, while the next data block in transmit queue 19517 * has enough data to make up for one. This "spillover" 19518 * case essentially creates a split-packet, where portions 19519 * of the packet's payload fragments may span across two 19520 * virtually discontiguous address blocks. 19521 */ 19522 seg_len = mss; 19523 do { 19524 len = seg_len; 19525 19526 ASSERT(len > 0); 19527 ASSERT(max_pld >= 0); 19528 ASSERT(!add_buffer || cur_pld_off == 0); 19529 19530 /* 19531 * First time around for this payload buffer; note 19532 * in the case of a spillover, the following has 19533 * been done prior to adding the split-packet 19534 * descriptor to Multidata, and we don't want to 19535 * repeat the process. 19536 */ 19537 if (add_buffer) { 19538 ASSERT(mmd != NULL); 19539 ASSERT(md_pbuf == NULL); 19540 ASSERT(md_pbuf_nxt == NULL); 19541 ASSERT(pbuf_idx == -1 && pbuf_idx_nxt == -1); 19542 19543 /* 19544 * Have we reached the limit? We'd get to 19545 * this case when we're not chaining the 19546 * Multidata messages together, and since 19547 * we're done, terminate this loop. 19548 */ 19549 if (max_pld == 0) 19550 break; /* done */ 19551 19552 if ((md_pbuf = dupb(*xmit_tail)) == NULL) { 19553 TCP_STAT(tcp_mdt_allocfail); 19554 goto legacy_send; /* out_of_mem */ 19555 } 19556 19557 if (IS_VMLOANED_MBLK(md_pbuf) && !zcopy && 19558 zc_cap != NULL) { 19559 if (!ip_md_zcopy_attr(mmd, NULL, 19560 zc_cap->ill_zerocopy_flags)) { 19561 freeb(md_pbuf); 19562 TCP_STAT(tcp_mdt_allocfail); 19563 /* out_of_mem */ 19564 goto legacy_send; 19565 } 19566 zcopy = B_TRUE; 19567 } 19568 19569 md_pbuf->b_rptr += base_pld_off; 19570 19571 /* 19572 * Add a payload buffer to the Multidata; this 19573 * operation must not fail, or otherwise our 19574 * logic in this routine is broken. There 19575 * is no memory allocation done by the 19576 * routine, so any returned failure simply 19577 * tells us that we've done something wrong. 19578 * 19579 * A failure tells us that either we're adding 19580 * the same payload buffer more than once, or 19581 * we're trying to add more buffers than 19582 * allowed (max_pld calculation is wrong). 19583 * None of the above cases should happen, and 19584 * we panic because either there's horrible 19585 * heap corruption, and/or programming mistake. 19586 */ 19587 pbuf_idx = mmd_addpldbuf(mmd, md_pbuf); 19588 if (pbuf_idx < 0) { 19589 cmn_err(CE_PANIC, "tcp_multisend: " 19590 "payload buffer logic error " 19591 "detected for tcp %p mmd %p " 19592 "pbuf %p (%d)\n", 19593 (void *)tcp, (void *)mmd, 19594 (void *)md_pbuf, pbuf_idx); 19595 } 19596 19597 ASSERT(max_pld > 0); 19598 --max_pld; 19599 add_buffer = B_FALSE; 19600 } 19601 19602 ASSERT(md_mp_head != NULL); 19603 ASSERT(md_pbuf != NULL); 19604 ASSERT(md_pbuf_nxt == NULL); 19605 ASSERT(pbuf_idx != -1); 19606 ASSERT(pbuf_idx_nxt == -1); 19607 ASSERT(*usable > 0); 19608 19609 /* 19610 * We spillover to the next payload buffer only 19611 * if all of the following is true: 19612 * 19613 * 1. There is not enough data on the current 19614 * payload buffer to make up `len', 19615 * 2. We are allowed to send `len', 19616 * 3. The next payload buffer length is large 19617 * enough to accomodate `spill'. 19618 */ 19619 if ((spill = len - *tail_unsent) > 0 && 19620 *usable >= len && 19621 MBLKL((*xmit_tail)->b_cont) >= spill && 19622 max_pld > 0) { 19623 md_pbuf_nxt = dupb((*xmit_tail)->b_cont); 19624 if (md_pbuf_nxt == NULL) { 19625 TCP_STAT(tcp_mdt_allocfail); 19626 goto legacy_send; /* out_of_mem */ 19627 } 19628 19629 if (IS_VMLOANED_MBLK(md_pbuf_nxt) && !zcopy && 19630 zc_cap != NULL) { 19631 if (!ip_md_zcopy_attr(mmd, NULL, 19632 zc_cap->ill_zerocopy_flags)) { 19633 freeb(md_pbuf_nxt); 19634 TCP_STAT(tcp_mdt_allocfail); 19635 /* out_of_mem */ 19636 goto legacy_send; 19637 } 19638 zcopy = B_TRUE; 19639 } 19640 19641 /* 19642 * See comments above on the first call to 19643 * mmd_addpldbuf for explanation on the panic. 19644 */ 19645 pbuf_idx_nxt = mmd_addpldbuf(mmd, md_pbuf_nxt); 19646 if (pbuf_idx_nxt < 0) { 19647 panic("tcp_multisend: " 19648 "next payload buffer logic error " 19649 "detected for tcp %p mmd %p " 19650 "pbuf %p (%d)\n", 19651 (void *)tcp, (void *)mmd, 19652 (void *)md_pbuf_nxt, pbuf_idx_nxt); 19653 } 19654 19655 ASSERT(max_pld > 0); 19656 --max_pld; 19657 } else if (spill > 0) { 19658 /* 19659 * If there's a spillover, but the following 19660 * xmit_tail couldn't give us enough octets 19661 * to reach "len", then stop the current 19662 * Multidata creation and let the legacy 19663 * tcp_send() path take over. We don't want 19664 * to send the tiny segment as part of this 19665 * Multidata for performance reasons; instead, 19666 * we let the legacy path deal with grouping 19667 * it with the subsequent small mblks. 19668 */ 19669 if (*usable >= len && 19670 MBLKL((*xmit_tail)->b_cont) < spill) { 19671 max_pld = 0; 19672 break; /* done */ 19673 } 19674 19675 /* 19676 * We can't spillover, and we are near 19677 * the end of the current payload buffer, 19678 * so send what's left. 19679 */ 19680 ASSERT(*tail_unsent > 0); 19681 len = *tail_unsent; 19682 } 19683 19684 /* tail_unsent is negated if there is a spillover */ 19685 *tail_unsent -= len; 19686 *usable -= len; 19687 ASSERT(*usable >= 0); 19688 19689 if (*usable < mss) 19690 seg_len = *usable; 19691 /* 19692 * Sender SWS avoidance; see comments in tcp_send(); 19693 * everything else is the same, except that we only 19694 * do this here if there is no more data to be sent 19695 * following the current xmit_tail. We don't check 19696 * for 1-byte urgent data because we shouldn't get 19697 * here if TCP_URG_VALID is set. 19698 */ 19699 if (*usable > 0 && *usable < mss && 19700 ((md_pbuf_nxt == NULL && 19701 (*xmit_tail)->b_cont == NULL) || 19702 (md_pbuf_nxt != NULL && 19703 (*xmit_tail)->b_cont->b_cont == NULL)) && 19704 seg_len < (tcp->tcp_max_swnd >> 1) && 19705 (tcp->tcp_unsent - 19706 ((*snxt + len) - tcp->tcp_snxt)) > seg_len && 19707 !tcp->tcp_zero_win_probe) { 19708 if ((*snxt + len) == tcp->tcp_snxt && 19709 (*snxt + len) == tcp->tcp_suna) { 19710 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 19711 } 19712 done = B_TRUE; 19713 } 19714 19715 /* 19716 * Prime pump for IP's checksumming on our behalf; 19717 * include the adjustment for a source route if any. 19718 * Do this only for software/partial hardware checksum 19719 * offload, as this field gets zeroed out later for 19720 * the full hardware checksum offload case. 19721 */ 19722 if (!(hwcksum_flags & HCK_FULLCKSUM)) { 19723 cksum = len + tcp_tcp_hdr_len + tcp->tcp_sum; 19724 cksum = (cksum >> 16) + (cksum & 0xFFFF); 19725 U16_TO_ABE16(cksum, tcp->tcp_tcph->th_sum); 19726 } 19727 19728 U32_TO_ABE32(*snxt, tcp->tcp_tcph->th_seq); 19729 *snxt += len; 19730 19731 tcp->tcp_tcph->th_flags[0] = TH_ACK; 19732 /* 19733 * We set the PUSH bit only if TCP has no more buffered 19734 * data to be transmitted (or if sender SWS avoidance 19735 * takes place), as opposed to setting it for every 19736 * last packet in the burst. 19737 */ 19738 if (done || 19739 (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) == 0) 19740 tcp->tcp_tcph->th_flags[0] |= TH_PUSH; 19741 19742 /* 19743 * Set FIN bit if this is our last segment; snxt 19744 * already includes its length, and it will not 19745 * be adjusted after this point. 19746 */ 19747 if (tcp->tcp_valid_bits == TCP_FSS_VALID && 19748 *snxt == tcp->tcp_fss) { 19749 if (!tcp->tcp_fin_acked) { 19750 tcp->tcp_tcph->th_flags[0] |= TH_FIN; 19751 BUMP_MIB(&tcp_mib, tcpOutControl); 19752 } 19753 if (!tcp->tcp_fin_sent) { 19754 tcp->tcp_fin_sent = B_TRUE; 19755 /* 19756 * tcp state must be ESTABLISHED 19757 * in order for us to get here in 19758 * the first place. 19759 */ 19760 tcp->tcp_state = TCPS_FIN_WAIT_1; 19761 19762 /* 19763 * Upon returning from this routine, 19764 * tcp_wput_data() will set tcp_snxt 19765 * to be equal to snxt + tcp_fin_sent. 19766 * This is essentially the same as 19767 * setting it to tcp_fss + 1. 19768 */ 19769 } 19770 } 19771 19772 tcp->tcp_last_sent_len = (ushort_t)len; 19773 19774 len += tcp_hdr_len; 19775 if (tcp->tcp_ipversion == IPV4_VERSION) 19776 tcp->tcp_ipha->ipha_length = htons(len); 19777 else 19778 tcp->tcp_ip6h->ip6_plen = htons(len - 19779 ((char *)&tcp->tcp_ip6h[1] - 19780 tcp->tcp_iphc)); 19781 19782 pkt_info->flags = (PDESC_HBUF_REF | PDESC_PBUF_REF); 19783 19784 /* setup header fragment */ 19785 PDESC_HDR_ADD(pkt_info, 19786 md_hbuf->b_rptr + cur_hdr_off, /* base */ 19787 tcp->tcp_mdt_hdr_head, /* head room */ 19788 tcp_hdr_len, /* len */ 19789 tcp->tcp_mdt_hdr_tail); /* tail room */ 19790 19791 ASSERT(pkt_info->hdr_lim - pkt_info->hdr_base == 19792 hdr_frag_sz); 19793 ASSERT(MBLKIN(md_hbuf, 19794 (pkt_info->hdr_base - md_hbuf->b_rptr), 19795 PDESC_HDRSIZE(pkt_info))); 19796 19797 /* setup first payload fragment */ 19798 PDESC_PLD_INIT(pkt_info); 19799 PDESC_PLD_SPAN_ADD(pkt_info, 19800 pbuf_idx, /* index */ 19801 md_pbuf->b_rptr + cur_pld_off, /* start */ 19802 tcp->tcp_last_sent_len); /* len */ 19803 19804 /* create a split-packet in case of a spillover */ 19805 if (md_pbuf_nxt != NULL) { 19806 ASSERT(spill > 0); 19807 ASSERT(pbuf_idx_nxt > pbuf_idx); 19808 ASSERT(!add_buffer); 19809 19810 md_pbuf = md_pbuf_nxt; 19811 md_pbuf_nxt = NULL; 19812 pbuf_idx = pbuf_idx_nxt; 19813 pbuf_idx_nxt = -1; 19814 cur_pld_off = spill; 19815 19816 /* trim out first payload fragment */ 19817 PDESC_PLD_SPAN_TRIM(pkt_info, 0, spill); 19818 19819 /* setup second payload fragment */ 19820 PDESC_PLD_SPAN_ADD(pkt_info, 19821 pbuf_idx, /* index */ 19822 md_pbuf->b_rptr, /* start */ 19823 spill); /* len */ 19824 19825 if ((*xmit_tail)->b_next == NULL) { 19826 /* 19827 * Store the lbolt used for RTT 19828 * estimation. We can only record one 19829 * timestamp per mblk so we do it when 19830 * we reach the end of the payload 19831 * buffer. Also we only take a new 19832 * timestamp sample when the previous 19833 * timed data from the same mblk has 19834 * been ack'ed. 19835 */ 19836 (*xmit_tail)->b_prev = local_time; 19837 (*xmit_tail)->b_next = 19838 (mblk_t *)(uintptr_t)first_snxt; 19839 } 19840 19841 first_snxt = *snxt - spill; 19842 19843 /* 19844 * Advance xmit_tail; usable could be 0 by 19845 * the time we got here, but we made sure 19846 * above that we would only spillover to 19847 * the next data block if usable includes 19848 * the spilled-over amount prior to the 19849 * subtraction. Therefore, we are sure 19850 * that xmit_tail->b_cont can't be NULL. 19851 */ 19852 ASSERT((*xmit_tail)->b_cont != NULL); 19853 *xmit_tail = (*xmit_tail)->b_cont; 19854 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= 19855 (uintptr_t)INT_MAX); 19856 *tail_unsent = (int)MBLKL(*xmit_tail) - spill; 19857 } else { 19858 cur_pld_off += tcp->tcp_last_sent_len; 19859 } 19860 19861 /* 19862 * Fill in the header using the template header, and 19863 * add options such as time-stamp, ECN and/or SACK, 19864 * as needed. 19865 */ 19866 tcp_fill_header(tcp, pkt_info->hdr_rptr, 19867 (clock_t)local_time, num_sack_blk); 19868 19869 /* take care of some IP header businesses */ 19870 if (af == AF_INET) { 19871 ipha = (ipha_t *)pkt_info->hdr_rptr; 19872 19873 ASSERT(OK_32PTR((uchar_t *)ipha)); 19874 ASSERT(PDESC_HDRL(pkt_info) >= 19875 IP_SIMPLE_HDR_LENGTH); 19876 ASSERT(ipha->ipha_version_and_hdr_length == 19877 IP_SIMPLE_HDR_VERSION); 19878 19879 /* 19880 * Assign ident value for current packet; see 19881 * related comments in ip_wput_ire() about the 19882 * contract private interface with clustering 19883 * group. 19884 */ 19885 clusterwide = B_FALSE; 19886 if (cl_inet_ipident != NULL) { 19887 ASSERT(cl_inet_isclusterwide != NULL); 19888 if ((*cl_inet_isclusterwide)(IPPROTO_IP, 19889 AF_INET, 19890 (uint8_t *)(uintptr_t)src)) { 19891 ipha->ipha_ident = 19892 (*cl_inet_ipident) 19893 (IPPROTO_IP, AF_INET, 19894 (uint8_t *)(uintptr_t)src, 19895 (uint8_t *)(uintptr_t)dst); 19896 clusterwide = B_TRUE; 19897 } 19898 } 19899 19900 if (!clusterwide) { 19901 ipha->ipha_ident = (uint16_t) 19902 atomic_add_32_nv( 19903 &ire->ire_ident, 1); 19904 } 19905 #ifndef _BIG_ENDIAN 19906 ipha->ipha_ident = (ipha->ipha_ident << 8) | 19907 (ipha->ipha_ident >> 8); 19908 #endif 19909 } else { 19910 ip6h = (ip6_t *)pkt_info->hdr_rptr; 19911 19912 ASSERT(OK_32PTR((uchar_t *)ip6h)); 19913 ASSERT(IPVER(ip6h) == IPV6_VERSION); 19914 ASSERT(ip6h->ip6_nxt == IPPROTO_TCP); 19915 ASSERT(PDESC_HDRL(pkt_info) >= 19916 (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET + 19917 TCP_CHECKSUM_SIZE)); 19918 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 19919 19920 if (tcp->tcp_ip_forward_progress) { 19921 rconfirm = B_TRUE; 19922 tcp->tcp_ip_forward_progress = B_FALSE; 19923 } 19924 } 19925 19926 /* at least one payload span, and at most two */ 19927 ASSERT(pkt_info->pld_cnt > 0 && pkt_info->pld_cnt < 3); 19928 19929 /* add the packet descriptor to Multidata */ 19930 if ((pkt = mmd_addpdesc(mmd, pkt_info, &err, 19931 KM_NOSLEEP)) == NULL) { 19932 /* 19933 * Any failure other than ENOMEM indicates 19934 * that we have passed in invalid pkt_info 19935 * or parameters to mmd_addpdesc, which must 19936 * not happen. 19937 * 19938 * EINVAL is a result of failure on boundary 19939 * checks against the pkt_info contents. It 19940 * should not happen, and we panic because 19941 * either there's horrible heap corruption, 19942 * and/or programming mistake. 19943 */ 19944 if (err != ENOMEM) { 19945 cmn_err(CE_PANIC, "tcp_multisend: " 19946 "pdesc logic error detected for " 19947 "tcp %p mmd %p pinfo %p (%d)\n", 19948 (void *)tcp, (void *)mmd, 19949 (void *)pkt_info, err); 19950 } 19951 TCP_STAT(tcp_mdt_addpdescfail); 19952 goto legacy_send; /* out_of_mem */ 19953 } 19954 ASSERT(pkt != NULL); 19955 19956 /* calculate IP header and TCP checksums */ 19957 if (af == AF_INET) { 19958 /* calculate pseudo-header checksum */ 19959 cksum = (dst >> 16) + (dst & 0xFFFF) + 19960 (src >> 16) + (src & 0xFFFF); 19961 19962 /* offset for TCP header checksum */ 19963 up = IPH_TCPH_CHECKSUMP(ipha, 19964 IP_SIMPLE_HDR_LENGTH); 19965 } else { 19966 up = (uint16_t *)&ip6h->ip6_src; 19967 19968 /* calculate pseudo-header checksum */ 19969 cksum = up[0] + up[1] + up[2] + up[3] + 19970 up[4] + up[5] + up[6] + up[7] + 19971 up[8] + up[9] + up[10] + up[11] + 19972 up[12] + up[13] + up[14] + up[15]; 19973 19974 /* Fold the initial sum */ 19975 cksum = (cksum & 0xffff) + (cksum >> 16); 19976 19977 up = (uint16_t *)(((uchar_t *)ip6h) + 19978 IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET); 19979 } 19980 19981 if (hwcksum_flags & HCK_FULLCKSUM) { 19982 /* clear checksum field for hardware */ 19983 *up = 0; 19984 } else if (hwcksum_flags & HCK_PARTIALCKSUM) { 19985 uint32_t sum; 19986 19987 /* pseudo-header checksumming */ 19988 sum = *up + cksum + IP_TCP_CSUM_COMP; 19989 sum = (sum & 0xFFFF) + (sum >> 16); 19990 *up = (sum & 0xFFFF) + (sum >> 16); 19991 } else { 19992 /* software checksumming */ 19993 TCP_STAT(tcp_out_sw_cksum); 19994 TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes, 19995 tcp->tcp_hdr_len + tcp->tcp_last_sent_len); 19996 *up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len, 19997 cksum + IP_TCP_CSUM_COMP); 19998 if (*up == 0) 19999 *up = 0xFFFF; 20000 } 20001 20002 /* IPv4 header checksum */ 20003 if (af == AF_INET) { 20004 ipha->ipha_fragment_offset_and_flags |= 20005 (uint32_t)htons(ire->ire_frag_flag); 20006 20007 if (hwcksum_flags & HCK_IPV4_HDRCKSUM) { 20008 ipha->ipha_hdr_checksum = 0; 20009 } else { 20010 IP_HDR_CKSUM(ipha, cksum, 20011 ((uint32_t *)ipha)[0], 20012 ((uint16_t *)ipha)[4]); 20013 } 20014 } 20015 20016 /* advance header offset */ 20017 cur_hdr_off += hdr_frag_sz; 20018 20019 obbytes += tcp->tcp_last_sent_len; 20020 ++obsegs; 20021 } while (!done && *usable > 0 && --num_burst_seg > 0 && 20022 *tail_unsent > 0); 20023 20024 if ((*xmit_tail)->b_next == NULL) { 20025 /* 20026 * Store the lbolt used for RTT estimation. We can only 20027 * record one timestamp per mblk so we do it when we 20028 * reach the end of the payload buffer. Also we only 20029 * take a new timestamp sample when the previous timed 20030 * data from the same mblk has been ack'ed. 20031 */ 20032 (*xmit_tail)->b_prev = local_time; 20033 (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)first_snxt; 20034 } 20035 20036 ASSERT(*tail_unsent >= 0); 20037 if (*tail_unsent > 0) { 20038 /* 20039 * We got here because we broke out of the above 20040 * loop due to of one of the following cases: 20041 * 20042 * 1. len < adjusted MSS (i.e. small), 20043 * 2. Sender SWS avoidance, 20044 * 3. max_pld is zero. 20045 * 20046 * We are done for this Multidata, so trim our 20047 * last payload buffer (if any) accordingly. 20048 */ 20049 if (md_pbuf != NULL) 20050 md_pbuf->b_wptr -= *tail_unsent; 20051 } else if (*usable > 0) { 20052 *xmit_tail = (*xmit_tail)->b_cont; 20053 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= 20054 (uintptr_t)INT_MAX); 20055 *tail_unsent = (int)MBLKL(*xmit_tail); 20056 add_buffer = B_TRUE; 20057 } 20058 } while (!done && *usable > 0 && num_burst_seg > 0 && 20059 (tcp_mdt_chain || max_pld > 0)); 20060 20061 /* send everything down */ 20062 tcp_multisend_data(tcp, ire, ill, md_mp_head, obsegs, obbytes, 20063 &rconfirm); 20064 20065 #undef PREP_NEW_MULTIDATA 20066 #undef PREP_NEW_PBUF 20067 #undef IPVER 20068 20069 IRE_REFRELE(ire); 20070 return (0); 20071 } 20072 20073 /* 20074 * A wrapper function for sending one or more Multidata messages down to 20075 * the module below ip; this routine does not release the reference of the 20076 * IRE (caller does that). This routine is analogous to tcp_send_data(). 20077 */ 20078 static void 20079 tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head, 20080 const uint_t obsegs, const uint_t obbytes, boolean_t *rconfirm) 20081 { 20082 uint64_t delta; 20083 nce_t *nce; 20084 20085 ASSERT(ire != NULL && ill != NULL); 20086 ASSERT(ire->ire_stq != NULL); 20087 ASSERT(md_mp_head != NULL); 20088 ASSERT(rconfirm != NULL); 20089 20090 /* adjust MIBs and IRE timestamp */ 20091 TCP_RECORD_TRACE(tcp, md_mp_head, TCP_TRACE_SEND_PKT); 20092 tcp->tcp_obsegs += obsegs; 20093 UPDATE_MIB(&tcp_mib, tcpOutDataSegs, obsegs); 20094 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, obbytes); 20095 TCP_STAT_UPDATE(tcp_mdt_pkt_out, obsegs); 20096 20097 if (tcp->tcp_ipversion == IPV4_VERSION) { 20098 TCP_STAT_UPDATE(tcp_mdt_pkt_out_v4, obsegs); 20099 UPDATE_MIB(&ip_mib, ipOutRequests, obsegs); 20100 } else { 20101 TCP_STAT_UPDATE(tcp_mdt_pkt_out_v6, obsegs); 20102 UPDATE_MIB(&ip6_mib, ipv6OutRequests, obsegs); 20103 } 20104 20105 ire->ire_ob_pkt_count += obsegs; 20106 if (ire->ire_ipif != NULL) 20107 atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, obsegs); 20108 ire->ire_last_used_time = lbolt; 20109 20110 /* send it down */ 20111 putnext(ire->ire_stq, md_mp_head); 20112 20113 /* we're done for TCP/IPv4 */ 20114 if (tcp->tcp_ipversion == IPV4_VERSION) 20115 return; 20116 20117 nce = ire->ire_nce; 20118 20119 ASSERT(nce != NULL); 20120 ASSERT(!(nce->nce_flags & (NCE_F_NONUD|NCE_F_PERMANENT))); 20121 ASSERT(nce->nce_state != ND_INCOMPLETE); 20122 20123 /* reachability confirmation? */ 20124 if (*rconfirm) { 20125 nce->nce_last = TICK_TO_MSEC(lbolt64); 20126 if (nce->nce_state != ND_REACHABLE) { 20127 mutex_enter(&nce->nce_lock); 20128 nce->nce_state = ND_REACHABLE; 20129 nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; 20130 mutex_exit(&nce->nce_lock); 20131 (void) untimeout(nce->nce_timeout_id); 20132 if (ip_debug > 2) { 20133 /* ip1dbg */ 20134 pr_addr_dbg("tcp_multisend_data: state " 20135 "for %s changed to REACHABLE\n", 20136 AF_INET6, &ire->ire_addr_v6); 20137 } 20138 } 20139 /* reset transport reachability confirmation */ 20140 *rconfirm = B_FALSE; 20141 } 20142 20143 delta = TICK_TO_MSEC(lbolt64) - nce->nce_last; 20144 ip1dbg(("tcp_multisend_data: delta = %" PRId64 20145 " ill_reachable_time = %d \n", delta, ill->ill_reachable_time)); 20146 20147 if (delta > (uint64_t)ill->ill_reachable_time) { 20148 mutex_enter(&nce->nce_lock); 20149 switch (nce->nce_state) { 20150 case ND_REACHABLE: 20151 case ND_STALE: 20152 /* 20153 * ND_REACHABLE is identical to ND_STALE in this 20154 * specific case. If reachable time has expired for 20155 * this neighbor (delta is greater than reachable 20156 * time), conceptually, the neighbor cache is no 20157 * longer in REACHABLE state, but already in STALE 20158 * state. So the correct transition here is to 20159 * ND_DELAY. 20160 */ 20161 nce->nce_state = ND_DELAY; 20162 mutex_exit(&nce->nce_lock); 20163 NDP_RESTART_TIMER(nce, delay_first_probe_time); 20164 if (ip_debug > 3) { 20165 /* ip2dbg */ 20166 pr_addr_dbg("tcp_multisend_data: state " 20167 "for %s changed to DELAY\n", 20168 AF_INET6, &ire->ire_addr_v6); 20169 } 20170 break; 20171 case ND_DELAY: 20172 case ND_PROBE: 20173 mutex_exit(&nce->nce_lock); 20174 /* Timers have already started */ 20175 break; 20176 case ND_UNREACHABLE: 20177 /* 20178 * ndp timer has detected that this nce is 20179 * unreachable and initiated deleting this nce 20180 * and all its associated IREs. This is a race 20181 * where we found the ire before it was deleted 20182 * and have just sent out a packet using this 20183 * unreachable nce. 20184 */ 20185 mutex_exit(&nce->nce_lock); 20186 break; 20187 default: 20188 ASSERT(0); 20189 } 20190 } 20191 } 20192 20193 /* 20194 * tcp_send() is called by tcp_wput_data() for non-Multidata transmission 20195 * scheme, and returns one of the following: 20196 * 20197 * -1 = failed allocation. 20198 * 0 = success; burst count reached, or usable send window is too small, 20199 * and that we'd rather wait until later before sending again. 20200 * 1 = success; we are called from tcp_multisend(), and both usable send 20201 * window and tail_unsent are greater than the MDT threshold, and thus 20202 * Multidata Transmit should be used instead. 20203 */ 20204 static int 20205 tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, 20206 const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable, 20207 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 20208 const int mdt_thres) 20209 { 20210 int num_burst_seg = tcp->tcp_snd_burst; 20211 20212 for (;;) { 20213 struct datab *db; 20214 tcph_t *tcph; 20215 uint32_t sum; 20216 mblk_t *mp, *mp1; 20217 uchar_t *rptr; 20218 int len; 20219 20220 /* 20221 * If we're called by tcp_multisend(), and the amount of 20222 * sendable data as well as the size of current xmit_tail 20223 * is beyond the MDT threshold, return to the caller and 20224 * let the large data transmit be done using MDT. 20225 */ 20226 if (*usable > 0 && *usable > mdt_thres && 20227 (*tail_unsent > mdt_thres || (*tail_unsent == 0 && 20228 MBLKL((*xmit_tail)->b_cont) > mdt_thres))) { 20229 ASSERT(tcp->tcp_mdt); 20230 return (1); /* success; do large send */ 20231 } 20232 20233 if (num_burst_seg-- == 0) 20234 break; /* success; burst count reached */ 20235 20236 len = mss; 20237 if (len > *usable) { 20238 len = *usable; 20239 if (len <= 0) { 20240 /* Terminate the loop */ 20241 break; /* success; too small */ 20242 } 20243 /* 20244 * Sender silly-window avoidance. 20245 * Ignore this if we are going to send a 20246 * zero window probe out. 20247 * 20248 * TODO: force data into microscopic window? 20249 * ==> (!pushed || (unsent > usable)) 20250 */ 20251 if (len < (tcp->tcp_max_swnd >> 1) && 20252 (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len && 20253 !((tcp->tcp_valid_bits & TCP_URG_VALID) && 20254 len == 1) && (! tcp->tcp_zero_win_probe)) { 20255 /* 20256 * If the retransmit timer is not running 20257 * we start it so that we will retransmit 20258 * in the case when the the receiver has 20259 * decremented the window. 20260 */ 20261 if (*snxt == tcp->tcp_snxt && 20262 *snxt == tcp->tcp_suna) { 20263 /* 20264 * We are not supposed to send 20265 * anything. So let's wait a little 20266 * bit longer before breaking SWS 20267 * avoidance. 20268 * 20269 * What should the value be? 20270 * Suggestion: MAX(init rexmit time, 20271 * tcp->tcp_rto) 20272 */ 20273 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 20274 } 20275 break; /* success; too small */ 20276 } 20277 } 20278 20279 tcph = tcp->tcp_tcph; 20280 20281 *usable -= len; /* Approximate - can be adjusted later */ 20282 if (*usable > 0) 20283 tcph->th_flags[0] = TH_ACK; 20284 else 20285 tcph->th_flags[0] = (TH_ACK | TH_PUSH); 20286 20287 /* 20288 * Prime pump for IP's checksumming on our behalf 20289 * Include the adjustment for a source route if any. 20290 */ 20291 sum = len + tcp_tcp_hdr_len + tcp->tcp_sum; 20292 sum = (sum >> 16) + (sum & 0xFFFF); 20293 U16_TO_ABE16(sum, tcph->th_sum); 20294 20295 U32_TO_ABE32(*snxt, tcph->th_seq); 20296 20297 /* 20298 * Branch off to tcp_xmit_mp() if any of the VALID bits is 20299 * set. For the case when TCP_FSS_VALID is the only valid 20300 * bit (normal active close), branch off only when we think 20301 * that the FIN flag needs to be set. Note for this case, 20302 * that (snxt + len) may not reflect the actual seg_len, 20303 * as len may be further reduced in tcp_xmit_mp(). If len 20304 * gets modified, we will end up here again. 20305 */ 20306 if (tcp->tcp_valid_bits != 0 && 20307 (tcp->tcp_valid_bits != TCP_FSS_VALID || 20308 ((*snxt + len) == tcp->tcp_fss))) { 20309 uchar_t *prev_rptr; 20310 uint32_t prev_snxt = tcp->tcp_snxt; 20311 20312 if (*tail_unsent == 0) { 20313 ASSERT((*xmit_tail)->b_cont != NULL); 20314 *xmit_tail = (*xmit_tail)->b_cont; 20315 prev_rptr = (*xmit_tail)->b_rptr; 20316 *tail_unsent = (int)((*xmit_tail)->b_wptr - 20317 (*xmit_tail)->b_rptr); 20318 } else { 20319 prev_rptr = (*xmit_tail)->b_rptr; 20320 (*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr - 20321 *tail_unsent; 20322 } 20323 mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL, 20324 *snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 20325 /* Restore tcp_snxt so we get amount sent right. */ 20326 tcp->tcp_snxt = prev_snxt; 20327 if (prev_rptr == (*xmit_tail)->b_rptr) { 20328 /* 20329 * If the previous timestamp is still in use, 20330 * don't stomp on it. 20331 */ 20332 if ((*xmit_tail)->b_next == NULL) { 20333 (*xmit_tail)->b_prev = local_time; 20334 (*xmit_tail)->b_next = 20335 (mblk_t *)(uintptr_t)(*snxt); 20336 } 20337 } else 20338 (*xmit_tail)->b_rptr = prev_rptr; 20339 20340 if (mp == NULL) 20341 return (-1); 20342 mp1 = mp->b_cont; 20343 20344 tcp->tcp_last_sent_len = (ushort_t)len; 20345 while (mp1->b_cont) { 20346 *xmit_tail = (*xmit_tail)->b_cont; 20347 (*xmit_tail)->b_prev = local_time; 20348 (*xmit_tail)->b_next = 20349 (mblk_t *)(uintptr_t)(*snxt); 20350 mp1 = mp1->b_cont; 20351 } 20352 *snxt += len; 20353 *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; 20354 BUMP_LOCAL(tcp->tcp_obsegs); 20355 BUMP_MIB(&tcp_mib, tcpOutDataSegs); 20356 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len); 20357 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 20358 tcp_send_data(tcp, q, mp); 20359 continue; 20360 } 20361 20362 *snxt += len; /* Adjust later if we don't send all of len */ 20363 BUMP_MIB(&tcp_mib, tcpOutDataSegs); 20364 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len); 20365 20366 if (*tail_unsent) { 20367 /* Are the bytes above us in flight? */ 20368 rptr = (*xmit_tail)->b_wptr - *tail_unsent; 20369 if (rptr != (*xmit_tail)->b_rptr) { 20370 *tail_unsent -= len; 20371 tcp->tcp_last_sent_len = (ushort_t)len; 20372 len += tcp_hdr_len; 20373 if (tcp->tcp_ipversion == IPV4_VERSION) 20374 tcp->tcp_ipha->ipha_length = htons(len); 20375 else 20376 tcp->tcp_ip6h->ip6_plen = 20377 htons(len - 20378 ((char *)&tcp->tcp_ip6h[1] - 20379 tcp->tcp_iphc)); 20380 mp = dupb(*xmit_tail); 20381 if (!mp) 20382 return (-1); /* out_of_mem */ 20383 mp->b_rptr = rptr; 20384 /* 20385 * If the old timestamp is no longer in use, 20386 * sample a new timestamp now. 20387 */ 20388 if ((*xmit_tail)->b_next == NULL) { 20389 (*xmit_tail)->b_prev = local_time; 20390 (*xmit_tail)->b_next = 20391 (mblk_t *)(uintptr_t)(*snxt-len); 20392 } 20393 goto must_alloc; 20394 } 20395 } else { 20396 *xmit_tail = (*xmit_tail)->b_cont; 20397 ASSERT((uintptr_t)((*xmit_tail)->b_wptr - 20398 (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX); 20399 *tail_unsent = (int)((*xmit_tail)->b_wptr - 20400 (*xmit_tail)->b_rptr); 20401 } 20402 20403 (*xmit_tail)->b_prev = local_time; 20404 (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len); 20405 20406 *tail_unsent -= len; 20407 tcp->tcp_last_sent_len = (ushort_t)len; 20408 20409 len += tcp_hdr_len; 20410 if (tcp->tcp_ipversion == IPV4_VERSION) 20411 tcp->tcp_ipha->ipha_length = htons(len); 20412 else 20413 tcp->tcp_ip6h->ip6_plen = htons(len - 20414 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 20415 20416 mp = dupb(*xmit_tail); 20417 if (!mp) 20418 return (-1); /* out_of_mem */ 20419 20420 len = tcp_hdr_len; 20421 /* 20422 * There are four reasons to allocate a new hdr mblk: 20423 * 1) The bytes above us are in use by another packet 20424 * 2) We don't have good alignment 20425 * 3) The mblk is being shared 20426 * 4) We don't have enough room for a header 20427 */ 20428 rptr = mp->b_rptr - len; 20429 if (!OK_32PTR(rptr) || 20430 ((db = mp->b_datap), db->db_ref != 2) || 20431 rptr < db->db_base) { 20432 /* NOTE: we assume allocb returns an OK_32PTR */ 20433 20434 must_alloc:; 20435 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 20436 tcp_wroff_xtra, BPRI_MED); 20437 if (!mp1) { 20438 freemsg(mp); 20439 return (-1); /* out_of_mem */ 20440 } 20441 mp1->b_cont = mp; 20442 mp = mp1; 20443 /* Leave room for Link Level header */ 20444 len = tcp_hdr_len; 20445 rptr = &mp->b_rptr[tcp_wroff_xtra]; 20446 mp->b_wptr = &rptr[len]; 20447 } 20448 20449 /* 20450 * Fill in the header using the template header, and add 20451 * options such as time-stamp, ECN and/or SACK, as needed. 20452 */ 20453 tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk); 20454 20455 mp->b_rptr = rptr; 20456 20457 if (*tail_unsent) { 20458 int spill = *tail_unsent; 20459 20460 mp1 = mp->b_cont; 20461 if (!mp1) 20462 mp1 = mp; 20463 20464 /* 20465 * If we're a little short, tack on more mblks until 20466 * there is no more spillover. 20467 */ 20468 while (spill < 0) { 20469 mblk_t *nmp; 20470 int nmpsz; 20471 20472 nmp = (*xmit_tail)->b_cont; 20473 nmpsz = MBLKL(nmp); 20474 20475 /* 20476 * Excess data in mblk; can we split it? 20477 * If MDT is enabled for the connection, 20478 * keep on splitting as this is a transient 20479 * send path. 20480 */ 20481 if (!tcp->tcp_mdt && (spill + nmpsz > 0)) { 20482 /* 20483 * Don't split if stream head was 20484 * told to break up larger writes 20485 * into smaller ones. 20486 */ 20487 if (tcp->tcp_maxpsz > 0) 20488 break; 20489 20490 /* 20491 * Next mblk is less than SMSS/2 20492 * rounded up to nearest 64-byte; 20493 * let it get sent as part of the 20494 * next segment. 20495 */ 20496 if (tcp->tcp_localnet && 20497 !tcp->tcp_cork && 20498 (nmpsz < roundup((mss >> 1), 64))) 20499 break; 20500 } 20501 20502 *xmit_tail = nmp; 20503 ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX); 20504 /* Stash for rtt use later */ 20505 (*xmit_tail)->b_prev = local_time; 20506 (*xmit_tail)->b_next = 20507 (mblk_t *)(uintptr_t)(*snxt - len); 20508 mp1->b_cont = dupb(*xmit_tail); 20509 mp1 = mp1->b_cont; 20510 20511 spill += nmpsz; 20512 if (mp1 == NULL) { 20513 *tail_unsent = spill; 20514 freemsg(mp); 20515 return (-1); /* out_of_mem */ 20516 } 20517 } 20518 20519 /* Trim back any surplus on the last mblk */ 20520 if (spill >= 0) { 20521 mp1->b_wptr -= spill; 20522 *tail_unsent = spill; 20523 } else { 20524 /* 20525 * We did not send everything we could in 20526 * order to remain within the b_cont limit. 20527 */ 20528 *usable -= spill; 20529 *snxt += spill; 20530 tcp->tcp_last_sent_len += spill; 20531 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, spill); 20532 /* 20533 * Adjust the checksum 20534 */ 20535 tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 20536 sum += spill; 20537 sum = (sum >> 16) + (sum & 0xFFFF); 20538 U16_TO_ABE16(sum, tcph->th_sum); 20539 if (tcp->tcp_ipversion == IPV4_VERSION) { 20540 sum = ntohs( 20541 ((ipha_t *)rptr)->ipha_length) + 20542 spill; 20543 ((ipha_t *)rptr)->ipha_length = 20544 htons(sum); 20545 } else { 20546 sum = ntohs( 20547 ((ip6_t *)rptr)->ip6_plen) + 20548 spill; 20549 ((ip6_t *)rptr)->ip6_plen = 20550 htons(sum); 20551 } 20552 *tail_unsent = 0; 20553 } 20554 } 20555 if (tcp->tcp_ip_forward_progress) { 20556 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 20557 *(uint32_t *)mp->b_rptr |= IP_FORWARD_PROG; 20558 tcp->tcp_ip_forward_progress = B_FALSE; 20559 } 20560 20561 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 20562 tcp_send_data(tcp, q, mp); 20563 BUMP_LOCAL(tcp->tcp_obsegs); 20564 } 20565 20566 return (0); 20567 } 20568 20569 /* Unlink and return any mblk that looks like it contains a MDT info */ 20570 static mblk_t * 20571 tcp_mdt_info_mp(mblk_t *mp) 20572 { 20573 mblk_t *prev_mp; 20574 20575 for (;;) { 20576 prev_mp = mp; 20577 /* no more to process? */ 20578 if ((mp = mp->b_cont) == NULL) 20579 break; 20580 20581 switch (DB_TYPE(mp)) { 20582 case M_CTL: 20583 if (*(uint32_t *)mp->b_rptr != MDT_IOC_INFO_UPDATE) 20584 continue; 20585 ASSERT(prev_mp != NULL); 20586 prev_mp->b_cont = mp->b_cont; 20587 mp->b_cont = NULL; 20588 return (mp); 20589 default: 20590 break; 20591 } 20592 } 20593 return (mp); 20594 } 20595 20596 /* MDT info update routine, called when IP notifies us about MDT */ 20597 static void 20598 tcp_mdt_update(tcp_t *tcp, ill_mdt_capab_t *mdt_capab, boolean_t first) 20599 { 20600 boolean_t prev_state; 20601 20602 /* 20603 * IP is telling us to abort MDT on this connection? We know 20604 * this because the capability is only turned off when IP 20605 * encounters some pathological cases, e.g. link-layer change 20606 * where the new driver doesn't support MDT, or in situation 20607 * where MDT usage on the link-layer has been switched off. 20608 * IP would not have sent us the initial MDT_IOC_INFO_UPDATE 20609 * if the link-layer doesn't support MDT, and if it does, it 20610 * will indicate that the feature is to be turned on. 20611 */ 20612 prev_state = tcp->tcp_mdt; 20613 tcp->tcp_mdt = (mdt_capab->ill_mdt_on != 0); 20614 if (!tcp->tcp_mdt && !first) { 20615 TCP_STAT(tcp_mdt_conn_halted3); 20616 ip1dbg(("tcp_mdt_update: disabling MDT for connp %p\n", 20617 (void *)tcp->tcp_connp)); 20618 } 20619 20620 /* 20621 * We currently only support MDT on simple TCP/{IPv4,IPv6}, 20622 * so disable MDT otherwise. The checks are done here 20623 * and in tcp_wput_data(). 20624 */ 20625 if (tcp->tcp_mdt && 20626 (tcp->tcp_ipversion == IPV4_VERSION && 20627 tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || 20628 (tcp->tcp_ipversion == IPV6_VERSION && 20629 tcp->tcp_ip_hdr_len != IPV6_HDR_LEN)) 20630 tcp->tcp_mdt = B_FALSE; 20631 20632 if (tcp->tcp_mdt) { 20633 if (mdt_capab->ill_mdt_version != MDT_VERSION_2) { 20634 cmn_err(CE_NOTE, "tcp_mdt_update: unknown MDT " 20635 "version (%d), expected version is %d", 20636 mdt_capab->ill_mdt_version, MDT_VERSION_2); 20637 tcp->tcp_mdt = B_FALSE; 20638 return; 20639 } 20640 20641 /* 20642 * We need the driver to be able to handle at least three 20643 * spans per packet in order for tcp MDT to be utilized. 20644 * The first is for the header portion, while the rest are 20645 * needed to handle a packet that straddles across two 20646 * virtually non-contiguous buffers; a typical tcp packet 20647 * therefore consists of only two spans. Note that we take 20648 * a zero as "don't care". 20649 */ 20650 if (mdt_capab->ill_mdt_span_limit > 0 && 20651 mdt_capab->ill_mdt_span_limit < 3) { 20652 tcp->tcp_mdt = B_FALSE; 20653 return; 20654 } 20655 20656 /* a zero means driver wants default value */ 20657 tcp->tcp_mdt_max_pld = MIN(mdt_capab->ill_mdt_max_pld, 20658 tcp_mdt_max_pbufs); 20659 if (tcp->tcp_mdt_max_pld == 0) 20660 tcp->tcp_mdt_max_pld = tcp_mdt_max_pbufs; 20661 20662 /* ensure 32-bit alignment */ 20663 tcp->tcp_mdt_hdr_head = roundup(MAX(tcp_mdt_hdr_head_min, 20664 mdt_capab->ill_mdt_hdr_head), 4); 20665 tcp->tcp_mdt_hdr_tail = roundup(MAX(tcp_mdt_hdr_tail_min, 20666 mdt_capab->ill_mdt_hdr_tail), 4); 20667 20668 if (!first && !prev_state) { 20669 TCP_STAT(tcp_mdt_conn_resumed2); 20670 ip1dbg(("tcp_mdt_update: reenabling MDT for connp %p\n", 20671 (void *)tcp->tcp_connp)); 20672 } 20673 } 20674 } 20675 20676 static void 20677 tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_mdt) 20678 { 20679 conn_t *connp = tcp->tcp_connp; 20680 20681 ASSERT(ire != NULL); 20682 20683 /* 20684 * We may be in the fastpath here, and although we essentially do 20685 * similar checks as in ip_bind_connected{_v6}/ip_mdinfo_return, 20686 * we try to keep things as brief as possible. After all, these 20687 * are only best-effort checks, and we do more thorough ones prior 20688 * to calling tcp_multisend(). 20689 */ 20690 if (ip_multidata_outbound && check_mdt && 20691 !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 20692 ill != NULL && ILL_MDT_CAPABLE(ill) && 20693 !CONN_IPSEC_OUT_ENCAPSULATED(connp) && 20694 !(ire->ire_flags & RTF_MULTIRT) && 20695 !IPP_ENABLED(IPP_LOCAL_OUT) && 20696 CONN_IS_MD_FASTPATH(connp)) { 20697 /* Remember the result */ 20698 connp->conn_mdt_ok = B_TRUE; 20699 20700 ASSERT(ill->ill_mdt_capab != NULL); 20701 if (!ill->ill_mdt_capab->ill_mdt_on) { 20702 /* 20703 * If MDT has been previously turned off in the past, 20704 * and we currently can do MDT (due to IPQoS policy 20705 * removal, etc.) then enable it for this interface. 20706 */ 20707 ill->ill_mdt_capab->ill_mdt_on = 1; 20708 ip1dbg(("tcp_ire_ill_check: connp %p enables MDT for " 20709 "interface %s\n", (void *)connp, ill->ill_name)); 20710 } 20711 tcp_mdt_update(tcp, ill->ill_mdt_capab, B_TRUE); 20712 } 20713 20714 /* 20715 * The goal is to reduce the number of generated tcp segments by 20716 * setting the maxpsz multiplier to 0; this will have an affect on 20717 * tcp_maxpsz_set(). With this behavior, tcp will pack more data 20718 * into each packet, up to SMSS bytes. Doing this reduces the number 20719 * of outbound segments and incoming ACKs, thus allowing for better 20720 * network and system performance. In contrast the legacy behavior 20721 * may result in sending less than SMSS size, because the last mblk 20722 * for some packets may have more data than needed to make up SMSS, 20723 * and the legacy code refused to "split" it. 20724 * 20725 * We apply the new behavior on following situations: 20726 * 20727 * 1) Loopback connections, 20728 * 2) Connections in which the remote peer is not on local subnet, 20729 * 3) Local subnet connections over the bge interface (see below). 20730 * 20731 * Ideally, we would like this behavior to apply for interfaces other 20732 * than bge. However, doing so would negatively impact drivers which 20733 * perform dynamic mapping and unmapping of DMA resources, which are 20734 * increased by setting the maxpsz multiplier to 0 (more mblks per 20735 * packet will be generated by tcp). The bge driver does not suffer 20736 * from this, as it copies the mblks into pre-mapped buffers, and 20737 * therefore does not require more I/O resources than before. 20738 * 20739 * Otherwise, this behavior is present on all network interfaces when 20740 * the destination endpoint is non-local, since reducing the number 20741 * of packets in general is good for the network. 20742 * 20743 * TODO We need to remove this hard-coded conditional for bge once 20744 * a better "self-tuning" mechanism, or a way to comprehend 20745 * the driver transmit strategy is devised. Until the solution 20746 * is found and well understood, we live with this hack. 20747 */ 20748 if (!tcp_static_maxpsz && 20749 (tcp->tcp_loopback || !tcp->tcp_localnet || 20750 (ill->ill_name_length > 3 && bcmp(ill->ill_name, "bge", 3) == 0))) { 20751 /* override the default value */ 20752 tcp->tcp_maxpsz = 0; 20753 20754 ip3dbg(("tcp_ire_ill_check: connp %p tcp_maxpsz %d on " 20755 "interface %s\n", (void *)connp, tcp->tcp_maxpsz, 20756 ill != NULL ? ill->ill_name : ipif_loopback_name)); 20757 } 20758 20759 /* set the stream head parameters accordingly */ 20760 (void) tcp_maxpsz_set(tcp, B_TRUE); 20761 } 20762 20763 /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */ 20764 static void 20765 tcp_wput_flush(tcp_t *tcp, mblk_t *mp) 20766 { 20767 uchar_t fval = *mp->b_rptr; 20768 mblk_t *tail; 20769 queue_t *q = tcp->tcp_wq; 20770 20771 /* TODO: How should flush interact with urgent data? */ 20772 if ((fval & FLUSHW) && tcp->tcp_xmit_head && 20773 !(tcp->tcp_valid_bits & TCP_URG_VALID)) { 20774 /* 20775 * Flush only data that has not yet been put on the wire. If 20776 * we flush data that we have already transmitted, life, as we 20777 * know it, may come to an end. 20778 */ 20779 tail = tcp->tcp_xmit_tail; 20780 tail->b_wptr -= tcp->tcp_xmit_tail_unsent; 20781 tcp->tcp_xmit_tail_unsent = 0; 20782 tcp->tcp_unsent = 0; 20783 if (tail->b_wptr != tail->b_rptr) 20784 tail = tail->b_cont; 20785 if (tail) { 20786 mblk_t **excess = &tcp->tcp_xmit_head; 20787 for (;;) { 20788 mblk_t *mp1 = *excess; 20789 if (mp1 == tail) 20790 break; 20791 tcp->tcp_xmit_tail = mp1; 20792 tcp->tcp_xmit_last = mp1; 20793 excess = &mp1->b_cont; 20794 } 20795 *excess = NULL; 20796 tcp_close_mpp(&tail); 20797 if (tcp->tcp_snd_zcopy_aware) 20798 tcp_zcopy_notify(tcp); 20799 } 20800 /* 20801 * We have no unsent data, so unsent must be less than 20802 * tcp_xmit_lowater, so re-enable flow. 20803 */ 20804 if (tcp->tcp_flow_stopped) { 20805 tcp_clrqfull(tcp); 20806 } 20807 } 20808 /* 20809 * TODO: you can't just flush these, you have to increase rwnd for one 20810 * thing. For another, how should urgent data interact? 20811 */ 20812 if (fval & FLUSHR) { 20813 *mp->b_rptr = fval & ~FLUSHW; 20814 /* XXX */ 20815 qreply(q, mp); 20816 return; 20817 } 20818 freemsg(mp); 20819 } 20820 20821 /* 20822 * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA 20823 * messages. 20824 */ 20825 static void 20826 tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) 20827 { 20828 mblk_t *mp1; 20829 STRUCT_HANDLE(strbuf, sb); 20830 uint16_t port; 20831 queue_t *q = tcp->tcp_wq; 20832 in6_addr_t v6addr; 20833 ipaddr_t v4addr; 20834 uint32_t flowinfo = 0; 20835 int addrlen; 20836 20837 /* Make sure it is one of ours. */ 20838 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 20839 case TI_GETMYNAME: 20840 case TI_GETPEERNAME: 20841 break; 20842 default: 20843 CALL_IP_WPUT(tcp->tcp_connp, q, mp); 20844 return; 20845 } 20846 switch (mi_copy_state(q, mp, &mp1)) { 20847 case -1: 20848 return; 20849 case MI_COPY_CASE(MI_COPY_IN, 1): 20850 break; 20851 case MI_COPY_CASE(MI_COPY_OUT, 1): 20852 /* Copy out the strbuf. */ 20853 mi_copyout(q, mp); 20854 return; 20855 case MI_COPY_CASE(MI_COPY_OUT, 2): 20856 /* All done. */ 20857 mi_copy_done(q, mp, 0); 20858 return; 20859 default: 20860 mi_copy_done(q, mp, EPROTO); 20861 return; 20862 } 20863 /* Check alignment of the strbuf */ 20864 if (!OK_32PTR(mp1->b_rptr)) { 20865 mi_copy_done(q, mp, EINVAL); 20866 return; 20867 } 20868 20869 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 20870 (void *)mp1->b_rptr); 20871 addrlen = tcp->tcp_family == AF_INET ? sizeof (sin_t) : sizeof (sin6_t); 20872 20873 if (STRUCT_FGET(sb, maxlen) < addrlen) { 20874 mi_copy_done(q, mp, EINVAL); 20875 return; 20876 } 20877 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 20878 case TI_GETMYNAME: 20879 if (tcp->tcp_family == AF_INET) { 20880 if (tcp->tcp_ipversion == IPV4_VERSION) { 20881 v4addr = tcp->tcp_ipha->ipha_src; 20882 } else { 20883 /* can't return an address in this case */ 20884 v4addr = 0; 20885 } 20886 } else { 20887 /* tcp->tcp_family == AF_INET6 */ 20888 if (tcp->tcp_ipversion == IPV4_VERSION) { 20889 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 20890 &v6addr); 20891 } else { 20892 v6addr = tcp->tcp_ip6h->ip6_src; 20893 } 20894 } 20895 port = tcp->tcp_lport; 20896 break; 20897 case TI_GETPEERNAME: 20898 if (tcp->tcp_family == AF_INET) { 20899 if (tcp->tcp_ipversion == IPV4_VERSION) { 20900 IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6, 20901 v4addr); 20902 } else { 20903 /* can't return an address in this case */ 20904 v4addr = 0; 20905 } 20906 } else { 20907 /* tcp->tcp_family == AF_INET6) */ 20908 v6addr = tcp->tcp_remote_v6; 20909 if (tcp->tcp_ipversion == IPV6_VERSION) { 20910 /* 20911 * No flowinfo if tcp->tcp_ipversion is v4. 20912 * 20913 * flowinfo was already initialized to zero 20914 * where it was declared above, so only 20915 * set it if ipversion is v6. 20916 */ 20917 flowinfo = tcp->tcp_ip6h->ip6_vcf & 20918 ~IPV6_VERS_AND_FLOW_MASK; 20919 } 20920 } 20921 port = tcp->tcp_fport; 20922 break; 20923 default: 20924 mi_copy_done(q, mp, EPROTO); 20925 return; 20926 } 20927 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 20928 if (!mp1) 20929 return; 20930 20931 if (tcp->tcp_family == AF_INET) { 20932 sin_t *sin; 20933 20934 STRUCT_FSET(sb, len, (int)sizeof (sin_t)); 20935 sin = (sin_t *)mp1->b_rptr; 20936 mp1->b_wptr = (uchar_t *)&sin[1]; 20937 *sin = sin_null; 20938 sin->sin_family = AF_INET; 20939 sin->sin_addr.s_addr = v4addr; 20940 sin->sin_port = port; 20941 } else { 20942 /* tcp->tcp_family == AF_INET6 */ 20943 sin6_t *sin6; 20944 20945 STRUCT_FSET(sb, len, (int)sizeof (sin6_t)); 20946 sin6 = (sin6_t *)mp1->b_rptr; 20947 mp1->b_wptr = (uchar_t *)&sin6[1]; 20948 *sin6 = sin6_null; 20949 sin6->sin6_family = AF_INET6; 20950 sin6->sin6_flowinfo = flowinfo; 20951 sin6->sin6_addr = v6addr; 20952 sin6->sin6_port = port; 20953 } 20954 /* Copy out the address */ 20955 mi_copyout(q, mp); 20956 } 20957 20958 /* 20959 * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL 20960 * messages. 20961 */ 20962 /* ARGSUSED */ 20963 static void 20964 tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) 20965 { 20966 conn_t *connp = (conn_t *)arg; 20967 tcp_t *tcp = connp->conn_tcp; 20968 queue_t *q = tcp->tcp_wq; 20969 struct iocblk *iocp; 20970 20971 ASSERT(DB_TYPE(mp) == M_IOCTL); 20972 /* 20973 * Try and ASSERT the minimum possible references on the 20974 * conn early enough. Since we are executing on write side, 20975 * the connection is obviously not detached and that means 20976 * there is a ref each for TCP and IP. Since we are behind 20977 * the squeue, the minimum references needed are 3. If the 20978 * conn is in classifier hash list, there should be an 20979 * extra ref for that (we check both the possibilities). 20980 */ 20981 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 20982 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 20983 20984 iocp = (struct iocblk *)mp->b_rptr; 20985 switch (iocp->ioc_cmd) { 20986 case TCP_IOC_DEFAULT_Q: 20987 /* Wants to be the default wq. */ 20988 if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) { 20989 iocp->ioc_error = EPERM; 20990 iocp->ioc_count = 0; 20991 mp->b_datap->db_type = M_IOCACK; 20992 qreply(q, mp); 20993 return; 20994 } 20995 tcp_def_q_set(tcp, mp); 20996 return; 20997 case _SIOCSOCKFALLBACK: 20998 /* 20999 * Either sockmod is about to be popped and the socket 21000 * would now be treated as a plain stream, or a module 21001 * is about to be pushed so we could no longer use read- 21002 * side synchronous streams for fused loopback tcp. 21003 * Drain any queued data and disable direct sockfs 21004 * interface from now on. 21005 */ 21006 if (!tcp->tcp_issocket) { 21007 DB_TYPE(mp) = M_IOCNAK; 21008 iocp->ioc_error = EINVAL; 21009 } else { 21010 #ifdef _ILP32 21011 tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); 21012 #else 21013 tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev; 21014 #endif 21015 /* 21016 * Insert this socket into the acceptor hash. 21017 * We might need it for T_CONN_RES message 21018 */ 21019 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 21020 21021 if (tcp->tcp_fused) { 21022 /* 21023 * This is a fused loopback tcp; disable 21024 * read-side synchronous streams interface 21025 * and drain any queued data. It is okay 21026 * to do this for non-synchronous streams 21027 * fused tcp as well. 21028 */ 21029 tcp_fuse_disable_pair(tcp, B_FALSE); 21030 } 21031 tcp->tcp_issocket = B_FALSE; 21032 TCP_STAT(tcp_sock_fallback); 21033 21034 DB_TYPE(mp) = M_IOCACK; 21035 iocp->ioc_error = 0; 21036 } 21037 iocp->ioc_count = 0; 21038 iocp->ioc_rval = 0; 21039 qreply(q, mp); 21040 return; 21041 } 21042 CALL_IP_WPUT(connp, q, mp); 21043 } 21044 21045 /* 21046 * This routine is called by tcp_wput() to handle all TPI requests. 21047 */ 21048 /* ARGSUSED */ 21049 static void 21050 tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) 21051 { 21052 conn_t *connp = (conn_t *)arg; 21053 tcp_t *tcp = connp->conn_tcp; 21054 union T_primitives *tprim = (union T_primitives *)mp->b_rptr; 21055 uchar_t *rptr; 21056 t_scalar_t type; 21057 int len; 21058 cred_t *cr = DB_CREDDEF(mp, tcp->tcp_cred); 21059 21060 /* 21061 * Try and ASSERT the minimum possible references on the 21062 * conn early enough. Since we are executing on write side, 21063 * the connection is obviously not detached and that means 21064 * there is a ref each for TCP and IP. Since we are behind 21065 * the squeue, the minimum references needed are 3. If the 21066 * conn is in classifier hash list, there should be an 21067 * extra ref for that (we check both the possibilities). 21068 */ 21069 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 21070 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 21071 21072 rptr = mp->b_rptr; 21073 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 21074 if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 21075 type = ((union T_primitives *)rptr)->type; 21076 if (type == T_EXDATA_REQ) { 21077 uint32_t msize = msgdsize(mp->b_cont); 21078 21079 len = msize - 1; 21080 if (len < 0) { 21081 freemsg(mp); 21082 return; 21083 } 21084 /* 21085 * Try to force urgent data out on the wire. 21086 * Even if we have unsent data this will 21087 * at least send the urgent flag. 21088 * XXX does not handle more flag correctly. 21089 */ 21090 len += tcp->tcp_unsent; 21091 len += tcp->tcp_snxt; 21092 tcp->tcp_urg = len; 21093 tcp->tcp_valid_bits |= TCP_URG_VALID; 21094 21095 /* Bypass tcp protocol for fused tcp loopback */ 21096 if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 21097 return; 21098 } else if (type != T_DATA_REQ) { 21099 goto non_urgent_data; 21100 } 21101 /* TODO: options, flags, ... from user */ 21102 /* Set length to zero for reclamation below */ 21103 tcp_wput_data(tcp, mp->b_cont, B_TRUE); 21104 freeb(mp); 21105 return; 21106 } else { 21107 if (tcp->tcp_debug) { 21108 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 21109 "tcp_wput_proto, dropping one..."); 21110 } 21111 freemsg(mp); 21112 return; 21113 } 21114 21115 non_urgent_data: 21116 21117 switch ((int)tprim->type) { 21118 case T_SSL_PROXY_BIND_REQ: /* an SSL proxy endpoint bind request */ 21119 /* 21120 * save the kssl_ent_t from the next block, and convert this 21121 * back to a normal bind_req. 21122 */ 21123 if (mp->b_cont != NULL) { 21124 ASSERT(MBLKL(mp->b_cont) >= sizeof (kssl_ent_t)); 21125 21126 if (tcp->tcp_kssl_ent != NULL) { 21127 kssl_release_ent(tcp->tcp_kssl_ent, NULL, 21128 KSSL_NO_PROXY); 21129 tcp->tcp_kssl_ent = NULL; 21130 } 21131 bcopy(mp->b_cont->b_rptr, &tcp->tcp_kssl_ent, 21132 sizeof (kssl_ent_t)); 21133 kssl_hold_ent(tcp->tcp_kssl_ent); 21134 freemsg(mp->b_cont); 21135 mp->b_cont = NULL; 21136 } 21137 tprim->type = T_BIND_REQ; 21138 21139 /* FALLTHROUGH */ 21140 case O_T_BIND_REQ: /* bind request */ 21141 case T_BIND_REQ: /* new semantics bind request */ 21142 tcp_bind(tcp, mp); 21143 break; 21144 case T_UNBIND_REQ: /* unbind request */ 21145 tcp_unbind(tcp, mp); 21146 break; 21147 case O_T_CONN_RES: /* old connection response XXX */ 21148 case T_CONN_RES: /* connection response */ 21149 tcp_accept(tcp, mp); 21150 break; 21151 case T_CONN_REQ: /* connection request */ 21152 tcp_connect(tcp, mp); 21153 break; 21154 case T_DISCON_REQ: /* disconnect request */ 21155 tcp_disconnect(tcp, mp); 21156 break; 21157 case T_CAPABILITY_REQ: 21158 tcp_capability_req(tcp, mp); /* capability request */ 21159 break; 21160 case T_INFO_REQ: /* information request */ 21161 tcp_info_req(tcp, mp); 21162 break; 21163 case T_SVR4_OPTMGMT_REQ: /* manage options req */ 21164 /* Only IP is allowed to return meaningful value */ 21165 (void) svr4_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj); 21166 break; 21167 case T_OPTMGMT_REQ: 21168 /* 21169 * Note: no support for snmpcom_req() through new 21170 * T_OPTMGMT_REQ. See comments in ip.c 21171 */ 21172 /* Only IP is allowed to return meaningful value */ 21173 (void) tpi_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj); 21174 break; 21175 21176 case T_UNITDATA_REQ: /* unitdata request */ 21177 tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 21178 break; 21179 case T_ORDREL_REQ: /* orderly release req */ 21180 freemsg(mp); 21181 21182 if (tcp->tcp_fused) 21183 tcp_unfuse(tcp); 21184 21185 if (tcp_xmit_end(tcp) != 0) { 21186 /* 21187 * We were crossing FINs and got a reset from 21188 * the other side. Just ignore it. 21189 */ 21190 if (tcp->tcp_debug) { 21191 (void) strlog(TCP_MOD_ID, 0, 1, 21192 SL_ERROR|SL_TRACE, 21193 "tcp_wput_proto, T_ORDREL_REQ out of " 21194 "state %s", 21195 tcp_display(tcp, NULL, 21196 DISP_ADDR_AND_PORT)); 21197 } 21198 } 21199 break; 21200 case T_ADDR_REQ: 21201 tcp_addr_req(tcp, mp); 21202 break; 21203 default: 21204 if (tcp->tcp_debug) { 21205 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 21206 "tcp_wput_proto, bogus TPI msg, type %d", 21207 tprim->type); 21208 } 21209 /* 21210 * We used to M_ERROR. Sending TNOTSUPPORT gives the user 21211 * to recover. 21212 */ 21213 tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 21214 break; 21215 } 21216 } 21217 21218 /* 21219 * The TCP write service routine should never be called... 21220 */ 21221 /* ARGSUSED */ 21222 static void 21223 tcp_wsrv(queue_t *q) 21224 { 21225 TCP_STAT(tcp_wsrv_called); 21226 } 21227 21228 /* Non overlapping byte exchanger */ 21229 static void 21230 tcp_xchg(uchar_t *a, uchar_t *b, int len) 21231 { 21232 uchar_t uch; 21233 21234 while (len-- > 0) { 21235 uch = a[len]; 21236 a[len] = b[len]; 21237 b[len] = uch; 21238 } 21239 } 21240 21241 /* 21242 * Send out a control packet on the tcp connection specified. This routine 21243 * is typically called where we need a simple ACK or RST generated. 21244 */ 21245 static void 21246 tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) 21247 { 21248 uchar_t *rptr; 21249 tcph_t *tcph; 21250 ipha_t *ipha = NULL; 21251 ip6_t *ip6h = NULL; 21252 uint32_t sum; 21253 int tcp_hdr_len; 21254 int tcp_ip_hdr_len; 21255 mblk_t *mp; 21256 21257 /* 21258 * Save sum for use in source route later. 21259 */ 21260 ASSERT(tcp != NULL); 21261 sum = tcp->tcp_tcp_hdr_len + tcp->tcp_sum; 21262 tcp_hdr_len = tcp->tcp_hdr_len; 21263 tcp_ip_hdr_len = tcp->tcp_ip_hdr_len; 21264 21265 /* If a text string is passed in with the request, pass it to strlog. */ 21266 if (str != NULL && tcp->tcp_debug) { 21267 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 21268 "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", 21269 str, seq, ack, ctl); 21270 } 21271 mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 21272 BPRI_MED); 21273 if (mp == NULL) { 21274 return; 21275 } 21276 rptr = &mp->b_rptr[tcp_wroff_xtra]; 21277 mp->b_rptr = rptr; 21278 mp->b_wptr = &rptr[tcp_hdr_len]; 21279 bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); 21280 21281 if (tcp->tcp_ipversion == IPV4_VERSION) { 21282 ipha = (ipha_t *)rptr; 21283 ipha->ipha_length = htons(tcp_hdr_len); 21284 } else { 21285 ip6h = (ip6_t *)rptr; 21286 ASSERT(tcp != NULL); 21287 ip6h->ip6_plen = htons(tcp->tcp_hdr_len - 21288 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 21289 } 21290 tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; 21291 tcph->th_flags[0] = (uint8_t)ctl; 21292 if (ctl & TH_RST) { 21293 BUMP_MIB(&tcp_mib, tcpOutRsts); 21294 BUMP_MIB(&tcp_mib, tcpOutControl); 21295 /* 21296 * Don't send TSopt w/ TH_RST packets per RFC 1323. 21297 */ 21298 if (tcp->tcp_snd_ts_ok && 21299 tcp->tcp_state > TCPS_SYN_SENT) { 21300 mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN]; 21301 *(mp->b_wptr) = TCPOPT_EOL; 21302 if (tcp->tcp_ipversion == IPV4_VERSION) { 21303 ipha->ipha_length = htons(tcp_hdr_len - 21304 TCPOPT_REAL_TS_LEN); 21305 } else { 21306 ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - 21307 TCPOPT_REAL_TS_LEN); 21308 } 21309 tcph->th_offset_and_rsrvd[0] -= (3 << 4); 21310 sum -= TCPOPT_REAL_TS_LEN; 21311 } 21312 } 21313 if (ctl & TH_ACK) { 21314 if (tcp->tcp_snd_ts_ok) { 21315 U32_TO_BE32(lbolt, 21316 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 21317 U32_TO_BE32(tcp->tcp_ts_recent, 21318 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 21319 } 21320 21321 /* Update the latest receive window size in TCP header. */ 21322 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 21323 tcph->th_win); 21324 tcp->tcp_rack = ack; 21325 tcp->tcp_rack_cnt = 0; 21326 BUMP_MIB(&tcp_mib, tcpOutAck); 21327 } 21328 BUMP_LOCAL(tcp->tcp_obsegs); 21329 U32_TO_BE32(seq, tcph->th_seq); 21330 U32_TO_BE32(ack, tcph->th_ack); 21331 /* 21332 * Include the adjustment for a source route if any. 21333 */ 21334 sum = (sum >> 16) + (sum & 0xFFFF); 21335 U16_TO_BE16(sum, tcph->th_sum); 21336 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 21337 tcp_send_data(tcp, tcp->tcp_wq, mp); 21338 } 21339 21340 /* 21341 * If this routine returns B_TRUE, TCP can generate a RST in response 21342 * to a segment. If it returns B_FALSE, TCP should not respond. 21343 */ 21344 static boolean_t 21345 tcp_send_rst_chk(void) 21346 { 21347 clock_t now; 21348 21349 /* 21350 * TCP needs to protect itself from generating too many RSTs. 21351 * This can be a DoS attack by sending us random segments 21352 * soliciting RSTs. 21353 * 21354 * What we do here is to have a limit of tcp_rst_sent_rate RSTs 21355 * in each 1 second interval. In this way, TCP still generate 21356 * RSTs in normal cases but when under attack, the impact is 21357 * limited. 21358 */ 21359 if (tcp_rst_sent_rate_enabled != 0) { 21360 now = lbolt; 21361 /* lbolt can wrap around. */ 21362 if ((tcp_last_rst_intrvl > now) || 21363 (TICK_TO_MSEC(now - tcp_last_rst_intrvl) > 1*SECONDS)) { 21364 tcp_last_rst_intrvl = now; 21365 tcp_rst_cnt = 1; 21366 } else if (++tcp_rst_cnt > tcp_rst_sent_rate) { 21367 return (B_FALSE); 21368 } 21369 } 21370 return (B_TRUE); 21371 } 21372 21373 /* 21374 * Send down the advice IP ioctl to tell IP to mark an IRE temporary. 21375 */ 21376 static void 21377 tcp_ip_ire_mark_advice(tcp_t *tcp) 21378 { 21379 mblk_t *mp; 21380 ipic_t *ipic; 21381 21382 if (tcp->tcp_ipversion == IPV4_VERSION) { 21383 mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN, 21384 &ipic); 21385 } else { 21386 mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN, 21387 &ipic); 21388 } 21389 if (mp == NULL) 21390 return; 21391 ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY; 21392 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 21393 } 21394 21395 /* 21396 * Return an IP advice ioctl mblk and set ipic to be the pointer 21397 * to the advice structure. 21398 */ 21399 static mblk_t * 21400 tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic) 21401 { 21402 struct iocblk *ioc; 21403 mblk_t *mp, *mp1; 21404 21405 mp = allocb(sizeof (ipic_t) + addr_len, BPRI_HI); 21406 if (mp == NULL) 21407 return (NULL); 21408 bzero(mp->b_rptr, sizeof (ipic_t) + addr_len); 21409 *ipic = (ipic_t *)mp->b_rptr; 21410 (*ipic)->ipic_cmd = IP_IOC_IRE_ADVISE_NO_REPLY; 21411 (*ipic)->ipic_addr_offset = sizeof (ipic_t); 21412 21413 bcopy(addr, *ipic + 1, addr_len); 21414 21415 (*ipic)->ipic_addr_length = addr_len; 21416 mp->b_wptr = &mp->b_rptr[sizeof (ipic_t) + addr_len]; 21417 21418 mp1 = mkiocb(IP_IOCTL); 21419 if (mp1 == NULL) { 21420 freemsg(mp); 21421 return (NULL); 21422 } 21423 mp1->b_cont = mp; 21424 ioc = (struct iocblk *)mp1->b_rptr; 21425 ioc->ioc_count = sizeof (ipic_t) + addr_len; 21426 21427 return (mp1); 21428 } 21429 21430 /* 21431 * Generate a reset based on an inbound packet for which there is no active 21432 * tcp state that we can find. 21433 * 21434 * IPSEC NOTE : Try to send the reply with the same protection as it came 21435 * in. We still have the ipsec_mp that the packet was attached to. Thus 21436 * the packet will go out at the same level of protection as it came in by 21437 * converting the IPSEC_IN to IPSEC_OUT. 21438 */ 21439 static void 21440 tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, 21441 uint32_t ack, int ctl, uint_t ip_hdr_len, zoneid_t zoneid) 21442 { 21443 ipha_t *ipha = NULL; 21444 ip6_t *ip6h = NULL; 21445 ushort_t len; 21446 tcph_t *tcph; 21447 int i; 21448 mblk_t *ipsec_mp; 21449 boolean_t mctl_present; 21450 ipic_t *ipic; 21451 ipaddr_t v4addr; 21452 in6_addr_t v6addr; 21453 int addr_len; 21454 void *addr; 21455 queue_t *q = tcp_g_q; 21456 tcp_t *tcp = Q_TO_TCP(q); 21457 cred_t *cr; 21458 mblk_t *nmp; 21459 21460 if (!tcp_send_rst_chk()) { 21461 tcp_rst_unsent++; 21462 freemsg(mp); 21463 return; 21464 } 21465 21466 if (mp->b_datap->db_type == M_CTL) { 21467 ipsec_mp = mp; 21468 mp = mp->b_cont; 21469 mctl_present = B_TRUE; 21470 } else { 21471 ipsec_mp = mp; 21472 mctl_present = B_FALSE; 21473 } 21474 21475 if (str && q && tcp_dbg) { 21476 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 21477 "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 21478 "flags 0x%x", 21479 str, seq, ack, ctl); 21480 } 21481 if (mp->b_datap->db_ref != 1) { 21482 mblk_t *mp1 = copyb(mp); 21483 freemsg(mp); 21484 mp = mp1; 21485 if (!mp) { 21486 if (mctl_present) 21487 freeb(ipsec_mp); 21488 return; 21489 } else { 21490 if (mctl_present) { 21491 ipsec_mp->b_cont = mp; 21492 } else { 21493 ipsec_mp = mp; 21494 } 21495 } 21496 } else if (mp->b_cont) { 21497 freemsg(mp->b_cont); 21498 mp->b_cont = NULL; 21499 } 21500 /* 21501 * We skip reversing source route here. 21502 * (for now we replace all IP options with EOL) 21503 */ 21504 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 21505 ipha = (ipha_t *)mp->b_rptr; 21506 for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 21507 mp->b_rptr[i] = IPOPT_EOL; 21508 /* 21509 * Make sure that src address isn't flagrantly invalid. 21510 * Not all broadcast address checking for the src address 21511 * is possible, since we don't know the netmask of the src 21512 * addr. No check for destination address is done, since 21513 * IP will not pass up a packet with a broadcast dest 21514 * address to TCP. Similar checks are done below for IPv6. 21515 */ 21516 if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST || 21517 CLASSD(ipha->ipha_src)) { 21518 freemsg(ipsec_mp); 21519 BUMP_MIB(&ip_mib, ipInDiscards); 21520 return; 21521 } 21522 } else { 21523 ip6h = (ip6_t *)mp->b_rptr; 21524 21525 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || 21526 IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { 21527 freemsg(ipsec_mp); 21528 BUMP_MIB(&ip6_mib, ipv6InDiscards); 21529 return; 21530 } 21531 21532 /* Remove any extension headers assuming partial overlay */ 21533 if (ip_hdr_len > IPV6_HDR_LEN) { 21534 uint8_t *to; 21535 21536 to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN; 21537 ovbcopy(ip6h, to, IPV6_HDR_LEN); 21538 mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN; 21539 ip_hdr_len = IPV6_HDR_LEN; 21540 ip6h = (ip6_t *)mp->b_rptr; 21541 ip6h->ip6_nxt = IPPROTO_TCP; 21542 } 21543 } 21544 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 21545 if (tcph->th_flags[0] & TH_RST) { 21546 freemsg(ipsec_mp); 21547 return; 21548 } 21549 tcph->th_offset_and_rsrvd[0] = (5 << 4); 21550 len = ip_hdr_len + sizeof (tcph_t); 21551 mp->b_wptr = &mp->b_rptr[len]; 21552 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 21553 ipha->ipha_length = htons(len); 21554 /* Swap addresses */ 21555 v4addr = ipha->ipha_src; 21556 ipha->ipha_src = ipha->ipha_dst; 21557 ipha->ipha_dst = v4addr; 21558 ipha->ipha_ident = 0; 21559 ipha->ipha_ttl = (uchar_t)tcp_ipv4_ttl; 21560 addr_len = IP_ADDR_LEN; 21561 addr = &v4addr; 21562 } else { 21563 /* No ip6i_t in this case */ 21564 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 21565 /* Swap addresses */ 21566 v6addr = ip6h->ip6_src; 21567 ip6h->ip6_src = ip6h->ip6_dst; 21568 ip6h->ip6_dst = v6addr; 21569 ip6h->ip6_hops = (uchar_t)tcp_ipv6_hoplimit; 21570 addr_len = IPV6_ADDR_LEN; 21571 addr = &v6addr; 21572 } 21573 tcp_xchg(tcph->th_fport, tcph->th_lport, 2); 21574 U32_TO_BE32(ack, tcph->th_ack); 21575 U32_TO_BE32(seq, tcph->th_seq); 21576 U16_TO_BE16(0, tcph->th_win); 21577 U16_TO_BE16(sizeof (tcph_t), tcph->th_sum); 21578 tcph->th_flags[0] = (uint8_t)ctl; 21579 if (ctl & TH_RST) { 21580 BUMP_MIB(&tcp_mib, tcpOutRsts); 21581 BUMP_MIB(&tcp_mib, tcpOutControl); 21582 } 21583 21584 /* IP trusts us to set up labels when required. */ 21585 if (is_system_labeled() && (cr = DB_CRED(mp)) != NULL && 21586 crgetlabel(cr) != NULL) { 21587 int err, adjust; 21588 21589 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) 21590 err = tsol_check_label(cr, &mp, &adjust, 21591 tcp->tcp_connp->conn_mac_exempt); 21592 else 21593 err = tsol_check_label_v6(cr, &mp, &adjust, 21594 tcp->tcp_connp->conn_mac_exempt); 21595 if (mctl_present) 21596 ipsec_mp->b_cont = mp; 21597 else 21598 ipsec_mp = mp; 21599 if (err != 0) { 21600 freemsg(ipsec_mp); 21601 return; 21602 } 21603 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 21604 ipha = (ipha_t *)mp->b_rptr; 21605 adjust += ntohs(ipha->ipha_length); 21606 ipha->ipha_length = htons(adjust); 21607 } else { 21608 ip6h = (ip6_t *)mp->b_rptr; 21609 } 21610 } 21611 21612 if (mctl_present) { 21613 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; 21614 21615 ASSERT(ii->ipsec_in_type == IPSEC_IN); 21616 if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h)) { 21617 return; 21618 } 21619 } 21620 if (zoneid == ALL_ZONES) 21621 zoneid = GLOBAL_ZONEID; 21622 21623 /* Add the zoneid so ip_output routes it properly */ 21624 if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid)) == NULL) { 21625 freemsg(ipsec_mp); 21626 return; 21627 } 21628 ipsec_mp = nmp; 21629 21630 /* 21631 * NOTE: one might consider tracing a TCP packet here, but 21632 * this function has no active TCP state and no tcp structure 21633 * that has a trace buffer. If we traced here, we would have 21634 * to keep a local trace buffer in tcp_record_trace(). 21635 * 21636 * TSol note: The mblk that contains the incoming packet was 21637 * reused by tcp_xmit_listener_reset, so it already contains 21638 * the right credentials and we don't need to call mblk_setcred. 21639 * Also the conn's cred is not right since it is associated 21640 * with tcp_g_q. 21641 */ 21642 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp); 21643 21644 /* 21645 * Tell IP to mark the IRE used for this destination temporary. 21646 * This way, we can limit our exposure to DoS attack because IP 21647 * creates an IRE for each destination. If there are too many, 21648 * the time to do any routing lookup will be extremely long. And 21649 * the lookup can be in interrupt context. 21650 * 21651 * Note that in normal circumstances, this marking should not 21652 * affect anything. It would be nice if only 1 message is 21653 * needed to inform IP that the IRE created for this RST should 21654 * not be added to the cache table. But there is currently 21655 * not such communication mechanism between TCP and IP. So 21656 * the best we can do now is to send the advice ioctl to IP 21657 * to mark the IRE temporary. 21658 */ 21659 if ((mp = tcp_ip_advise_mblk(addr, addr_len, &ipic)) != NULL) { 21660 ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY; 21661 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 21662 } 21663 } 21664 21665 /* 21666 * Initiate closedown sequence on an active connection. (May be called as 21667 * writer.) Return value zero for OK return, non-zero for error return. 21668 */ 21669 static int 21670 tcp_xmit_end(tcp_t *tcp) 21671 { 21672 ipic_t *ipic; 21673 mblk_t *mp; 21674 21675 if (tcp->tcp_state < TCPS_SYN_RCVD || 21676 tcp->tcp_state > TCPS_CLOSE_WAIT) { 21677 /* 21678 * Invalid state, only states TCPS_SYN_RCVD, 21679 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 21680 */ 21681 return (-1); 21682 } 21683 21684 tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 21685 tcp->tcp_valid_bits |= TCP_FSS_VALID; 21686 /* 21687 * If there is nothing more unsent, send the FIN now. 21688 * Otherwise, it will go out with the last segment. 21689 */ 21690 if (tcp->tcp_unsent == 0) { 21691 mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 21692 tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 21693 21694 if (mp) { 21695 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 21696 tcp_send_data(tcp, tcp->tcp_wq, mp); 21697 } else { 21698 /* 21699 * Couldn't allocate msg. Pretend we got it out. 21700 * Wait for rexmit timeout. 21701 */ 21702 tcp->tcp_snxt = tcp->tcp_fss + 1; 21703 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 21704 } 21705 21706 /* 21707 * If needed, update tcp_rexmit_snxt as tcp_snxt is 21708 * changed. 21709 */ 21710 if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 21711 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 21712 } 21713 } else { 21714 /* 21715 * If tcp->tcp_cork is set, then the data will not get sent, 21716 * so we have to check that and unset it first. 21717 */ 21718 if (tcp->tcp_cork) 21719 tcp->tcp_cork = B_FALSE; 21720 tcp_wput_data(tcp, NULL, B_FALSE); 21721 } 21722 21723 /* 21724 * If TCP does not get enough samples of RTT or tcp_rtt_updates 21725 * is 0, don't update the cache. 21726 */ 21727 if (tcp_rtt_updates == 0 || tcp->tcp_rtt_update < tcp_rtt_updates) 21728 return (0); 21729 21730 /* 21731 * NOTE: should not update if source routes i.e. if tcp_remote if 21732 * different from the destination. 21733 */ 21734 if (tcp->tcp_ipversion == IPV4_VERSION) { 21735 if (tcp->tcp_remote != tcp->tcp_ipha->ipha_dst) { 21736 return (0); 21737 } 21738 mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN, 21739 &ipic); 21740 } else { 21741 if (!(IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, 21742 &tcp->tcp_ip6h->ip6_dst))) { 21743 return (0); 21744 } 21745 mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN, 21746 &ipic); 21747 } 21748 21749 /* Record route attributes in the IRE for use by future connections. */ 21750 if (mp == NULL) 21751 return (0); 21752 21753 /* 21754 * We do not have a good algorithm to update ssthresh at this time. 21755 * So don't do any update. 21756 */ 21757 ipic->ipic_rtt = tcp->tcp_rtt_sa; 21758 ipic->ipic_rtt_sd = tcp->tcp_rtt_sd; 21759 21760 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 21761 return (0); 21762 } 21763 21764 /* 21765 * Generate a "no listener here" RST in response to an "unknown" segment. 21766 * Note that we are reusing the incoming mp to construct the outgoing 21767 * RST. 21768 */ 21769 void 21770 tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid) 21771 { 21772 uchar_t *rptr; 21773 uint32_t seg_len; 21774 tcph_t *tcph; 21775 uint32_t seg_seq; 21776 uint32_t seg_ack; 21777 uint_t flags; 21778 mblk_t *ipsec_mp; 21779 ipha_t *ipha; 21780 ip6_t *ip6h; 21781 boolean_t mctl_present = B_FALSE; 21782 boolean_t check = B_TRUE; 21783 boolean_t policy_present; 21784 21785 TCP_STAT(tcp_no_listener); 21786 21787 ipsec_mp = mp; 21788 21789 if (mp->b_datap->db_type == M_CTL) { 21790 ipsec_in_t *ii; 21791 21792 mctl_present = B_TRUE; 21793 mp = mp->b_cont; 21794 21795 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 21796 ASSERT(ii->ipsec_in_type == IPSEC_IN); 21797 if (ii->ipsec_in_dont_check) { 21798 check = B_FALSE; 21799 if (!ii->ipsec_in_secure) { 21800 freeb(ipsec_mp); 21801 mctl_present = B_FALSE; 21802 ipsec_mp = mp; 21803 } 21804 } 21805 } 21806 21807 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 21808 policy_present = ipsec_inbound_v4_policy_present; 21809 ipha = (ipha_t *)mp->b_rptr; 21810 ip6h = NULL; 21811 } else { 21812 policy_present = ipsec_inbound_v6_policy_present; 21813 ipha = NULL; 21814 ip6h = (ip6_t *)mp->b_rptr; 21815 } 21816 21817 if (check && policy_present) { 21818 /* 21819 * The conn_t parameter is NULL because we already know 21820 * nobody's home. 21821 */ 21822 ipsec_mp = ipsec_check_global_policy( 21823 ipsec_mp, (conn_t *)NULL, ipha, ip6h, mctl_present); 21824 if (ipsec_mp == NULL) 21825 return; 21826 } 21827 if (is_system_labeled() && !tsol_can_reply_error(mp)) { 21828 DTRACE_PROBE2( 21829 tx__ip__log__error__nolistener__tcp, 21830 char *, "Could not reply with RST to mp(1)", 21831 mblk_t *, mp); 21832 ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n")); 21833 freemsg(ipsec_mp); 21834 return; 21835 } 21836 21837 rptr = mp->b_rptr; 21838 21839 tcph = (tcph_t *)&rptr[ip_hdr_len]; 21840 seg_seq = BE32_TO_U32(tcph->th_seq); 21841 seg_ack = BE32_TO_U32(tcph->th_ack); 21842 flags = tcph->th_flags[0]; 21843 21844 seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len); 21845 if (flags & TH_RST) { 21846 freemsg(ipsec_mp); 21847 } else if (flags & TH_ACK) { 21848 tcp_xmit_early_reset("no tcp, reset", 21849 ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid); 21850 } else { 21851 if (flags & TH_SYN) { 21852 seg_len++; 21853 } else { 21854 /* 21855 * Here we violate the RFC. Note that a normal 21856 * TCP will never send a segment without the ACK 21857 * flag, except for RST or SYN segment. This 21858 * segment is neither. Just drop it on the 21859 * floor. 21860 */ 21861 freemsg(ipsec_mp); 21862 tcp_rst_unsent++; 21863 return; 21864 } 21865 21866 tcp_xmit_early_reset("no tcp, reset/ack", 21867 ipsec_mp, 0, seg_seq + seg_len, 21868 TH_RST | TH_ACK, ip_hdr_len, zoneid); 21869 } 21870 } 21871 21872 /* 21873 * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 21874 * ip and tcp header ready to pass down to IP. If the mp passed in is 21875 * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 21876 * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 21877 * otherwise it will dup partial mblks.) 21878 * Otherwise, an appropriate ACK packet will be generated. This 21879 * routine is not usually called to send new data for the first time. It 21880 * is mostly called out of the timer for retransmits, and to generate ACKs. 21881 * 21882 * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 21883 * be adjusted by *offset. And after dupb(), the offset and the ending mblk 21884 * of the original mblk chain will be returned in *offset and *end_mp. 21885 */ 21886 static mblk_t * 21887 tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 21888 mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 21889 boolean_t rexmit) 21890 { 21891 int data_length; 21892 int32_t off = 0; 21893 uint_t flags; 21894 mblk_t *mp1; 21895 mblk_t *mp2; 21896 uchar_t *rptr; 21897 tcph_t *tcph; 21898 int32_t num_sack_blk = 0; 21899 int32_t sack_opt_len = 0; 21900 21901 /* Allocate for our maximum TCP header + link-level */ 21902 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 21903 BPRI_MED); 21904 if (!mp1) 21905 return (NULL); 21906 data_length = 0; 21907 21908 /* 21909 * Note that tcp_mss has been adjusted to take into account the 21910 * timestamp option if applicable. Because SACK options do not 21911 * appear in every TCP segments and they are of variable lengths, 21912 * they cannot be included in tcp_mss. Thus we need to calculate 21913 * the actual segment length when we need to send a segment which 21914 * includes SACK options. 21915 */ 21916 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 21917 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 21918 tcp->tcp_num_sack_blk); 21919 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 21920 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 21921 if (max_to_send + sack_opt_len > tcp->tcp_mss) 21922 max_to_send -= sack_opt_len; 21923 } 21924 21925 if (offset != NULL) { 21926 off = *offset; 21927 /* We use offset as an indicator that end_mp is not NULL. */ 21928 *end_mp = NULL; 21929 } 21930 for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 21931 /* This could be faster with cooperation from downstream */ 21932 if (mp2 != mp1 && !sendall && 21933 data_length + (int)(mp->b_wptr - mp->b_rptr) > 21934 max_to_send) 21935 /* 21936 * Don't send the next mblk since the whole mblk 21937 * does not fit. 21938 */ 21939 break; 21940 mp2->b_cont = dupb(mp); 21941 mp2 = mp2->b_cont; 21942 if (!mp2) { 21943 freemsg(mp1); 21944 return (NULL); 21945 } 21946 mp2->b_rptr += off; 21947 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 21948 (uintptr_t)INT_MAX); 21949 21950 data_length += (int)(mp2->b_wptr - mp2->b_rptr); 21951 if (data_length > max_to_send) { 21952 mp2->b_wptr -= data_length - max_to_send; 21953 data_length = max_to_send; 21954 off = mp2->b_wptr - mp->b_rptr; 21955 break; 21956 } else { 21957 off = 0; 21958 } 21959 } 21960 if (offset != NULL) { 21961 *offset = off; 21962 *end_mp = mp; 21963 } 21964 if (seg_len != NULL) { 21965 *seg_len = data_length; 21966 } 21967 21968 /* Update the latest receive window size in TCP header. */ 21969 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 21970 tcp->tcp_tcph->th_win); 21971 21972 rptr = mp1->b_rptr + tcp_wroff_xtra; 21973 mp1->b_rptr = rptr; 21974 mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; 21975 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 21976 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 21977 U32_TO_ABE32(seq, tcph->th_seq); 21978 21979 /* 21980 * Use tcp_unsent to determine if the PUSH bit should be used assumes 21981 * that this function was called from tcp_wput_data. Thus, when called 21982 * to retransmit data the setting of the PUSH bit may appear some 21983 * what random in that it might get set when it should not. This 21984 * should not pose any performance issues. 21985 */ 21986 if (data_length != 0 && (tcp->tcp_unsent == 0 || 21987 tcp->tcp_unsent == data_length)) { 21988 flags = TH_ACK | TH_PUSH; 21989 } else { 21990 flags = TH_ACK; 21991 } 21992 21993 if (tcp->tcp_ecn_ok) { 21994 if (tcp->tcp_ecn_echo_on) 21995 flags |= TH_ECE; 21996 21997 /* 21998 * Only set ECT bit and ECN_CWR if a segment contains new data. 21999 * There is no TCP flow control for non-data segments, and 22000 * only data segment is transmitted reliably. 22001 */ 22002 if (data_length > 0 && !rexmit) { 22003 SET_ECT(tcp, rptr); 22004 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 22005 flags |= TH_CWR; 22006 tcp->tcp_ecn_cwr_sent = B_TRUE; 22007 } 22008 } 22009 } 22010 22011 if (tcp->tcp_valid_bits) { 22012 uint32_t u1; 22013 22014 if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 22015 seq == tcp->tcp_iss) { 22016 uchar_t *wptr; 22017 22018 /* 22019 * If TCP_ISS_VALID and the seq number is tcp_iss, 22020 * TCP can only be in SYN-SENT, SYN-RCVD or 22021 * FIN-WAIT-1 state. It can be FIN-WAIT-1 if 22022 * our SYN is not ack'ed but the app closes this 22023 * TCP connection. 22024 */ 22025 ASSERT(tcp->tcp_state == TCPS_SYN_SENT || 22026 tcp->tcp_state == TCPS_SYN_RCVD || 22027 tcp->tcp_state == TCPS_FIN_WAIT_1); 22028 22029 /* 22030 * Tack on the MSS option. It is always needed 22031 * for both active and passive open. 22032 * 22033 * MSS option value should be interface MTU - MIN 22034 * TCP/IP header according to RFC 793 as it means 22035 * the maximum segment size TCP can receive. But 22036 * to get around some broken middle boxes/end hosts 22037 * out there, we allow the option value to be the 22038 * same as the MSS option size on the peer side. 22039 * In this way, the other side will not send 22040 * anything larger than they can receive. 22041 * 22042 * Note that for SYN_SENT state, the ndd param 22043 * tcp_use_smss_as_mss_opt has no effect as we 22044 * don't know the peer's MSS option value. So 22045 * the only case we need to take care of is in 22046 * SYN_RCVD state, which is done later. 22047 */ 22048 wptr = mp1->b_wptr; 22049 wptr[0] = TCPOPT_MAXSEG; 22050 wptr[1] = TCPOPT_MAXSEG_LEN; 22051 wptr += 2; 22052 u1 = tcp->tcp_if_mtu - 22053 (tcp->tcp_ipversion == IPV4_VERSION ? 22054 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - 22055 TCP_MIN_HEADER_LENGTH; 22056 U16_TO_BE16(u1, wptr); 22057 mp1->b_wptr = wptr + 2; 22058 /* Update the offset to cover the additional word */ 22059 tcph->th_offset_and_rsrvd[0] += (1 << 4); 22060 22061 /* 22062 * Note that the following way of filling in 22063 * TCP options are not optimal. Some NOPs can 22064 * be saved. But there is no need at this time 22065 * to optimize it. When it is needed, we will 22066 * do it. 22067 */ 22068 switch (tcp->tcp_state) { 22069 case TCPS_SYN_SENT: 22070 flags = TH_SYN; 22071 22072 if (tcp->tcp_snd_ts_ok) { 22073 uint32_t llbolt = (uint32_t)lbolt; 22074 22075 wptr = mp1->b_wptr; 22076 wptr[0] = TCPOPT_NOP; 22077 wptr[1] = TCPOPT_NOP; 22078 wptr[2] = TCPOPT_TSTAMP; 22079 wptr[3] = TCPOPT_TSTAMP_LEN; 22080 wptr += 4; 22081 U32_TO_BE32(llbolt, wptr); 22082 wptr += 4; 22083 ASSERT(tcp->tcp_ts_recent == 0); 22084 U32_TO_BE32(0L, wptr); 22085 mp1->b_wptr += TCPOPT_REAL_TS_LEN; 22086 tcph->th_offset_and_rsrvd[0] += 22087 (3 << 4); 22088 } 22089 22090 /* 22091 * Set up all the bits to tell other side 22092 * we are ECN capable. 22093 */ 22094 if (tcp->tcp_ecn_ok) { 22095 flags |= (TH_ECE | TH_CWR); 22096 } 22097 break; 22098 case TCPS_SYN_RCVD: 22099 flags |= TH_SYN; 22100 22101 /* 22102 * Reset the MSS option value to be SMSS 22103 * We should probably add back the bytes 22104 * for timestamp option and IPsec. We 22105 * don't do that as this is a workaround 22106 * for broken middle boxes/end hosts, it 22107 * is better for us to be more cautious. 22108 * They may not take these things into 22109 * account in their SMSS calculation. Thus 22110 * the peer's calculated SMSS may be smaller 22111 * than what it can be. This should be OK. 22112 */ 22113 if (tcp_use_smss_as_mss_opt) { 22114 u1 = tcp->tcp_mss; 22115 U16_TO_BE16(u1, wptr); 22116 } 22117 22118 /* 22119 * If the other side is ECN capable, reply 22120 * that we are also ECN capable. 22121 */ 22122 if (tcp->tcp_ecn_ok) 22123 flags |= TH_ECE; 22124 break; 22125 default: 22126 /* 22127 * The above ASSERT() makes sure that this 22128 * must be FIN-WAIT-1 state. Our SYN has 22129 * not been ack'ed so retransmit it. 22130 */ 22131 flags |= TH_SYN; 22132 break; 22133 } 22134 22135 if (tcp->tcp_snd_ws_ok) { 22136 wptr = mp1->b_wptr; 22137 wptr[0] = TCPOPT_NOP; 22138 wptr[1] = TCPOPT_WSCALE; 22139 wptr[2] = TCPOPT_WS_LEN; 22140 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 22141 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 22142 tcph->th_offset_and_rsrvd[0] += (1 << 4); 22143 } 22144 22145 if (tcp->tcp_snd_sack_ok) { 22146 wptr = mp1->b_wptr; 22147 wptr[0] = TCPOPT_NOP; 22148 wptr[1] = TCPOPT_NOP; 22149 wptr[2] = TCPOPT_SACK_PERMITTED; 22150 wptr[3] = TCPOPT_SACK_OK_LEN; 22151 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 22152 tcph->th_offset_and_rsrvd[0] += (1 << 4); 22153 } 22154 22155 /* allocb() of adequate mblk assures space */ 22156 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 22157 (uintptr_t)INT_MAX); 22158 u1 = (int)(mp1->b_wptr - mp1->b_rptr); 22159 /* 22160 * Get IP set to checksum on our behalf 22161 * Include the adjustment for a source route if any. 22162 */ 22163 u1 += tcp->tcp_sum; 22164 u1 = (u1 >> 16) + (u1 & 0xFFFF); 22165 U16_TO_BE16(u1, tcph->th_sum); 22166 BUMP_MIB(&tcp_mib, tcpOutControl); 22167 } 22168 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 22169 (seq + data_length) == tcp->tcp_fss) { 22170 if (!tcp->tcp_fin_acked) { 22171 flags |= TH_FIN; 22172 BUMP_MIB(&tcp_mib, tcpOutControl); 22173 } 22174 if (!tcp->tcp_fin_sent) { 22175 tcp->tcp_fin_sent = B_TRUE; 22176 switch (tcp->tcp_state) { 22177 case TCPS_SYN_RCVD: 22178 case TCPS_ESTABLISHED: 22179 tcp->tcp_state = TCPS_FIN_WAIT_1; 22180 break; 22181 case TCPS_CLOSE_WAIT: 22182 tcp->tcp_state = TCPS_LAST_ACK; 22183 break; 22184 } 22185 if (tcp->tcp_suna == tcp->tcp_snxt) 22186 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 22187 tcp->tcp_snxt = tcp->tcp_fss + 1; 22188 } 22189 } 22190 /* 22191 * Note the trick here. u1 is unsigned. When tcp_urg 22192 * is smaller than seq, u1 will become a very huge value. 22193 * So the comparison will fail. Also note that tcp_urp 22194 * should be positive, see RFC 793 page 17. 22195 */ 22196 u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION; 22197 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 && 22198 u1 < (uint32_t)(64 * 1024)) { 22199 flags |= TH_URG; 22200 BUMP_MIB(&tcp_mib, tcpOutUrg); 22201 U32_TO_ABE16(u1, tcph->th_urp); 22202 } 22203 } 22204 tcph->th_flags[0] = (uchar_t)flags; 22205 tcp->tcp_rack = tcp->tcp_rnxt; 22206 tcp->tcp_rack_cnt = 0; 22207 22208 if (tcp->tcp_snd_ts_ok) { 22209 if (tcp->tcp_state != TCPS_SYN_SENT) { 22210 uint32_t llbolt = (uint32_t)lbolt; 22211 22212 U32_TO_BE32(llbolt, 22213 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 22214 U32_TO_BE32(tcp->tcp_ts_recent, 22215 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 22216 } 22217 } 22218 22219 if (num_sack_blk > 0) { 22220 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 22221 sack_blk_t *tmp; 22222 int32_t i; 22223 22224 wptr[0] = TCPOPT_NOP; 22225 wptr[1] = TCPOPT_NOP; 22226 wptr[2] = TCPOPT_SACK; 22227 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 22228 sizeof (sack_blk_t); 22229 wptr += TCPOPT_REAL_SACK_LEN; 22230 22231 tmp = tcp->tcp_sack_list; 22232 for (i = 0; i < num_sack_blk; i++) { 22233 U32_TO_BE32(tmp[i].begin, wptr); 22234 wptr += sizeof (tcp_seq); 22235 U32_TO_BE32(tmp[i].end, wptr); 22236 wptr += sizeof (tcp_seq); 22237 } 22238 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4); 22239 } 22240 ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 22241 data_length += (int)(mp1->b_wptr - rptr); 22242 if (tcp->tcp_ipversion == IPV4_VERSION) { 22243 ((ipha_t *)rptr)->ipha_length = htons(data_length); 22244 } else { 22245 ip6_t *ip6 = (ip6_t *)(rptr + 22246 (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ? 22247 sizeof (ip6i_t) : 0)); 22248 22249 ip6->ip6_plen = htons(data_length - 22250 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 22251 } 22252 22253 /* 22254 * Prime pump for IP 22255 * Include the adjustment for a source route if any. 22256 */ 22257 data_length -= tcp->tcp_ip_hdr_len; 22258 data_length += tcp->tcp_sum; 22259 data_length = (data_length >> 16) + (data_length & 0xFFFF); 22260 U16_TO_ABE16(data_length, tcph->th_sum); 22261 if (tcp->tcp_ip_forward_progress) { 22262 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 22263 *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; 22264 tcp->tcp_ip_forward_progress = B_FALSE; 22265 } 22266 return (mp1); 22267 } 22268 22269 /* This function handles the push timeout. */ 22270 void 22271 tcp_push_timer(void *arg) 22272 { 22273 conn_t *connp = (conn_t *)arg; 22274 tcp_t *tcp = connp->conn_tcp; 22275 22276 TCP_DBGSTAT(tcp_push_timer_cnt); 22277 22278 ASSERT(tcp->tcp_listener == NULL); 22279 22280 /* 22281 * We need to plug synchronous streams during our drain to prevent 22282 * a race with tcp_fuse_rrw() or tcp_fusion_rinfop(). 22283 */ 22284 TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp); 22285 tcp->tcp_push_tid = 0; 22286 if ((tcp->tcp_rcv_list != NULL) && 22287 (tcp_rcv_drain(tcp->tcp_rq, tcp) == TH_ACK_NEEDED)) 22288 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 22289 TCP_FUSE_SYNCSTR_UNPLUG_DRAIN(tcp); 22290 } 22291 22292 /* 22293 * This function handles delayed ACK timeout. 22294 */ 22295 static void 22296 tcp_ack_timer(void *arg) 22297 { 22298 conn_t *connp = (conn_t *)arg; 22299 tcp_t *tcp = connp->conn_tcp; 22300 mblk_t *mp; 22301 22302 TCP_DBGSTAT(tcp_ack_timer_cnt); 22303 22304 tcp->tcp_ack_tid = 0; 22305 22306 if (tcp->tcp_fused) 22307 return; 22308 22309 /* 22310 * Do not send ACK if there is no outstanding unack'ed data. 22311 */ 22312 if (tcp->tcp_rnxt == tcp->tcp_rack) { 22313 return; 22314 } 22315 22316 if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) { 22317 /* 22318 * Make sure we don't allow deferred ACKs to result in 22319 * timer-based ACKing. If we have held off an ACK 22320 * when there was more than an mss here, and the timer 22321 * goes off, we have to worry about the possibility 22322 * that the sender isn't doing slow-start, or is out 22323 * of step with us for some other reason. We fall 22324 * permanently back in the direction of 22325 * ACK-every-other-packet as suggested in RFC 1122. 22326 */ 22327 if (tcp->tcp_rack_abs_max > 2) 22328 tcp->tcp_rack_abs_max--; 22329 tcp->tcp_rack_cur_max = 2; 22330 } 22331 mp = tcp_ack_mp(tcp); 22332 22333 if (mp != NULL) { 22334 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 22335 BUMP_LOCAL(tcp->tcp_obsegs); 22336 BUMP_MIB(&tcp_mib, tcpOutAck); 22337 BUMP_MIB(&tcp_mib, tcpOutAckDelayed); 22338 tcp_send_data(tcp, tcp->tcp_wq, mp); 22339 } 22340 } 22341 22342 22343 /* Generate an ACK-only (no data) segment for a TCP endpoint */ 22344 static mblk_t * 22345 tcp_ack_mp(tcp_t *tcp) 22346 { 22347 uint32_t seq_no; 22348 22349 /* 22350 * There are a few cases to be considered while setting the sequence no. 22351 * Essentially, we can come here while processing an unacceptable pkt 22352 * in the TCPS_SYN_RCVD state, in which case we set the sequence number 22353 * to snxt (per RFC 793), note the swnd wouldn't have been set yet. 22354 * If we are here for a zero window probe, stick with suna. In all 22355 * other cases, we check if suna + swnd encompasses snxt and set 22356 * the sequence number to snxt, if so. If snxt falls outside the 22357 * window (the receiver probably shrunk its window), we will go with 22358 * suna + swnd, otherwise the sequence no will be unacceptable to the 22359 * receiver. 22360 */ 22361 if (tcp->tcp_zero_win_probe) { 22362 seq_no = tcp->tcp_suna; 22363 } else if (tcp->tcp_state == TCPS_SYN_RCVD) { 22364 ASSERT(tcp->tcp_swnd == 0); 22365 seq_no = tcp->tcp_snxt; 22366 } else { 22367 seq_no = SEQ_GT(tcp->tcp_snxt, 22368 (tcp->tcp_suna + tcp->tcp_swnd)) ? 22369 (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt; 22370 } 22371 22372 if (tcp->tcp_valid_bits) { 22373 /* 22374 * For the complex case where we have to send some 22375 * controls (FIN or SYN), let tcp_xmit_mp do it. 22376 */ 22377 return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE, 22378 NULL, B_FALSE)); 22379 } else { 22380 /* Generate a simple ACK */ 22381 int data_length; 22382 uchar_t *rptr; 22383 tcph_t *tcph; 22384 mblk_t *mp1; 22385 int32_t tcp_hdr_len; 22386 int32_t tcp_tcp_hdr_len; 22387 int32_t num_sack_blk = 0; 22388 int32_t sack_opt_len; 22389 22390 /* 22391 * Allocate space for TCP + IP headers 22392 * and link-level header 22393 */ 22394 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 22395 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 22396 tcp->tcp_num_sack_blk); 22397 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 22398 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 22399 tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len; 22400 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + sack_opt_len; 22401 } else { 22402 tcp_hdr_len = tcp->tcp_hdr_len; 22403 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; 22404 } 22405 mp1 = allocb(tcp_hdr_len + tcp_wroff_xtra, BPRI_MED); 22406 if (!mp1) 22407 return (NULL); 22408 22409 /* Update the latest receive window size in TCP header. */ 22410 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 22411 tcp->tcp_tcph->th_win); 22412 /* copy in prototype TCP + IP header */ 22413 rptr = mp1->b_rptr + tcp_wroff_xtra; 22414 mp1->b_rptr = rptr; 22415 mp1->b_wptr = rptr + tcp_hdr_len; 22416 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 22417 22418 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 22419 22420 /* Set the TCP sequence number. */ 22421 U32_TO_ABE32(seq_no, tcph->th_seq); 22422 22423 /* Set up the TCP flag field. */ 22424 tcph->th_flags[0] = (uchar_t)TH_ACK; 22425 if (tcp->tcp_ecn_echo_on) 22426 tcph->th_flags[0] |= TH_ECE; 22427 22428 tcp->tcp_rack = tcp->tcp_rnxt; 22429 tcp->tcp_rack_cnt = 0; 22430 22431 /* fill in timestamp option if in use */ 22432 if (tcp->tcp_snd_ts_ok) { 22433 uint32_t llbolt = (uint32_t)lbolt; 22434 22435 U32_TO_BE32(llbolt, 22436 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 22437 U32_TO_BE32(tcp->tcp_ts_recent, 22438 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 22439 } 22440 22441 /* Fill in SACK options */ 22442 if (num_sack_blk > 0) { 22443 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 22444 sack_blk_t *tmp; 22445 int32_t i; 22446 22447 wptr[0] = TCPOPT_NOP; 22448 wptr[1] = TCPOPT_NOP; 22449 wptr[2] = TCPOPT_SACK; 22450 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 22451 sizeof (sack_blk_t); 22452 wptr += TCPOPT_REAL_SACK_LEN; 22453 22454 tmp = tcp->tcp_sack_list; 22455 for (i = 0; i < num_sack_blk; i++) { 22456 U32_TO_BE32(tmp[i].begin, wptr); 22457 wptr += sizeof (tcp_seq); 22458 U32_TO_BE32(tmp[i].end, wptr); 22459 wptr += sizeof (tcp_seq); 22460 } 22461 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 22462 << 4); 22463 } 22464 22465 if (tcp->tcp_ipversion == IPV4_VERSION) { 22466 ((ipha_t *)rptr)->ipha_length = htons(tcp_hdr_len); 22467 } else { 22468 /* Check for ip6i_t header in sticky hdrs */ 22469 ip6_t *ip6 = (ip6_t *)(rptr + 22470 (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ? 22471 sizeof (ip6i_t) : 0)); 22472 22473 ip6->ip6_plen = htons(tcp_hdr_len - 22474 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 22475 } 22476 22477 /* 22478 * Prime pump for checksum calculation in IP. Include the 22479 * adjustment for a source route if any. 22480 */ 22481 data_length = tcp_tcp_hdr_len + tcp->tcp_sum; 22482 data_length = (data_length >> 16) + (data_length & 0xFFFF); 22483 U16_TO_ABE16(data_length, tcph->th_sum); 22484 22485 if (tcp->tcp_ip_forward_progress) { 22486 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 22487 *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; 22488 tcp->tcp_ip_forward_progress = B_FALSE; 22489 } 22490 return (mp1); 22491 } 22492 } 22493 22494 /* 22495 * To create a temporary tcp structure for inserting into bind hash list. 22496 * The parameter is assumed to be in network byte order, ready for use. 22497 */ 22498 /* ARGSUSED */ 22499 static tcp_t * 22500 tcp_alloc_temp_tcp(in_port_t port) 22501 { 22502 conn_t *connp; 22503 tcp_t *tcp; 22504 22505 connp = ipcl_conn_create(IPCL_TCPCONN, KM_SLEEP); 22506 if (connp == NULL) 22507 return (NULL); 22508 22509 tcp = connp->conn_tcp; 22510 22511 /* 22512 * Only initialize the necessary info in those structures. Note 22513 * that since INADDR_ANY is all 0, we do not need to set 22514 * tcp_bound_source to INADDR_ANY here. 22515 */ 22516 tcp->tcp_state = TCPS_BOUND; 22517 tcp->tcp_lport = port; 22518 tcp->tcp_exclbind = 1; 22519 tcp->tcp_reserved_port = 1; 22520 22521 /* Just for place holding... */ 22522 tcp->tcp_ipversion = IPV4_VERSION; 22523 22524 return (tcp); 22525 } 22526 22527 /* 22528 * To remove a port range specified by lo_port and hi_port from the 22529 * reserved port ranges. This is one of the three public functions of 22530 * the reserved port interface. Note that a port range has to be removed 22531 * as a whole. Ports in a range cannot be removed individually. 22532 * 22533 * Params: 22534 * in_port_t lo_port: the beginning port of the reserved port range to 22535 * be deleted. 22536 * in_port_t hi_port: the ending port of the reserved port range to 22537 * be deleted. 22538 * 22539 * Return: 22540 * B_TRUE if the deletion is successful, B_FALSE otherwise. 22541 */ 22542 boolean_t 22543 tcp_reserved_port_del(in_port_t lo_port, in_port_t hi_port) 22544 { 22545 int i, j; 22546 int size; 22547 tcp_t **temp_tcp_array; 22548 tcp_t *tcp; 22549 22550 rw_enter(&tcp_reserved_port_lock, RW_WRITER); 22551 22552 /* First make sure that the port ranage is indeed reserved. */ 22553 for (i = 0; i < tcp_reserved_port_array_size; i++) { 22554 if (tcp_reserved_port[i].lo_port == lo_port) { 22555 hi_port = tcp_reserved_port[i].hi_port; 22556 temp_tcp_array = tcp_reserved_port[i].temp_tcp_array; 22557 break; 22558 } 22559 } 22560 if (i == tcp_reserved_port_array_size) { 22561 rw_exit(&tcp_reserved_port_lock); 22562 return (B_FALSE); 22563 } 22564 22565 /* 22566 * Remove the range from the array. This simple loop is possible 22567 * because port ranges are inserted in ascending order. 22568 */ 22569 for (j = i; j < tcp_reserved_port_array_size - 1; j++) { 22570 tcp_reserved_port[j].lo_port = tcp_reserved_port[j+1].lo_port; 22571 tcp_reserved_port[j].hi_port = tcp_reserved_port[j+1].hi_port; 22572 tcp_reserved_port[j].temp_tcp_array = 22573 tcp_reserved_port[j+1].temp_tcp_array; 22574 } 22575 22576 /* Remove all the temporary tcp structures. */ 22577 size = hi_port - lo_port + 1; 22578 while (size > 0) { 22579 tcp = temp_tcp_array[size - 1]; 22580 ASSERT(tcp != NULL); 22581 tcp_bind_hash_remove(tcp); 22582 CONN_DEC_REF(tcp->tcp_connp); 22583 size--; 22584 } 22585 kmem_free(temp_tcp_array, (hi_port - lo_port + 1) * sizeof (tcp_t *)); 22586 tcp_reserved_port_array_size--; 22587 rw_exit(&tcp_reserved_port_lock); 22588 return (B_TRUE); 22589 } 22590 22591 /* 22592 * Macro to remove temporary tcp structure from the bind hash list. The 22593 * first parameter is the list of tcp to be removed. The second parameter 22594 * is the number of tcps in the array. 22595 */ 22596 #define TCP_TMP_TCP_REMOVE(tcp_array, num) \ 22597 { \ 22598 while ((num) > 0) { \ 22599 tcp_t *tcp = (tcp_array)[(num) - 1]; \ 22600 tf_t *tbf; \ 22601 tcp_t *tcpnext; \ 22602 tbf = &tcp_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)]; \ 22603 mutex_enter(&tbf->tf_lock); \ 22604 tcpnext = tcp->tcp_bind_hash; \ 22605 if (tcpnext) { \ 22606 tcpnext->tcp_ptpbhn = \ 22607 tcp->tcp_ptpbhn; \ 22608 } \ 22609 *tcp->tcp_ptpbhn = tcpnext; \ 22610 mutex_exit(&tbf->tf_lock); \ 22611 kmem_free(tcp, sizeof (tcp_t)); \ 22612 (tcp_array)[(num) - 1] = NULL; \ 22613 (num)--; \ 22614 } \ 22615 } 22616 22617 /* 22618 * The public interface for other modules to call to reserve a port range 22619 * in TCP. The caller passes in how large a port range it wants. TCP 22620 * will try to find a range and return it via lo_port and hi_port. This is 22621 * used by NCA's nca_conn_init. 22622 * NCA can only be used in the global zone so this only affects the global 22623 * zone's ports. 22624 * 22625 * Params: 22626 * int size: the size of the port range to be reserved. 22627 * in_port_t *lo_port (referenced): returns the beginning port of the 22628 * reserved port range added. 22629 * in_port_t *hi_port (referenced): returns the ending port of the 22630 * reserved port range added. 22631 * 22632 * Return: 22633 * B_TRUE if the port reservation is successful, B_FALSE otherwise. 22634 */ 22635 boolean_t 22636 tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) 22637 { 22638 tcp_t *tcp; 22639 tcp_t *tmp_tcp; 22640 tcp_t **temp_tcp_array; 22641 tf_t *tbf; 22642 in_port_t net_port; 22643 in_port_t port; 22644 int32_t cur_size; 22645 int i, j; 22646 boolean_t used; 22647 tcp_rport_t tmp_ports[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE]; 22648 zoneid_t zoneid = GLOBAL_ZONEID; 22649 22650 /* Sanity check. */ 22651 if (size <= 0 || size > TCP_RESERVED_PORTS_RANGE_MAX) { 22652 return (B_FALSE); 22653 } 22654 22655 rw_enter(&tcp_reserved_port_lock, RW_WRITER); 22656 if (tcp_reserved_port_array_size == TCP_RESERVED_PORTS_ARRAY_MAX_SIZE) { 22657 rw_exit(&tcp_reserved_port_lock); 22658 return (B_FALSE); 22659 } 22660 22661 /* 22662 * Find the starting port to try. Since the port ranges are ordered 22663 * in the reserved port array, we can do a simple search here. 22664 */ 22665 *lo_port = TCP_SMALLEST_RESERVED_PORT; 22666 *hi_port = TCP_LARGEST_RESERVED_PORT; 22667 for (i = 0; i < tcp_reserved_port_array_size; 22668 *lo_port = tcp_reserved_port[i].hi_port + 1, i++) { 22669 if (tcp_reserved_port[i].lo_port - *lo_port >= size) { 22670 *hi_port = tcp_reserved_port[i].lo_port - 1; 22671 break; 22672 } 22673 } 22674 /* No available port range. */ 22675 if (i == tcp_reserved_port_array_size && *hi_port - *lo_port < size) { 22676 rw_exit(&tcp_reserved_port_lock); 22677 return (B_FALSE); 22678 } 22679 22680 temp_tcp_array = kmem_zalloc(size * sizeof (tcp_t *), KM_NOSLEEP); 22681 if (temp_tcp_array == NULL) { 22682 rw_exit(&tcp_reserved_port_lock); 22683 return (B_FALSE); 22684 } 22685 22686 /* Go thru the port range to see if some ports are already bound. */ 22687 for (port = *lo_port, cur_size = 0; 22688 cur_size < size && port <= *hi_port; 22689 cur_size++, port++) { 22690 used = B_FALSE; 22691 net_port = htons(port); 22692 tbf = &tcp_bind_fanout[TCP_BIND_HASH(net_port)]; 22693 mutex_enter(&tbf->tf_lock); 22694 for (tcp = tbf->tf_tcp; tcp != NULL; 22695 tcp = tcp->tcp_bind_hash) { 22696 if (IPCL_ZONE_MATCH(tcp->tcp_connp, zoneid) && 22697 net_port == tcp->tcp_lport) { 22698 /* 22699 * A port is already bound. Search again 22700 * starting from port + 1. Release all 22701 * temporary tcps. 22702 */ 22703 mutex_exit(&tbf->tf_lock); 22704 TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size); 22705 *lo_port = port + 1; 22706 cur_size = -1; 22707 used = B_TRUE; 22708 break; 22709 } 22710 } 22711 if (!used) { 22712 if ((tmp_tcp = tcp_alloc_temp_tcp(net_port)) == NULL) { 22713 /* 22714 * Allocation failure. Just fail the request. 22715 * Need to remove all those temporary tcp 22716 * structures. 22717 */ 22718 mutex_exit(&tbf->tf_lock); 22719 TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size); 22720 rw_exit(&tcp_reserved_port_lock); 22721 kmem_free(temp_tcp_array, 22722 (hi_port - lo_port + 1) * 22723 sizeof (tcp_t *)); 22724 return (B_FALSE); 22725 } 22726 temp_tcp_array[cur_size] = tmp_tcp; 22727 tcp_bind_hash_insert(tbf, tmp_tcp, B_TRUE); 22728 mutex_exit(&tbf->tf_lock); 22729 } 22730 } 22731 22732 /* 22733 * The current range is not large enough. We can actually do another 22734 * search if this search is done between 2 reserved port ranges. But 22735 * for first release, we just stop here and return saying that no port 22736 * range is available. 22737 */ 22738 if (cur_size < size) { 22739 TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size); 22740 rw_exit(&tcp_reserved_port_lock); 22741 kmem_free(temp_tcp_array, size * sizeof (tcp_t *)); 22742 return (B_FALSE); 22743 } 22744 *hi_port = port - 1; 22745 22746 /* 22747 * Insert range into array in ascending order. Since this function 22748 * must not be called often, we choose to use the simplest method. 22749 * The above array should not consume excessive stack space as 22750 * the size must be very small. If in future releases, we find 22751 * that we should provide more reserved port ranges, this function 22752 * has to be modified to be more efficient. 22753 */ 22754 if (tcp_reserved_port_array_size == 0) { 22755 tcp_reserved_port[0].lo_port = *lo_port; 22756 tcp_reserved_port[0].hi_port = *hi_port; 22757 tcp_reserved_port[0].temp_tcp_array = temp_tcp_array; 22758 } else { 22759 for (i = 0, j = 0; i < tcp_reserved_port_array_size; i++, j++) { 22760 if (*lo_port < tcp_reserved_port[i].lo_port && i == j) { 22761 tmp_ports[j].lo_port = *lo_port; 22762 tmp_ports[j].hi_port = *hi_port; 22763 tmp_ports[j].temp_tcp_array = temp_tcp_array; 22764 j++; 22765 } 22766 tmp_ports[j].lo_port = tcp_reserved_port[i].lo_port; 22767 tmp_ports[j].hi_port = tcp_reserved_port[i].hi_port; 22768 tmp_ports[j].temp_tcp_array = 22769 tcp_reserved_port[i].temp_tcp_array; 22770 } 22771 if (j == i) { 22772 tmp_ports[j].lo_port = *lo_port; 22773 tmp_ports[j].hi_port = *hi_port; 22774 tmp_ports[j].temp_tcp_array = temp_tcp_array; 22775 } 22776 bcopy(tmp_ports, tcp_reserved_port, sizeof (tmp_ports)); 22777 } 22778 tcp_reserved_port_array_size++; 22779 rw_exit(&tcp_reserved_port_lock); 22780 return (B_TRUE); 22781 } 22782 22783 /* 22784 * Check to see if a port is in any reserved port range. 22785 * 22786 * Params: 22787 * in_port_t port: the port to be verified. 22788 * 22789 * Return: 22790 * B_TRUE is the port is inside a reserved port range, B_FALSE otherwise. 22791 */ 22792 boolean_t 22793 tcp_reserved_port_check(in_port_t port) 22794 { 22795 int i; 22796 22797 rw_enter(&tcp_reserved_port_lock, RW_READER); 22798 for (i = 0; i < tcp_reserved_port_array_size; i++) { 22799 if (port >= tcp_reserved_port[i].lo_port || 22800 port <= tcp_reserved_port[i].hi_port) { 22801 rw_exit(&tcp_reserved_port_lock); 22802 return (B_TRUE); 22803 } 22804 } 22805 rw_exit(&tcp_reserved_port_lock); 22806 return (B_FALSE); 22807 } 22808 22809 /* 22810 * To list all reserved port ranges. This is the function to handle 22811 * ndd tcp_reserved_port_list. 22812 */ 22813 /* ARGSUSED */ 22814 static int 22815 tcp_reserved_port_list(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 22816 { 22817 int i; 22818 22819 rw_enter(&tcp_reserved_port_lock, RW_READER); 22820 if (tcp_reserved_port_array_size > 0) 22821 (void) mi_mpprintf(mp, "The following ports are reserved:"); 22822 else 22823 (void) mi_mpprintf(mp, "No port is reserved."); 22824 for (i = 0; i < tcp_reserved_port_array_size; i++) { 22825 (void) mi_mpprintf(mp, "%d-%d", 22826 tcp_reserved_port[i].lo_port, tcp_reserved_port[i].hi_port); 22827 } 22828 rw_exit(&tcp_reserved_port_lock); 22829 return (0); 22830 } 22831 22832 /* 22833 * Hash list insertion routine for tcp_t structures. 22834 * Inserts entries with the ones bound to a specific IP address first 22835 * followed by those bound to INADDR_ANY. 22836 */ 22837 static void 22838 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) 22839 { 22840 tcp_t **tcpp; 22841 tcp_t *tcpnext; 22842 22843 if (tcp->tcp_ptpbhn != NULL) { 22844 ASSERT(!caller_holds_lock); 22845 tcp_bind_hash_remove(tcp); 22846 } 22847 tcpp = &tbf->tf_tcp; 22848 if (!caller_holds_lock) { 22849 mutex_enter(&tbf->tf_lock); 22850 } else { 22851 ASSERT(MUTEX_HELD(&tbf->tf_lock)); 22852 } 22853 tcpnext = tcpp[0]; 22854 if (tcpnext) { 22855 /* 22856 * If the new tcp bound to the INADDR_ANY address 22857 * and the first one in the list is not bound to 22858 * INADDR_ANY we skip all entries until we find the 22859 * first one bound to INADDR_ANY. 22860 * This makes sure that applications binding to a 22861 * specific address get preference over those binding to 22862 * INADDR_ANY. 22863 */ 22864 if (V6_OR_V4_INADDR_ANY(tcp->tcp_bound_source_v6) && 22865 !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) { 22866 while ((tcpnext = tcpp[0]) != NULL && 22867 !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) 22868 tcpp = &(tcpnext->tcp_bind_hash); 22869 if (tcpnext) 22870 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash; 22871 } else 22872 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash; 22873 } 22874 tcp->tcp_bind_hash = tcpnext; 22875 tcp->tcp_ptpbhn = tcpp; 22876 tcpp[0] = tcp; 22877 if (!caller_holds_lock) 22878 mutex_exit(&tbf->tf_lock); 22879 } 22880 22881 /* 22882 * Hash list removal routine for tcp_t structures. 22883 */ 22884 static void 22885 tcp_bind_hash_remove(tcp_t *tcp) 22886 { 22887 tcp_t *tcpnext; 22888 kmutex_t *lockp; 22889 22890 if (tcp->tcp_ptpbhn == NULL) 22891 return; 22892 22893 /* 22894 * Extract the lock pointer in case there are concurrent 22895 * hash_remove's for this instance. 22896 */ 22897 ASSERT(tcp->tcp_lport != 0); 22898 lockp = &tcp_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)].tf_lock; 22899 22900 ASSERT(lockp != NULL); 22901 mutex_enter(lockp); 22902 if (tcp->tcp_ptpbhn) { 22903 tcpnext = tcp->tcp_bind_hash; 22904 if (tcpnext) { 22905 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 22906 tcp->tcp_bind_hash = NULL; 22907 } 22908 *tcp->tcp_ptpbhn = tcpnext; 22909 tcp->tcp_ptpbhn = NULL; 22910 } 22911 mutex_exit(lockp); 22912 } 22913 22914 22915 /* 22916 * Hash list lookup routine for tcp_t structures. 22917 * Returns with a CONN_INC_REF tcp structure. Caller must do a CONN_DEC_REF. 22918 */ 22919 static tcp_t * 22920 tcp_acceptor_hash_lookup(t_uscalar_t id) 22921 { 22922 tf_t *tf; 22923 tcp_t *tcp; 22924 22925 tf = &tcp_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 22926 mutex_enter(&tf->tf_lock); 22927 for (tcp = tf->tf_tcp; tcp != NULL; 22928 tcp = tcp->tcp_acceptor_hash) { 22929 if (tcp->tcp_acceptor_id == id) { 22930 CONN_INC_REF(tcp->tcp_connp); 22931 mutex_exit(&tf->tf_lock); 22932 return (tcp); 22933 } 22934 } 22935 mutex_exit(&tf->tf_lock); 22936 return (NULL); 22937 } 22938 22939 22940 /* 22941 * Hash list insertion routine for tcp_t structures. 22942 */ 22943 void 22944 tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp) 22945 { 22946 tf_t *tf; 22947 tcp_t **tcpp; 22948 tcp_t *tcpnext; 22949 22950 tf = &tcp_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 22951 22952 if (tcp->tcp_ptpahn != NULL) 22953 tcp_acceptor_hash_remove(tcp); 22954 tcpp = &tf->tf_tcp; 22955 mutex_enter(&tf->tf_lock); 22956 tcpnext = tcpp[0]; 22957 if (tcpnext) 22958 tcpnext->tcp_ptpahn = &tcp->tcp_acceptor_hash; 22959 tcp->tcp_acceptor_hash = tcpnext; 22960 tcp->tcp_ptpahn = tcpp; 22961 tcpp[0] = tcp; 22962 tcp->tcp_acceptor_lockp = &tf->tf_lock; /* For tcp_*_hash_remove */ 22963 mutex_exit(&tf->tf_lock); 22964 } 22965 22966 /* 22967 * Hash list removal routine for tcp_t structures. 22968 */ 22969 static void 22970 tcp_acceptor_hash_remove(tcp_t *tcp) 22971 { 22972 tcp_t *tcpnext; 22973 kmutex_t *lockp; 22974 22975 /* 22976 * Extract the lock pointer in case there are concurrent 22977 * hash_remove's for this instance. 22978 */ 22979 lockp = tcp->tcp_acceptor_lockp; 22980 22981 if (tcp->tcp_ptpahn == NULL) 22982 return; 22983 22984 ASSERT(lockp != NULL); 22985 mutex_enter(lockp); 22986 if (tcp->tcp_ptpahn) { 22987 tcpnext = tcp->tcp_acceptor_hash; 22988 if (tcpnext) { 22989 tcpnext->tcp_ptpahn = tcp->tcp_ptpahn; 22990 tcp->tcp_acceptor_hash = NULL; 22991 } 22992 *tcp->tcp_ptpahn = tcpnext; 22993 tcp->tcp_ptpahn = NULL; 22994 } 22995 mutex_exit(lockp); 22996 tcp->tcp_acceptor_lockp = NULL; 22997 } 22998 22999 /* ARGSUSED */ 23000 static int 23001 tcp_host_param_setvalue(queue_t *q, mblk_t *mp, char *value, caddr_t cp, int af) 23002 { 23003 int error = 0; 23004 int retval; 23005 char *end; 23006 23007 tcp_hsp_t *hsp; 23008 tcp_hsp_t *hspprev; 23009 23010 ipaddr_t addr = 0; /* Address we're looking for */ 23011 in6_addr_t v6addr; /* Address we're looking for */ 23012 uint32_t hash; /* Hash of that address */ 23013 23014 /* 23015 * If the following variables are still zero after parsing the input 23016 * string, the user didn't specify them and we don't change them in 23017 * the HSP. 23018 */ 23019 23020 ipaddr_t mask = 0; /* Subnet mask */ 23021 in6_addr_t v6mask; 23022 long sendspace = 0; /* Send buffer size */ 23023 long recvspace = 0; /* Receive buffer size */ 23024 long timestamp = 0; /* Originate TCP TSTAMP option, 1 = yes */ 23025 boolean_t delete = B_FALSE; /* User asked to delete this HSP */ 23026 23027 rw_enter(&tcp_hsp_lock, RW_WRITER); 23028 23029 /* Parse and validate address */ 23030 if (af == AF_INET) { 23031 retval = inet_pton(af, value, &addr); 23032 if (retval == 1) 23033 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 23034 } else if (af == AF_INET6) { 23035 retval = inet_pton(af, value, &v6addr); 23036 } else { 23037 error = EINVAL; 23038 goto done; 23039 } 23040 if (retval == 0) { 23041 error = EINVAL; 23042 goto done; 23043 } 23044 23045 while ((*value) && *value != ' ') 23046 value++; 23047 23048 /* Parse individual keywords, set variables if found */ 23049 while (*value) { 23050 /* Skip leading blanks */ 23051 23052 while (*value == ' ' || *value == '\t') 23053 value++; 23054 23055 /* If at end of string, we're done */ 23056 23057 if (!*value) 23058 break; 23059 23060 /* We have a word, figure out what it is */ 23061 23062 if (strncmp("mask", value, 4) == 0) { 23063 value += 4; 23064 while (*value == ' ' || *value == '\t') 23065 value++; 23066 /* Parse subnet mask */ 23067 if (af == AF_INET) { 23068 retval = inet_pton(af, value, &mask); 23069 if (retval == 1) { 23070 V4MASK_TO_V6(mask, v6mask); 23071 } 23072 } else if (af == AF_INET6) { 23073 retval = inet_pton(af, value, &v6mask); 23074 } 23075 if (retval != 1) { 23076 error = EINVAL; 23077 goto done; 23078 } 23079 while ((*value) && *value != ' ') 23080 value++; 23081 } else if (strncmp("sendspace", value, 9) == 0) { 23082 value += 9; 23083 23084 if (ddi_strtol(value, &end, 0, &sendspace) != 0 || 23085 sendspace < TCP_XMIT_HIWATER || 23086 sendspace >= (1L<<30)) { 23087 error = EINVAL; 23088 goto done; 23089 } 23090 value = end; 23091 } else if (strncmp("recvspace", value, 9) == 0) { 23092 value += 9; 23093 23094 if (ddi_strtol(value, &end, 0, &recvspace) != 0 || 23095 recvspace < TCP_RECV_HIWATER || 23096 recvspace >= (1L<<30)) { 23097 error = EINVAL; 23098 goto done; 23099 } 23100 value = end; 23101 } else if (strncmp("timestamp", value, 9) == 0) { 23102 value += 9; 23103 23104 if (ddi_strtol(value, &end, 0, ×tamp) != 0 || 23105 timestamp < 0 || timestamp > 1) { 23106 error = EINVAL; 23107 goto done; 23108 } 23109 23110 /* 23111 * We increment timestamp so we know it's been set; 23112 * this is undone when we put it in the HSP 23113 */ 23114 timestamp++; 23115 value = end; 23116 } else if (strncmp("delete", value, 6) == 0) { 23117 value += 6; 23118 delete = B_TRUE; 23119 } else { 23120 error = EINVAL; 23121 goto done; 23122 } 23123 } 23124 23125 /* Hash address for lookup */ 23126 23127 hash = TCP_HSP_HASH(addr); 23128 23129 if (delete) { 23130 /* 23131 * Note that deletes don't return an error if the thing 23132 * we're trying to delete isn't there. 23133 */ 23134 if (tcp_hsp_hash == NULL) 23135 goto done; 23136 hsp = tcp_hsp_hash[hash]; 23137 23138 if (hsp) { 23139 if (IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, 23140 &v6addr)) { 23141 tcp_hsp_hash[hash] = hsp->tcp_hsp_next; 23142 mi_free((char *)hsp); 23143 } else { 23144 hspprev = hsp; 23145 while ((hsp = hsp->tcp_hsp_next) != NULL) { 23146 if (IN6_ARE_ADDR_EQUAL( 23147 &hsp->tcp_hsp_addr_v6, &v6addr)) { 23148 hspprev->tcp_hsp_next = 23149 hsp->tcp_hsp_next; 23150 mi_free((char *)hsp); 23151 break; 23152 } 23153 hspprev = hsp; 23154 } 23155 } 23156 } 23157 } else { 23158 /* 23159 * We're adding/modifying an HSP. If we haven't already done 23160 * so, allocate the hash table. 23161 */ 23162 23163 if (!tcp_hsp_hash) { 23164 tcp_hsp_hash = (tcp_hsp_t **) 23165 mi_zalloc(sizeof (tcp_hsp_t *) * TCP_HSP_HASH_SIZE); 23166 if (!tcp_hsp_hash) { 23167 error = EINVAL; 23168 goto done; 23169 } 23170 } 23171 23172 /* Get head of hash chain */ 23173 23174 hsp = tcp_hsp_hash[hash]; 23175 23176 /* Try to find pre-existing hsp on hash chain */ 23177 /* Doesn't handle CIDR prefixes. */ 23178 while (hsp) { 23179 if (IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, &v6addr)) 23180 break; 23181 hsp = hsp->tcp_hsp_next; 23182 } 23183 23184 /* 23185 * If we didn't, create one with default values and put it 23186 * at head of hash chain 23187 */ 23188 23189 if (!hsp) { 23190 hsp = (tcp_hsp_t *)mi_zalloc(sizeof (tcp_hsp_t)); 23191 if (!hsp) { 23192 error = EINVAL; 23193 goto done; 23194 } 23195 hsp->tcp_hsp_next = tcp_hsp_hash[hash]; 23196 tcp_hsp_hash[hash] = hsp; 23197 } 23198 23199 /* Set values that the user asked us to change */ 23200 23201 hsp->tcp_hsp_addr_v6 = v6addr; 23202 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) 23203 hsp->tcp_hsp_vers = IPV4_VERSION; 23204 else 23205 hsp->tcp_hsp_vers = IPV6_VERSION; 23206 hsp->tcp_hsp_subnet_v6 = v6mask; 23207 if (sendspace > 0) 23208 hsp->tcp_hsp_sendspace = sendspace; 23209 if (recvspace > 0) 23210 hsp->tcp_hsp_recvspace = recvspace; 23211 if (timestamp > 0) 23212 hsp->tcp_hsp_tstamp = timestamp - 1; 23213 } 23214 23215 done: 23216 rw_exit(&tcp_hsp_lock); 23217 return (error); 23218 } 23219 23220 /* Set callback routine passed to nd_load by tcp_param_register. */ 23221 /* ARGSUSED */ 23222 static int 23223 tcp_host_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) 23224 { 23225 return (tcp_host_param_setvalue(q, mp, value, cp, AF_INET)); 23226 } 23227 /* ARGSUSED */ 23228 static int 23229 tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 23230 cred_t *cr) 23231 { 23232 return (tcp_host_param_setvalue(q, mp, value, cp, AF_INET6)); 23233 } 23234 23235 /* TCP host parameters report triggered via the Named Dispatch mechanism. */ 23236 /* ARGSUSED */ 23237 static int 23238 tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 23239 { 23240 tcp_hsp_t *hsp; 23241 int i; 23242 char addrbuf[INET6_ADDRSTRLEN], subnetbuf[INET6_ADDRSTRLEN]; 23243 23244 rw_enter(&tcp_hsp_lock, RW_READER); 23245 (void) mi_mpprintf(mp, 23246 "Hash HSP " MI_COL_HDRPAD_STR 23247 "Address Subnet Mask Send Receive TStamp"); 23248 if (tcp_hsp_hash) { 23249 for (i = 0; i < TCP_HSP_HASH_SIZE; i++) { 23250 hsp = tcp_hsp_hash[i]; 23251 while (hsp) { 23252 if (hsp->tcp_hsp_vers == IPV4_VERSION) { 23253 (void) inet_ntop(AF_INET, 23254 &hsp->tcp_hsp_addr, 23255 addrbuf, sizeof (addrbuf)); 23256 (void) inet_ntop(AF_INET, 23257 &hsp->tcp_hsp_subnet, 23258 subnetbuf, sizeof (subnetbuf)); 23259 } else { 23260 (void) inet_ntop(AF_INET6, 23261 &hsp->tcp_hsp_addr_v6, 23262 addrbuf, sizeof (addrbuf)); 23263 (void) inet_ntop(AF_INET6, 23264 &hsp->tcp_hsp_subnet_v6, 23265 subnetbuf, sizeof (subnetbuf)); 23266 } 23267 (void) mi_mpprintf(mp, 23268 " %03d " MI_COL_PTRFMT_STR 23269 "%s %s %010d %010d %d", 23270 i, 23271 (void *)hsp, 23272 addrbuf, 23273 subnetbuf, 23274 hsp->tcp_hsp_sendspace, 23275 hsp->tcp_hsp_recvspace, 23276 hsp->tcp_hsp_tstamp); 23277 23278 hsp = hsp->tcp_hsp_next; 23279 } 23280 } 23281 } 23282 rw_exit(&tcp_hsp_lock); 23283 return (0); 23284 } 23285 23286 23287 /* Data for fast netmask macro used by tcp_hsp_lookup */ 23288 23289 static ipaddr_t netmasks[] = { 23290 IN_CLASSA_NET, IN_CLASSA_NET, IN_CLASSB_NET, 23291 IN_CLASSC_NET | IN_CLASSD_NET /* Class C,D,E */ 23292 }; 23293 23294 #define netmask(addr) (netmasks[(ipaddr_t)(addr) >> 30]) 23295 23296 /* 23297 * XXX This routine should go away and instead we should use the metrics 23298 * associated with the routes to determine the default sndspace and rcvspace. 23299 */ 23300 static tcp_hsp_t * 23301 tcp_hsp_lookup(ipaddr_t addr) 23302 { 23303 tcp_hsp_t *hsp = NULL; 23304 23305 /* Quick check without acquiring the lock. */ 23306 if (tcp_hsp_hash == NULL) 23307 return (NULL); 23308 23309 rw_enter(&tcp_hsp_lock, RW_READER); 23310 23311 /* This routine finds the best-matching HSP for address addr. */ 23312 23313 if (tcp_hsp_hash) { 23314 int i; 23315 ipaddr_t srchaddr; 23316 tcp_hsp_t *hsp_net; 23317 23318 /* We do three passes: host, network, and subnet. */ 23319 23320 srchaddr = addr; 23321 23322 for (i = 1; i <= 3; i++) { 23323 /* Look for exact match on srchaddr */ 23324 23325 hsp = tcp_hsp_hash[TCP_HSP_HASH(srchaddr)]; 23326 while (hsp) { 23327 if (hsp->tcp_hsp_vers == IPV4_VERSION && 23328 hsp->tcp_hsp_addr == srchaddr) 23329 break; 23330 hsp = hsp->tcp_hsp_next; 23331 } 23332 ASSERT(hsp == NULL || 23333 hsp->tcp_hsp_vers == IPV4_VERSION); 23334 23335 /* 23336 * If this is the first pass: 23337 * If we found a match, great, return it. 23338 * If not, search for the network on the second pass. 23339 */ 23340 23341 if (i == 1) 23342 if (hsp) 23343 break; 23344 else 23345 { 23346 srchaddr = addr & netmask(addr); 23347 continue; 23348 } 23349 23350 /* 23351 * If this is the second pass: 23352 * If we found a match, but there's a subnet mask, 23353 * save the match but try again using the subnet 23354 * mask on the third pass. 23355 * Otherwise, return whatever we found. 23356 */ 23357 23358 if (i == 2) { 23359 if (hsp && hsp->tcp_hsp_subnet) { 23360 hsp_net = hsp; 23361 srchaddr = addr & hsp->tcp_hsp_subnet; 23362 continue; 23363 } else { 23364 break; 23365 } 23366 } 23367 23368 /* 23369 * This must be the third pass. If we didn't find 23370 * anything, return the saved network HSP instead. 23371 */ 23372 23373 if (!hsp) 23374 hsp = hsp_net; 23375 } 23376 } 23377 23378 rw_exit(&tcp_hsp_lock); 23379 return (hsp); 23380 } 23381 23382 /* 23383 * XXX Equally broken as the IPv4 routine. Doesn't handle longest 23384 * match lookup. 23385 */ 23386 static tcp_hsp_t * 23387 tcp_hsp_lookup_ipv6(in6_addr_t *v6addr) 23388 { 23389 tcp_hsp_t *hsp = NULL; 23390 23391 /* Quick check without acquiring the lock. */ 23392 if (tcp_hsp_hash == NULL) 23393 return (NULL); 23394 23395 rw_enter(&tcp_hsp_lock, RW_READER); 23396 23397 /* This routine finds the best-matching HSP for address addr. */ 23398 23399 if (tcp_hsp_hash) { 23400 int i; 23401 in6_addr_t v6srchaddr; 23402 tcp_hsp_t *hsp_net; 23403 23404 /* We do three passes: host, network, and subnet. */ 23405 23406 v6srchaddr = *v6addr; 23407 23408 for (i = 1; i <= 3; i++) { 23409 /* Look for exact match on srchaddr */ 23410 23411 hsp = tcp_hsp_hash[TCP_HSP_HASH( 23412 V4_PART_OF_V6(v6srchaddr))]; 23413 while (hsp) { 23414 if (hsp->tcp_hsp_vers == IPV6_VERSION && 23415 IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, 23416 &v6srchaddr)) 23417 break; 23418 hsp = hsp->tcp_hsp_next; 23419 } 23420 23421 /* 23422 * If this is the first pass: 23423 * If we found a match, great, return it. 23424 * If not, search for the network on the second pass. 23425 */ 23426 23427 if (i == 1) 23428 if (hsp) 23429 break; 23430 else { 23431 /* Assume a 64 bit mask */ 23432 v6srchaddr.s6_addr32[0] = 23433 v6addr->s6_addr32[0]; 23434 v6srchaddr.s6_addr32[1] = 23435 v6addr->s6_addr32[1]; 23436 v6srchaddr.s6_addr32[2] = 0; 23437 v6srchaddr.s6_addr32[3] = 0; 23438 continue; 23439 } 23440 23441 /* 23442 * If this is the second pass: 23443 * If we found a match, but there's a subnet mask, 23444 * save the match but try again using the subnet 23445 * mask on the third pass. 23446 * Otherwise, return whatever we found. 23447 */ 23448 23449 if (i == 2) { 23450 ASSERT(hsp == NULL || 23451 hsp->tcp_hsp_vers == IPV6_VERSION); 23452 if (hsp && 23453 !IN6_IS_ADDR_UNSPECIFIED( 23454 &hsp->tcp_hsp_subnet_v6)) { 23455 hsp_net = hsp; 23456 V6_MASK_COPY(*v6addr, 23457 hsp->tcp_hsp_subnet_v6, v6srchaddr); 23458 continue; 23459 } else { 23460 break; 23461 } 23462 } 23463 23464 /* 23465 * This must be the third pass. If we didn't find 23466 * anything, return the saved network HSP instead. 23467 */ 23468 23469 if (!hsp) 23470 hsp = hsp_net; 23471 } 23472 } 23473 23474 rw_exit(&tcp_hsp_lock); 23475 return (hsp); 23476 } 23477 23478 /* 23479 * Type three generator adapted from the random() function in 4.4 BSD: 23480 */ 23481 23482 /* 23483 * Copyright (c) 1983, 1993 23484 * The Regents of the University of California. All rights reserved. 23485 * 23486 * Redistribution and use in source and binary forms, with or without 23487 * modification, are permitted provided that the following conditions 23488 * are met: 23489 * 1. Redistributions of source code must retain the above copyright 23490 * notice, this list of conditions and the following disclaimer. 23491 * 2. Redistributions in binary form must reproduce the above copyright 23492 * notice, this list of conditions and the following disclaimer in the 23493 * documentation and/or other materials provided with the distribution. 23494 * 3. All advertising materials mentioning features or use of this software 23495 * must display the following acknowledgement: 23496 * This product includes software developed by the University of 23497 * California, Berkeley and its contributors. 23498 * 4. Neither the name of the University nor the names of its contributors 23499 * may be used to endorse or promote products derived from this software 23500 * without specific prior written permission. 23501 * 23502 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23503 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23504 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23505 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23506 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23507 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23508 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23509 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23510 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23511 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23512 * SUCH DAMAGE. 23513 */ 23514 23515 /* Type 3 -- x**31 + x**3 + 1 */ 23516 #define DEG_3 31 23517 #define SEP_3 3 23518 23519 23520 /* Protected by tcp_random_lock */ 23521 static int tcp_randtbl[DEG_3 + 1]; 23522 23523 static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 23524 static int *tcp_random_rptr = &tcp_randtbl[1]; 23525 23526 static int *tcp_random_state = &tcp_randtbl[1]; 23527 static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 23528 23529 kmutex_t tcp_random_lock; 23530 23531 void 23532 tcp_random_init(void) 23533 { 23534 int i; 23535 hrtime_t hrt; 23536 time_t wallclock; 23537 uint64_t result; 23538 23539 /* 23540 * Use high-res timer and current time for seed. Gethrtime() returns 23541 * a longlong, which may contain resolution down to nanoseconds. 23542 * The current time will either be a 32-bit or a 64-bit quantity. 23543 * XOR the two together in a 64-bit result variable. 23544 * Convert the result to a 32-bit value by multiplying the high-order 23545 * 32-bits by the low-order 32-bits. 23546 */ 23547 23548 hrt = gethrtime(); 23549 (void) drv_getparm(TIME, &wallclock); 23550 result = (uint64_t)wallclock ^ (uint64_t)hrt; 23551 mutex_enter(&tcp_random_lock); 23552 tcp_random_state[0] = ((result >> 32) & 0xffffffff) * 23553 (result & 0xffffffff); 23554 23555 for (i = 1; i < DEG_3; i++) 23556 tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 23557 + 12345; 23558 tcp_random_fptr = &tcp_random_state[SEP_3]; 23559 tcp_random_rptr = &tcp_random_state[0]; 23560 mutex_exit(&tcp_random_lock); 23561 for (i = 0; i < 10 * DEG_3; i++) 23562 (void) tcp_random(); 23563 } 23564 23565 /* 23566 * tcp_random: Return a random number in the range [1 - (128K + 1)]. 23567 * This range is selected to be approximately centered on TCP_ISS / 2, 23568 * and easy to compute. We get this value by generating a 32-bit random 23569 * number, selecting out the high-order 17 bits, and then adding one so 23570 * that we never return zero. 23571 */ 23572 int 23573 tcp_random(void) 23574 { 23575 int i; 23576 23577 mutex_enter(&tcp_random_lock); 23578 *tcp_random_fptr += *tcp_random_rptr; 23579 23580 /* 23581 * The high-order bits are more random than the low-order bits, 23582 * so we select out the high-order 17 bits and add one so that 23583 * we never return zero. 23584 */ 23585 i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 23586 if (++tcp_random_fptr >= tcp_random_end_ptr) { 23587 tcp_random_fptr = tcp_random_state; 23588 ++tcp_random_rptr; 23589 } else if (++tcp_random_rptr >= tcp_random_end_ptr) 23590 tcp_random_rptr = tcp_random_state; 23591 23592 mutex_exit(&tcp_random_lock); 23593 return (i); 23594 } 23595 23596 /* 23597 * XXX This will go away when TPI is extended to send 23598 * info reqs to sockfs/timod ..... 23599 * Given a queue, set the max packet size for the write 23600 * side of the queue below stream head. This value is 23601 * cached on the stream head. 23602 * Returns 1 on success, 0 otherwise. 23603 */ 23604 static int 23605 setmaxps(queue_t *q, int maxpsz) 23606 { 23607 struct stdata *stp; 23608 queue_t *wq; 23609 stp = STREAM(q); 23610 23611 /* 23612 * At this point change of a queue parameter is not allowed 23613 * when a multiplexor is sitting on top. 23614 */ 23615 if (stp->sd_flag & STPLEX) 23616 return (0); 23617 23618 claimstr(stp->sd_wrq); 23619 wq = stp->sd_wrq->q_next; 23620 ASSERT(wq != NULL); 23621 (void) strqset(wq, QMAXPSZ, 0, maxpsz); 23622 releasestr(stp->sd_wrq); 23623 return (1); 23624 } 23625 23626 static int 23627 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, 23628 int *t_errorp, int *sys_errorp) 23629 { 23630 int error; 23631 int is_absreq_failure; 23632 t_scalar_t *opt_lenp; 23633 t_scalar_t opt_offset; 23634 int prim_type; 23635 struct T_conn_req *tcreqp; 23636 struct T_conn_res *tcresp; 23637 cred_t *cr; 23638 23639 cr = DB_CREDDEF(mp, tcp->tcp_cred); 23640 23641 prim_type = ((union T_primitives *)mp->b_rptr)->type; 23642 ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || 23643 prim_type == T_CONN_RES); 23644 23645 switch (prim_type) { 23646 case T_CONN_REQ: 23647 tcreqp = (struct T_conn_req *)mp->b_rptr; 23648 opt_offset = tcreqp->OPT_offset; 23649 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; 23650 break; 23651 case O_T_CONN_RES: 23652 case T_CONN_RES: 23653 tcresp = (struct T_conn_res *)mp->b_rptr; 23654 opt_offset = tcresp->OPT_offset; 23655 opt_lenp = (t_scalar_t *)&tcresp->OPT_length; 23656 break; 23657 } 23658 23659 *t_errorp = 0; 23660 *sys_errorp = 0; 23661 *do_disconnectp = 0; 23662 23663 error = tpi_optcom_buf(tcp->tcp_wq, mp, opt_lenp, 23664 opt_offset, cr, &tcp_opt_obj, 23665 NULL, &is_absreq_failure); 23666 23667 switch (error) { 23668 case 0: /* no error */ 23669 ASSERT(is_absreq_failure == 0); 23670 return (0); 23671 case ENOPROTOOPT: 23672 *t_errorp = TBADOPT; 23673 break; 23674 case EACCES: 23675 *t_errorp = TACCES; 23676 break; 23677 default: 23678 *t_errorp = TSYSERR; *sys_errorp = error; 23679 break; 23680 } 23681 if (is_absreq_failure != 0) { 23682 /* 23683 * The connection request should get the local ack 23684 * T_OK_ACK and then a T_DISCON_IND. 23685 */ 23686 *do_disconnectp = 1; 23687 } 23688 return (-1); 23689 } 23690 23691 /* 23692 * Split this function out so that if the secret changes, I'm okay. 23693 * 23694 * Initialize the tcp_iss_cookie and tcp_iss_key. 23695 */ 23696 23697 #define PASSWD_SIZE 16 /* MUST be multiple of 4 */ 23698 23699 static void 23700 tcp_iss_key_init(uint8_t *phrase, int len) 23701 { 23702 struct { 23703 int32_t current_time; 23704 uint32_t randnum; 23705 uint16_t pad; 23706 uint8_t ether[6]; 23707 uint8_t passwd[PASSWD_SIZE]; 23708 } tcp_iss_cookie; 23709 time_t t; 23710 23711 /* 23712 * Start with the current absolute time. 23713 */ 23714 (void) drv_getparm(TIME, &t); 23715 tcp_iss_cookie.current_time = t; 23716 23717 /* 23718 * XXX - Need a more random number per RFC 1750, not this crap. 23719 * OTOH, if what follows is pretty random, then I'm in better shape. 23720 */ 23721 tcp_iss_cookie.randnum = (uint32_t)(gethrtime() + tcp_random()); 23722 tcp_iss_cookie.pad = 0x365c; /* Picked from HMAC pad values. */ 23723 23724 /* 23725 * The cpu_type_info is pretty non-random. Ugggh. It does serve 23726 * as a good template. 23727 */ 23728 bcopy(&cpu_list->cpu_type_info, &tcp_iss_cookie.passwd, 23729 min(PASSWD_SIZE, sizeof (cpu_list->cpu_type_info))); 23730 23731 /* 23732 * The pass-phrase. Normally this is supplied by user-called NDD. 23733 */ 23734 bcopy(phrase, &tcp_iss_cookie.passwd, min(PASSWD_SIZE, len)); 23735 23736 /* 23737 * See 4010593 if this section becomes a problem again, 23738 * but the local ethernet address is useful here. 23739 */ 23740 (void) localetheraddr(NULL, 23741 (struct ether_addr *)&tcp_iss_cookie.ether); 23742 23743 /* 23744 * Hash 'em all together. The MD5Final is called per-connection. 23745 */ 23746 mutex_enter(&tcp_iss_key_lock); 23747 MD5Init(&tcp_iss_key); 23748 MD5Update(&tcp_iss_key, (uchar_t *)&tcp_iss_cookie, 23749 sizeof (tcp_iss_cookie)); 23750 mutex_exit(&tcp_iss_key_lock); 23751 } 23752 23753 /* 23754 * Set the RFC 1948 pass phrase 23755 */ 23756 /* ARGSUSED */ 23757 static int 23758 tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 23759 cred_t *cr) 23760 { 23761 /* 23762 * Basically, value contains a new pass phrase. Pass it along! 23763 */ 23764 tcp_iss_key_init((uint8_t *)value, strlen(value)); 23765 return (0); 23766 } 23767 23768 /* ARGSUSED */ 23769 static int 23770 tcp_sack_info_constructor(void *buf, void *cdrarg, int kmflags) 23771 { 23772 bzero(buf, sizeof (tcp_sack_info_t)); 23773 return (0); 23774 } 23775 23776 /* ARGSUSED */ 23777 static int 23778 tcp_iphc_constructor(void *buf, void *cdrarg, int kmflags) 23779 { 23780 bzero(buf, TCP_MAX_COMBINED_HEADER_LENGTH); 23781 return (0); 23782 } 23783 23784 void 23785 tcp_ddi_init(void) 23786 { 23787 int i; 23788 23789 /* Initialize locks */ 23790 rw_init(&tcp_hsp_lock, NULL, RW_DEFAULT, NULL); 23791 mutex_init(&tcp_g_q_lock, NULL, MUTEX_DEFAULT, NULL); 23792 mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL); 23793 mutex_init(&tcp_iss_key_lock, NULL, MUTEX_DEFAULT, NULL); 23794 mutex_init(&tcp_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL); 23795 rw_init(&tcp_reserved_port_lock, NULL, RW_DEFAULT, NULL); 23796 23797 for (i = 0; i < A_CNT(tcp_bind_fanout); i++) { 23798 mutex_init(&tcp_bind_fanout[i].tf_lock, NULL, 23799 MUTEX_DEFAULT, NULL); 23800 } 23801 23802 for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) { 23803 mutex_init(&tcp_acceptor_fanout[i].tf_lock, NULL, 23804 MUTEX_DEFAULT, NULL); 23805 } 23806 23807 /* TCP's IPsec code calls the packet dropper. */ 23808 ip_drop_register(&tcp_dropper, "TCP IPsec policy enforcement"); 23809 23810 if (!tcp_g_nd) { 23811 if (!tcp_param_register(tcp_param_arr, A_CNT(tcp_param_arr))) { 23812 nd_free(&tcp_g_nd); 23813 } 23814 } 23815 23816 /* 23817 * Note: To really walk the device tree you need the devinfo 23818 * pointer to your device which is only available after probe/attach. 23819 * The following is safe only because it uses ddi_root_node() 23820 */ 23821 tcp_max_optsize = optcom_max_optsize(tcp_opt_obj.odb_opt_des_arr, 23822 tcp_opt_obj.odb_opt_arr_cnt); 23823 23824 tcp_timercache = kmem_cache_create("tcp_timercache", 23825 sizeof (tcp_timer_t) + sizeof (mblk_t), 0, 23826 NULL, NULL, NULL, NULL, NULL, 0); 23827 23828 tcp_sack_info_cache = kmem_cache_create("tcp_sack_info_cache", 23829 sizeof (tcp_sack_info_t), 0, 23830 tcp_sack_info_constructor, NULL, NULL, NULL, NULL, 0); 23831 23832 tcp_iphc_cache = kmem_cache_create("tcp_iphc_cache", 23833 TCP_MAX_COMBINED_HEADER_LENGTH, 0, 23834 tcp_iphc_constructor, NULL, NULL, NULL, NULL, 0); 23835 23836 tcp_squeue_wput_proc = tcp_squeue_switch(tcp_squeue_wput); 23837 tcp_squeue_close_proc = tcp_squeue_switch(tcp_squeue_close); 23838 23839 ip_squeue_init(tcp_squeue_add); 23840 23841 /* Initialize the random number generator */ 23842 tcp_random_init(); 23843 23844 /* 23845 * Initialize RFC 1948 secret values. This will probably be reset once 23846 * by the boot scripts. 23847 * 23848 * Use NULL name, as the name is caught by the new lockstats. 23849 * 23850 * Initialize with some random, non-guessable string, like the global 23851 * T_INFO_ACK. 23852 */ 23853 23854 tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack, 23855 sizeof (tcp_g_t_info_ack)); 23856 23857 if ((tcp_kstat = kstat_create(TCP_MOD_NAME, 0, "tcpstat", 23858 "net", KSTAT_TYPE_NAMED, 23859 sizeof (tcp_statistics) / sizeof (kstat_named_t), 23860 KSTAT_FLAG_VIRTUAL)) != NULL) { 23861 tcp_kstat->ks_data = &tcp_statistics; 23862 kstat_install(tcp_kstat); 23863 } 23864 23865 tcp_kstat_init(); 23866 } 23867 23868 void 23869 tcp_ddi_destroy(void) 23870 { 23871 int i; 23872 23873 nd_free(&tcp_g_nd); 23874 23875 for (i = 0; i < A_CNT(tcp_bind_fanout); i++) { 23876 mutex_destroy(&tcp_bind_fanout[i].tf_lock); 23877 } 23878 23879 for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) { 23880 mutex_destroy(&tcp_acceptor_fanout[i].tf_lock); 23881 } 23882 23883 mutex_destroy(&tcp_iss_key_lock); 23884 rw_destroy(&tcp_hsp_lock); 23885 mutex_destroy(&tcp_g_q_lock); 23886 mutex_destroy(&tcp_random_lock); 23887 mutex_destroy(&tcp_epriv_port_lock); 23888 rw_destroy(&tcp_reserved_port_lock); 23889 23890 ip_drop_unregister(&tcp_dropper); 23891 23892 kmem_cache_destroy(tcp_timercache); 23893 kmem_cache_destroy(tcp_sack_info_cache); 23894 kmem_cache_destroy(tcp_iphc_cache); 23895 23896 tcp_kstat_fini(); 23897 } 23898 23899 /* 23900 * Generate ISS, taking into account NDD changes may happen halfway through. 23901 * (If the iss is not zero, set it.) 23902 */ 23903 23904 static void 23905 tcp_iss_init(tcp_t *tcp) 23906 { 23907 MD5_CTX context; 23908 struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg; 23909 uint32_t answer[4]; 23910 23911 tcp_iss_incr_extra += (ISS_INCR >> 1); 23912 tcp->tcp_iss = tcp_iss_incr_extra; 23913 switch (tcp_strong_iss) { 23914 case 2: 23915 mutex_enter(&tcp_iss_key_lock); 23916 context = tcp_iss_key; 23917 mutex_exit(&tcp_iss_key_lock); 23918 arg.ports = tcp->tcp_ports; 23919 if (tcp->tcp_ipversion == IPV4_VERSION) { 23920 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 23921 &arg.src); 23922 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_dst, 23923 &arg.dst); 23924 } else { 23925 arg.src = tcp->tcp_ip6h->ip6_src; 23926 arg.dst = tcp->tcp_ip6h->ip6_dst; 23927 } 23928 MD5Update(&context, (uchar_t *)&arg, sizeof (arg)); 23929 MD5Final((uchar_t *)answer, &context); 23930 tcp->tcp_iss += answer[0] ^ answer[1] ^ answer[2] ^ answer[3]; 23931 /* 23932 * Now that we've hashed into a unique per-connection sequence 23933 * space, add a random increment per strong_iss == 1. So I 23934 * guess we'll have to... 23935 */ 23936 /* FALLTHRU */ 23937 case 1: 23938 tcp->tcp_iss += (gethrtime() >> ISS_NSEC_SHT) + tcp_random(); 23939 break; 23940 default: 23941 tcp->tcp_iss += (uint32_t)gethrestime_sec() * ISS_INCR; 23942 break; 23943 } 23944 tcp->tcp_valid_bits = TCP_ISS_VALID; 23945 tcp->tcp_fss = tcp->tcp_iss - 1; 23946 tcp->tcp_suna = tcp->tcp_iss; 23947 tcp->tcp_snxt = tcp->tcp_iss + 1; 23948 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 23949 tcp->tcp_csuna = tcp->tcp_snxt; 23950 } 23951 23952 /* 23953 * Exported routine for extracting active tcp connection status. 23954 * 23955 * This is used by the Solaris Cluster Networking software to 23956 * gather a list of connections that need to be forwarded to 23957 * specific nodes in the cluster when configuration changes occur. 23958 * 23959 * The callback is invoked for each tcp_t structure. Returning 23960 * non-zero from the callback routine terminates the search. 23961 */ 23962 int 23963 cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg) 23964 { 23965 tcp_t *tcp; 23966 cl_tcp_info_t cl_tcpi; 23967 connf_t *connfp; 23968 conn_t *connp; 23969 int i; 23970 23971 ASSERT(callback != NULL); 23972 23973 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 23974 23975 connfp = &ipcl_globalhash_fanout[i]; 23976 connp = NULL; 23977 23978 while ((connp = 23979 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 23980 23981 tcp = connp->conn_tcp; 23982 cl_tcpi.cl_tcpi_version = CL_TCPI_V1; 23983 cl_tcpi.cl_tcpi_ipversion = tcp->tcp_ipversion; 23984 cl_tcpi.cl_tcpi_state = tcp->tcp_state; 23985 cl_tcpi.cl_tcpi_lport = tcp->tcp_lport; 23986 cl_tcpi.cl_tcpi_fport = tcp->tcp_fport; 23987 /* 23988 * The macros tcp_laddr and tcp_faddr give the IPv4 23989 * addresses. They are copied implicitly below as 23990 * mapped addresses. 23991 */ 23992 cl_tcpi.cl_tcpi_laddr_v6 = tcp->tcp_ip_src_v6; 23993 if (tcp->tcp_ipversion == IPV4_VERSION) { 23994 cl_tcpi.cl_tcpi_faddr = 23995 tcp->tcp_ipha->ipha_dst; 23996 } else { 23997 cl_tcpi.cl_tcpi_faddr_v6 = 23998 tcp->tcp_ip6h->ip6_dst; 23999 } 24000 24001 /* 24002 * If the callback returns non-zero 24003 * we terminate the traversal. 24004 */ 24005 if ((*callback)(&cl_tcpi, arg) != 0) { 24006 CONN_DEC_REF(tcp->tcp_connp); 24007 return (1); 24008 } 24009 } 24010 } 24011 24012 return (0); 24013 } 24014 24015 /* 24016 * Macros used for accessing the different types of sockaddr 24017 * structures inside a tcp_ioc_abort_conn_t. 24018 */ 24019 #define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local) 24020 #define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote) 24021 #define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr) 24022 #define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr) 24023 #define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port) 24024 #define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port) 24025 #define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local) 24026 #define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote) 24027 #define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr) 24028 #define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr) 24029 #define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port) 24030 #define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port) 24031 24032 /* 24033 * Return the correct error code to mimic the behavior 24034 * of a connection reset. 24035 */ 24036 #define TCP_AC_GET_ERRCODE(state, err) { \ 24037 switch ((state)) { \ 24038 case TCPS_SYN_SENT: \ 24039 case TCPS_SYN_RCVD: \ 24040 (err) = ECONNREFUSED; \ 24041 break; \ 24042 case TCPS_ESTABLISHED: \ 24043 case TCPS_FIN_WAIT_1: \ 24044 case TCPS_FIN_WAIT_2: \ 24045 case TCPS_CLOSE_WAIT: \ 24046 (err) = ECONNRESET; \ 24047 break; \ 24048 case TCPS_CLOSING: \ 24049 case TCPS_LAST_ACK: \ 24050 case TCPS_TIME_WAIT: \ 24051 (err) = 0; \ 24052 break; \ 24053 default: \ 24054 (err) = ENXIO; \ 24055 } \ 24056 } 24057 24058 /* 24059 * Check if a tcp structure matches the info in acp. 24060 */ 24061 #define TCP_AC_ADDR_MATCH(acp, tcp) \ 24062 (((acp)->ac_local.ss_family == AF_INET) ? \ 24063 ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \ 24064 TCP_AC_V4LOCAL((acp)) == (tcp)->tcp_ip_src) && \ 24065 (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \ 24066 TCP_AC_V4REMOTE((acp)) == (tcp)->tcp_remote) && \ 24067 (TCP_AC_V4LPORT((acp)) == 0 || \ 24068 TCP_AC_V4LPORT((acp)) == (tcp)->tcp_lport) && \ 24069 (TCP_AC_V4RPORT((acp)) == 0 || \ 24070 TCP_AC_V4RPORT((acp)) == (tcp)->tcp_fport) && \ 24071 (acp)->ac_start <= (tcp)->tcp_state && \ 24072 (acp)->ac_end >= (tcp)->tcp_state) : \ 24073 ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \ 24074 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \ 24075 &(tcp)->tcp_ip_src_v6)) && \ 24076 (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \ 24077 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \ 24078 &(tcp)->tcp_remote_v6)) && \ 24079 (TCP_AC_V6LPORT((acp)) == 0 || \ 24080 TCP_AC_V6LPORT((acp)) == (tcp)->tcp_lport) && \ 24081 (TCP_AC_V6RPORT((acp)) == 0 || \ 24082 TCP_AC_V6RPORT((acp)) == (tcp)->tcp_fport) && \ 24083 (acp)->ac_start <= (tcp)->tcp_state && \ 24084 (acp)->ac_end >= (tcp)->tcp_state)) 24085 24086 #define TCP_AC_MATCH(acp, tcp) \ 24087 (((acp)->ac_zoneid == ALL_ZONES || \ 24088 (acp)->ac_zoneid == tcp->tcp_connp->conn_zoneid) ? \ 24089 TCP_AC_ADDR_MATCH(acp, tcp) : 0) 24090 24091 /* 24092 * Build a message containing a tcp_ioc_abort_conn_t structure 24093 * which is filled in with information from acp and tp. 24094 */ 24095 static mblk_t * 24096 tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) 24097 { 24098 mblk_t *mp; 24099 tcp_ioc_abort_conn_t *tacp; 24100 24101 mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO); 24102 if (mp == NULL) 24103 return (NULL); 24104 24105 mp->b_datap->db_type = M_CTL; 24106 24107 *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN; 24108 tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr + 24109 sizeof (uint32_t)); 24110 24111 tacp->ac_start = acp->ac_start; 24112 tacp->ac_end = acp->ac_end; 24113 tacp->ac_zoneid = acp->ac_zoneid; 24114 24115 if (acp->ac_local.ss_family == AF_INET) { 24116 tacp->ac_local.ss_family = AF_INET; 24117 tacp->ac_remote.ss_family = AF_INET; 24118 TCP_AC_V4LOCAL(tacp) = tp->tcp_ip_src; 24119 TCP_AC_V4REMOTE(tacp) = tp->tcp_remote; 24120 TCP_AC_V4LPORT(tacp) = tp->tcp_lport; 24121 TCP_AC_V4RPORT(tacp) = tp->tcp_fport; 24122 } else { 24123 tacp->ac_local.ss_family = AF_INET6; 24124 tacp->ac_remote.ss_family = AF_INET6; 24125 TCP_AC_V6LOCAL(tacp) = tp->tcp_ip_src_v6; 24126 TCP_AC_V6REMOTE(tacp) = tp->tcp_remote_v6; 24127 TCP_AC_V6LPORT(tacp) = tp->tcp_lport; 24128 TCP_AC_V6RPORT(tacp) = tp->tcp_fport; 24129 } 24130 mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp); 24131 return (mp); 24132 } 24133 24134 /* 24135 * Print a tcp_ioc_abort_conn_t structure. 24136 */ 24137 static void 24138 tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) 24139 { 24140 char lbuf[128]; 24141 char rbuf[128]; 24142 sa_family_t af; 24143 in_port_t lport, rport; 24144 ushort_t logflags; 24145 24146 af = acp->ac_local.ss_family; 24147 24148 if (af == AF_INET) { 24149 (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp), 24150 lbuf, 128); 24151 (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp), 24152 rbuf, 128); 24153 lport = ntohs(TCP_AC_V4LPORT(acp)); 24154 rport = ntohs(TCP_AC_V4RPORT(acp)); 24155 } else { 24156 (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp), 24157 lbuf, 128); 24158 (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp), 24159 rbuf, 128); 24160 lport = ntohs(TCP_AC_V6LPORT(acp)); 24161 rport = ntohs(TCP_AC_V6RPORT(acp)); 24162 } 24163 24164 logflags = SL_TRACE | SL_NOTE; 24165 /* 24166 * Don't print this message to the console if the operation was done 24167 * to a non-global zone. 24168 */ 24169 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 24170 logflags |= SL_CONSOLE; 24171 (void) strlog(TCP_MOD_ID, 0, 1, logflags, 24172 "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, " 24173 "start = %d, end = %d\n", lbuf, lport, rbuf, rport, 24174 acp->ac_start, acp->ac_end); 24175 } 24176 24177 /* 24178 * Called inside tcp_rput when a message built using 24179 * tcp_ioctl_abort_build_msg is put into a queue. 24180 * Note that when we get here there is no wildcard in acp any more. 24181 */ 24182 static void 24183 tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp) 24184 { 24185 tcp_ioc_abort_conn_t *acp; 24186 24187 acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t)); 24188 if (tcp->tcp_state <= acp->ac_end) { 24189 /* 24190 * If we get here, we are already on the correct 24191 * squeue. This ioctl follows the following path 24192 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn 24193 * ->tcp_ioctl_abort->squeue_fill (if on a 24194 * different squeue) 24195 */ 24196 int errcode; 24197 24198 TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode); 24199 (void) tcp_clean_death(tcp, errcode, 26); 24200 } 24201 freemsg(mp); 24202 } 24203 24204 /* 24205 * Abort all matching connections on a hash chain. 24206 */ 24207 static int 24208 tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count, 24209 boolean_t exact) 24210 { 24211 int nmatch, err = 0; 24212 tcp_t *tcp; 24213 MBLKP mp, last, listhead = NULL; 24214 conn_t *tconnp; 24215 connf_t *connfp = &ipcl_conn_fanout[index]; 24216 24217 startover: 24218 nmatch = 0; 24219 24220 mutex_enter(&connfp->connf_lock); 24221 for (tconnp = connfp->connf_head; tconnp != NULL; 24222 tconnp = tconnp->conn_next) { 24223 tcp = tconnp->conn_tcp; 24224 if (TCP_AC_MATCH(acp, tcp)) { 24225 CONN_INC_REF(tcp->tcp_connp); 24226 mp = tcp_ioctl_abort_build_msg(acp, tcp); 24227 if (mp == NULL) { 24228 err = ENOMEM; 24229 CONN_DEC_REF(tcp->tcp_connp); 24230 break; 24231 } 24232 mp->b_prev = (mblk_t *)tcp; 24233 24234 if (listhead == NULL) { 24235 listhead = mp; 24236 last = mp; 24237 } else { 24238 last->b_next = mp; 24239 last = mp; 24240 } 24241 nmatch++; 24242 if (exact) 24243 break; 24244 } 24245 24246 /* Avoid holding lock for too long. */ 24247 if (nmatch >= 500) 24248 break; 24249 } 24250 mutex_exit(&connfp->connf_lock); 24251 24252 /* Pass mp into the correct tcp */ 24253 while ((mp = listhead) != NULL) { 24254 listhead = listhead->b_next; 24255 tcp = (tcp_t *)mp->b_prev; 24256 mp->b_next = mp->b_prev = NULL; 24257 squeue_fill(tcp->tcp_connp->conn_sqp, mp, 24258 tcp_input, tcp->tcp_connp, SQTAG_TCP_ABORT_BUCKET); 24259 } 24260 24261 *count += nmatch; 24262 if (nmatch >= 500 && err == 0) 24263 goto startover; 24264 return (err); 24265 } 24266 24267 /* 24268 * Abort all connections that matches the attributes specified in acp. 24269 */ 24270 static int 24271 tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp) 24272 { 24273 sa_family_t af; 24274 uint32_t ports; 24275 uint16_t *pports; 24276 int err = 0, count = 0; 24277 boolean_t exact = B_FALSE; /* set when there is no wildcard */ 24278 int index = -1; 24279 ushort_t logflags; 24280 24281 af = acp->ac_local.ss_family; 24282 24283 if (af == AF_INET) { 24284 if (TCP_AC_V4REMOTE(acp) != INADDR_ANY && 24285 TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) { 24286 pports = (uint16_t *)&ports; 24287 pports[1] = TCP_AC_V4LPORT(acp); 24288 pports[0] = TCP_AC_V4RPORT(acp); 24289 exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY); 24290 } 24291 } else { 24292 if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) && 24293 TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) { 24294 pports = (uint16_t *)&ports; 24295 pports[1] = TCP_AC_V6LPORT(acp); 24296 pports[0] = TCP_AC_V6RPORT(acp); 24297 exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp)); 24298 } 24299 } 24300 24301 /* 24302 * For cases where remote addr, local port, and remote port are non- 24303 * wildcards, tcp_ioctl_abort_bucket will only be called once. 24304 */ 24305 if (index != -1) { 24306 err = tcp_ioctl_abort_bucket(acp, index, 24307 &count, exact); 24308 } else { 24309 /* 24310 * loop through all entries for wildcard case 24311 */ 24312 for (index = 0; index < ipcl_conn_fanout_size; index++) { 24313 err = tcp_ioctl_abort_bucket(acp, index, 24314 &count, exact); 24315 if (err != 0) 24316 break; 24317 } 24318 } 24319 24320 logflags = SL_TRACE | SL_NOTE; 24321 /* 24322 * Don't print this message to the console if the operation was done 24323 * to a non-global zone. 24324 */ 24325 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 24326 logflags |= SL_CONSOLE; 24327 (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: " 24328 "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' ')); 24329 if (err == 0 && count == 0) 24330 err = ENOENT; 24331 return (err); 24332 } 24333 24334 /* 24335 * Process the TCP_IOC_ABORT_CONN ioctl request. 24336 */ 24337 static void 24338 tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) 24339 { 24340 int err; 24341 IOCP iocp; 24342 MBLKP mp1; 24343 sa_family_t laf, raf; 24344 tcp_ioc_abort_conn_t *acp; 24345 zone_t *zptr; 24346 zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid; 24347 24348 iocp = (IOCP)mp->b_rptr; 24349 24350 if ((mp1 = mp->b_cont) == NULL || 24351 iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) { 24352 err = EINVAL; 24353 goto out; 24354 } 24355 24356 /* check permissions */ 24357 if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) { 24358 err = EPERM; 24359 goto out; 24360 } 24361 24362 if (mp1->b_cont != NULL) { 24363 freemsg(mp1->b_cont); 24364 mp1->b_cont = NULL; 24365 } 24366 24367 acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr; 24368 laf = acp->ac_local.ss_family; 24369 raf = acp->ac_remote.ss_family; 24370 24371 /* check that a zone with the supplied zoneid exists */ 24372 if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) { 24373 zptr = zone_find_by_id(zoneid); 24374 if (zptr != NULL) { 24375 zone_rele(zptr); 24376 } else { 24377 err = EINVAL; 24378 goto out; 24379 } 24380 } 24381 24382 if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT || 24383 acp->ac_start > acp->ac_end || laf != raf || 24384 (laf != AF_INET && laf != AF_INET6)) { 24385 err = EINVAL; 24386 goto out; 24387 } 24388 24389 tcp_ioctl_abort_dump(acp); 24390 err = tcp_ioctl_abort(acp); 24391 24392 out: 24393 if (mp1 != NULL) { 24394 freemsg(mp1); 24395 mp->b_cont = NULL; 24396 } 24397 24398 if (err != 0) 24399 miocnak(q, mp, 0, err); 24400 else 24401 miocack(q, mp, 0, 0); 24402 } 24403 24404 /* 24405 * tcp_time_wait_processing() handles processing of incoming packets when 24406 * the tcp is in the TIME_WAIT state. 24407 * A TIME_WAIT tcp that has an associated open TCP stream is never put 24408 * on the time wait list. 24409 */ 24410 void 24411 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, 24412 uint32_t seg_ack, int seg_len, tcph_t *tcph) 24413 { 24414 int32_t bytes_acked; 24415 int32_t gap; 24416 int32_t rgap; 24417 tcp_opt_t tcpopt; 24418 uint_t flags; 24419 uint32_t new_swnd = 0; 24420 conn_t *connp; 24421 24422 BUMP_LOCAL(tcp->tcp_ibsegs); 24423 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_RECV_PKT); 24424 24425 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 24426 new_swnd = BE16_TO_U16(tcph->th_win) << 24427 ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 24428 if (tcp->tcp_snd_ts_ok) { 24429 if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 24430 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 24431 tcp->tcp_rnxt, TH_ACK); 24432 goto done; 24433 } 24434 } 24435 gap = seg_seq - tcp->tcp_rnxt; 24436 rgap = tcp->tcp_rwnd - (gap + seg_len); 24437 if (gap < 0) { 24438 BUMP_MIB(&tcp_mib, tcpInDataDupSegs); 24439 UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, 24440 (seg_len > -gap ? -gap : seg_len)); 24441 seg_len += gap; 24442 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 24443 if (flags & TH_RST) { 24444 goto done; 24445 } 24446 if ((flags & TH_FIN) && seg_len == -1) { 24447 /* 24448 * When TCP receives a duplicate FIN in 24449 * TIME_WAIT state, restart the 2 MSL timer. 24450 * See page 73 in RFC 793. Make sure this TCP 24451 * is already on the TIME_WAIT list. If not, 24452 * just restart the timer. 24453 */ 24454 if (TCP_IS_DETACHED(tcp)) { 24455 tcp_time_wait_remove(tcp, NULL); 24456 tcp_time_wait_append(tcp); 24457 TCP_DBGSTAT(tcp_rput_time_wait); 24458 } else { 24459 ASSERT(tcp != NULL); 24460 TCP_TIMER_RESTART(tcp, 24461 tcp_time_wait_interval); 24462 } 24463 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 24464 tcp->tcp_rnxt, TH_ACK); 24465 goto done; 24466 } 24467 flags |= TH_ACK_NEEDED; 24468 seg_len = 0; 24469 goto process_ack; 24470 } 24471 24472 /* Fix seg_seq, and chew the gap off the front. */ 24473 seg_seq = tcp->tcp_rnxt; 24474 } 24475 24476 if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 24477 /* 24478 * Make sure that when we accept the connection, pick 24479 * an ISS greater than (tcp_snxt + ISS_INCR/2) for the 24480 * old connection. 24481 * 24482 * The next ISS generated is equal to tcp_iss_incr_extra 24483 * + ISS_INCR/2 + other components depending on the 24484 * value of tcp_strong_iss. We pre-calculate the new 24485 * ISS here and compare with tcp_snxt to determine if 24486 * we need to make adjustment to tcp_iss_incr_extra. 24487 * 24488 * The above calculation is ugly and is a 24489 * waste of CPU cycles... 24490 */ 24491 uint32_t new_iss = tcp_iss_incr_extra; 24492 int32_t adj; 24493 24494 switch (tcp_strong_iss) { 24495 case 2: { 24496 /* Add time and MD5 components. */ 24497 uint32_t answer[4]; 24498 struct { 24499 uint32_t ports; 24500 in6_addr_t src; 24501 in6_addr_t dst; 24502 } arg; 24503 MD5_CTX context; 24504 24505 mutex_enter(&tcp_iss_key_lock); 24506 context = tcp_iss_key; 24507 mutex_exit(&tcp_iss_key_lock); 24508 arg.ports = tcp->tcp_ports; 24509 /* We use MAPPED addresses in tcp_iss_init */ 24510 arg.src = tcp->tcp_ip_src_v6; 24511 if (tcp->tcp_ipversion == IPV4_VERSION) { 24512 IN6_IPADDR_TO_V4MAPPED( 24513 tcp->tcp_ipha->ipha_dst, 24514 &arg.dst); 24515 } else { 24516 arg.dst = 24517 tcp->tcp_ip6h->ip6_dst; 24518 } 24519 MD5Update(&context, (uchar_t *)&arg, 24520 sizeof (arg)); 24521 MD5Final((uchar_t *)answer, &context); 24522 answer[0] ^= answer[1] ^ answer[2] ^ answer[3]; 24523 new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0]; 24524 break; 24525 } 24526 case 1: 24527 /* Add time component and min random (i.e. 1). */ 24528 new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1; 24529 break; 24530 default: 24531 /* Add only time component. */ 24532 new_iss += (uint32_t)gethrestime_sec() * ISS_INCR; 24533 break; 24534 } 24535 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 24536 /* 24537 * New ISS not guaranteed to be ISS_INCR/2 24538 * ahead of the current tcp_snxt, so add the 24539 * difference to tcp_iss_incr_extra. 24540 */ 24541 tcp_iss_incr_extra += adj; 24542 } 24543 /* 24544 * If tcp_clean_death() can not perform the task now, 24545 * drop the SYN packet and let the other side re-xmit. 24546 * Otherwise pass the SYN packet back in, since the 24547 * old tcp state has been cleaned up or freed. 24548 */ 24549 if (tcp_clean_death(tcp, 0, 27) == -1) 24550 goto done; 24551 /* 24552 * We will come back to tcp_rput_data 24553 * on the global queue. Packets destined 24554 * for the global queue will be checked 24555 * with global policy. But the policy for 24556 * this packet has already been checked as 24557 * this was destined for the detached 24558 * connection. We need to bypass policy 24559 * check this time by attaching a dummy 24560 * ipsec_in with ipsec_in_dont_check set. 24561 */ 24562 if ((connp = ipcl_classify(mp, tcp->tcp_connp->conn_zoneid)) != 24563 NULL) { 24564 TCP_STAT(tcp_time_wait_syn_success); 24565 tcp_reinput(connp, mp, tcp->tcp_connp->conn_sqp); 24566 return; 24567 } 24568 goto done; 24569 } 24570 24571 /* 24572 * rgap is the amount of stuff received out of window. A negative 24573 * value is the amount out of window. 24574 */ 24575 if (rgap < 0) { 24576 BUMP_MIB(&tcp_mib, tcpInDataPastWinSegs); 24577 UPDATE_MIB(&tcp_mib, tcpInDataPastWinBytes, -rgap); 24578 /* Fix seg_len and make sure there is something left. */ 24579 seg_len += rgap; 24580 if (seg_len <= 0) { 24581 if (flags & TH_RST) { 24582 goto done; 24583 } 24584 flags |= TH_ACK_NEEDED; 24585 seg_len = 0; 24586 goto process_ack; 24587 } 24588 } 24589 /* 24590 * Check whether we can update tcp_ts_recent. This test is 24591 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 24592 * Extensions for High Performance: An Update", Internet Draft. 24593 */ 24594 if (tcp->tcp_snd_ts_ok && 24595 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 24596 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 24597 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 24598 tcp->tcp_last_rcv_lbolt = lbolt64; 24599 } 24600 24601 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 24602 /* Always ack out of order packets */ 24603 flags |= TH_ACK_NEEDED; 24604 seg_len = 0; 24605 } else if (seg_len > 0) { 24606 BUMP_MIB(&tcp_mib, tcpInClosed); 24607 BUMP_MIB(&tcp_mib, tcpInDataInorderSegs); 24608 UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, seg_len); 24609 } 24610 if (flags & TH_RST) { 24611 (void) tcp_clean_death(tcp, 0, 28); 24612 goto done; 24613 } 24614 if (flags & TH_SYN) { 24615 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 24616 TH_RST|TH_ACK); 24617 /* 24618 * Do not delete the TCP structure if it is in 24619 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 24620 */ 24621 goto done; 24622 } 24623 process_ack: 24624 if (flags & TH_ACK) { 24625 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 24626 if (bytes_acked <= 0) { 24627 if (bytes_acked == 0 && seg_len == 0 && 24628 new_swnd == tcp->tcp_swnd) 24629 BUMP_MIB(&tcp_mib, tcpInDupAck); 24630 } else { 24631 /* Acks something not sent */ 24632 flags |= TH_ACK_NEEDED; 24633 } 24634 } 24635 if (flags & TH_ACK_NEEDED) { 24636 /* 24637 * Time to send an ack for some reason. 24638 */ 24639 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 24640 tcp->tcp_rnxt, TH_ACK); 24641 } 24642 done: 24643 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 24644 DB_CKSUMSTART(mp) = 0; 24645 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 24646 TCP_STAT(tcp_time_wait_syn_fail); 24647 } 24648 freemsg(mp); 24649 } 24650 24651 /* 24652 * Allocate a T_SVR4_OPTMGMT_REQ. 24653 * The caller needs to increment tcp_drop_opt_ack_cnt when sending these so 24654 * that tcp_rput_other can drop the acks. 24655 */ 24656 static mblk_t * 24657 tcp_setsockopt_mp(int level, int cmd, char *opt, int optlen) 24658 { 24659 mblk_t *mp; 24660 struct T_optmgmt_req *tor; 24661 struct opthdr *oh; 24662 uint_t size; 24663 char *optptr; 24664 24665 size = sizeof (*tor) + sizeof (*oh) + optlen; 24666 mp = allocb(size, BPRI_MED); 24667 if (mp == NULL) 24668 return (NULL); 24669 24670 mp->b_wptr += size; 24671 mp->b_datap->db_type = M_PROTO; 24672 tor = (struct T_optmgmt_req *)mp->b_rptr; 24673 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 24674 tor->MGMT_flags = T_NEGOTIATE; 24675 tor->OPT_length = sizeof (*oh) + optlen; 24676 tor->OPT_offset = (t_scalar_t)sizeof (*tor); 24677 24678 oh = (struct opthdr *)&tor[1]; 24679 oh->level = level; 24680 oh->name = cmd; 24681 oh->len = optlen; 24682 if (optlen != 0) { 24683 optptr = (char *)&oh[1]; 24684 bcopy(opt, optptr, optlen); 24685 } 24686 return (mp); 24687 } 24688 24689 /* 24690 * TCP Timers Implementation. 24691 */ 24692 timeout_id_t 24693 tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim) 24694 { 24695 mblk_t *mp; 24696 tcp_timer_t *tcpt; 24697 tcp_t *tcp = connp->conn_tcp; 24698 24699 ASSERT(connp->conn_sqp != NULL); 24700 24701 TCP_DBGSTAT(tcp_timeout_calls); 24702 24703 if (tcp->tcp_timercache == NULL) { 24704 mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC); 24705 } else { 24706 TCP_DBGSTAT(tcp_timeout_cached_alloc); 24707 mp = tcp->tcp_timercache; 24708 tcp->tcp_timercache = mp->b_next; 24709 mp->b_next = NULL; 24710 ASSERT(mp->b_wptr == NULL); 24711 } 24712 24713 CONN_INC_REF(connp); 24714 tcpt = (tcp_timer_t *)mp->b_rptr; 24715 tcpt->connp = connp; 24716 tcpt->tcpt_proc = f; 24717 tcpt->tcpt_tid = timeout(tcp_timer_callback, mp, tim); 24718 return ((timeout_id_t)mp); 24719 } 24720 24721 static void 24722 tcp_timer_callback(void *arg) 24723 { 24724 mblk_t *mp = (mblk_t *)arg; 24725 tcp_timer_t *tcpt; 24726 conn_t *connp; 24727 24728 tcpt = (tcp_timer_t *)mp->b_rptr; 24729 connp = tcpt->connp; 24730 squeue_fill(connp->conn_sqp, mp, 24731 tcp_timer_handler, connp, SQTAG_TCP_TIMER); 24732 } 24733 24734 static void 24735 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2) 24736 { 24737 tcp_timer_t *tcpt; 24738 conn_t *connp = (conn_t *)arg; 24739 tcp_t *tcp = connp->conn_tcp; 24740 24741 tcpt = (tcp_timer_t *)mp->b_rptr; 24742 ASSERT(connp == tcpt->connp); 24743 ASSERT((squeue_t *)arg2 == connp->conn_sqp); 24744 24745 /* 24746 * If the TCP has reached the closed state, don't proceed any 24747 * further. This TCP logically does not exist on the system. 24748 * tcpt_proc could for example access queues, that have already 24749 * been qprocoff'ed off. Also see comments at the start of tcp_input 24750 */ 24751 if (tcp->tcp_state != TCPS_CLOSED) { 24752 (*tcpt->tcpt_proc)(connp); 24753 } else { 24754 tcp->tcp_timer_tid = 0; 24755 } 24756 tcp_timer_free(connp->conn_tcp, mp); 24757 } 24758 24759 /* 24760 * There is potential race with untimeout and the handler firing at the same 24761 * time. The mblock may be freed by the handler while we are trying to use 24762 * it. But since both should execute on the same squeue, this race should not 24763 * occur. 24764 */ 24765 clock_t 24766 tcp_timeout_cancel(conn_t *connp, timeout_id_t id) 24767 { 24768 mblk_t *mp = (mblk_t *)id; 24769 tcp_timer_t *tcpt; 24770 clock_t delta; 24771 24772 TCP_DBGSTAT(tcp_timeout_cancel_reqs); 24773 24774 if (mp == NULL) 24775 return (-1); 24776 24777 tcpt = (tcp_timer_t *)mp->b_rptr; 24778 ASSERT(tcpt->connp == connp); 24779 24780 delta = untimeout(tcpt->tcpt_tid); 24781 24782 if (delta >= 0) { 24783 TCP_DBGSTAT(tcp_timeout_canceled); 24784 tcp_timer_free(connp->conn_tcp, mp); 24785 CONN_DEC_REF(connp); 24786 } 24787 24788 return (delta); 24789 } 24790 24791 /* 24792 * Allocate space for the timer event. The allocation looks like mblk, but it is 24793 * not a proper mblk. To avoid confusion we set b_wptr to NULL. 24794 * 24795 * Dealing with failures: If we can't allocate from the timer cache we try 24796 * allocating from dblock caches using allocb_tryhard(). In this case b_wptr 24797 * points to b_rptr. 24798 * If we can't allocate anything using allocb_tryhard(), we perform a last 24799 * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and 24800 * save the actual allocation size in b_datap. 24801 */ 24802 mblk_t * 24803 tcp_timermp_alloc(int kmflags) 24804 { 24805 mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache, 24806 kmflags & ~KM_PANIC); 24807 24808 if (mp != NULL) { 24809 mp->b_next = mp->b_prev = NULL; 24810 mp->b_rptr = (uchar_t *)(&mp[1]); 24811 mp->b_wptr = NULL; 24812 mp->b_datap = NULL; 24813 mp->b_queue = NULL; 24814 } else if (kmflags & KM_PANIC) { 24815 /* 24816 * Failed to allocate memory for the timer. Try allocating from 24817 * dblock caches. 24818 */ 24819 TCP_STAT(tcp_timermp_allocfail); 24820 mp = allocb_tryhard(sizeof (tcp_timer_t)); 24821 if (mp == NULL) { 24822 size_t size = 0; 24823 /* 24824 * Memory is really low. Try tryhard allocation. 24825 */ 24826 TCP_STAT(tcp_timermp_allocdblfail); 24827 mp = kmem_alloc_tryhard(sizeof (mblk_t) + 24828 sizeof (tcp_timer_t), &size, kmflags); 24829 mp->b_rptr = (uchar_t *)(&mp[1]); 24830 mp->b_next = mp->b_prev = NULL; 24831 mp->b_wptr = (uchar_t *)-1; 24832 mp->b_datap = (dblk_t *)size; 24833 mp->b_queue = NULL; 24834 } 24835 ASSERT(mp->b_wptr != NULL); 24836 } 24837 TCP_DBGSTAT(tcp_timermp_alloced); 24838 24839 return (mp); 24840 } 24841 24842 /* 24843 * Free per-tcp timer cache. 24844 * It can only contain entries from tcp_timercache. 24845 */ 24846 void 24847 tcp_timermp_free(tcp_t *tcp) 24848 { 24849 mblk_t *mp; 24850 24851 while ((mp = tcp->tcp_timercache) != NULL) { 24852 ASSERT(mp->b_wptr == NULL); 24853 tcp->tcp_timercache = tcp->tcp_timercache->b_next; 24854 kmem_cache_free(tcp_timercache, mp); 24855 } 24856 } 24857 24858 /* 24859 * Free timer event. Put it on the per-tcp timer cache if there is not too many 24860 * events there already (currently at most two events are cached). 24861 * If the event is not allocated from the timer cache, free it right away. 24862 */ 24863 static void 24864 tcp_timer_free(tcp_t *tcp, mblk_t *mp) 24865 { 24866 mblk_t *mp1 = tcp->tcp_timercache; 24867 24868 if (mp->b_wptr != NULL) { 24869 /* 24870 * This allocation is not from a timer cache, free it right 24871 * away. 24872 */ 24873 if (mp->b_wptr != (uchar_t *)-1) 24874 freeb(mp); 24875 else 24876 kmem_free(mp, (size_t)mp->b_datap); 24877 } else if (mp1 == NULL || mp1->b_next == NULL) { 24878 /* Cache this timer block for future allocations */ 24879 mp->b_rptr = (uchar_t *)(&mp[1]); 24880 mp->b_next = mp1; 24881 tcp->tcp_timercache = mp; 24882 } else { 24883 kmem_cache_free(tcp_timercache, mp); 24884 TCP_DBGSTAT(tcp_timermp_freed); 24885 } 24886 } 24887 24888 /* 24889 * End of TCP Timers implementation. 24890 */ 24891 24892 /* 24893 * tcp_{set,clr}qfull() functions are used to either set or clear QFULL 24894 * on the specified backing STREAMS q. Note, the caller may make the 24895 * decision to call based on the tcp_t.tcp_flow_stopped value which 24896 * when check outside the q's lock is only an advisory check ... 24897 */ 24898 24899 void 24900 tcp_setqfull(tcp_t *tcp) 24901 { 24902 queue_t *q = tcp->tcp_wq; 24903 24904 if (!(q->q_flag & QFULL)) { 24905 mutex_enter(QLOCK(q)); 24906 if (!(q->q_flag & QFULL)) { 24907 /* still need to set QFULL */ 24908 q->q_flag |= QFULL; 24909 tcp->tcp_flow_stopped = B_TRUE; 24910 mutex_exit(QLOCK(q)); 24911 TCP_STAT(tcp_flwctl_on); 24912 } else { 24913 mutex_exit(QLOCK(q)); 24914 } 24915 } 24916 } 24917 24918 void 24919 tcp_clrqfull(tcp_t *tcp) 24920 { 24921 queue_t *q = tcp->tcp_wq; 24922 24923 if (q->q_flag & QFULL) { 24924 mutex_enter(QLOCK(q)); 24925 if (q->q_flag & QFULL) { 24926 q->q_flag &= ~QFULL; 24927 tcp->tcp_flow_stopped = B_FALSE; 24928 mutex_exit(QLOCK(q)); 24929 if (q->q_flag & QWANTW) 24930 qbackenable(q, 0); 24931 } else { 24932 mutex_exit(QLOCK(q)); 24933 } 24934 } 24935 } 24936 24937 /* 24938 * TCP Kstats implementation 24939 */ 24940 static void 24941 tcp_kstat_init(void) 24942 { 24943 tcp_named_kstat_t template = { 24944 { "rtoAlgorithm", KSTAT_DATA_INT32, 0 }, 24945 { "rtoMin", KSTAT_DATA_INT32, 0 }, 24946 { "rtoMax", KSTAT_DATA_INT32, 0 }, 24947 { "maxConn", KSTAT_DATA_INT32, 0 }, 24948 { "activeOpens", KSTAT_DATA_UINT32, 0 }, 24949 { "passiveOpens", KSTAT_DATA_UINT32, 0 }, 24950 { "attemptFails", KSTAT_DATA_UINT32, 0 }, 24951 { "estabResets", KSTAT_DATA_UINT32, 0 }, 24952 { "currEstab", KSTAT_DATA_UINT32, 0 }, 24953 { "inSegs", KSTAT_DATA_UINT32, 0 }, 24954 { "outSegs", KSTAT_DATA_UINT32, 0 }, 24955 { "retransSegs", KSTAT_DATA_UINT32, 0 }, 24956 { "connTableSize", KSTAT_DATA_INT32, 0 }, 24957 { "outRsts", KSTAT_DATA_UINT32, 0 }, 24958 { "outDataSegs", KSTAT_DATA_UINT32, 0 }, 24959 { "outDataBytes", KSTAT_DATA_UINT32, 0 }, 24960 { "retransBytes", KSTAT_DATA_UINT32, 0 }, 24961 { "outAck", KSTAT_DATA_UINT32, 0 }, 24962 { "outAckDelayed", KSTAT_DATA_UINT32, 0 }, 24963 { "outUrg", KSTAT_DATA_UINT32, 0 }, 24964 { "outWinUpdate", KSTAT_DATA_UINT32, 0 }, 24965 { "outWinProbe", KSTAT_DATA_UINT32, 0 }, 24966 { "outControl", KSTAT_DATA_UINT32, 0 }, 24967 { "outFastRetrans", KSTAT_DATA_UINT32, 0 }, 24968 { "inAckSegs", KSTAT_DATA_UINT32, 0 }, 24969 { "inAckBytes", KSTAT_DATA_UINT32, 0 }, 24970 { "inDupAck", KSTAT_DATA_UINT32, 0 }, 24971 { "inAckUnsent", KSTAT_DATA_UINT32, 0 }, 24972 { "inDataInorderSegs", KSTAT_DATA_UINT32, 0 }, 24973 { "inDataInorderBytes", KSTAT_DATA_UINT32, 0 }, 24974 { "inDataUnorderSegs", KSTAT_DATA_UINT32, 0 }, 24975 { "inDataUnorderBytes", KSTAT_DATA_UINT32, 0 }, 24976 { "inDataDupSegs", KSTAT_DATA_UINT32, 0 }, 24977 { "inDataDupBytes", KSTAT_DATA_UINT32, 0 }, 24978 { "inDataPartDupSegs", KSTAT_DATA_UINT32, 0 }, 24979 { "inDataPartDupBytes", KSTAT_DATA_UINT32, 0 }, 24980 { "inDataPastWinSegs", KSTAT_DATA_UINT32, 0 }, 24981 { "inDataPastWinBytes", KSTAT_DATA_UINT32, 0 }, 24982 { "inWinProbe", KSTAT_DATA_UINT32, 0 }, 24983 { "inWinUpdate", KSTAT_DATA_UINT32, 0 }, 24984 { "inClosed", KSTAT_DATA_UINT32, 0 }, 24985 { "rttUpdate", KSTAT_DATA_UINT32, 0 }, 24986 { "rttNoUpdate", KSTAT_DATA_UINT32, 0 }, 24987 { "timRetrans", KSTAT_DATA_UINT32, 0 }, 24988 { "timRetransDrop", KSTAT_DATA_UINT32, 0 }, 24989 { "timKeepalive", KSTAT_DATA_UINT32, 0 }, 24990 { "timKeepaliveProbe", KSTAT_DATA_UINT32, 0 }, 24991 { "timKeepaliveDrop", KSTAT_DATA_UINT32, 0 }, 24992 { "listenDrop", KSTAT_DATA_UINT32, 0 }, 24993 { "listenDropQ0", KSTAT_DATA_UINT32, 0 }, 24994 { "halfOpenDrop", KSTAT_DATA_UINT32, 0 }, 24995 { "outSackRetransSegs", KSTAT_DATA_UINT32, 0 }, 24996 { "connTableSize6", KSTAT_DATA_INT32, 0 } 24997 }; 24998 24999 tcp_mibkp = kstat_create(TCP_MOD_NAME, 0, TCP_MOD_NAME, 25000 "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0); 25001 25002 if (tcp_mibkp == NULL) 25003 return; 25004 25005 template.rtoAlgorithm.value.ui32 = 4; 25006 template.rtoMin.value.ui32 = tcp_rexmit_interval_min; 25007 template.rtoMax.value.ui32 = tcp_rexmit_interval_max; 25008 template.maxConn.value.i32 = -1; 25009 25010 bcopy(&template, tcp_mibkp->ks_data, sizeof (template)); 25011 25012 tcp_mibkp->ks_update = tcp_kstat_update; 25013 25014 kstat_install(tcp_mibkp); 25015 } 25016 25017 static void 25018 tcp_kstat_fini(void) 25019 { 25020 25021 if (tcp_mibkp != NULL) { 25022 kstat_delete(tcp_mibkp); 25023 tcp_mibkp = NULL; 25024 } 25025 } 25026 25027 static int 25028 tcp_kstat_update(kstat_t *kp, int rw) 25029 { 25030 tcp_named_kstat_t *tcpkp; 25031 tcp_t *tcp; 25032 connf_t *connfp; 25033 conn_t *connp; 25034 int i; 25035 25036 if (!kp || !kp->ks_data) 25037 return (EIO); 25038 25039 if (rw == KSTAT_WRITE) 25040 return (EACCES); 25041 25042 tcpkp = (tcp_named_kstat_t *)kp->ks_data; 25043 25044 tcpkp->currEstab.value.ui32 = 0; 25045 25046 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 25047 connfp = &ipcl_globalhash_fanout[i]; 25048 connp = NULL; 25049 while ((connp = 25050 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 25051 tcp = connp->conn_tcp; 25052 switch (tcp_snmp_state(tcp)) { 25053 case MIB2_TCP_established: 25054 case MIB2_TCP_closeWait: 25055 tcpkp->currEstab.value.ui32++; 25056 break; 25057 } 25058 } 25059 } 25060 25061 tcpkp->activeOpens.value.ui32 = tcp_mib.tcpActiveOpens; 25062 tcpkp->passiveOpens.value.ui32 = tcp_mib.tcpPassiveOpens; 25063 tcpkp->attemptFails.value.ui32 = tcp_mib.tcpAttemptFails; 25064 tcpkp->estabResets.value.ui32 = tcp_mib.tcpEstabResets; 25065 tcpkp->inSegs.value.ui32 = tcp_mib.tcpInSegs; 25066 tcpkp->outSegs.value.ui32 = tcp_mib.tcpOutSegs; 25067 tcpkp->retransSegs.value.ui32 = tcp_mib.tcpRetransSegs; 25068 tcpkp->connTableSize.value.i32 = tcp_mib.tcpConnTableSize; 25069 tcpkp->outRsts.value.ui32 = tcp_mib.tcpOutRsts; 25070 tcpkp->outDataSegs.value.ui32 = tcp_mib.tcpOutDataSegs; 25071 tcpkp->outDataBytes.value.ui32 = tcp_mib.tcpOutDataBytes; 25072 tcpkp->retransBytes.value.ui32 = tcp_mib.tcpRetransBytes; 25073 tcpkp->outAck.value.ui32 = tcp_mib.tcpOutAck; 25074 tcpkp->outAckDelayed.value.ui32 = tcp_mib.tcpOutAckDelayed; 25075 tcpkp->outUrg.value.ui32 = tcp_mib.tcpOutUrg; 25076 tcpkp->outWinUpdate.value.ui32 = tcp_mib.tcpOutWinUpdate; 25077 tcpkp->outWinProbe.value.ui32 = tcp_mib.tcpOutWinProbe; 25078 tcpkp->outControl.value.ui32 = tcp_mib.tcpOutControl; 25079 tcpkp->outFastRetrans.value.ui32 = tcp_mib.tcpOutFastRetrans; 25080 tcpkp->inAckSegs.value.ui32 = tcp_mib.tcpInAckSegs; 25081 tcpkp->inAckBytes.value.ui32 = tcp_mib.tcpInAckBytes; 25082 tcpkp->inDupAck.value.ui32 = tcp_mib.tcpInDupAck; 25083 tcpkp->inAckUnsent.value.ui32 = tcp_mib.tcpInAckUnsent; 25084 tcpkp->inDataInorderSegs.value.ui32 = tcp_mib.tcpInDataInorderSegs; 25085 tcpkp->inDataInorderBytes.value.ui32 = tcp_mib.tcpInDataInorderBytes; 25086 tcpkp->inDataUnorderSegs.value.ui32 = tcp_mib.tcpInDataUnorderSegs; 25087 tcpkp->inDataUnorderBytes.value.ui32 = tcp_mib.tcpInDataUnorderBytes; 25088 tcpkp->inDataDupSegs.value.ui32 = tcp_mib.tcpInDataDupSegs; 25089 tcpkp->inDataDupBytes.value.ui32 = tcp_mib.tcpInDataDupBytes; 25090 tcpkp->inDataPartDupSegs.value.ui32 = tcp_mib.tcpInDataPartDupSegs; 25091 tcpkp->inDataPartDupBytes.value.ui32 = tcp_mib.tcpInDataPartDupBytes; 25092 tcpkp->inDataPastWinSegs.value.ui32 = tcp_mib.tcpInDataPastWinSegs; 25093 tcpkp->inDataPastWinBytes.value.ui32 = tcp_mib.tcpInDataPastWinBytes; 25094 tcpkp->inWinProbe.value.ui32 = tcp_mib.tcpInWinProbe; 25095 tcpkp->inWinUpdate.value.ui32 = tcp_mib.tcpInWinUpdate; 25096 tcpkp->inClosed.value.ui32 = tcp_mib.tcpInClosed; 25097 tcpkp->rttNoUpdate.value.ui32 = tcp_mib.tcpRttNoUpdate; 25098 tcpkp->rttUpdate.value.ui32 = tcp_mib.tcpRttUpdate; 25099 tcpkp->timRetrans.value.ui32 = tcp_mib.tcpTimRetrans; 25100 tcpkp->timRetransDrop.value.ui32 = tcp_mib.tcpTimRetransDrop; 25101 tcpkp->timKeepalive.value.ui32 = tcp_mib.tcpTimKeepalive; 25102 tcpkp->timKeepaliveProbe.value.ui32 = tcp_mib.tcpTimKeepaliveProbe; 25103 tcpkp->timKeepaliveDrop.value.ui32 = tcp_mib.tcpTimKeepaliveDrop; 25104 tcpkp->listenDrop.value.ui32 = tcp_mib.tcpListenDrop; 25105 tcpkp->listenDropQ0.value.ui32 = tcp_mib.tcpListenDropQ0; 25106 tcpkp->halfOpenDrop.value.ui32 = tcp_mib.tcpHalfOpenDrop; 25107 tcpkp->outSackRetransSegs.value.ui32 = tcp_mib.tcpOutSackRetransSegs; 25108 tcpkp->connTableSize6.value.i32 = tcp_mib.tcp6ConnTableSize; 25109 25110 return (0); 25111 } 25112 25113 void 25114 tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp) 25115 { 25116 uint16_t hdr_len; 25117 ipha_t *ipha; 25118 uint8_t *nexthdrp; 25119 tcph_t *tcph; 25120 25121 /* Already has an eager */ 25122 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 25123 TCP_STAT(tcp_reinput_syn); 25124 squeue_enter(connp->conn_sqp, mp, connp->conn_recv, 25125 connp, SQTAG_TCP_REINPUT_EAGER); 25126 return; 25127 } 25128 25129 switch (IPH_HDR_VERSION(mp->b_rptr)) { 25130 case IPV4_VERSION: 25131 ipha = (ipha_t *)mp->b_rptr; 25132 hdr_len = IPH_HDR_LENGTH(ipha); 25133 break; 25134 case IPV6_VERSION: 25135 if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, 25136 &hdr_len, &nexthdrp)) { 25137 CONN_DEC_REF(connp); 25138 freemsg(mp); 25139 return; 25140 } 25141 break; 25142 } 25143 25144 tcph = (tcph_t *)&mp->b_rptr[hdr_len]; 25145 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 25146 mp->b_datap->db_struioflag |= STRUIO_EAGER; 25147 DB_CKSUMSTART(mp) = (intptr_t)sqp; 25148 } 25149 25150 squeue_fill(connp->conn_sqp, mp, connp->conn_recv, connp, 25151 SQTAG_TCP_REINPUT); 25152 } 25153 25154 static squeue_func_t 25155 tcp_squeue_switch(int val) 25156 { 25157 squeue_func_t rval = squeue_fill; 25158 25159 switch (val) { 25160 case 1: 25161 rval = squeue_enter_nodrain; 25162 break; 25163 case 2: 25164 rval = squeue_enter; 25165 break; 25166 default: 25167 break; 25168 } 25169 return (rval); 25170 } 25171 25172 static void 25173 tcp_squeue_add(squeue_t *sqp) 25174 { 25175 tcp_squeue_priv_t *tcp_time_wait = kmem_zalloc( 25176 sizeof (tcp_squeue_priv_t), KM_SLEEP); 25177 25178 *squeue_getprivate(sqp, SQPRIVATE_TCP) = (intptr_t)tcp_time_wait; 25179 tcp_time_wait->tcp_time_wait_tid = timeout(tcp_time_wait_collector, 25180 sqp, TCP_TIME_WAIT_DELAY); 25181 if (tcp_free_list_max_cnt == 0) { 25182 int tcp_ncpus = ((boot_max_ncpus == -1) ? 25183 max_ncpus : boot_max_ncpus); 25184 25185 /* 25186 * Limit number of entries to 1% of availble memory / tcp_ncpus 25187 */ 25188 tcp_free_list_max_cnt = (freemem * PAGESIZE) / 25189 (tcp_ncpus * sizeof (tcp_t) * 100); 25190 } 25191 tcp_time_wait->tcp_free_list_cnt = 0; 25192 } 25193