1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 const char tcp_version[] = "%Z%%M% %I% %E% SMI"; 30 31 32 #include <sys/types.h> 33 #include <sys/stream.h> 34 #include <sys/strsun.h> 35 #include <sys/strsubr.h> 36 #include <sys/stropts.h> 37 #include <sys/strlog.h> 38 #include <sys/strsun.h> 39 #define _SUN_TPI_VERSION 2 40 #include <sys/tihdr.h> 41 #include <sys/timod.h> 42 #include <sys/ddi.h> 43 #include <sys/sunddi.h> 44 #include <sys/suntpi.h> 45 #include <sys/xti_inet.h> 46 #include <sys/cmn_err.h> 47 #include <sys/debug.h> 48 #include <sys/vtrace.h> 49 #include <sys/kmem.h> 50 #include <sys/ethernet.h> 51 #include <sys/cpuvar.h> 52 #include <sys/dlpi.h> 53 #include <sys/multidata.h> 54 #include <sys/multidata_impl.h> 55 #include <sys/pattr.h> 56 #include <sys/policy.h> 57 #include <sys/priv.h> 58 #include <sys/zone.h> 59 60 #include <sys/errno.h> 61 #include <sys/signal.h> 62 #include <sys/socket.h> 63 #include <sys/sockio.h> 64 #include <sys/isa_defs.h> 65 #include <sys/md5.h> 66 #include <sys/random.h> 67 #include <netinet/in.h> 68 #include <netinet/tcp.h> 69 #include <netinet/ip6.h> 70 #include <netinet/icmp6.h> 71 #include <net/if.h> 72 #include <net/route.h> 73 #include <inet/ipsec_impl.h> 74 75 #include <inet/common.h> 76 #include <inet/ip.h> 77 #include <inet/ip_impl.h> 78 #include <inet/ip6.h> 79 #include <inet/ip_ndp.h> 80 #include <inet/mi.h> 81 #include <inet/mib2.h> 82 #include <inet/nd.h> 83 #include <inet/optcom.h> 84 #include <inet/snmpcom.h> 85 #include <inet/kstatcom.h> 86 #include <inet/tcp.h> 87 #include <inet/tcp_impl.h> 88 #include <net/pfkeyv2.h> 89 #include <inet/ipsec_info.h> 90 #include <inet/ipdrop.h> 91 #include <inet/tcp_trace.h> 92 93 #include <inet/ipclassifier.h> 94 #include <inet/ip_ire.h> 95 #include <inet/ip_ftable.h> 96 #include <inet/ip_if.h> 97 #include <inet/ipp_common.h> 98 #include <sys/squeue.h> 99 #include <inet/kssl/ksslapi.h> 100 #include <sys/tsol/label.h> 101 #include <sys/tsol/tnet.h> 102 #include <sys/sdt.h> 103 #include <rpc/pmap_prot.h> 104 105 /* 106 * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) 107 * 108 * (Read the detailed design doc in PSARC case directory) 109 * 110 * The entire tcp state is contained in tcp_t and conn_t structure 111 * which are allocated in tandem using ipcl_conn_create() and passing 112 * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect 113 * the references on the tcp_t. The tcp_t structure is never compressed 114 * and packets always land on the correct TCP perimeter from the time 115 * eager is created till the time tcp_t dies (as such the old mentat 116 * TCP global queue is not used for detached state and no IPSEC checking 117 * is required). The global queue is still allocated to send out resets 118 * for connection which have no listeners and IP directly calls 119 * tcp_xmit_listeners_reset() which does any policy check. 120 * 121 * Protection and Synchronisation mechanism: 122 * 123 * The tcp data structure does not use any kind of lock for protecting 124 * its state but instead uses 'squeues' for mutual exclusion from various 125 * read and write side threads. To access a tcp member, the thread should 126 * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or 127 * squeue_fill). Since the squeues allow a direct function call, caller 128 * can pass any tcp function having prototype of edesc_t as argument 129 * (different from traditional STREAMs model where packets come in only 130 * designated entry points). The list of functions that can be directly 131 * called via squeue are listed before the usual function prototype. 132 * 133 * Referencing: 134 * 135 * TCP is MT-Hot and we use a reference based scheme to make sure that the 136 * tcp structure doesn't disappear when its needed. When the application 137 * creates an outgoing connection or accepts an incoming connection, we 138 * start out with 2 references on 'conn_ref'. One for TCP and one for IP. 139 * The IP reference is just a symbolic reference since ip_tcpclose() 140 * looks at tcp structure after tcp_close_output() returns which could 141 * have dropped the last TCP reference. So as long as the connection is 142 * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the 143 * conn_t. The classifier puts its own reference when the connection is 144 * inserted in listen or connected hash. Anytime a thread needs to enter 145 * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr 146 * on write side or by doing a classify on read side and then puts a 147 * reference on the conn before doing squeue_enter/tryenter/fill. For 148 * read side, the classifier itself puts the reference under fanout lock 149 * to make sure that tcp can't disappear before it gets processed. The 150 * squeue will drop this reference automatically so the called function 151 * doesn't have to do a DEC_REF. 152 * 153 * Opening a new connection: 154 * 155 * The outgoing connection open is pretty simple. ip_tcpopen() does the 156 * work in creating the conn/tcp structure and initializing it. The 157 * squeue assignment is done based on the CPU the application 158 * is running on. So for outbound connections, processing is always done 159 * on application CPU which might be different from the incoming CPU 160 * being interrupted by the NIC. An optimal way would be to figure out 161 * the NIC <-> CPU binding at listen time, and assign the outgoing 162 * connection to the squeue attached to the CPU that will be interrupted 163 * for incoming packets (we know the NIC based on the bind IP address). 164 * This might seem like a problem if more data is going out but the 165 * fact is that in most cases the transmit is ACK driven transmit where 166 * the outgoing data normally sits on TCP's xmit queue waiting to be 167 * transmitted. 168 * 169 * Accepting a connection: 170 * 171 * This is a more interesting case because of various races involved in 172 * establishing a eager in its own perimeter. Read the meta comment on 173 * top of tcp_conn_request(). But briefly, the squeue is picked by 174 * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU. 175 * 176 * Closing a connection: 177 * 178 * The close is fairly straight forward. tcp_close() calls tcp_close_output() 179 * via squeue to do the close and mark the tcp as detached if the connection 180 * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its 181 * reference but tcp_close() drop IP's reference always. So if tcp was 182 * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP 183 * and 1 because it is in classifier's connected hash. This is the condition 184 * we use to determine that its OK to clean up the tcp outside of squeue 185 * when time wait expires (check the ref under fanout and conn_lock and 186 * if it is 2, remove it from fanout hash and kill it). 187 * 188 * Although close just drops the necessary references and marks the 189 * tcp_detached state, tcp_close needs to know the tcp_detached has been 190 * set (under squeue) before letting the STREAM go away (because a 191 * inbound packet might attempt to go up the STREAM while the close 192 * has happened and tcp_detached is not set). So a special lock and 193 * flag is used along with a condition variable (tcp_closelock, tcp_closed, 194 * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked 195 * tcp_detached. 196 * 197 * Special provisions and fast paths: 198 * 199 * We make special provision for (AF_INET, SOCK_STREAM) sockets which 200 * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP 201 * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles 202 * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY 203 * check to send packets directly to tcp_rput_data via squeue. Everyone 204 * else comes through tcp_input() on the read side. 205 * 206 * We also make special provisions for sockfs by marking tcp_issocket 207 * whenever we have only sockfs on top of TCP. This allows us to skip 208 * putting the tcp in acceptor hash since a sockfs listener can never 209 * become acceptor and also avoid allocating a tcp_t for acceptor STREAM 210 * since eager has already been allocated and the accept now happens 211 * on acceptor STREAM. There is a big blob of comment on top of 212 * tcp_conn_request explaining the new accept. When socket is POP'd, 213 * sockfs sends us an ioctl to mark the fact and we go back to old 214 * behaviour. Once tcp_issocket is unset, its never set for the 215 * life of that connection. 216 * 217 * IPsec notes : 218 * 219 * Since a packet is always executed on the correct TCP perimeter 220 * all IPsec processing is defered to IP including checking new 221 * connections and setting IPSEC policies for new connection. The 222 * only exception is tcp_xmit_listeners_reset() which is called 223 * directly from IP and needs to policy check to see if TH_RST 224 * can be sent out. 225 */ 226 227 extern major_t TCP6_MAJ; 228 229 /* 230 * Values for squeue switch: 231 * 1: squeue_enter_nodrain 232 * 2: squeue_enter 233 * 3: squeue_fill 234 */ 235 int tcp_squeue_close = 2; 236 int tcp_squeue_wput = 2; 237 238 squeue_func_t tcp_squeue_close_proc; 239 squeue_func_t tcp_squeue_wput_proc; 240 241 /* 242 * This controls how tiny a write must be before we try to copy it 243 * into the the mblk on the tail of the transmit queue. Not much 244 * speedup is observed for values larger than sixteen. Zero will 245 * disable the optimisation. 246 */ 247 int tcp_tx_pull_len = 16; 248 249 /* 250 * TCP Statistics. 251 * 252 * How TCP statistics work. 253 * 254 * There are two types of statistics invoked by two macros. 255 * 256 * TCP_STAT(name) does non-atomic increment of a named stat counter. It is 257 * supposed to be used in non MT-hot paths of the code. 258 * 259 * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is 260 * supposed to be used for DEBUG purposes and may be used on a hot path. 261 * 262 * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat 263 * (use "kstat tcp" to get them). 264 * 265 * There is also additional debugging facility that marks tcp_clean_death() 266 * instances and saves them in tcp_t structure. It is triggered by 267 * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for 268 * tcp_clean_death() calls that counts the number of times each tag was hit. It 269 * is triggered by TCP_CLD_COUNTERS define. 270 * 271 * How to add new counters. 272 * 273 * 1) Add a field in the tcp_stat structure describing your counter. 274 * 2) Add a line in tcp_statistics with the name of the counter. 275 * 276 * IMPORTANT!! - make sure that both are in sync !! 277 * 3) Use either TCP_STAT or TCP_DBGSTAT with the name. 278 * 279 * Please avoid using private counters which are not kstat-exported. 280 * 281 * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances 282 * in tcp_t structure. 283 * 284 * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags. 285 */ 286 287 #ifndef TCP_DEBUG_COUNTER 288 #ifdef DEBUG 289 #define TCP_DEBUG_COUNTER 1 290 #else 291 #define TCP_DEBUG_COUNTER 0 292 #endif 293 #endif 294 295 #define TCP_CLD_COUNTERS 0 296 297 #define TCP_TAG_CLEAN_DEATH 1 298 #define TCP_MAX_CLEAN_DEATH_TAG 32 299 300 #ifdef lint 301 static int _lint_dummy_; 302 #endif 303 304 #if TCP_CLD_COUNTERS 305 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; 306 #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++ 307 #elif defined(lint) 308 #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0); 309 #else 310 #define TCP_CLD_STAT(x) 311 #endif 312 313 #if TCP_DEBUG_COUNTER 314 #define TCP_DBGSTAT(x) atomic_add_64(&(tcp_statistics.x.value.ui64), 1) 315 #elif defined(lint) 316 #define TCP_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); 317 #else 318 #define TCP_DBGSTAT(x) 319 #endif 320 321 tcp_stat_t tcp_statistics = { 322 { "tcp_time_wait", KSTAT_DATA_UINT64 }, 323 { "tcp_time_wait_syn", KSTAT_DATA_UINT64 }, 324 { "tcp_time_wait_success", KSTAT_DATA_UINT64 }, 325 { "tcp_time_wait_fail", KSTAT_DATA_UINT64 }, 326 { "tcp_reinput_syn", KSTAT_DATA_UINT64 }, 327 { "tcp_ip_output", KSTAT_DATA_UINT64 }, 328 { "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 }, 329 { "tcp_detach_time_wait", KSTAT_DATA_UINT64 }, 330 { "tcp_time_wait_reap", KSTAT_DATA_UINT64 }, 331 { "tcp_clean_death_nondetached", KSTAT_DATA_UINT64 }, 332 { "tcp_reinit_calls", KSTAT_DATA_UINT64 }, 333 { "tcp_eager_err1", KSTAT_DATA_UINT64 }, 334 { "tcp_eager_err2", KSTAT_DATA_UINT64 }, 335 { "tcp_eager_blowoff_calls", KSTAT_DATA_UINT64 }, 336 { "tcp_eager_blowoff_q", KSTAT_DATA_UINT64 }, 337 { "tcp_eager_blowoff_q0", KSTAT_DATA_UINT64 }, 338 { "tcp_not_hard_bound", KSTAT_DATA_UINT64 }, 339 { "tcp_no_listener", KSTAT_DATA_UINT64 }, 340 { "tcp_found_eager", KSTAT_DATA_UINT64 }, 341 { "tcp_wrong_queue", KSTAT_DATA_UINT64 }, 342 { "tcp_found_eager_binding1", KSTAT_DATA_UINT64 }, 343 { "tcp_found_eager_bound1", KSTAT_DATA_UINT64 }, 344 { "tcp_eager_has_listener1", KSTAT_DATA_UINT64 }, 345 { "tcp_open_alloc", KSTAT_DATA_UINT64 }, 346 { "tcp_open_detached_alloc", KSTAT_DATA_UINT64 }, 347 { "tcp_rput_time_wait", KSTAT_DATA_UINT64 }, 348 { "tcp_listendrop", KSTAT_DATA_UINT64 }, 349 { "tcp_listendropq0", KSTAT_DATA_UINT64 }, 350 { "tcp_wrong_rq", KSTAT_DATA_UINT64 }, 351 { "tcp_rsrv_calls", KSTAT_DATA_UINT64 }, 352 { "tcp_eagerfree2", KSTAT_DATA_UINT64 }, 353 { "tcp_eagerfree3", KSTAT_DATA_UINT64 }, 354 { "tcp_eagerfree4", KSTAT_DATA_UINT64 }, 355 { "tcp_eagerfree5", KSTAT_DATA_UINT64 }, 356 { "tcp_timewait_syn_fail", KSTAT_DATA_UINT64 }, 357 { "tcp_listen_badflags", KSTAT_DATA_UINT64 }, 358 { "tcp_timeout_calls", KSTAT_DATA_UINT64 }, 359 { "tcp_timeout_cached_alloc", KSTAT_DATA_UINT64 }, 360 { "tcp_timeout_cancel_reqs", KSTAT_DATA_UINT64 }, 361 { "tcp_timeout_canceled", KSTAT_DATA_UINT64 }, 362 { "tcp_timermp_alloced", KSTAT_DATA_UINT64 }, 363 { "tcp_timermp_freed", KSTAT_DATA_UINT64 }, 364 { "tcp_timermp_allocfail", KSTAT_DATA_UINT64 }, 365 { "tcp_timermp_allocdblfail", KSTAT_DATA_UINT64 }, 366 { "tcp_push_timer_cnt", KSTAT_DATA_UINT64 }, 367 { "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 }, 368 { "tcp_ire_null1", KSTAT_DATA_UINT64 }, 369 { "tcp_ire_null", KSTAT_DATA_UINT64 }, 370 { "tcp_ip_send", KSTAT_DATA_UINT64 }, 371 { "tcp_ip_ire_send", KSTAT_DATA_UINT64 }, 372 { "tcp_wsrv_called", KSTAT_DATA_UINT64 }, 373 { "tcp_flwctl_on", KSTAT_DATA_UINT64 }, 374 { "tcp_timer_fire_early", KSTAT_DATA_UINT64 }, 375 { "tcp_timer_fire_miss", KSTAT_DATA_UINT64 }, 376 { "tcp_freelist_cleanup", KSTAT_DATA_UINT64 }, 377 { "tcp_rput_v6_error", KSTAT_DATA_UINT64 }, 378 { "tcp_out_sw_cksum", KSTAT_DATA_UINT64 }, 379 { "tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 380 { "tcp_zcopy_on", KSTAT_DATA_UINT64 }, 381 { "tcp_zcopy_off", KSTAT_DATA_UINT64 }, 382 { "tcp_zcopy_backoff", KSTAT_DATA_UINT64 }, 383 { "tcp_zcopy_disable", KSTAT_DATA_UINT64 }, 384 { "tcp_mdt_pkt_out", KSTAT_DATA_UINT64 }, 385 { "tcp_mdt_pkt_out_v4", KSTAT_DATA_UINT64 }, 386 { "tcp_mdt_pkt_out_v6", KSTAT_DATA_UINT64 }, 387 { "tcp_mdt_discarded", KSTAT_DATA_UINT64 }, 388 { "tcp_mdt_conn_halted1", KSTAT_DATA_UINT64 }, 389 { "tcp_mdt_conn_halted2", KSTAT_DATA_UINT64 }, 390 { "tcp_mdt_conn_halted3", KSTAT_DATA_UINT64 }, 391 { "tcp_mdt_conn_resumed1", KSTAT_DATA_UINT64 }, 392 { "tcp_mdt_conn_resumed2", KSTAT_DATA_UINT64 }, 393 { "tcp_mdt_legacy_small", KSTAT_DATA_UINT64 }, 394 { "tcp_mdt_legacy_all", KSTAT_DATA_UINT64 }, 395 { "tcp_mdt_legacy_ret", KSTAT_DATA_UINT64 }, 396 { "tcp_mdt_allocfail", KSTAT_DATA_UINT64 }, 397 { "tcp_mdt_addpdescfail", KSTAT_DATA_UINT64 }, 398 { "tcp_mdt_allocd", KSTAT_DATA_UINT64 }, 399 { "tcp_mdt_linked", KSTAT_DATA_UINT64 }, 400 { "tcp_fusion_flowctl", KSTAT_DATA_UINT64 }, 401 { "tcp_fusion_backenabled", KSTAT_DATA_UINT64 }, 402 { "tcp_fusion_urg", KSTAT_DATA_UINT64 }, 403 { "tcp_fusion_putnext", KSTAT_DATA_UINT64 }, 404 { "tcp_fusion_unfusable", KSTAT_DATA_UINT64 }, 405 { "tcp_fusion_aborted", KSTAT_DATA_UINT64 }, 406 { "tcp_fusion_unqualified", KSTAT_DATA_UINT64 }, 407 { "tcp_fusion_rrw_busy", KSTAT_DATA_UINT64 }, 408 { "tcp_fusion_rrw_msgcnt", KSTAT_DATA_UINT64 }, 409 { "tcp_fusion_rrw_plugged", KSTAT_DATA_UINT64 }, 410 { "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64 }, 411 { "tcp_sock_fallback", KSTAT_DATA_UINT64 }, 412 }; 413 414 static kstat_t *tcp_kstat; 415 416 /* 417 * Call either ip_output or ip_output_v6. This replaces putnext() calls on the 418 * tcp write side. 419 */ 420 #define CALL_IP_WPUT(connp, q, mp) { \ 421 ASSERT(((q)->q_flag & QREADR) == 0); \ 422 TCP_DBGSTAT(tcp_ip_output); \ 423 connp->conn_send(connp, (mp), (q), IP_WPUT); \ 424 } 425 426 /* Macros for timestamp comparisons */ 427 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 428 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 429 430 /* 431 * Parameters for TCP Initial Send Sequence number (ISS) generation. When 432 * tcp_strong_iss is set to 1, which is the default, the ISS is calculated 433 * by adding three components: a time component which grows by 1 every 4096 434 * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); 435 * a per-connection component which grows by 125000 for every new connection; 436 * and an "extra" component that grows by a random amount centered 437 * approximately on 64000. This causes the the ISS generator to cycle every 438 * 4.89 hours if no TCP connections are made, and faster if connections are 439 * made. 440 * 441 * When tcp_strong_iss is set to 0, ISS is calculated by adding two 442 * components: a time component which grows by 250000 every second; and 443 * a per-connection component which grows by 125000 for every new connections. 444 * 445 * A third method, when tcp_strong_iss is set to 2, for generating ISS is 446 * prescribed by Steve Bellovin. This involves adding time, the 125000 per 447 * connection, and a one-way hash (MD5) of the connection ID <sport, dport, 448 * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered 449 * password. 450 */ 451 #define ISS_INCR 250000 452 #define ISS_NSEC_SHT 12 453 454 static uint32_t tcp_iss_incr_extra; /* Incremented for each connection */ 455 static kmutex_t tcp_iss_key_lock; 456 static MD5_CTX tcp_iss_key; 457 static sin_t sin_null; /* Zero address for quick clears */ 458 static sin6_t sin6_null; /* Zero address for quick clears */ 459 460 /* Packet dropper for TCP IPsec policy drops. */ 461 static ipdropper_t tcp_dropper; 462 463 /* 464 * This implementation follows the 4.3BSD interpretation of the urgent 465 * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause 466 * incompatible changes in protocols like telnet and rlogin. 467 */ 468 #define TCP_OLD_URP_INTERPRETATION 1 469 470 #define TCP_IS_DETACHED_NONEAGER(tcp) \ 471 (TCP_IS_DETACHED(tcp) && \ 472 (!(tcp)->tcp_hard_binding)) 473 474 /* 475 * TCP reassembly macros. We hide starting and ending sequence numbers in 476 * b_next and b_prev of messages on the reassembly queue. The messages are 477 * chained using b_cont. These macros are used in tcp_reass() so we don't 478 * have to see the ugly casts and assignments. 479 */ 480 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) 481 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ 482 (mblk_t *)(uintptr_t)(u)) 483 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) 484 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ 485 (mblk_t *)(uintptr_t)(u)) 486 487 /* 488 * Implementation of TCP Timers. 489 * ============================= 490 * 491 * INTERFACE: 492 * 493 * There are two basic functions dealing with tcp timers: 494 * 495 * timeout_id_t tcp_timeout(connp, func, time) 496 * clock_t tcp_timeout_cancel(connp, timeout_id) 497 * TCP_TIMER_RESTART(tcp, intvl) 498 * 499 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' 500 * after 'time' ticks passed. The function called by timeout() must adhere to 501 * the same restrictions as a driver soft interrupt handler - it must not sleep 502 * or call other functions that might sleep. The value returned is the opaque 503 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to 504 * cancel the request. The call to tcp_timeout() may fail in which case it 505 * returns zero. This is different from the timeout(9F) function which never 506 * fails. 507 * 508 * The call-back function 'func' always receives 'connp' as its single 509 * argument. It is always executed in the squeue corresponding to the tcp 510 * structure. The tcp structure is guaranteed to be present at the time the 511 * call-back is called. 512 * 513 * NOTE: The call-back function 'func' is never called if tcp is in 514 * the TCPS_CLOSED state. 515 * 516 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() 517 * request. locks acquired by the call-back routine should not be held across 518 * the call to tcp_timeout_cancel() or a deadlock may result. 519 * 520 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. 521 * Otherwise, it returns an integer value greater than or equal to 0. In 522 * particular, if the call-back function is already placed on the squeue, it can 523 * not be canceled. 524 * 525 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called 526 * within squeue context corresponding to the tcp instance. Since the 527 * call-back is also called via the same squeue, there are no race 528 * conditions described in untimeout(9F) manual page since all calls are 529 * strictly serialized. 530 * 531 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout 532 * stored in tcp_timer_tid and starts a new one using 533 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back 534 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid 535 * field. 536 * 537 * NOTE: since the timeout cancellation is not guaranteed, the cancelled 538 * call-back may still be called, so it is possible tcp_timer() will be 539 * called several times. This should not be a problem since tcp_timer() 540 * should always check the tcp instance state. 541 * 542 * 543 * IMPLEMENTATION: 544 * 545 * TCP timers are implemented using three-stage process. The call to 546 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function 547 * when the timer expires. The tcp_timer_callback() arranges the call of the 548 * tcp_timer_handler() function via squeue corresponding to the tcp 549 * instance. The tcp_timer_handler() calls actual requested timeout call-back 550 * and passes tcp instance as an argument to it. Information is passed between 551 * stages using the tcp_timer_t structure which contains the connp pointer, the 552 * tcp call-back to call and the timeout id returned by the timeout(9F). 553 * 554 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - 555 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo 556 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() 557 * returns the pointer to this mblk. 558 * 559 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It 560 * looks like a normal mblk without actual dblk attached to it. 561 * 562 * To optimize performance each tcp instance holds a small cache of timer 563 * mblocks. In the current implementation it caches up to two timer mblocks per 564 * tcp instance. The cache is preserved over tcp frees and is only freed when 565 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp 566 * timer processing happens on a corresponding squeue, the cache manipulation 567 * does not require any locks. Experiments show that majority of timer mblocks 568 * allocations are satisfied from the tcp cache and do not involve kmem calls. 569 * 570 * The tcp_timeout() places a refhold on the connp instance which guarantees 571 * that it will be present at the time the call-back function fires. The 572 * tcp_timer_handler() drops the reference after calling the call-back, so the 573 * call-back function does not need to manipulate the references explicitly. 574 */ 575 576 typedef struct tcp_timer_s { 577 conn_t *connp; 578 void (*tcpt_proc)(void *); 579 timeout_id_t tcpt_tid; 580 } tcp_timer_t; 581 582 static kmem_cache_t *tcp_timercache; 583 kmem_cache_t *tcp_sack_info_cache; 584 kmem_cache_t *tcp_iphc_cache; 585 586 /* 587 * For scalability, we must not run a timer for every TCP connection 588 * in TIME_WAIT state. To see why, consider (for time wait interval of 589 * 4 minutes): 590 * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's 591 * 592 * This list is ordered by time, so you need only delete from the head 593 * until you get to entries which aren't old enough to delete yet. 594 * The list consists of only the detached TIME_WAIT connections. 595 * 596 * Note that the timer (tcp_time_wait_expire) is started when the tcp_t 597 * becomes detached TIME_WAIT (either by changing the state and already 598 * being detached or the other way around). This means that the TIME_WAIT 599 * state can be extended (up to doubled) if the connection doesn't become 600 * detached for a long time. 601 * 602 * The list manipulations (including tcp_time_wait_next/prev) 603 * are protected by the tcp_time_wait_lock. The content of the 604 * detached TIME_WAIT connections is protected by the normal perimeters. 605 */ 606 607 typedef struct tcp_squeue_priv_s { 608 kmutex_t tcp_time_wait_lock; 609 /* Protects the next 3 globals */ 610 timeout_id_t tcp_time_wait_tid; 611 tcp_t *tcp_time_wait_head; 612 tcp_t *tcp_time_wait_tail; 613 tcp_t *tcp_free_list; 614 uint_t tcp_free_list_cnt; 615 } tcp_squeue_priv_t; 616 617 /* 618 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. 619 * Running it every 5 seconds seems to give the best results. 620 */ 621 #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000) 622 623 /* 624 * To prevent memory hog, limit the number of entries in tcp_free_list 625 * to 1% of available memory / number of cpus 626 */ 627 uint_t tcp_free_list_max_cnt = 0; 628 629 #define TCP_XMIT_LOWATER 4096 630 #define TCP_XMIT_HIWATER 49152 631 #define TCP_RECV_LOWATER 2048 632 #define TCP_RECV_HIWATER 49152 633 634 /* 635 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days 636 */ 637 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) 638 639 #define TIDUSZ 4096 /* transport interface data unit size */ 640 641 /* 642 * Bind hash list size and has function. It has to be a power of 2 for 643 * hashing. 644 */ 645 #define TCP_BIND_FANOUT_SIZE 512 646 #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1)) 647 /* 648 * Size of listen and acceptor hash list. It has to be a power of 2 for 649 * hashing. 650 */ 651 #define TCP_FANOUT_SIZE 256 652 653 #ifdef _ILP32 654 #define TCP_ACCEPTOR_HASH(accid) \ 655 (((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1)) 656 #else 657 #define TCP_ACCEPTOR_HASH(accid) \ 658 ((uint_t)(accid) & (TCP_FANOUT_SIZE - 1)) 659 #endif /* _ILP32 */ 660 661 #define IP_ADDR_CACHE_SIZE 2048 662 #define IP_ADDR_CACHE_HASH(faddr) \ 663 (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1)) 664 665 /* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */ 666 #define TCP_HSP_HASH_SIZE 256 667 668 #define TCP_HSP_HASH(addr) \ 669 (((addr>>24) ^ (addr >>16) ^ \ 670 (addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE) 671 672 /* 673 * TCP options struct returned from tcp_parse_options. 674 */ 675 typedef struct tcp_opt_s { 676 uint32_t tcp_opt_mss; 677 uint32_t tcp_opt_wscale; 678 uint32_t tcp_opt_ts_val; 679 uint32_t tcp_opt_ts_ecr; 680 tcp_t *tcp; 681 } tcp_opt_t; 682 683 /* 684 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 685 */ 686 687 #ifdef _BIG_ENDIAN 688 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 689 (TCPOPT_TSTAMP << 8) | 10) 690 #else 691 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 692 (TCPOPT_NOP << 8) | TCPOPT_NOP) 693 #endif 694 695 /* 696 * Flags returned from tcp_parse_options. 697 */ 698 #define TCP_OPT_MSS_PRESENT 1 699 #define TCP_OPT_WSCALE_PRESENT 2 700 #define TCP_OPT_TSTAMP_PRESENT 4 701 #define TCP_OPT_SACK_OK_PRESENT 8 702 #define TCP_OPT_SACK_PRESENT 16 703 704 /* TCP option length */ 705 #define TCPOPT_NOP_LEN 1 706 #define TCPOPT_MAXSEG_LEN 4 707 #define TCPOPT_WS_LEN 3 708 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 709 #define TCPOPT_TSTAMP_LEN 10 710 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 711 #define TCPOPT_SACK_OK_LEN 2 712 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 713 #define TCPOPT_REAL_SACK_LEN 4 714 #define TCPOPT_MAX_SACK_LEN 36 715 #define TCPOPT_HEADER_LEN 2 716 717 /* TCP cwnd burst factor. */ 718 #define TCP_CWND_INFINITE 65535 719 #define TCP_CWND_SS 3 720 #define TCP_CWND_NORMAL 5 721 722 /* Maximum TCP initial cwin (start/restart). */ 723 #define TCP_MAX_INIT_CWND 8 724 725 /* 726 * Initialize cwnd according to RFC 3390. def_max_init_cwnd is 727 * either tcp_slow_start_initial or tcp_slow_start_after idle 728 * depending on the caller. If the upper layer has not used the 729 * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd 730 * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd. 731 * If the upper layer has changed set the tcp_init_cwnd, just use 732 * it to calculate the tcp_cwnd. 733 */ 734 #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \ 735 { \ 736 if ((tcp)->tcp_init_cwnd == 0) { \ 737 (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \ 738 MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \ 739 } else { \ 740 (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \ 741 } \ 742 tcp->tcp_cwnd_cnt = 0; \ 743 } 744 745 /* TCP Timer control structure */ 746 typedef struct tcpt_s { 747 pfv_t tcpt_pfv; /* The routine we are to call */ 748 tcp_t *tcpt_tcp; /* The parameter we are to pass in */ 749 } tcpt_t; 750 751 /* Host Specific Parameter structure */ 752 typedef struct tcp_hsp { 753 struct tcp_hsp *tcp_hsp_next; 754 in6_addr_t tcp_hsp_addr_v6; 755 in6_addr_t tcp_hsp_subnet_v6; 756 uint_t tcp_hsp_vers; /* IPV4_VERSION | IPV6_VERSION */ 757 int32_t tcp_hsp_sendspace; 758 int32_t tcp_hsp_recvspace; 759 int32_t tcp_hsp_tstamp; 760 } tcp_hsp_t; 761 #define tcp_hsp_addr V4_PART_OF_V6(tcp_hsp_addr_v6) 762 #define tcp_hsp_subnet V4_PART_OF_V6(tcp_hsp_subnet_v6) 763 764 /* 765 * Functions called directly via squeue having a prototype of edesc_t. 766 */ 767 void tcp_conn_request(void *arg, mblk_t *mp, void *arg2); 768 static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2); 769 void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2); 770 static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2); 771 static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2); 772 void tcp_input(void *arg, mblk_t *mp, void *arg2); 773 void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); 774 static void tcp_close_output(void *arg, mblk_t *mp, void *arg2); 775 void tcp_output(void *arg, mblk_t *mp, void *arg2); 776 static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2); 777 static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2); 778 779 780 /* Prototype for TCP functions */ 781 static void tcp_random_init(void); 782 int tcp_random(void); 783 static void tcp_accept(tcp_t *tcp, mblk_t *mp); 784 static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, 785 tcp_t *eager); 786 static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp); 787 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 788 int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, 789 boolean_t user_specified); 790 static void tcp_closei_local(tcp_t *tcp); 791 static void tcp_close_detached(tcp_t *tcp); 792 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, 793 mblk_t *idmp, mblk_t **defermp); 794 static void tcp_connect(tcp_t *tcp, mblk_t *mp); 795 static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, 796 in_port_t dstport, uint_t srcid); 797 static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, 798 in_port_t dstport, uint32_t flowinfo, uint_t srcid, 799 uint32_t scope_id); 800 static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); 801 static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp); 802 static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); 803 static char *tcp_display(tcp_t *tcp, char *, char); 804 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); 805 static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only); 806 static void tcp_eager_unlink(tcp_t *tcp); 807 static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr, 808 int unixerr); 809 static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 810 int tlierr, int unixerr); 811 static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, 812 cred_t *cr); 813 static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, 814 char *value, caddr_t cp, cred_t *cr); 815 static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, 816 char *value, caddr_t cp, cred_t *cr); 817 static int tcp_tpistate(tcp_t *tcp); 818 static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp, 819 int caller_holds_lock); 820 static void tcp_bind_hash_remove(tcp_t *tcp); 821 static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id); 822 void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp); 823 static void tcp_acceptor_hash_remove(tcp_t *tcp); 824 static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); 825 static void tcp_info_req(tcp_t *tcp, mblk_t *mp); 826 static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); 827 static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp); 828 static int tcp_header_init_ipv4(tcp_t *tcp); 829 static int tcp_header_init_ipv6(tcp_t *tcp); 830 int tcp_init(tcp_t *tcp, queue_t *q); 831 static int tcp_init_values(tcp_t *tcp); 832 static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic); 833 static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, 834 t_scalar_t addr_length); 835 static void tcp_ip_ire_mark_advice(tcp_t *tcp); 836 static void tcp_ip_notify(tcp_t *tcp); 837 static mblk_t *tcp_ire_mp(mblk_t *mp); 838 static void tcp_iss_init(tcp_t *tcp); 839 static void tcp_keepalive_killer(void *arg); 840 static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt); 841 static void tcp_mss_set(tcp_t *tcp, uint32_t size); 842 static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, 843 int *do_disconnectp, int *t_errorp, int *sys_errorp); 844 static boolean_t tcp_allow_connopt_set(int level, int name); 845 int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); 846 int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr); 847 int tcp_opt_set(queue_t *q, uint_t optset_context, int level, 848 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, 849 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, 850 mblk_t *mblk); 851 static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha); 852 static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, 853 uchar_t *ptr, uint_t len); 854 static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); 855 static boolean_t tcp_param_register(tcpparam_t *tcppa, int cnt); 856 static int tcp_param_set(queue_t *q, mblk_t *mp, char *value, 857 caddr_t cp, cred_t *cr); 858 static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, 859 caddr_t cp, cred_t *cr); 860 static void tcp_iss_key_init(uint8_t *phrase, int len); 861 static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, 862 caddr_t cp, cred_t *cr); 863 static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt); 864 static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start); 865 static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp); 866 static void tcp_reinit(tcp_t *tcp); 867 static void tcp_reinit_values(tcp_t *tcp); 868 static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, 869 tcp_t *thisstream, cred_t *cr); 870 871 static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp); 872 static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); 873 static boolean_t tcp_send_rst_chk(void); 874 static void tcp_ss_rexmit(tcp_t *tcp); 875 static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp); 876 static void tcp_process_options(tcp_t *, tcph_t *); 877 static void tcp_rput_common(tcp_t *tcp, mblk_t *mp); 878 static void tcp_rsrv(queue_t *q); 879 static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd); 880 static int tcp_snmp_state(tcp_t *tcp); 881 static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, 882 cred_t *cr); 883 static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 884 cred_t *cr); 885 static int tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 886 cred_t *cr); 887 static int tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 888 cred_t *cr); 889 static int tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 890 cred_t *cr); 891 static int tcp_host_param_set(queue_t *q, mblk_t *mp, char *value, 892 caddr_t cp, cred_t *cr); 893 static int tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value, 894 caddr_t cp, cred_t *cr); 895 static int tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, 896 cred_t *cr); 897 static void tcp_timer(void *arg); 898 static void tcp_timer_callback(void *); 899 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp, 900 boolean_t random); 901 static in_port_t tcp_get_next_priv_port(const tcp_t *); 902 static void tcp_wput_sock(queue_t *q, mblk_t *mp); 903 void tcp_wput_accept(queue_t *q, mblk_t *mp); 904 static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); 905 static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); 906 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); 907 static int tcp_send(queue_t *q, tcp_t *tcp, const int mss, 908 const int tcp_hdr_len, const int tcp_tcp_hdr_len, 909 const int num_sack_blk, int *usable, uint_t *snxt, 910 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 911 const int mdt_thres); 912 static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, 913 const int tcp_hdr_len, const int tcp_tcp_hdr_len, 914 const int num_sack_blk, int *usable, uint_t *snxt, 915 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 916 const int mdt_thres); 917 static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, 918 int num_sack_blk); 919 static void tcp_wsrv(queue_t *q); 920 static int tcp_xmit_end(tcp_t *tcp); 921 static mblk_t *tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, 922 int32_t *offset, mblk_t **end_mp, uint32_t seq, 923 boolean_t sendall, uint32_t *seg_len, boolean_t rexmit); 924 static void tcp_ack_timer(void *arg); 925 static mblk_t *tcp_ack_mp(tcp_t *tcp); 926 static void tcp_xmit_early_reset(char *str, mblk_t *mp, 927 uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len, 928 zoneid_t zoneid); 929 static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, 930 uint32_t ack, int ctl); 931 static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr); 932 static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr); 933 static int setmaxps(queue_t *q, int maxpsz); 934 static void tcp_set_rto(tcp_t *, time_t); 935 static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *, 936 boolean_t, boolean_t); 937 static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, 938 boolean_t ipsec_mctl); 939 static mblk_t *tcp_setsockopt_mp(int level, int cmd, 940 char *opt, int optlen); 941 static int tcp_build_hdrs(queue_t *, tcp_t *); 942 static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 943 uint32_t seg_seq, uint32_t seg_ack, int seg_len, 944 tcph_t *tcph); 945 boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp); 946 boolean_t tcp_reserved_port_add(int, in_port_t *, in_port_t *); 947 boolean_t tcp_reserved_port_del(in_port_t, in_port_t); 948 boolean_t tcp_reserved_port_check(in_port_t); 949 static tcp_t *tcp_alloc_temp_tcp(in_port_t); 950 static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *); 951 static mblk_t *tcp_mdt_info_mp(mblk_t *); 952 static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t); 953 static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *, 954 const boolean_t, const uint32_t, const uint32_t, 955 const uint32_t, const uint32_t); 956 static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *, 957 const uint_t, const uint_t, boolean_t *); 958 static void tcp_send_data(tcp_t *, queue_t *, mblk_t *); 959 extern mblk_t *tcp_timermp_alloc(int); 960 extern void tcp_timermp_free(tcp_t *); 961 static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); 962 static void tcp_stop_lingering(tcp_t *tcp); 963 static void tcp_close_linger_timeout(void *arg); 964 void tcp_ddi_init(void); 965 void tcp_ddi_destroy(void); 966 static void tcp_kstat_init(void); 967 static void tcp_kstat_fini(void); 968 static int tcp_kstat_update(kstat_t *kp, int rw); 969 void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp); 970 static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 971 tcph_t *tcph, uint_t ipvers, mblk_t *idmp); 972 static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, 973 tcph_t *tcph, mblk_t *idmp); 974 static squeue_func_t tcp_squeue_switch(int); 975 976 static int tcp_open(queue_t *, dev_t *, int, int, cred_t *); 977 static int tcp_close(queue_t *, int); 978 static int tcpclose_accept(queue_t *); 979 static int tcp_modclose(queue_t *); 980 static void tcp_wput_mod(queue_t *, mblk_t *); 981 982 static void tcp_squeue_add(squeue_t *); 983 static boolean_t tcp_zcopy_check(tcp_t *); 984 static void tcp_zcopy_notify(tcp_t *); 985 static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *); 986 static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int); 987 static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t); 988 989 extern void tcp_kssl_input(tcp_t *, mblk_t *); 990 991 /* 992 * Routines related to the TCP_IOC_ABORT_CONN ioctl command. 993 * 994 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting 995 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure 996 * (defined in tcp.h) needs to be filled in and passed into the kernel 997 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t 998 * structure contains the four-tuple of a TCP connection and a range of TCP 999 * states (specified by ac_start and ac_end). The use of wildcard addresses 1000 * and ports is allowed. Connections with a matching four tuple and a state 1001 * within the specified range will be aborted. The valid states for the 1002 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, 1003 * inclusive. 1004 * 1005 * An application which has its connection aborted by this ioctl will receive 1006 * an error that is dependent on the connection state at the time of the abort. 1007 * If the connection state is < TCPS_TIME_WAIT, an application should behave as 1008 * though a RST packet has been received. If the connection state is equal to 1009 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel 1010 * and all resources associated with the connection will be freed. 1011 */ 1012 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); 1013 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); 1014 static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *); 1015 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *); 1016 static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); 1017 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, 1018 boolean_t); 1019 1020 static struct module_info tcp_rinfo = { 1021 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER 1022 }; 1023 1024 static struct module_info tcp_winfo = { 1025 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16 1026 }; 1027 1028 /* 1029 * Entry points for TCP as a module. It only allows SNMP requests 1030 * to pass through. 1031 */ 1032 struct qinit tcp_mod_rinit = { 1033 (pfi_t)putnext, NULL, tcp_open, ip_snmpmod_close, NULL, &tcp_rinfo, 1034 }; 1035 1036 struct qinit tcp_mod_winit = { 1037 (pfi_t)ip_snmpmod_wput, NULL, tcp_open, ip_snmpmod_close, NULL, 1038 &tcp_rinfo 1039 }; 1040 1041 /* 1042 * Entry points for TCP as a device. The normal case which supports 1043 * the TCP functionality. 1044 */ 1045 struct qinit tcp_rinit = { 1046 NULL, (pfi_t)tcp_rsrv, tcp_open, tcp_close, NULL, &tcp_rinfo 1047 }; 1048 1049 struct qinit tcp_winit = { 1050 (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 1051 }; 1052 1053 /* Initial entry point for TCP in socket mode. */ 1054 struct qinit tcp_sock_winit = { 1055 (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 1056 }; 1057 1058 /* 1059 * Entry points for TCP as a acceptor STREAM opened by sockfs when doing 1060 * an accept. Avoid allocating data structures since eager has already 1061 * been created. 1062 */ 1063 struct qinit tcp_acceptor_rinit = { 1064 NULL, (pfi_t)tcp_rsrv, NULL, tcpclose_accept, NULL, &tcp_winfo 1065 }; 1066 1067 struct qinit tcp_acceptor_winit = { 1068 (pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo 1069 }; 1070 1071 /* 1072 * Entry points for TCP loopback (read side only) 1073 */ 1074 struct qinit tcp_loopback_rinit = { 1075 (pfi_t)0, (pfi_t)tcp_rsrv, tcp_open, tcp_close, (pfi_t)0, 1076 &tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD 1077 }; 1078 1079 struct streamtab tcpinfo = { 1080 &tcp_rinit, &tcp_winit 1081 }; 1082 1083 extern squeue_func_t tcp_squeue_wput_proc; 1084 extern squeue_func_t tcp_squeue_timer_proc; 1085 1086 /* Protected by tcp_g_q_lock */ 1087 static queue_t *tcp_g_q; /* Default queue used during detached closes */ 1088 kmutex_t tcp_g_q_lock; 1089 1090 /* Protected by tcp_hsp_lock */ 1091 /* 1092 * XXX The host param mechanism should go away and instead we should use 1093 * the metrics associated with the routes to determine the default sndspace 1094 * and rcvspace. 1095 */ 1096 static tcp_hsp_t **tcp_hsp_hash; /* Hash table for HSPs */ 1097 krwlock_t tcp_hsp_lock; 1098 1099 /* 1100 * Extra privileged ports. In host byte order. 1101 * Protected by tcp_epriv_port_lock. 1102 */ 1103 #define TCP_NUM_EPRIV_PORTS 64 1104 static int tcp_g_num_epriv_ports = TCP_NUM_EPRIV_PORTS; 1105 static uint16_t tcp_g_epriv_ports[TCP_NUM_EPRIV_PORTS] = { 2049, 4045 }; 1106 kmutex_t tcp_epriv_port_lock; 1107 1108 /* 1109 * The smallest anonymous port in the privileged port range which TCP 1110 * looks for free port. Use in the option TCP_ANONPRIVBIND. 1111 */ 1112 static in_port_t tcp_min_anonpriv_port = 512; 1113 1114 /* Only modified during _init and _fini thus no locking is needed. */ 1115 static caddr_t tcp_g_nd; /* Head of 'named dispatch' variable list */ 1116 1117 /* Hint not protected by any lock */ 1118 static uint_t tcp_next_port_to_try; 1119 1120 1121 /* TCP bind hash list - all tcp_t with state >= BOUND. */ 1122 tf_t tcp_bind_fanout[TCP_BIND_FANOUT_SIZE]; 1123 1124 /* TCP queue hash list - all tcp_t in case they will be an acceptor. */ 1125 static tf_t tcp_acceptor_fanout[TCP_FANOUT_SIZE]; 1126 1127 /* 1128 * TCP has a private interface for other kernel modules to reserve a 1129 * port range for them to use. Once reserved, TCP will not use any ports 1130 * in the range. This interface relies on the TCP_EXCLBIND feature. If 1131 * the semantics of TCP_EXCLBIND is changed, implementation of this interface 1132 * has to be verified. 1133 * 1134 * There can be TCP_RESERVED_PORTS_ARRAY_MAX_SIZE port ranges. Each port 1135 * range can cover at most TCP_RESERVED_PORTS_RANGE_MAX ports. A port 1136 * range is [port a, port b] inclusive. And each port range is between 1137 * TCP_LOWESET_RESERVED_PORT and TCP_LARGEST_RESERVED_PORT inclusive. 1138 * 1139 * Note that the default anonymous port range starts from 32768. There is 1140 * no port "collision" between that and the reserved port range. If there 1141 * is port collision (because the default smallest anonymous port is lowered 1142 * or some apps specifically bind to ports in the reserved port range), the 1143 * system may not be able to reserve a port range even there are enough 1144 * unbound ports as a reserved port range contains consecutive ports . 1145 */ 1146 #define TCP_RESERVED_PORTS_ARRAY_MAX_SIZE 5 1147 #define TCP_RESERVED_PORTS_RANGE_MAX 1000 1148 #define TCP_SMALLEST_RESERVED_PORT 10240 1149 #define TCP_LARGEST_RESERVED_PORT 20480 1150 1151 /* Structure to represent those reserved port ranges. */ 1152 typedef struct tcp_rport_s { 1153 in_port_t lo_port; 1154 in_port_t hi_port; 1155 tcp_t **temp_tcp_array; 1156 } tcp_rport_t; 1157 1158 /* The reserved port array. */ 1159 static tcp_rport_t tcp_reserved_port[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE]; 1160 1161 /* Locks to protect the tcp_reserved_ports array. */ 1162 static krwlock_t tcp_reserved_port_lock; 1163 1164 /* The number of ranges in the array. */ 1165 uint32_t tcp_reserved_port_array_size = 0; 1166 1167 /* 1168 * MIB-2 stuff for SNMP 1169 * Note: tcpInErrs {tcp 15} is accumulated in ip.c 1170 */ 1171 mib2_tcp_t tcp_mib; /* SNMP fixed size info */ 1172 kstat_t *tcp_mibkp; /* kstat exporting tcp_mib data */ 1173 1174 boolean_t tcp_icmp_source_quench = B_FALSE; 1175 /* 1176 * Following assumes TPI alignment requirements stay along 32 bit 1177 * boundaries 1178 */ 1179 #define ROUNDUP32(x) \ 1180 (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1)) 1181 1182 /* Template for response to info request. */ 1183 static struct T_info_ack tcp_g_t_info_ack = { 1184 T_INFO_ACK, /* PRIM_type */ 1185 0, /* TSDU_size */ 1186 T_INFINITE, /* ETSDU_size */ 1187 T_INVALID, /* CDATA_size */ 1188 T_INVALID, /* DDATA_size */ 1189 sizeof (sin_t), /* ADDR_size */ 1190 0, /* OPT_size - not initialized here */ 1191 TIDUSZ, /* TIDU_size */ 1192 T_COTS_ORD, /* SERV_type */ 1193 TCPS_IDLE, /* CURRENT_state */ 1194 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 1195 }; 1196 1197 static struct T_info_ack tcp_g_t_info_ack_v6 = { 1198 T_INFO_ACK, /* PRIM_type */ 1199 0, /* TSDU_size */ 1200 T_INFINITE, /* ETSDU_size */ 1201 T_INVALID, /* CDATA_size */ 1202 T_INVALID, /* DDATA_size */ 1203 sizeof (sin6_t), /* ADDR_size */ 1204 0, /* OPT_size - not initialized here */ 1205 TIDUSZ, /* TIDU_size */ 1206 T_COTS_ORD, /* SERV_type */ 1207 TCPS_IDLE, /* CURRENT_state */ 1208 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 1209 }; 1210 1211 #define MS 1L 1212 #define SECONDS (1000 * MS) 1213 #define MINUTES (60 * SECONDS) 1214 #define HOURS (60 * MINUTES) 1215 #define DAYS (24 * HOURS) 1216 1217 #define PARAM_MAX (~(uint32_t)0) 1218 1219 /* Max size IP datagram is 64k - 1 */ 1220 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t))) 1221 #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t))) 1222 /* Max of the above */ 1223 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 1224 1225 /* Largest TCP port number */ 1226 #define TCP_MAX_PORT (64 * 1024 - 1) 1227 1228 /* 1229 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link 1230 * layer header. It has to be a multiple of 4. 1231 */ 1232 static tcpparam_t tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; 1233 #define tcp_wroff_xtra tcp_wroff_xtra_param.tcp_param_val 1234 1235 /* 1236 * All of these are alterable, within the min/max values given, at run time. 1237 * Note that the default value of "tcp_time_wait_interval" is four minutes, 1238 * per the TCP spec. 1239 */ 1240 /* BEGIN CSTYLED */ 1241 tcpparam_t tcp_param_arr[] = { 1242 /*min max value name */ 1243 { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"}, 1244 { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" }, 1245 { 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" }, 1246 { 1, 1024, 1, "tcp_conn_req_min" }, 1247 { 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" }, 1248 { 128, (1<<30), 1024*1024, "tcp_cwnd_max" }, 1249 { 0, 10, 0, "tcp_debug" }, 1250 { 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"}, 1251 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"}, 1252 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"}, 1253 { 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"}, 1254 { 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"}, 1255 { 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"}, 1256 { 1, 255, 64, "tcp_ipv4_ttl"}, 1257 { 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"}, 1258 { 0, 100, 10, "tcp_maxpsz_multiplier" }, 1259 { 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"}, 1260 { 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"}, 1261 { 1, TCP_MSS_MAX, 108, "tcp_mss_min"}, 1262 { 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"}, 1263 { 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"}, 1264 { 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"}, 1265 { 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"}, 1266 { 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" }, 1267 { 0, 16, 0, "tcp_snd_lowat_fraction" }, 1268 { 0, 128000, 0, "tcp_sth_rcv_hiwat" }, 1269 { 0, 128000, 0, "tcp_sth_rcv_lowat" }, 1270 { 1, 10000, 3, "tcp_dupack_fast_retransmit" }, 1271 { 0, 1, 0, "tcp_ignore_path_mtu" }, 1272 { 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"}, 1273 { 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"}, 1274 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"}, 1275 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"}, 1276 { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"}, 1277 { 1, 65536, 4, "tcp_recv_hiwat_minmss"}, 1278 { 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"}, 1279 { 0, TCP_MSS_MAX, 64, "tcp_co_min"}, 1280 { 8192, (1<<30), 1024*1024, "tcp_max_buf"}, 1281 /* 1282 * Question: What default value should I set for tcp_strong_iss? 1283 */ 1284 { 0, 2, 1, "tcp_strong_iss"}, 1285 { 0, 65536, 20, "tcp_rtt_updates"}, 1286 { 0, 1, 1, "tcp_wscale_always"}, 1287 { 0, 1, 0, "tcp_tstamp_always"}, 1288 { 0, 1, 1, "tcp_tstamp_if_wscale"}, 1289 { 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"}, 1290 { 0, 16, 2, "tcp_deferred_acks_max"}, 1291 { 1, 16384, 4, "tcp_slow_start_after_idle"}, 1292 { 1, 4, 4, "tcp_slow_start_initial"}, 1293 { 10*MS, 50*MS, 20*MS, "tcp_co_timer_interval"}, 1294 { 0, 2, 2, "tcp_sack_permitted"}, 1295 { 0, 1, 0, "tcp_trace"}, 1296 { 0, 1, 1, "tcp_compression_enabled"}, 1297 { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"}, 1298 { 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"}, 1299 { 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"}, 1300 { 0, 1, 0, "tcp_rev_src_routes"}, 1301 { 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"}, 1302 { 100*MS, 60*SECONDS, 1*SECONDS, "tcp_ndd_get_info_interval"}, 1303 { 0, 16, 8, "tcp_local_dacks_max"}, 1304 { 0, 2, 1, "tcp_ecn_permitted"}, 1305 { 0, 1, 1, "tcp_rst_sent_rate_enabled"}, 1306 { 0, PARAM_MAX, 40, "tcp_rst_sent_rate"}, 1307 { 0, 100*MS, 50*MS, "tcp_push_timer_interval"}, 1308 { 0, 1, 0, "tcp_use_smss_as_mss_opt"}, 1309 { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"}, 1310 }; 1311 /* END CSTYLED */ 1312 1313 /* 1314 * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of 1315 * each header fragment in the header buffer. Each parameter value has 1316 * to be a multiple of 4 (32-bit aligned). 1317 */ 1318 static tcpparam_t tcp_mdt_head_param = { 32, 256, 32, "tcp_mdt_hdr_head_min" }; 1319 static tcpparam_t tcp_mdt_tail_param = { 0, 256, 32, "tcp_mdt_hdr_tail_min" }; 1320 #define tcp_mdt_hdr_head_min tcp_mdt_head_param.tcp_param_val 1321 #define tcp_mdt_hdr_tail_min tcp_mdt_tail_param.tcp_param_val 1322 1323 /* 1324 * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out 1325 * the maximum number of payload buffers associated per Multidata. 1326 */ 1327 static tcpparam_t tcp_mdt_max_pbufs_param = 1328 { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" }; 1329 #define tcp_mdt_max_pbufs tcp_mdt_max_pbufs_param.tcp_param_val 1330 1331 /* Round up the value to the nearest mss. */ 1332 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 1333 1334 /* 1335 * Set ECN capable transport (ECT) code point in IP header. 1336 * 1337 * Note that there are 2 ECT code points '01' and '10', which are called 1338 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 1339 * point ECT(0) for TCP as described in RFC 2481. 1340 */ 1341 #define SET_ECT(tcp, iph) \ 1342 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1343 /* We need to clear the code point first. */ \ 1344 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ 1345 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ 1346 } else { \ 1347 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \ 1348 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ 1349 } 1350 1351 /* 1352 * The format argument to pass to tcp_display(). 1353 * DISP_PORT_ONLY means that the returned string has only port info. 1354 * DISP_ADDR_AND_PORT means that the returned string also contains the 1355 * remote and local IP address. 1356 */ 1357 #define DISP_PORT_ONLY 1 1358 #define DISP_ADDR_AND_PORT 2 1359 1360 /* 1361 * This controls the rate some ndd info report functions can be used 1362 * by non-privileged users. It stores the last time such info is 1363 * requested. When those report functions are called again, this 1364 * is checked with the current time and compare with the ndd param 1365 * tcp_ndd_get_info_interval. 1366 */ 1367 static clock_t tcp_last_ndd_get_info_time = 0; 1368 #define NDD_TOO_QUICK_MSG \ 1369 "ndd get info rate too high for non-privileged users, try again " \ 1370 "later.\n" 1371 #define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n" 1372 1373 #define IS_VMLOANED_MBLK(mp) \ 1374 (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) 1375 1376 /* 1377 * These two variables control the rate for TCP to generate RSTs in 1378 * response to segments not belonging to any connections. We limit 1379 * TCP to sent out tcp_rst_sent_rate (ndd param) number of RSTs in 1380 * each 1 second interval. This is to protect TCP against DoS attack. 1381 */ 1382 static clock_t tcp_last_rst_intrvl; 1383 static uint32_t tcp_rst_cnt; 1384 1385 /* The number of RST not sent because of the rate limit. */ 1386 static uint32_t tcp_rst_unsent; 1387 1388 /* Enable or disable b_cont M_MULTIDATA chaining for MDT. */ 1389 boolean_t tcp_mdt_chain = B_TRUE; 1390 1391 /* 1392 * MDT threshold in the form of effective send MSS multiplier; we take 1393 * the MDT path if the amount of unsent data exceeds the threshold value 1394 * (default threshold is 1*SMSS). 1395 */ 1396 uint_t tcp_mdt_smss_threshold = 1; 1397 1398 uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ 1399 1400 /* 1401 * Forces all connections to obey the value of the tcp_maxpsz_multiplier 1402 * tunable settable via NDD. Otherwise, the per-connection behavior is 1403 * determined dynamically during tcp_adapt_ire(), which is the default. 1404 */ 1405 boolean_t tcp_static_maxpsz = B_FALSE; 1406 1407 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ 1408 uint32_t tcp_random_anon_port = 1; 1409 1410 /* 1411 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more 1412 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent 1413 * data, TCP will not respond with an ACK. RFC 793 requires that 1414 * TCP responds with an ACK for such a bogus ACK. By not following 1415 * the RFC, we prevent TCP from getting into an ACK storm if somehow 1416 * an attacker successfully spoofs an acceptable segment to our 1417 * peer; or when our peer is "confused." 1418 */ 1419 uint32_t tcp_drop_ack_unsent_cnt = 10; 1420 1421 /* 1422 * Hook functions to enable cluster networking 1423 * On non-clustered systems these vectors must always be NULL. 1424 */ 1425 1426 void (*cl_inet_listen)(uint8_t protocol, sa_family_t addr_family, 1427 uint8_t *laddrp, in_port_t lport) = NULL; 1428 void (*cl_inet_unlisten)(uint8_t protocol, sa_family_t addr_family, 1429 uint8_t *laddrp, in_port_t lport) = NULL; 1430 void (*cl_inet_connect)(uint8_t protocol, sa_family_t addr_family, 1431 uint8_t *laddrp, in_port_t lport, 1432 uint8_t *faddrp, in_port_t fport) = NULL; 1433 void (*cl_inet_disconnect)(uint8_t protocol, sa_family_t addr_family, 1434 uint8_t *laddrp, in_port_t lport, 1435 uint8_t *faddrp, in_port_t fport) = NULL; 1436 1437 /* 1438 * The following are defined in ip.c 1439 */ 1440 extern int (*cl_inet_isclusterwide)(uint8_t protocol, sa_family_t addr_family, 1441 uint8_t *laddrp); 1442 extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, 1443 uint8_t *laddrp, uint8_t *faddrp); 1444 1445 #define CL_INET_CONNECT(tcp) { \ 1446 if (cl_inet_connect != NULL) { \ 1447 /* \ 1448 * Running in cluster mode - register active connection \ 1449 * information \ 1450 */ \ 1451 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1452 if ((tcp)->tcp_ipha->ipha_src != 0) { \ 1453 (*cl_inet_connect)(IPPROTO_TCP, AF_INET,\ 1454 (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\ 1455 (in_port_t)(tcp)->tcp_lport, \ 1456 (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ 1457 (in_port_t)(tcp)->tcp_fport); \ 1458 } \ 1459 } else { \ 1460 if (!IN6_IS_ADDR_UNSPECIFIED( \ 1461 &(tcp)->tcp_ip6h->ip6_src)) {\ 1462 (*cl_inet_connect)(IPPROTO_TCP, AF_INET6,\ 1463 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\ 1464 (in_port_t)(tcp)->tcp_lport, \ 1465 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ 1466 (in_port_t)(tcp)->tcp_fport); \ 1467 } \ 1468 } \ 1469 } \ 1470 } 1471 1472 #define CL_INET_DISCONNECT(tcp) { \ 1473 if (cl_inet_disconnect != NULL) { \ 1474 /* \ 1475 * Running in cluster mode - deregister active \ 1476 * connection information \ 1477 */ \ 1478 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1479 if ((tcp)->tcp_ip_src != 0) { \ 1480 (*cl_inet_disconnect)(IPPROTO_TCP, \ 1481 AF_INET, \ 1482 (uint8_t *)(&((tcp)->tcp_ip_src)),\ 1483 (in_port_t)(tcp)->tcp_lport, \ 1484 (uint8_t *) \ 1485 (&((tcp)->tcp_ipha->ipha_dst)),\ 1486 (in_port_t)(tcp)->tcp_fport); \ 1487 } \ 1488 } else { \ 1489 if (!IN6_IS_ADDR_UNSPECIFIED( \ 1490 &(tcp)->tcp_ip_src_v6)) { \ 1491 (*cl_inet_disconnect)(IPPROTO_TCP, AF_INET6,\ 1492 (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\ 1493 (in_port_t)(tcp)->tcp_lport, \ 1494 (uint8_t *) \ 1495 (&((tcp)->tcp_ip6h->ip6_dst)),\ 1496 (in_port_t)(tcp)->tcp_fport); \ 1497 } \ 1498 } \ 1499 } \ 1500 } 1501 1502 /* 1503 * Cluster networking hook for traversing current connection list. 1504 * This routine is used to extract the current list of live connections 1505 * which must continue to to be dispatched to this node. 1506 */ 1507 int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg); 1508 1509 /* 1510 * Figure out the value of window scale opton. Note that the rwnd is 1511 * ASSUMED to be rounded up to the nearest MSS before the calculation. 1512 * We cannot find the scale value and then do a round up of tcp_rwnd 1513 * because the scale value may not be correct after that. 1514 * 1515 * Set the compiler flag to make this function inline. 1516 */ 1517 static void 1518 tcp_set_ws_value(tcp_t *tcp) 1519 { 1520 int i; 1521 uint32_t rwnd = tcp->tcp_rwnd; 1522 1523 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; 1524 i++, rwnd >>= 1) 1525 ; 1526 tcp->tcp_rcv_ws = i; 1527 } 1528 1529 /* 1530 * Remove a connection from the list of detached TIME_WAIT connections. 1531 */ 1532 static void 1533 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) 1534 { 1535 boolean_t locked = B_FALSE; 1536 1537 if (tcp_time_wait == NULL) { 1538 tcp_time_wait = *((tcp_squeue_priv_t **) 1539 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); 1540 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1541 locked = B_TRUE; 1542 } 1543 1544 if (tcp->tcp_time_wait_expire == 0) { 1545 ASSERT(tcp->tcp_time_wait_next == NULL); 1546 ASSERT(tcp->tcp_time_wait_prev == NULL); 1547 if (locked) 1548 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1549 return; 1550 } 1551 ASSERT(TCP_IS_DETACHED(tcp)); 1552 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1553 1554 if (tcp == tcp_time_wait->tcp_time_wait_head) { 1555 ASSERT(tcp->tcp_time_wait_prev == NULL); 1556 tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; 1557 if (tcp_time_wait->tcp_time_wait_head != NULL) { 1558 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = 1559 NULL; 1560 } else { 1561 tcp_time_wait->tcp_time_wait_tail = NULL; 1562 } 1563 } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { 1564 ASSERT(tcp != tcp_time_wait->tcp_time_wait_head); 1565 ASSERT(tcp->tcp_time_wait_next == NULL); 1566 tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; 1567 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1568 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; 1569 } else { 1570 ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 1571 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 1572 tcp->tcp_time_wait_prev->tcp_time_wait_next = 1573 tcp->tcp_time_wait_next; 1574 tcp->tcp_time_wait_next->tcp_time_wait_prev = 1575 tcp->tcp_time_wait_prev; 1576 } 1577 tcp->tcp_time_wait_next = NULL; 1578 tcp->tcp_time_wait_prev = NULL; 1579 tcp->tcp_time_wait_expire = 0; 1580 1581 if (locked) 1582 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1583 } 1584 1585 /* 1586 * Add a connection to the list of detached TIME_WAIT connections 1587 * and set its time to expire. 1588 */ 1589 static void 1590 tcp_time_wait_append(tcp_t *tcp) 1591 { 1592 tcp_squeue_priv_t *tcp_time_wait = 1593 *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp, 1594 SQPRIVATE_TCP)); 1595 1596 tcp_timers_stop(tcp); 1597 1598 /* Freed above */ 1599 ASSERT(tcp->tcp_timer_tid == 0); 1600 ASSERT(tcp->tcp_ack_tid == 0); 1601 1602 /* must have happened at the time of detaching the tcp */ 1603 ASSERT(tcp->tcp_ptpahn == NULL); 1604 ASSERT(tcp->tcp_flow_stopped == 0); 1605 ASSERT(tcp->tcp_time_wait_next == NULL); 1606 ASSERT(tcp->tcp_time_wait_prev == NULL); 1607 ASSERT(tcp->tcp_time_wait_expire == NULL); 1608 ASSERT(tcp->tcp_listener == NULL); 1609 1610 tcp->tcp_time_wait_expire = ddi_get_lbolt(); 1611 /* 1612 * The value computed below in tcp->tcp_time_wait_expire may 1613 * appear negative or wrap around. That is ok since our 1614 * interest is only in the difference between the current lbolt 1615 * value and tcp->tcp_time_wait_expire. But the value should not 1616 * be zero, since it means the tcp is not in the TIME_WAIT list. 1617 * The corresponding comparison in tcp_time_wait_collector() uses 1618 * modular arithmetic. 1619 */ 1620 tcp->tcp_time_wait_expire += 1621 drv_usectohz(tcp_time_wait_interval * 1000); 1622 if (tcp->tcp_time_wait_expire == 0) 1623 tcp->tcp_time_wait_expire = 1; 1624 1625 ASSERT(TCP_IS_DETACHED(tcp)); 1626 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1627 ASSERT(tcp->tcp_time_wait_next == NULL); 1628 ASSERT(tcp->tcp_time_wait_prev == NULL); 1629 TCP_DBGSTAT(tcp_time_wait); 1630 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1631 if (tcp_time_wait->tcp_time_wait_head == NULL) { 1632 ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); 1633 tcp_time_wait->tcp_time_wait_head = tcp; 1634 } else { 1635 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1636 ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == 1637 TCPS_TIME_WAIT); 1638 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; 1639 tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; 1640 } 1641 tcp_time_wait->tcp_time_wait_tail = tcp; 1642 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1643 } 1644 1645 /* ARGSUSED */ 1646 void 1647 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) 1648 { 1649 conn_t *connp = (conn_t *)arg; 1650 tcp_t *tcp = connp->conn_tcp; 1651 1652 ASSERT(tcp != NULL); 1653 if (tcp->tcp_state == TCPS_CLOSED) { 1654 return; 1655 } 1656 1657 ASSERT((tcp->tcp_family == AF_INET && 1658 tcp->tcp_ipversion == IPV4_VERSION) || 1659 (tcp->tcp_family == AF_INET6 && 1660 (tcp->tcp_ipversion == IPV4_VERSION || 1661 tcp->tcp_ipversion == IPV6_VERSION))); 1662 ASSERT(!tcp->tcp_listener); 1663 1664 TCP_STAT(tcp_time_wait_reap); 1665 ASSERT(TCP_IS_DETACHED(tcp)); 1666 1667 /* 1668 * Because they have no upstream client to rebind or tcp_close() 1669 * them later, we axe the connection here and now. 1670 */ 1671 tcp_close_detached(tcp); 1672 } 1673 1674 void 1675 tcp_cleanup(tcp_t *tcp) 1676 { 1677 mblk_t *mp; 1678 char *tcp_iphc; 1679 int tcp_iphc_len; 1680 int tcp_hdr_grown; 1681 tcp_sack_info_t *tcp_sack_info; 1682 conn_t *connp = tcp->tcp_connp; 1683 1684 tcp_bind_hash_remove(tcp); 1685 tcp_free(tcp); 1686 1687 /* Release any SSL context */ 1688 if (tcp->tcp_kssl_ent != NULL) { 1689 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); 1690 tcp->tcp_kssl_ent = NULL; 1691 } 1692 1693 if (tcp->tcp_kssl_ctx != NULL) { 1694 kssl_release_ctx(tcp->tcp_kssl_ctx); 1695 tcp->tcp_kssl_ctx = NULL; 1696 } 1697 tcp->tcp_kssl_pending = B_FALSE; 1698 1699 conn_delete_ire(connp, NULL); 1700 if (connp->conn_flags & IPCL_TCPCONN) { 1701 if (connp->conn_latch != NULL) 1702 IPLATCH_REFRELE(connp->conn_latch); 1703 if (connp->conn_policy != NULL) 1704 IPPH_REFRELE(connp->conn_policy); 1705 } 1706 1707 /* 1708 * Since we will bzero the entire structure, we need to 1709 * remove it and reinsert it in global hash list. We 1710 * know the walkers can't get to this conn because we 1711 * had set CONDEMNED flag earlier and checked reference 1712 * under conn_lock so walker won't pick it and when we 1713 * go the ipcl_globalhash_remove() below, no walker 1714 * can get to it. 1715 */ 1716 ipcl_globalhash_remove(connp); 1717 1718 /* Save some state */ 1719 mp = tcp->tcp_timercache; 1720 1721 tcp_sack_info = tcp->tcp_sack_info; 1722 tcp_iphc = tcp->tcp_iphc; 1723 tcp_iphc_len = tcp->tcp_iphc_len; 1724 tcp_hdr_grown = tcp->tcp_hdr_grown; 1725 1726 if (connp->conn_cred != NULL) 1727 crfree(connp->conn_cred); 1728 if (connp->conn_peercred != NULL) 1729 crfree(connp->conn_peercred); 1730 bzero(connp, sizeof (conn_t)); 1731 bzero(tcp, sizeof (tcp_t)); 1732 1733 /* restore the state */ 1734 tcp->tcp_timercache = mp; 1735 1736 tcp->tcp_sack_info = tcp_sack_info; 1737 tcp->tcp_iphc = tcp_iphc; 1738 tcp->tcp_iphc_len = tcp_iphc_len; 1739 tcp->tcp_hdr_grown = tcp_hdr_grown; 1740 1741 1742 tcp->tcp_connp = connp; 1743 1744 connp->conn_tcp = tcp; 1745 connp->conn_flags = IPCL_TCPCONN; 1746 connp->conn_state_flags = CONN_INCIPIENT; 1747 connp->conn_ulp = IPPROTO_TCP; 1748 connp->conn_ref = 1; 1749 1750 ipcl_globalhash_insert(connp); 1751 } 1752 1753 /* 1754 * Blows away all tcps whose TIME_WAIT has expired. List traversal 1755 * is done forwards from the head. 1756 */ 1757 /* ARGSUSED */ 1758 void 1759 tcp_time_wait_collector(void *arg) 1760 { 1761 tcp_t *tcp; 1762 clock_t now; 1763 mblk_t *mp; 1764 conn_t *connp; 1765 kmutex_t *lock; 1766 1767 squeue_t *sqp = (squeue_t *)arg; 1768 tcp_squeue_priv_t *tcp_time_wait = 1769 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 1770 1771 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1772 tcp_time_wait->tcp_time_wait_tid = 0; 1773 1774 if (tcp_time_wait->tcp_free_list != NULL && 1775 tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { 1776 TCP_STAT(tcp_freelist_cleanup); 1777 while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { 1778 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 1779 CONN_DEC_REF(tcp->tcp_connp); 1780 } 1781 tcp_time_wait->tcp_free_list_cnt = 0; 1782 } 1783 1784 /* 1785 * In order to reap time waits reliably, we should use a 1786 * source of time that is not adjustable by the user -- hence 1787 * the call to ddi_get_lbolt(). 1788 */ 1789 now = ddi_get_lbolt(); 1790 while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { 1791 /* 1792 * Compare times using modular arithmetic, since 1793 * lbolt can wrapover. 1794 */ 1795 if ((now - tcp->tcp_time_wait_expire) < 0) { 1796 break; 1797 } 1798 1799 tcp_time_wait_remove(tcp, tcp_time_wait); 1800 1801 connp = tcp->tcp_connp; 1802 ASSERT(connp->conn_fanout != NULL); 1803 lock = &connp->conn_fanout->connf_lock; 1804 /* 1805 * This is essentially a TW reclaim fast path optimization for 1806 * performance where the timewait collector checks under the 1807 * fanout lock (so that no one else can get access to the 1808 * conn_t) that the refcnt is 2 i.e. one for TCP and one for 1809 * the classifier hash list. If ref count is indeed 2, we can 1810 * just remove the conn under the fanout lock and avoid 1811 * cleaning up the conn under the squeue, provided that 1812 * clustering callbacks are not enabled. If clustering is 1813 * enabled, we need to make the clustering callback before 1814 * setting the CONDEMNED flag and after dropping all locks and 1815 * so we forego this optimization and fall back to the slow 1816 * path. Also please see the comments in tcp_closei_local 1817 * regarding the refcnt logic. 1818 * 1819 * Since we are holding the tcp_time_wait_lock, its better 1820 * not to block on the fanout_lock because other connections 1821 * can't add themselves to time_wait list. So we do a 1822 * tryenter instead of mutex_enter. 1823 */ 1824 if (mutex_tryenter(lock)) { 1825 mutex_enter(&connp->conn_lock); 1826 if ((connp->conn_ref == 2) && 1827 (cl_inet_disconnect == NULL)) { 1828 ipcl_hash_remove_locked(connp, 1829 connp->conn_fanout); 1830 /* 1831 * Set the CONDEMNED flag now itself so that 1832 * the refcnt cannot increase due to any 1833 * walker. But we have still not cleaned up 1834 * conn_ire_cache. This is still ok since 1835 * we are going to clean it up in tcp_cleanup 1836 * immediately and any interface unplumb 1837 * thread will wait till the ire is blown away 1838 */ 1839 connp->conn_state_flags |= CONN_CONDEMNED; 1840 mutex_exit(lock); 1841 mutex_exit(&connp->conn_lock); 1842 if (tcp_time_wait->tcp_free_list_cnt < 1843 tcp_free_list_max_cnt) { 1844 /* Add to head of tcp_free_list */ 1845 mutex_exit( 1846 &tcp_time_wait->tcp_time_wait_lock); 1847 tcp_cleanup(tcp); 1848 mutex_enter( 1849 &tcp_time_wait->tcp_time_wait_lock); 1850 tcp->tcp_time_wait_next = 1851 tcp_time_wait->tcp_free_list; 1852 tcp_time_wait->tcp_free_list = tcp; 1853 tcp_time_wait->tcp_free_list_cnt++; 1854 continue; 1855 } else { 1856 /* Do not add to tcp_free_list */ 1857 mutex_exit( 1858 &tcp_time_wait->tcp_time_wait_lock); 1859 tcp_bind_hash_remove(tcp); 1860 conn_delete_ire(tcp->tcp_connp, NULL); 1861 CONN_DEC_REF(tcp->tcp_connp); 1862 } 1863 } else { 1864 CONN_INC_REF_LOCKED(connp); 1865 mutex_exit(lock); 1866 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1867 mutex_exit(&connp->conn_lock); 1868 /* 1869 * We can reuse the closemp here since conn has 1870 * detached (otherwise we wouldn't even be in 1871 * time_wait list). 1872 */ 1873 mp = &tcp->tcp_closemp; 1874 squeue_fill(connp->conn_sqp, mp, 1875 tcp_timewait_output, connp, 1876 SQTAG_TCP_TIMEWAIT); 1877 } 1878 } else { 1879 mutex_enter(&connp->conn_lock); 1880 CONN_INC_REF_LOCKED(connp); 1881 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1882 mutex_exit(&connp->conn_lock); 1883 /* 1884 * We can reuse the closemp here since conn has 1885 * detached (otherwise we wouldn't even be in 1886 * time_wait list). 1887 */ 1888 mp = &tcp->tcp_closemp; 1889 squeue_fill(connp->conn_sqp, mp, 1890 tcp_timewait_output, connp, 0); 1891 } 1892 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1893 } 1894 1895 if (tcp_time_wait->tcp_free_list != NULL) 1896 tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; 1897 1898 tcp_time_wait->tcp_time_wait_tid = 1899 timeout(tcp_time_wait_collector, sqp, TCP_TIME_WAIT_DELAY); 1900 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1901 } 1902 1903 /* 1904 * Reply to a clients T_CONN_RES TPI message. This function 1905 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES 1906 * on the acceptor STREAM and processed in tcp_wput_accept(). 1907 * Read the block comment on top of tcp_conn_request(). 1908 */ 1909 static void 1910 tcp_accept(tcp_t *listener, mblk_t *mp) 1911 { 1912 tcp_t *acceptor; 1913 tcp_t *eager; 1914 tcp_t *tcp; 1915 struct T_conn_res *tcr; 1916 t_uscalar_t acceptor_id; 1917 t_scalar_t seqnum; 1918 mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */ 1919 mblk_t *ok_mp; 1920 mblk_t *mp1; 1921 1922 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 1923 tcp_err_ack(listener, mp, TPROTO, 0); 1924 return; 1925 } 1926 tcr = (struct T_conn_res *)mp->b_rptr; 1927 1928 /* 1929 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the 1930 * read side queue of the streams device underneath us i.e. the 1931 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we 1932 * look it up in the queue_hash. Under LP64 it sends down the 1933 * minor_t of the accepting endpoint. 1934 * 1935 * Once the acceptor/eager are modified (in tcp_accept_swap) the 1936 * fanout hash lock is held. 1937 * This prevents any thread from entering the acceptor queue from 1938 * below (since it has not been hard bound yet i.e. any inbound 1939 * packets will arrive on the listener or default tcp queue and 1940 * go through tcp_lookup). 1941 * The CONN_INC_REF will prevent the acceptor from closing. 1942 * 1943 * XXX It is still possible for a tli application to send down data 1944 * on the accepting stream while another thread calls t_accept. 1945 * This should not be a problem for well-behaved applications since 1946 * the T_OK_ACK is sent after the queue swapping is completed. 1947 * 1948 * If the accepting fd is the same as the listening fd, avoid 1949 * queue hash lookup since that will return an eager listener in a 1950 * already established state. 1951 */ 1952 acceptor_id = tcr->ACCEPTOR_id; 1953 mutex_enter(&listener->tcp_eager_lock); 1954 if (listener->tcp_acceptor_id == acceptor_id) { 1955 eager = listener->tcp_eager_next_q; 1956 /* only count how many T_CONN_INDs so don't count q0 */ 1957 if ((listener->tcp_conn_req_cnt_q != 1) || 1958 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { 1959 mutex_exit(&listener->tcp_eager_lock); 1960 tcp_err_ack(listener, mp, TBADF, 0); 1961 return; 1962 } 1963 if (listener->tcp_conn_req_cnt_q0 != 0) { 1964 /* Throw away all the eagers on q0. */ 1965 tcp_eager_cleanup(listener, 1); 1966 } 1967 if (listener->tcp_syn_defense) { 1968 listener->tcp_syn_defense = B_FALSE; 1969 if (listener->tcp_ip_addr_cache != NULL) { 1970 kmem_free(listener->tcp_ip_addr_cache, 1971 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1972 listener->tcp_ip_addr_cache = NULL; 1973 } 1974 } 1975 /* 1976 * Transfer tcp_conn_req_max to the eager so that when 1977 * a disconnect occurs we can revert the endpoint to the 1978 * listen state. 1979 */ 1980 eager->tcp_conn_req_max = listener->tcp_conn_req_max; 1981 ASSERT(listener->tcp_conn_req_cnt_q0 == 0); 1982 /* 1983 * Get a reference on the acceptor just like the 1984 * tcp_acceptor_hash_lookup below. 1985 */ 1986 acceptor = listener; 1987 CONN_INC_REF(acceptor->tcp_connp); 1988 } else { 1989 acceptor = tcp_acceptor_hash_lookup(acceptor_id); 1990 if (acceptor == NULL) { 1991 if (listener->tcp_debug) { 1992 (void) strlog(TCP_MOD_ID, 0, 1, 1993 SL_ERROR|SL_TRACE, 1994 "tcp_accept: did not find acceptor 0x%x\n", 1995 acceptor_id); 1996 } 1997 mutex_exit(&listener->tcp_eager_lock); 1998 tcp_err_ack(listener, mp, TPROVMISMATCH, 0); 1999 return; 2000 } 2001 /* 2002 * Verify acceptor state. The acceptable states for an acceptor 2003 * include TCPS_IDLE and TCPS_BOUND. 2004 */ 2005 switch (acceptor->tcp_state) { 2006 case TCPS_IDLE: 2007 /* FALLTHRU */ 2008 case TCPS_BOUND: 2009 break; 2010 default: 2011 CONN_DEC_REF(acceptor->tcp_connp); 2012 mutex_exit(&listener->tcp_eager_lock); 2013 tcp_err_ack(listener, mp, TOUTSTATE, 0); 2014 return; 2015 } 2016 } 2017 2018 /* The listener must be in TCPS_LISTEN */ 2019 if (listener->tcp_state != TCPS_LISTEN) { 2020 CONN_DEC_REF(acceptor->tcp_connp); 2021 mutex_exit(&listener->tcp_eager_lock); 2022 tcp_err_ack(listener, mp, TOUTSTATE, 0); 2023 return; 2024 } 2025 2026 /* 2027 * Rendezvous with an eager connection request packet hanging off 2028 * 'tcp' that has the 'seqnum' tag. We tagged the detached open 2029 * tcp structure when the connection packet arrived in 2030 * tcp_conn_request(). 2031 */ 2032 seqnum = tcr->SEQ_number; 2033 eager = listener; 2034 do { 2035 eager = eager->tcp_eager_next_q; 2036 if (eager == NULL) { 2037 CONN_DEC_REF(acceptor->tcp_connp); 2038 mutex_exit(&listener->tcp_eager_lock); 2039 tcp_err_ack(listener, mp, TBADSEQ, 0); 2040 return; 2041 } 2042 } while (eager->tcp_conn_req_seqnum != seqnum); 2043 mutex_exit(&listener->tcp_eager_lock); 2044 2045 /* 2046 * At this point, both acceptor and listener have 2 ref 2047 * that they begin with. Acceptor has one additional ref 2048 * we placed in lookup while listener has 3 additional 2049 * ref for being behind the squeue (tcp_accept() is 2050 * done on listener's squeue); being in classifier hash; 2051 * and eager's ref on listener. 2052 */ 2053 ASSERT(listener->tcp_connp->conn_ref >= 5); 2054 ASSERT(acceptor->tcp_connp->conn_ref >= 3); 2055 2056 /* 2057 * The eager at this point is set in its own squeue and 2058 * could easily have been killed (tcp_accept_finish will 2059 * deal with that) because of a TH_RST so we can only 2060 * ASSERT for a single ref. 2061 */ 2062 ASSERT(eager->tcp_connp->conn_ref >= 1); 2063 2064 /* Pre allocate the stroptions mblk also */ 2065 opt_mp = allocb(sizeof (struct stroptions), BPRI_HI); 2066 if (opt_mp == NULL) { 2067 CONN_DEC_REF(acceptor->tcp_connp); 2068 CONN_DEC_REF(eager->tcp_connp); 2069 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 2070 return; 2071 } 2072 DB_TYPE(opt_mp) = M_SETOPTS; 2073 opt_mp->b_wptr += sizeof (struct stroptions); 2074 2075 /* 2076 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO 2077 * from listener to acceptor. The message is chained on opt_mp 2078 * which will be sent onto eager's squeue. 2079 */ 2080 if (listener->tcp_bound_if != 0) { 2081 /* allocate optmgmt req */ 2082 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6, 2083 IPV6_BOUND_IF, (char *)&listener->tcp_bound_if, 2084 sizeof (int)); 2085 if (mp1 != NULL) 2086 linkb(opt_mp, mp1); 2087 } 2088 if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { 2089 uint_t on = 1; 2090 2091 /* allocate optmgmt req */ 2092 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6, 2093 IPV6_RECVPKTINFO, (char *)&on, sizeof (on)); 2094 if (mp1 != NULL) 2095 linkb(opt_mp, mp1); 2096 } 2097 2098 /* Re-use mp1 to hold a copy of mp, in case reallocb fails */ 2099 if ((mp1 = copymsg(mp)) == NULL) { 2100 CONN_DEC_REF(acceptor->tcp_connp); 2101 CONN_DEC_REF(eager->tcp_connp); 2102 freemsg(opt_mp); 2103 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 2104 return; 2105 } 2106 2107 tcr = (struct T_conn_res *)mp1->b_rptr; 2108 2109 /* 2110 * This is an expanded version of mi_tpi_ok_ack_alloc() 2111 * which allocates a larger mblk and appends the new 2112 * local address to the ok_ack. The address is copied by 2113 * soaccept() for getsockname(). 2114 */ 2115 { 2116 int extra; 2117 2118 extra = (eager->tcp_family == AF_INET) ? 2119 sizeof (sin_t) : sizeof (sin6_t); 2120 2121 /* 2122 * Try to re-use mp, if possible. Otherwise, allocate 2123 * an mblk and return it as ok_mp. In any case, mp 2124 * is no longer usable upon return. 2125 */ 2126 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { 2127 CONN_DEC_REF(acceptor->tcp_connp); 2128 CONN_DEC_REF(eager->tcp_connp); 2129 freemsg(opt_mp); 2130 /* Original mp has been freed by now, so use mp1 */ 2131 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); 2132 return; 2133 } 2134 2135 mp = NULL; /* We should never use mp after this point */ 2136 2137 switch (extra) { 2138 case sizeof (sin_t): { 2139 sin_t *sin = (sin_t *)ok_mp->b_wptr; 2140 2141 ok_mp->b_wptr += extra; 2142 sin->sin_family = AF_INET; 2143 sin->sin_port = eager->tcp_lport; 2144 sin->sin_addr.s_addr = 2145 eager->tcp_ipha->ipha_src; 2146 break; 2147 } 2148 case sizeof (sin6_t): { 2149 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; 2150 2151 ok_mp->b_wptr += extra; 2152 sin6->sin6_family = AF_INET6; 2153 sin6->sin6_port = eager->tcp_lport; 2154 if (eager->tcp_ipversion == IPV4_VERSION) { 2155 sin6->sin6_flowinfo = 0; 2156 IN6_IPADDR_TO_V4MAPPED( 2157 eager->tcp_ipha->ipha_src, 2158 &sin6->sin6_addr); 2159 } else { 2160 ASSERT(eager->tcp_ip6h != NULL); 2161 sin6->sin6_flowinfo = 2162 eager->tcp_ip6h->ip6_vcf & 2163 ~IPV6_VERS_AND_FLOW_MASK; 2164 sin6->sin6_addr = 2165 eager->tcp_ip6h->ip6_src; 2166 } 2167 break; 2168 } 2169 default: 2170 break; 2171 } 2172 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); 2173 } 2174 2175 /* 2176 * If there are no options we know that the T_CONN_RES will 2177 * succeed. However, we can't send the T_OK_ACK upstream until 2178 * the tcp_accept_swap is done since it would be dangerous to 2179 * let the application start using the new fd prior to the swap. 2180 */ 2181 tcp_accept_swap(listener, acceptor, eager); 2182 2183 /* 2184 * tcp_accept_swap unlinks eager from listener but does not drop 2185 * the eager's reference on the listener. 2186 */ 2187 ASSERT(eager->tcp_listener == NULL); 2188 ASSERT(listener->tcp_connp->conn_ref >= 5); 2189 2190 /* 2191 * The eager is now associated with its own queue. Insert in 2192 * the hash so that the connection can be reused for a future 2193 * T_CONN_RES. 2194 */ 2195 tcp_acceptor_hash_insert(acceptor_id, eager); 2196 2197 /* 2198 * We now do the processing of options with T_CONN_RES. 2199 * We delay till now since we wanted to have queue to pass to 2200 * option processing routines that points back to the right 2201 * instance structure which does not happen until after 2202 * tcp_accept_swap(). 2203 * 2204 * Note: 2205 * The sanity of the logic here assumes that whatever options 2206 * are appropriate to inherit from listner=>eager are done 2207 * before this point, and whatever were to be overridden (or not) 2208 * in transfer logic from eager=>acceptor in tcp_accept_swap(). 2209 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it 2210 * before its ACCEPTOR_id comes down in T_CONN_RES ] 2211 * This may not be true at this point in time but can be fixed 2212 * independently. This option processing code starts with 2213 * the instantiated acceptor instance and the final queue at 2214 * this point. 2215 */ 2216 2217 if (tcr->OPT_length != 0) { 2218 /* Options to process */ 2219 int t_error = 0; 2220 int sys_error = 0; 2221 int do_disconnect = 0; 2222 2223 if (tcp_conprim_opt_process(eager, mp1, 2224 &do_disconnect, &t_error, &sys_error) < 0) { 2225 eager->tcp_accept_error = 1; 2226 if (do_disconnect) { 2227 /* 2228 * An option failed which does not allow 2229 * connection to be accepted. 2230 * 2231 * We allow T_CONN_RES to succeed and 2232 * put a T_DISCON_IND on the eager queue. 2233 */ 2234 ASSERT(t_error == 0 && sys_error == 0); 2235 eager->tcp_send_discon_ind = 1; 2236 } else { 2237 ASSERT(t_error != 0); 2238 freemsg(ok_mp); 2239 /* 2240 * Original mp was either freed or set 2241 * to ok_mp above, so use mp1 instead. 2242 */ 2243 tcp_err_ack(listener, mp1, t_error, sys_error); 2244 goto finish; 2245 } 2246 } 2247 /* 2248 * Most likely success in setting options (except if 2249 * eager->tcp_send_discon_ind set). 2250 * mp1 option buffer represented by OPT_length/offset 2251 * potentially modified and contains results of setting 2252 * options at this point 2253 */ 2254 } 2255 2256 /* We no longer need mp1, since all options processing has passed */ 2257 freemsg(mp1); 2258 2259 putnext(listener->tcp_rq, ok_mp); 2260 2261 mutex_enter(&listener->tcp_eager_lock); 2262 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 2263 tcp_t *tail; 2264 mblk_t *conn_ind; 2265 2266 /* 2267 * This path should not be executed if listener and 2268 * acceptor streams are the same. 2269 */ 2270 ASSERT(listener != acceptor); 2271 2272 tcp = listener->tcp_eager_prev_q0; 2273 /* 2274 * listener->tcp_eager_prev_q0 points to the TAIL of the 2275 * deferred T_conn_ind queue. We need to get to the head of 2276 * the queue in order to send up T_conn_ind the same order as 2277 * how the 3WHS is completed. 2278 */ 2279 while (tcp != listener) { 2280 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) 2281 break; 2282 else 2283 tcp = tcp->tcp_eager_prev_q0; 2284 } 2285 ASSERT(tcp != listener); 2286 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; 2287 ASSERT(conn_ind != NULL); 2288 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 2289 2290 /* Move from q0 to q */ 2291 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 2292 listener->tcp_conn_req_cnt_q0--; 2293 listener->tcp_conn_req_cnt_q++; 2294 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 2295 tcp->tcp_eager_prev_q0; 2296 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 2297 tcp->tcp_eager_next_q0; 2298 tcp->tcp_eager_prev_q0 = NULL; 2299 tcp->tcp_eager_next_q0 = NULL; 2300 tcp->tcp_conn_def_q0 = B_FALSE; 2301 2302 /* 2303 * Insert at end of the queue because sockfs sends 2304 * down T_CONN_RES in chronological order. Leaving 2305 * the older conn indications at front of the queue 2306 * helps reducing search time. 2307 */ 2308 tail = listener->tcp_eager_last_q; 2309 if (tail != NULL) 2310 tail->tcp_eager_next_q = tcp; 2311 else 2312 listener->tcp_eager_next_q = tcp; 2313 listener->tcp_eager_last_q = tcp; 2314 tcp->tcp_eager_next_q = NULL; 2315 mutex_exit(&listener->tcp_eager_lock); 2316 putnext(tcp->tcp_rq, conn_ind); 2317 } else { 2318 mutex_exit(&listener->tcp_eager_lock); 2319 } 2320 2321 /* 2322 * Done with the acceptor - free it 2323 * 2324 * Note: from this point on, no access to listener should be made 2325 * as listener can be equal to acceptor. 2326 */ 2327 finish: 2328 ASSERT(acceptor->tcp_detached); 2329 acceptor->tcp_rq = tcp_g_q; 2330 acceptor->tcp_wq = WR(tcp_g_q); 2331 (void) tcp_clean_death(acceptor, 0, 2); 2332 CONN_DEC_REF(acceptor->tcp_connp); 2333 2334 /* 2335 * In case we already received a FIN we have to make tcp_rput send 2336 * the ordrel_ind. This will also send up a window update if the window 2337 * has opened up. 2338 * 2339 * In the normal case of a successful connection acceptance 2340 * we give the O_T_BIND_REQ to the read side put procedure as an 2341 * indication that this was just accepted. This tells tcp_rput to 2342 * pass up any data queued in tcp_rcv_list. 2343 * 2344 * In the fringe case where options sent with T_CONN_RES failed and 2345 * we required, we would be indicating a T_DISCON_IND to blow 2346 * away this connection. 2347 */ 2348 2349 /* 2350 * XXX: we currently have a problem if XTI application closes the 2351 * acceptor stream in between. This problem exists in on10-gate also 2352 * and is well know but nothing can be done short of major rewrite 2353 * to fix it. Now it is possible to take care of it by assigning TLI/XTI 2354 * eager same squeue as listener (we can distinguish non socket 2355 * listeners at the time of handling a SYN in tcp_conn_request) 2356 * and do most of the work that tcp_accept_finish does here itself 2357 * and then get behind the acceptor squeue to access the acceptor 2358 * queue. 2359 */ 2360 /* 2361 * We already have a ref on tcp so no need to do one before squeue_fill 2362 */ 2363 squeue_fill(eager->tcp_connp->conn_sqp, opt_mp, 2364 tcp_accept_finish, eager->tcp_connp, SQTAG_TCP_ACCEPT_FINISH); 2365 } 2366 2367 /* 2368 * Swap information between the eager and acceptor for a TLI/XTI client. 2369 * The sockfs accept is done on the acceptor stream and control goes 2370 * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not 2371 * called. In either case, both the eager and listener are in their own 2372 * perimeter (squeue) and the code has to deal with potential race. 2373 * 2374 * See the block comment on top of tcp_accept() and tcp_wput_accept(). 2375 */ 2376 static void 2377 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) 2378 { 2379 conn_t *econnp, *aconnp; 2380 2381 ASSERT(eager->tcp_rq == listener->tcp_rq); 2382 ASSERT(eager->tcp_detached && !acceptor->tcp_detached); 2383 ASSERT(!eager->tcp_hard_bound); 2384 ASSERT(!TCP_IS_SOCKET(acceptor)); 2385 ASSERT(!TCP_IS_SOCKET(eager)); 2386 ASSERT(!TCP_IS_SOCKET(listener)); 2387 2388 acceptor->tcp_detached = B_TRUE; 2389 /* 2390 * To permit stream re-use by TLI/XTI, the eager needs a copy of 2391 * the acceptor id. 2392 */ 2393 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; 2394 2395 /* remove eager from listen list... */ 2396 mutex_enter(&listener->tcp_eager_lock); 2397 tcp_eager_unlink(eager); 2398 ASSERT(eager->tcp_eager_next_q == NULL && 2399 eager->tcp_eager_last_q == NULL); 2400 ASSERT(eager->tcp_eager_next_q0 == NULL && 2401 eager->tcp_eager_prev_q0 == NULL); 2402 mutex_exit(&listener->tcp_eager_lock); 2403 eager->tcp_rq = acceptor->tcp_rq; 2404 eager->tcp_wq = acceptor->tcp_wq; 2405 2406 econnp = eager->tcp_connp; 2407 aconnp = acceptor->tcp_connp; 2408 2409 eager->tcp_rq->q_ptr = econnp; 2410 eager->tcp_wq->q_ptr = econnp; 2411 2412 /* 2413 * In the TLI/XTI loopback case, we are inside the listener's squeue, 2414 * which might be a different squeue from our peer TCP instance. 2415 * For TCP Fusion, the peer expects that whenever tcp_detached is 2416 * clear, our TCP queues point to the acceptor's queues. Thus, use 2417 * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq 2418 * above reach global visibility prior to the clearing of tcp_detached. 2419 */ 2420 membar_producer(); 2421 eager->tcp_detached = B_FALSE; 2422 2423 ASSERT(eager->tcp_ack_tid == 0); 2424 2425 econnp->conn_dev = aconnp->conn_dev; 2426 if (eager->tcp_cred != NULL) 2427 crfree(eager->tcp_cred); 2428 eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred; 2429 econnp->conn_zoneid = aconnp->conn_zoneid; 2430 aconnp->conn_cred = NULL; 2431 2432 econnp->conn_mac_exempt = aconnp->conn_mac_exempt; 2433 aconnp->conn_mac_exempt = B_FALSE; 2434 2435 ASSERT(aconnp->conn_peercred == NULL); 2436 2437 /* Do the IPC initialization */ 2438 CONN_INC_REF(econnp); 2439 2440 econnp->conn_multicast_loop = aconnp->conn_multicast_loop; 2441 econnp->conn_af_isv6 = aconnp->conn_af_isv6; 2442 econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6; 2443 econnp->conn_ulp = aconnp->conn_ulp; 2444 2445 /* Done with old IPC. Drop its ref on its connp */ 2446 CONN_DEC_REF(aconnp); 2447 } 2448 2449 2450 /* 2451 * Adapt to the information, such as rtt and rtt_sd, provided from the 2452 * ire cached in conn_cache_ire. If no ire cached, do a ire lookup. 2453 * 2454 * Checks for multicast and broadcast destination address. 2455 * Returns zero on failure; non-zero if ok. 2456 * 2457 * Note that the MSS calculation here is based on the info given in 2458 * the IRE. We do not do any calculation based on TCP options. They 2459 * will be handled in tcp_rput_other() and tcp_rput_data() when TCP 2460 * knows which options to use. 2461 * 2462 * Note on how TCP gets its parameters for a connection. 2463 * 2464 * When a tcp_t structure is allocated, it gets all the default parameters. 2465 * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd, 2466 * spipe, rpipe, ... from the route metrics. Route metric overrides the 2467 * default. But if there is an associated tcp_host_param, it will override 2468 * the metrics. 2469 * 2470 * An incoming SYN with a multicast or broadcast destination address, is dropped 2471 * in 1 of 2 places. 2472 * 2473 * 1. If the packet was received over the wire it is dropped in 2474 * ip_rput_process_broadcast() 2475 * 2476 * 2. If the packet was received through internal IP loopback, i.e. the packet 2477 * was generated and received on the same machine, it is dropped in 2478 * ip_wput_local() 2479 * 2480 * An incoming SYN with a multicast or broadcast source address is always 2481 * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to 2482 * reject an attempt to connect to a broadcast or multicast (destination) 2483 * address. 2484 */ 2485 static int 2486 tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) 2487 { 2488 tcp_hsp_t *hsp; 2489 ire_t *ire; 2490 ire_t *sire = NULL; 2491 iulp_t *ire_uinfo = NULL; 2492 uint32_t mss_max; 2493 uint32_t mss; 2494 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 2495 conn_t *connp = tcp->tcp_connp; 2496 boolean_t ire_cacheable = B_FALSE; 2497 zoneid_t zoneid = connp->conn_zoneid; 2498 int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 2499 MATCH_IRE_SECATTR; 2500 ts_label_t *tsl = crgetlabel(CONN_CRED(connp)); 2501 ill_t *ill = NULL; 2502 boolean_t incoming = (ire_mp == NULL); 2503 2504 ASSERT(connp->conn_ire_cache == NULL); 2505 2506 if (tcp->tcp_ipversion == IPV4_VERSION) { 2507 2508 if (CLASSD(tcp->tcp_connp->conn_rem)) { 2509 BUMP_MIB(&ip_mib, ipInDiscards); 2510 return (0); 2511 } 2512 /* 2513 * If IP_NEXTHOP is set, then look for an IRE_CACHE 2514 * for the destination with the nexthop as gateway. 2515 * ire_ctable_lookup() is used because this particular 2516 * ire, if it exists, will be marked private. 2517 * If that is not available, use the interface ire 2518 * for the nexthop. 2519 * 2520 * TSol: tcp_update_label will detect label mismatches based 2521 * only on the destination's label, but that would not 2522 * detect label mismatches based on the security attributes 2523 * of routes or next hop gateway. Hence we need to pass the 2524 * label to ire_ftable_lookup below in order to locate the 2525 * right prefix (and/or) ire cache. Similarly we also need 2526 * pass the label to the ire_cache_lookup below to locate 2527 * the right ire that also matches on the label. 2528 */ 2529 if (tcp->tcp_connp->conn_nexthop_set) { 2530 ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem, 2531 tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid, 2532 tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW); 2533 if (ire == NULL) { 2534 ire = ire_ftable_lookup( 2535 tcp->tcp_connp->conn_nexthop_v4, 2536 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0, 2537 tsl, match_flags); 2538 if (ire == NULL) 2539 return (0); 2540 } else { 2541 ire_uinfo = &ire->ire_uinfo; 2542 } 2543 } else { 2544 ire = ire_cache_lookup(tcp->tcp_connp->conn_rem, 2545 zoneid, tsl); 2546 if (ire != NULL) { 2547 ire_cacheable = B_TRUE; 2548 ire_uinfo = (ire_mp != NULL) ? 2549 &((ire_t *)ire_mp->b_rptr)->ire_uinfo: 2550 &ire->ire_uinfo; 2551 2552 } else { 2553 if (ire_mp == NULL) { 2554 ire = ire_ftable_lookup( 2555 tcp->tcp_connp->conn_rem, 2556 0, 0, 0, NULL, &sire, zoneid, 0, 2557 tsl, (MATCH_IRE_RECURSIVE | 2558 MATCH_IRE_DEFAULT)); 2559 if (ire == NULL) 2560 return (0); 2561 ire_uinfo = (sire != NULL) ? 2562 &sire->ire_uinfo : 2563 &ire->ire_uinfo; 2564 } else { 2565 ire = (ire_t *)ire_mp->b_rptr; 2566 ire_uinfo = 2567 &((ire_t *) 2568 ire_mp->b_rptr)->ire_uinfo; 2569 } 2570 } 2571 } 2572 ASSERT(ire != NULL); 2573 2574 if ((ire->ire_src_addr == INADDR_ANY) || 2575 (ire->ire_type & IRE_BROADCAST)) { 2576 /* 2577 * ire->ire_mp is non null when ire_mp passed in is used 2578 * ire->ire_mp is set in ip_bind_insert_ire[_v6](). 2579 */ 2580 if (ire->ire_mp == NULL) 2581 ire_refrele(ire); 2582 if (sire != NULL) 2583 ire_refrele(sire); 2584 return (0); 2585 } 2586 2587 if (tcp->tcp_ipha->ipha_src == INADDR_ANY) { 2588 ipaddr_t src_addr; 2589 2590 /* 2591 * ip_bind_connected() has stored the correct source 2592 * address in conn_src. 2593 */ 2594 src_addr = tcp->tcp_connp->conn_src; 2595 tcp->tcp_ipha->ipha_src = src_addr; 2596 /* 2597 * Copy of the src addr. in tcp_t is needed 2598 * for the lookup funcs. 2599 */ 2600 IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6); 2601 } 2602 /* 2603 * Set the fragment bit so that IP will tell us if the MTU 2604 * should change. IP tells us the latest setting of 2605 * ip_path_mtu_discovery through ire_frag_flag. 2606 */ 2607 if (ip_path_mtu_discovery) { 2608 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 2609 htons(IPH_DF); 2610 } 2611 /* 2612 * If ire_uinfo is NULL, this is the IRE_INTERFACE case 2613 * for IP_NEXTHOP. No cache ire has been found for the 2614 * destination and we are working with the nexthop's 2615 * interface ire. Since we need to forward all packets 2616 * to the nexthop first, we "blindly" set tcp_localnet 2617 * to false, eventhough the destination may also be 2618 * onlink. 2619 */ 2620 if (ire_uinfo == NULL) 2621 tcp->tcp_localnet = 0; 2622 else 2623 tcp->tcp_localnet = (ire->ire_gateway_addr == 0); 2624 } else { 2625 /* 2626 * For incoming connection ire_mp = NULL 2627 * For outgoing connection ire_mp != NULL 2628 * Technically we should check conn_incoming_ill 2629 * when ire_mp is NULL and conn_outgoing_ill when 2630 * ire_mp is non-NULL. But this is performance 2631 * critical path and for IPV*_BOUND_IF, outgoing 2632 * and incoming ill are always set to the same value. 2633 */ 2634 ill_t *dst_ill = NULL; 2635 ipif_t *dst_ipif = NULL; 2636 2637 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 2638 2639 if (connp->conn_outgoing_ill != NULL) { 2640 /* Outgoing or incoming path */ 2641 int err; 2642 2643 dst_ill = conn_get_held_ill(connp, 2644 &connp->conn_outgoing_ill, &err); 2645 if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) { 2646 ip1dbg(("tcp_adapt_ire: ill_lookup failed\n")); 2647 return (0); 2648 } 2649 match_flags |= MATCH_IRE_ILL; 2650 dst_ipif = dst_ill->ill_ipif; 2651 } 2652 ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6, 2653 0, 0, dst_ipif, zoneid, tsl, match_flags); 2654 2655 if (ire != NULL) { 2656 ire_cacheable = B_TRUE; 2657 ire_uinfo = (ire_mp != NULL) ? 2658 &((ire_t *)ire_mp->b_rptr)->ire_uinfo: 2659 &ire->ire_uinfo; 2660 } else { 2661 if (ire_mp == NULL) { 2662 ire = ire_ftable_lookup_v6( 2663 &tcp->tcp_connp->conn_remv6, 2664 0, 0, 0, dst_ipif, &sire, zoneid, 2665 0, tsl, match_flags); 2666 if (ire == NULL) { 2667 if (dst_ill != NULL) 2668 ill_refrele(dst_ill); 2669 return (0); 2670 } 2671 ire_uinfo = (sire != NULL) ? &sire->ire_uinfo : 2672 &ire->ire_uinfo; 2673 } else { 2674 ire = (ire_t *)ire_mp->b_rptr; 2675 ire_uinfo = 2676 &((ire_t *)ire_mp->b_rptr)->ire_uinfo; 2677 } 2678 } 2679 if (dst_ill != NULL) 2680 ill_refrele(dst_ill); 2681 2682 ASSERT(ire != NULL); 2683 ASSERT(ire_uinfo != NULL); 2684 2685 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) || 2686 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 2687 /* 2688 * ire->ire_mp is non null when ire_mp passed in is used 2689 * ire->ire_mp is set in ip_bind_insert_ire[_v6](). 2690 */ 2691 if (ire->ire_mp == NULL) 2692 ire_refrele(ire); 2693 if (sire != NULL) 2694 ire_refrele(sire); 2695 return (0); 2696 } 2697 2698 if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { 2699 in6_addr_t src_addr; 2700 2701 /* 2702 * ip_bind_connected_v6() has stored the correct source 2703 * address per IPv6 addr. selection policy in 2704 * conn_src_v6. 2705 */ 2706 src_addr = tcp->tcp_connp->conn_srcv6; 2707 2708 tcp->tcp_ip6h->ip6_src = src_addr; 2709 /* 2710 * Copy of the src addr. in tcp_t is needed 2711 * for the lookup funcs. 2712 */ 2713 tcp->tcp_ip_src_v6 = src_addr; 2714 ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src, 2715 &connp->conn_srcv6)); 2716 } 2717 tcp->tcp_localnet = 2718 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6); 2719 } 2720 2721 /* 2722 * This allows applications to fail quickly when connections are made 2723 * to dead hosts. Hosts can be labeled dead by adding a reject route 2724 * with both the RTF_REJECT and RTF_PRIVATE flags set. 2725 */ 2726 if ((ire->ire_flags & RTF_REJECT) && 2727 (ire->ire_flags & RTF_PRIVATE)) 2728 goto error; 2729 2730 /* 2731 * Make use of the cached rtt and rtt_sd values to calculate the 2732 * initial RTO. Note that they are already initialized in 2733 * tcp_init_values(). 2734 * If ire_uinfo is NULL, i.e., we do not have a cache ire for 2735 * IP_NEXTHOP, but instead are using the interface ire for the 2736 * nexthop, then we do not use the ire_uinfo from that ire to 2737 * do any initializations. 2738 */ 2739 if (ire_uinfo != NULL) { 2740 if (ire_uinfo->iulp_rtt != 0) { 2741 clock_t rto; 2742 2743 tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt; 2744 tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd; 2745 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 2746 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5); 2747 2748 if (rto > tcp_rexmit_interval_max) { 2749 tcp->tcp_rto = tcp_rexmit_interval_max; 2750 } else if (rto < tcp_rexmit_interval_min) { 2751 tcp->tcp_rto = tcp_rexmit_interval_min; 2752 } else { 2753 tcp->tcp_rto = rto; 2754 } 2755 } 2756 if (ire_uinfo->iulp_ssthresh != 0) 2757 tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh; 2758 else 2759 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 2760 if (ire_uinfo->iulp_spipe > 0) { 2761 tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe, 2762 tcp_max_buf); 2763 if (tcp_snd_lowat_fraction != 0) 2764 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / 2765 tcp_snd_lowat_fraction; 2766 (void) tcp_maxpsz_set(tcp, B_TRUE); 2767 } 2768 /* 2769 * Note that up till now, acceptor always inherits receive 2770 * window from the listener. But if there is a metrics 2771 * associated with a host, we should use that instead of 2772 * inheriting it from listener. Thus we need to pass this 2773 * info back to the caller. 2774 */ 2775 if (ire_uinfo->iulp_rpipe > 0) { 2776 tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe, tcp_max_buf); 2777 } 2778 2779 if (ire_uinfo->iulp_rtomax > 0) { 2780 tcp->tcp_second_timer_threshold = 2781 ire_uinfo->iulp_rtomax; 2782 } 2783 2784 /* 2785 * Use the metric option settings, iulp_tstamp_ok and 2786 * iulp_wscale_ok, only for active open. What this means 2787 * is that if the other side uses timestamp or window 2788 * scale option, TCP will also use those options. That 2789 * is for passive open. If the application sets a 2790 * large window, window scale is enabled regardless of 2791 * the value in iulp_wscale_ok. This is the behavior 2792 * since 2.6. So we keep it. 2793 * The only case left in passive open processing is the 2794 * check for SACK. 2795 * For ECN, it should probably be like SACK. But the 2796 * current value is binary, so we treat it like the other 2797 * cases. The metric only controls active open.For passive 2798 * open, the ndd param, tcp_ecn_permitted, controls the 2799 * behavior. 2800 */ 2801 if (!tcp_detached) { 2802 /* 2803 * The if check means that the following can only 2804 * be turned on by the metrics only IRE, but not off. 2805 */ 2806 if (ire_uinfo->iulp_tstamp_ok) 2807 tcp->tcp_snd_ts_ok = B_TRUE; 2808 if (ire_uinfo->iulp_wscale_ok) 2809 tcp->tcp_snd_ws_ok = B_TRUE; 2810 if (ire_uinfo->iulp_sack == 2) 2811 tcp->tcp_snd_sack_ok = B_TRUE; 2812 if (ire_uinfo->iulp_ecn_ok) 2813 tcp->tcp_ecn_ok = B_TRUE; 2814 } else { 2815 /* 2816 * Passive open. 2817 * 2818 * As above, the if check means that SACK can only be 2819 * turned on by the metric only IRE. 2820 */ 2821 if (ire_uinfo->iulp_sack > 0) { 2822 tcp->tcp_snd_sack_ok = B_TRUE; 2823 } 2824 } 2825 } 2826 2827 2828 /* 2829 * XXX: Note that currently, ire_max_frag can be as small as 68 2830 * because of PMTUd. So tcp_mss may go to negative if combined 2831 * length of all those options exceeds 28 bytes. But because 2832 * of the tcp_mss_min check below, we may not have a problem if 2833 * tcp_mss_min is of a reasonable value. The default is 1 so 2834 * the negative problem still exists. And the check defeats PMTUd. 2835 * In fact, if PMTUd finds that the MSS should be smaller than 2836 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min 2837 * value. 2838 * 2839 * We do not deal with that now. All those problems related to 2840 * PMTUd will be fixed later. 2841 */ 2842 ASSERT(ire->ire_max_frag != 0); 2843 mss = tcp->tcp_if_mtu = ire->ire_max_frag; 2844 if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) { 2845 if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) { 2846 mss = MIN(mss, IPV6_MIN_MTU); 2847 } 2848 } 2849 2850 /* Sanity check for MSS value. */ 2851 if (tcp->tcp_ipversion == IPV4_VERSION) 2852 mss_max = tcp_mss_max_ipv4; 2853 else 2854 mss_max = tcp_mss_max_ipv6; 2855 2856 if (tcp->tcp_ipversion == IPV6_VERSION && 2857 (ire->ire_frag_flag & IPH_FRAG_HDR)) { 2858 /* 2859 * After receiving an ICMPv6 "packet too big" message with a 2860 * MTU < 1280, and for multirouted IPv6 packets, the IP layer 2861 * will insert a 8-byte fragment header in every packet; we 2862 * reduce the MSS by that amount here. 2863 */ 2864 mss -= sizeof (ip6_frag_t); 2865 } 2866 2867 if (tcp->tcp_ipsec_overhead == 0) 2868 tcp->tcp_ipsec_overhead = conn_ipsec_length(connp); 2869 2870 mss -= tcp->tcp_ipsec_overhead; 2871 2872 if (mss < tcp_mss_min) 2873 mss = tcp_mss_min; 2874 if (mss > mss_max) 2875 mss = mss_max; 2876 2877 /* Note that this is the maximum MSS, excluding all options. */ 2878 tcp->tcp_mss = mss; 2879 2880 /* 2881 * Initialize the ISS here now that we have the full connection ID. 2882 * The RFC 1948 method of initial sequence number generation requires 2883 * knowledge of the full connection ID before setting the ISS. 2884 */ 2885 2886 tcp_iss_init(tcp); 2887 2888 if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL)) 2889 tcp->tcp_loopback = B_TRUE; 2890 2891 if (tcp->tcp_ipversion == IPV4_VERSION) { 2892 hsp = tcp_hsp_lookup(tcp->tcp_remote); 2893 } else { 2894 hsp = tcp_hsp_lookup_ipv6(&tcp->tcp_remote_v6); 2895 } 2896 2897 if (hsp != NULL) { 2898 /* Only modify if we're going to make them bigger */ 2899 if (hsp->tcp_hsp_sendspace > tcp->tcp_xmit_hiwater) { 2900 tcp->tcp_xmit_hiwater = hsp->tcp_hsp_sendspace; 2901 if (tcp_snd_lowat_fraction != 0) 2902 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / 2903 tcp_snd_lowat_fraction; 2904 } 2905 2906 if (hsp->tcp_hsp_recvspace > tcp->tcp_rwnd) { 2907 tcp->tcp_rwnd = hsp->tcp_hsp_recvspace; 2908 } 2909 2910 /* Copy timestamp flag only for active open */ 2911 if (!tcp_detached) 2912 tcp->tcp_snd_ts_ok = hsp->tcp_hsp_tstamp; 2913 } 2914 2915 if (sire != NULL) 2916 IRE_REFRELE(sire); 2917 2918 /* 2919 * If we got an IRE_CACHE and an ILL, go through their properties; 2920 * otherwise, this is deferred until later when we have an IRE_CACHE. 2921 */ 2922 if (tcp->tcp_loopback || 2923 (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) { 2924 /* 2925 * For incoming, see if this tcp may be MDT-capable. For 2926 * outgoing, this process has been taken care of through 2927 * tcp_rput_other. 2928 */ 2929 tcp_ire_ill_check(tcp, ire, ill, incoming); 2930 tcp->tcp_ire_ill_check_done = B_TRUE; 2931 } 2932 2933 mutex_enter(&connp->conn_lock); 2934 /* 2935 * Make sure that conn is not marked incipient 2936 * for incoming connections. A blind 2937 * removal of incipient flag is cheaper than 2938 * check and removal. 2939 */ 2940 connp->conn_state_flags &= ~CONN_INCIPIENT; 2941 2942 /* Must not cache forwarding table routes. */ 2943 if (ire_cacheable) { 2944 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 2945 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 2946 connp->conn_ire_cache = ire; 2947 IRE_UNTRACE_REF(ire); 2948 rw_exit(&ire->ire_bucket->irb_lock); 2949 mutex_exit(&connp->conn_lock); 2950 return (1); 2951 } 2952 rw_exit(&ire->ire_bucket->irb_lock); 2953 } 2954 mutex_exit(&connp->conn_lock); 2955 2956 if (ire->ire_mp == NULL) 2957 ire_refrele(ire); 2958 return (1); 2959 2960 error: 2961 if (ire->ire_mp == NULL) 2962 ire_refrele(ire); 2963 if (sire != NULL) 2964 ire_refrele(sire); 2965 return (0); 2966 } 2967 2968 /* 2969 * tcp_bind is called (holding the writer lock) by tcp_wput_proto to process a 2970 * O_T_BIND_REQ/T_BIND_REQ message. 2971 */ 2972 static void 2973 tcp_bind(tcp_t *tcp, mblk_t *mp) 2974 { 2975 sin_t *sin; 2976 sin6_t *sin6; 2977 mblk_t *mp1; 2978 in_port_t requested_port; 2979 in_port_t allocated_port; 2980 struct T_bind_req *tbr; 2981 boolean_t bind_to_req_port_only; 2982 boolean_t backlog_update = B_FALSE; 2983 boolean_t user_specified; 2984 in6_addr_t v6addr; 2985 ipaddr_t v4addr; 2986 uint_t origipversion; 2987 int err; 2988 queue_t *q = tcp->tcp_wq; 2989 conn_t *connp; 2990 mlp_type_t addrtype, mlptype; 2991 zone_t *zone; 2992 cred_t *cr; 2993 in_port_t mlp_port; 2994 2995 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 2996 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 2997 if (tcp->tcp_debug) { 2998 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 2999 "tcp_bind: bad req, len %u", 3000 (uint_t)(mp->b_wptr - mp->b_rptr)); 3001 } 3002 tcp_err_ack(tcp, mp, TPROTO, 0); 3003 return; 3004 } 3005 /* Make sure the largest address fits */ 3006 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1); 3007 if (mp1 == NULL) { 3008 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 3009 return; 3010 } 3011 mp = mp1; 3012 tbr = (struct T_bind_req *)mp->b_rptr; 3013 if (tcp->tcp_state >= TCPS_BOUND) { 3014 if ((tcp->tcp_state == TCPS_BOUND || 3015 tcp->tcp_state == TCPS_LISTEN) && 3016 tcp->tcp_conn_req_max != tbr->CONIND_number && 3017 tbr->CONIND_number > 0) { 3018 /* 3019 * Handle listen() increasing CONIND_number. 3020 * This is more "liberal" then what the TPI spec 3021 * requires but is needed to avoid a t_unbind 3022 * when handling listen() since the port number 3023 * might be "stolen" between the unbind and bind. 3024 */ 3025 backlog_update = B_TRUE; 3026 goto do_bind; 3027 } 3028 if (tcp->tcp_debug) { 3029 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 3030 "tcp_bind: bad state, %d", tcp->tcp_state); 3031 } 3032 tcp_err_ack(tcp, mp, TOUTSTATE, 0); 3033 return; 3034 } 3035 origipversion = tcp->tcp_ipversion; 3036 3037 switch (tbr->ADDR_length) { 3038 case 0: /* request for a generic port */ 3039 tbr->ADDR_offset = sizeof (struct T_bind_req); 3040 if (tcp->tcp_family == AF_INET) { 3041 tbr->ADDR_length = sizeof (sin_t); 3042 sin = (sin_t *)&tbr[1]; 3043 *sin = sin_null; 3044 sin->sin_family = AF_INET; 3045 mp->b_wptr = (uchar_t *)&sin[1]; 3046 tcp->tcp_ipversion = IPV4_VERSION; 3047 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &v6addr); 3048 } else { 3049 ASSERT(tcp->tcp_family == AF_INET6); 3050 tbr->ADDR_length = sizeof (sin6_t); 3051 sin6 = (sin6_t *)&tbr[1]; 3052 *sin6 = sin6_null; 3053 sin6->sin6_family = AF_INET6; 3054 mp->b_wptr = (uchar_t *)&sin6[1]; 3055 tcp->tcp_ipversion = IPV6_VERSION; 3056 V6_SET_ZERO(v6addr); 3057 } 3058 requested_port = 0; 3059 break; 3060 3061 case sizeof (sin_t): /* Complete IPv4 address */ 3062 sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset, 3063 sizeof (sin_t)); 3064 if (sin == NULL || !OK_32PTR((char *)sin)) { 3065 if (tcp->tcp_debug) { 3066 (void) strlog(TCP_MOD_ID, 0, 1, 3067 SL_ERROR|SL_TRACE, 3068 "tcp_bind: bad address parameter, " 3069 "offset %d, len %d", 3070 tbr->ADDR_offset, tbr->ADDR_length); 3071 } 3072 tcp_err_ack(tcp, mp, TPROTO, 0); 3073 return; 3074 } 3075 /* 3076 * With sockets sockfs will accept bogus sin_family in 3077 * bind() and replace it with the family used in the socket 3078 * call. 3079 */ 3080 if (sin->sin_family != AF_INET || 3081 tcp->tcp_family != AF_INET) { 3082 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 3083 return; 3084 } 3085 requested_port = ntohs(sin->sin_port); 3086 tcp->tcp_ipversion = IPV4_VERSION; 3087 v4addr = sin->sin_addr.s_addr; 3088 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); 3089 break; 3090 3091 case sizeof (sin6_t): /* Complete IPv6 address */ 3092 sin6 = (sin6_t *)mi_offset_param(mp, 3093 tbr->ADDR_offset, sizeof (sin6_t)); 3094 if (sin6 == NULL || !OK_32PTR((char *)sin6)) { 3095 if (tcp->tcp_debug) { 3096 (void) strlog(TCP_MOD_ID, 0, 1, 3097 SL_ERROR|SL_TRACE, 3098 "tcp_bind: bad IPv6 address parameter, " 3099 "offset %d, len %d", tbr->ADDR_offset, 3100 tbr->ADDR_length); 3101 } 3102 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 3103 return; 3104 } 3105 if (sin6->sin6_family != AF_INET6 || 3106 tcp->tcp_family != AF_INET6) { 3107 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 3108 return; 3109 } 3110 requested_port = ntohs(sin6->sin6_port); 3111 tcp->tcp_ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 3112 IPV4_VERSION : IPV6_VERSION; 3113 v6addr = sin6->sin6_addr; 3114 break; 3115 3116 default: 3117 if (tcp->tcp_debug) { 3118 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 3119 "tcp_bind: bad address length, %d", 3120 tbr->ADDR_length); 3121 } 3122 tcp_err_ack(tcp, mp, TBADADDR, 0); 3123 return; 3124 } 3125 tcp->tcp_bound_source_v6 = v6addr; 3126 3127 /* Check for change in ipversion */ 3128 if (origipversion != tcp->tcp_ipversion) { 3129 ASSERT(tcp->tcp_family == AF_INET6); 3130 err = tcp->tcp_ipversion == IPV6_VERSION ? 3131 tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp); 3132 if (err) { 3133 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 3134 return; 3135 } 3136 } 3137 3138 /* 3139 * Initialize family specific fields. Copy of the src addr. 3140 * in tcp_t is needed for the lookup funcs. 3141 */ 3142 if (tcp->tcp_ipversion == IPV6_VERSION) { 3143 tcp->tcp_ip6h->ip6_src = v6addr; 3144 } else { 3145 IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src); 3146 } 3147 tcp->tcp_ip_src_v6 = v6addr; 3148 3149 /* 3150 * For O_T_BIND_REQ: 3151 * Verify that the target port/addr is available, or choose 3152 * another. 3153 * For T_BIND_REQ: 3154 * Verify that the target port/addr is available or fail. 3155 * In both cases when it succeeds the tcp is inserted in the 3156 * bind hash table. This ensures that the operation is atomic 3157 * under the lock on the hash bucket. 3158 */ 3159 bind_to_req_port_only = requested_port != 0 && 3160 tbr->PRIM_type != O_T_BIND_REQ; 3161 /* 3162 * Get a valid port (within the anonymous range and should not 3163 * be a privileged one) to use if the user has not given a port. 3164 * If multiple threads are here, they may all start with 3165 * with the same initial port. But, it should be fine as long as 3166 * tcp_bindi will ensure that no two threads will be assigned 3167 * the same port. 3168 * 3169 * NOTE: XXX If a privileged process asks for an anonymous port, we 3170 * still check for ports only in the range > tcp_smallest_non_priv_port, 3171 * unless TCP_ANONPRIVBIND option is set. 3172 */ 3173 mlptype = mlptSingle; 3174 mlp_port = requested_port; 3175 if (requested_port == 0) { 3176 requested_port = tcp->tcp_anon_priv_bind ? 3177 tcp_get_next_priv_port(tcp) : 3178 tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE); 3179 if (requested_port == 0) { 3180 tcp_err_ack(tcp, mp, TNOADDR, 0); 3181 return; 3182 } 3183 user_specified = B_FALSE; 3184 3185 /* 3186 * If the user went through one of the RPC interfaces to create 3187 * this socket and RPC is MLP in this zone, then give him an 3188 * anonymous MLP. 3189 */ 3190 cr = DB_CREDDEF(mp, tcp->tcp_cred); 3191 connp = tcp->tcp_connp; 3192 if (connp->conn_anon_mlp && is_system_labeled()) { 3193 zone = crgetzone(cr); 3194 addrtype = tsol_mlp_addr_type(zone->zone_id, 3195 IPV6_VERSION, &v6addr); 3196 if (addrtype == mlptSingle) { 3197 tcp_err_ack(tcp, mp, TNOADDR, 0); 3198 return; 3199 } 3200 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 3201 PMAPPORT, addrtype); 3202 mlp_port = PMAPPORT; 3203 } 3204 } else { 3205 int i; 3206 boolean_t priv = B_FALSE; 3207 3208 /* 3209 * If the requested_port is in the well-known privileged range, 3210 * verify that the stream was opened by a privileged user. 3211 * Note: No locks are held when inspecting tcp_g_*epriv_ports 3212 * but instead the code relies on: 3213 * - the fact that the address of the array and its size never 3214 * changes 3215 * - the atomic assignment of the elements of the array 3216 */ 3217 cr = DB_CREDDEF(mp, tcp->tcp_cred); 3218 if (requested_port < tcp_smallest_nonpriv_port) { 3219 priv = B_TRUE; 3220 } else { 3221 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 3222 if (requested_port == 3223 tcp_g_epriv_ports[i]) { 3224 priv = B_TRUE; 3225 break; 3226 } 3227 } 3228 } 3229 if (priv) { 3230 if (secpolicy_net_privaddr(cr, requested_port) != 0) { 3231 if (tcp->tcp_debug) { 3232 (void) strlog(TCP_MOD_ID, 0, 1, 3233 SL_ERROR|SL_TRACE, 3234 "tcp_bind: no priv for port %d", 3235 requested_port); 3236 } 3237 tcp_err_ack(tcp, mp, TACCES, 0); 3238 return; 3239 } 3240 } 3241 user_specified = B_TRUE; 3242 3243 connp = tcp->tcp_connp; 3244 if (is_system_labeled()) { 3245 zone = crgetzone(cr); 3246 addrtype = tsol_mlp_addr_type(zone->zone_id, 3247 IPV6_VERSION, &v6addr); 3248 if (addrtype == mlptSingle) { 3249 tcp_err_ack(tcp, mp, TNOADDR, 0); 3250 return; 3251 } 3252 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 3253 requested_port, addrtype); 3254 } 3255 } 3256 3257 if (mlptype != mlptSingle) { 3258 if (secpolicy_net_bindmlp(cr) != 0) { 3259 if (tcp->tcp_debug) { 3260 (void) strlog(TCP_MOD_ID, 0, 1, 3261 SL_ERROR|SL_TRACE, 3262 "tcp_bind: no priv for multilevel port %d", 3263 requested_port); 3264 } 3265 tcp_err_ack(tcp, mp, TACCES, 0); 3266 return; 3267 } 3268 3269 /* 3270 * If we're specifically binding a shared IP address and the 3271 * port is MLP on shared addresses, then check to see if this 3272 * zone actually owns the MLP. Reject if not. 3273 */ 3274 if (mlptype == mlptShared && addrtype == mlptShared) { 3275 zoneid_t mlpzone; 3276 3277 mlpzone = tsol_mlp_findzone(IPPROTO_TCP, 3278 htons(mlp_port)); 3279 if (connp->conn_zoneid != mlpzone) { 3280 if (tcp->tcp_debug) { 3281 (void) strlog(TCP_MOD_ID, 0, 1, 3282 SL_ERROR|SL_TRACE, 3283 "tcp_bind: attempt to bind port " 3284 "%d on shared addr in zone %d " 3285 "(should be %d)", 3286 mlp_port, connp->conn_zoneid, 3287 mlpzone); 3288 } 3289 tcp_err_ack(tcp, mp, TACCES, 0); 3290 return; 3291 } 3292 } 3293 3294 if (!user_specified) { 3295 err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp, 3296 requested_port, B_TRUE); 3297 if (err != 0) { 3298 if (tcp->tcp_debug) { 3299 (void) strlog(TCP_MOD_ID, 0, 1, 3300 SL_ERROR|SL_TRACE, 3301 "tcp_bind: cannot establish anon " 3302 "MLP for port %d", 3303 requested_port); 3304 } 3305 tcp_err_ack(tcp, mp, TSYSERR, err); 3306 return; 3307 } 3308 connp->conn_anon_port = B_TRUE; 3309 } 3310 connp->conn_mlp_type = mlptype; 3311 } 3312 3313 allocated_port = tcp_bindi(tcp, requested_port, &v6addr, 3314 tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified); 3315 3316 if (allocated_port == 0) { 3317 connp->conn_mlp_type = mlptSingle; 3318 if (connp->conn_anon_port) { 3319 connp->conn_anon_port = B_FALSE; 3320 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, 3321 requested_port, B_FALSE); 3322 } 3323 if (bind_to_req_port_only) { 3324 if (tcp->tcp_debug) { 3325 (void) strlog(TCP_MOD_ID, 0, 1, 3326 SL_ERROR|SL_TRACE, 3327 "tcp_bind: requested addr busy"); 3328 } 3329 tcp_err_ack(tcp, mp, TADDRBUSY, 0); 3330 } else { 3331 /* If we are out of ports, fail the bind. */ 3332 if (tcp->tcp_debug) { 3333 (void) strlog(TCP_MOD_ID, 0, 1, 3334 SL_ERROR|SL_TRACE, 3335 "tcp_bind: out of ports?"); 3336 } 3337 tcp_err_ack(tcp, mp, TNOADDR, 0); 3338 } 3339 return; 3340 } 3341 ASSERT(tcp->tcp_state == TCPS_BOUND); 3342 do_bind: 3343 if (!backlog_update) { 3344 if (tcp->tcp_family == AF_INET) 3345 sin->sin_port = htons(allocated_port); 3346 else 3347 sin6->sin6_port = htons(allocated_port); 3348 } 3349 if (tcp->tcp_family == AF_INET) { 3350 if (tbr->CONIND_number != 0) { 3351 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, 3352 sizeof (sin_t)); 3353 } else { 3354 /* Just verify the local IP address */ 3355 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, IP_ADDR_LEN); 3356 } 3357 } else { 3358 if (tbr->CONIND_number != 0) { 3359 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, 3360 sizeof (sin6_t)); 3361 } else { 3362 /* Just verify the local IP address */ 3363 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, 3364 IPV6_ADDR_LEN); 3365 } 3366 } 3367 if (mp1 == NULL) { 3368 if (connp->conn_anon_port) { 3369 connp->conn_anon_port = B_FALSE; 3370 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, 3371 requested_port, B_FALSE); 3372 } 3373 connp->conn_mlp_type = mlptSingle; 3374 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 3375 return; 3376 } 3377 3378 tbr->PRIM_type = T_BIND_ACK; 3379 mp->b_datap->db_type = M_PCPROTO; 3380 3381 /* Chain in the reply mp for tcp_rput() */ 3382 mp1->b_cont = mp; 3383 mp = mp1; 3384 3385 tcp->tcp_conn_req_max = tbr->CONIND_number; 3386 if (tcp->tcp_conn_req_max) { 3387 if (tcp->tcp_conn_req_max < tcp_conn_req_min) 3388 tcp->tcp_conn_req_max = tcp_conn_req_min; 3389 if (tcp->tcp_conn_req_max > tcp_conn_req_max_q) 3390 tcp->tcp_conn_req_max = tcp_conn_req_max_q; 3391 /* 3392 * If this is a listener, do not reset the eager list 3393 * and other stuffs. Note that we don't check if the 3394 * existing eager list meets the new tcp_conn_req_max 3395 * requirement. 3396 */ 3397 if (tcp->tcp_state != TCPS_LISTEN) { 3398 tcp->tcp_state = TCPS_LISTEN; 3399 /* Initialize the chain. Don't need the eager_lock */ 3400 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 3401 tcp->tcp_second_ctimer_threshold = 3402 tcp_ip_abort_linterval; 3403 } 3404 } 3405 3406 /* 3407 * We can call ip_bind directly which returns a T_BIND_ACK mp. The 3408 * processing continues in tcp_rput_other(). 3409 */ 3410 if (tcp->tcp_family == AF_INET6) { 3411 ASSERT(tcp->tcp_connp->conn_af_isv6); 3412 mp = ip_bind_v6(q, mp, tcp->tcp_connp, &tcp->tcp_sticky_ipp); 3413 } else { 3414 ASSERT(!tcp->tcp_connp->conn_af_isv6); 3415 mp = ip_bind_v4(q, mp, tcp->tcp_connp); 3416 } 3417 /* 3418 * If the bind cannot complete immediately 3419 * IP will arrange to call tcp_rput_other 3420 * when the bind completes. 3421 */ 3422 if (mp != NULL) { 3423 tcp_rput_other(tcp, mp); 3424 } else { 3425 /* 3426 * Bind will be resumed later. Need to ensure 3427 * that conn doesn't disappear when that happens. 3428 * This will be decremented in ip_resume_tcp_bind(). 3429 */ 3430 CONN_INC_REF(tcp->tcp_connp); 3431 } 3432 } 3433 3434 3435 /* 3436 * If the "bind_to_req_port_only" parameter is set, if the requested port 3437 * number is available, return it, If not return 0 3438 * 3439 * If "bind_to_req_port_only" parameter is not set and 3440 * If the requested port number is available, return it. If not, return 3441 * the first anonymous port we happen across. If no anonymous ports are 3442 * available, return 0. addr is the requested local address, if any. 3443 * 3444 * In either case, when succeeding update the tcp_t to record the port number 3445 * and insert it in the bind hash table. 3446 * 3447 * Note that TCP over IPv4 and IPv6 sockets can use the same port number 3448 * without setting SO_REUSEADDR. This is needed so that they 3449 * can be viewed as two independent transport protocols. 3450 */ 3451 static in_port_t 3452 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 3453 int reuseaddr, boolean_t quick_connect, 3454 boolean_t bind_to_req_port_only, boolean_t user_specified) 3455 { 3456 /* number of times we have run around the loop */ 3457 int count = 0; 3458 /* maximum number of times to run around the loop */ 3459 int loopmax; 3460 conn_t *connp = tcp->tcp_connp; 3461 zoneid_t zoneid = connp->conn_zoneid; 3462 3463 /* 3464 * Lookup for free addresses is done in a loop and "loopmax" 3465 * influences how long we spin in the loop 3466 */ 3467 if (bind_to_req_port_only) { 3468 /* 3469 * If the requested port is busy, don't bother to look 3470 * for a new one. Setting loop maximum count to 1 has 3471 * that effect. 3472 */ 3473 loopmax = 1; 3474 } else { 3475 /* 3476 * If the requested port is busy, look for a free one 3477 * in the anonymous port range. 3478 * Set loopmax appropriately so that one does not look 3479 * forever in the case all of the anonymous ports are in use. 3480 */ 3481 if (tcp->tcp_anon_priv_bind) { 3482 /* 3483 * loopmax = 3484 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 3485 */ 3486 loopmax = IPPORT_RESERVED - tcp_min_anonpriv_port; 3487 } else { 3488 loopmax = (tcp_largest_anon_port - 3489 tcp_smallest_anon_port + 1); 3490 } 3491 } 3492 do { 3493 uint16_t lport; 3494 tf_t *tbf; 3495 tcp_t *ltcp; 3496 conn_t *lconnp; 3497 3498 lport = htons(port); 3499 3500 /* 3501 * Ensure that the tcp_t is not currently in the bind hash. 3502 * Hold the lock on the hash bucket to ensure that 3503 * the duplicate check plus the insertion is an atomic 3504 * operation. 3505 * 3506 * This function does an inline lookup on the bind hash list 3507 * Make sure that we access only members of tcp_t 3508 * and that we don't look at tcp_tcp, since we are not 3509 * doing a CONN_INC_REF. 3510 */ 3511 tcp_bind_hash_remove(tcp); 3512 tbf = &tcp_bind_fanout[TCP_BIND_HASH(lport)]; 3513 mutex_enter(&tbf->tf_lock); 3514 for (ltcp = tbf->tf_tcp; ltcp != NULL; 3515 ltcp = ltcp->tcp_bind_hash) { 3516 boolean_t not_socket; 3517 boolean_t exclbind; 3518 3519 if (lport != ltcp->tcp_lport) 3520 continue; 3521 3522 lconnp = ltcp->tcp_connp; 3523 3524 /* 3525 * On a labeled system, we must treat bindings to ports 3526 * on shared IP addresses by sockets with MAC exemption 3527 * privilege as being in all zones, as there's 3528 * otherwise no way to identify the right receiver. 3529 */ 3530 if (!IPCL_ZONE_MATCH(ltcp->tcp_connp, zoneid) && 3531 !lconnp->conn_mac_exempt && 3532 !connp->conn_mac_exempt) 3533 continue; 3534 3535 /* 3536 * If TCP_EXCLBIND is set for either the bound or 3537 * binding endpoint, the semantics of bind 3538 * is changed according to the following. 3539 * 3540 * spec = specified address (v4 or v6) 3541 * unspec = unspecified address (v4 or v6) 3542 * A = specified addresses are different for endpoints 3543 * 3544 * bound bind to allowed 3545 * ------------------------------------- 3546 * unspec unspec no 3547 * unspec spec no 3548 * spec unspec no 3549 * spec spec yes if A 3550 * 3551 * For labeled systems, SO_MAC_EXEMPT behaves the same 3552 * as TCP_EXCLBIND, except that zoneid is ignored. 3553 * 3554 * Note: 3555 * 3556 * 1. Because of TLI semantics, an endpoint can go 3557 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or 3558 * TCPS_BOUND, depending on whether it is originally 3559 * a listener or not. That is why we need to check 3560 * for states greater than or equal to TCPS_BOUND 3561 * here. 3562 * 3563 * 2. Ideally, we should only check for state equals 3564 * to TCPS_LISTEN. And the following check should be 3565 * added. 3566 * 3567 * if (ltcp->tcp_state == TCPS_LISTEN || 3568 * !reuseaddr || !ltcp->tcp_reuseaddr) { 3569 * ... 3570 * } 3571 * 3572 * The semantics will be changed to this. If the 3573 * endpoint on the list is in state not equal to 3574 * TCPS_LISTEN and both endpoints have SO_REUSEADDR 3575 * set, let the bind succeed. 3576 * 3577 * Because of (1), we cannot do that for TLI 3578 * endpoints. But we can do that for socket endpoints. 3579 * If in future, we can change this going back 3580 * semantics, we can use the above check for TLI also. 3581 */ 3582 not_socket = !(TCP_IS_SOCKET(ltcp) && 3583 TCP_IS_SOCKET(tcp)); 3584 exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind; 3585 3586 if (lconnp->conn_mac_exempt || connp->conn_mac_exempt || 3587 (exclbind && (not_socket || 3588 ltcp->tcp_state <= TCPS_ESTABLISHED))) { 3589 if (V6_OR_V4_INADDR_ANY( 3590 ltcp->tcp_bound_source_v6) || 3591 V6_OR_V4_INADDR_ANY(*laddr) || 3592 IN6_ARE_ADDR_EQUAL(laddr, 3593 <cp->tcp_bound_source_v6)) { 3594 break; 3595 } 3596 continue; 3597 } 3598 3599 /* 3600 * Check ipversion to allow IPv4 and IPv6 sockets to 3601 * have disjoint port number spaces, if *_EXCLBIND 3602 * is not set and only if the application binds to a 3603 * specific port. We use the same autoassigned port 3604 * number space for IPv4 and IPv6 sockets. 3605 */ 3606 if (tcp->tcp_ipversion != ltcp->tcp_ipversion && 3607 bind_to_req_port_only) 3608 continue; 3609 3610 /* 3611 * Ideally, we should make sure that the source 3612 * address, remote address, and remote port in the 3613 * four tuple for this tcp-connection is unique. 3614 * However, trying to find out the local source 3615 * address would require too much code duplication 3616 * with IP, since IP needs needs to have that code 3617 * to support userland TCP implementations. 3618 */ 3619 if (quick_connect && 3620 (ltcp->tcp_state > TCPS_LISTEN) && 3621 ((tcp->tcp_fport != ltcp->tcp_fport) || 3622 !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, 3623 <cp->tcp_remote_v6))) 3624 continue; 3625 3626 if (!reuseaddr) { 3627 /* 3628 * No socket option SO_REUSEADDR. 3629 * If existing port is bound to 3630 * a non-wildcard IP address 3631 * and the requesting stream is 3632 * bound to a distinct 3633 * different IP addresses 3634 * (non-wildcard, also), keep 3635 * going. 3636 */ 3637 if (!V6_OR_V4_INADDR_ANY(*laddr) && 3638 !V6_OR_V4_INADDR_ANY( 3639 ltcp->tcp_bound_source_v6) && 3640 !IN6_ARE_ADDR_EQUAL(laddr, 3641 <cp->tcp_bound_source_v6)) 3642 continue; 3643 if (ltcp->tcp_state >= TCPS_BOUND) { 3644 /* 3645 * This port is being used and 3646 * its state is >= TCPS_BOUND, 3647 * so we can't bind to it. 3648 */ 3649 break; 3650 } 3651 } else { 3652 /* 3653 * socket option SO_REUSEADDR is set on the 3654 * binding tcp_t. 3655 * 3656 * If two streams are bound to 3657 * same IP address or both addr 3658 * and bound source are wildcards 3659 * (INADDR_ANY), we want to stop 3660 * searching. 3661 * We have found a match of IP source 3662 * address and source port, which is 3663 * refused regardless of the 3664 * SO_REUSEADDR setting, so we break. 3665 */ 3666 if (IN6_ARE_ADDR_EQUAL(laddr, 3667 <cp->tcp_bound_source_v6) && 3668 (ltcp->tcp_state == TCPS_LISTEN || 3669 ltcp->tcp_state == TCPS_BOUND)) 3670 break; 3671 } 3672 } 3673 if (ltcp != NULL) { 3674 /* The port number is busy */ 3675 mutex_exit(&tbf->tf_lock); 3676 } else { 3677 /* 3678 * This port is ours. Insert in fanout and mark as 3679 * bound to prevent others from getting the port 3680 * number. 3681 */ 3682 tcp->tcp_state = TCPS_BOUND; 3683 tcp->tcp_lport = htons(port); 3684 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 3685 3686 ASSERT(&tcp_bind_fanout[TCP_BIND_HASH( 3687 tcp->tcp_lport)] == tbf); 3688 tcp_bind_hash_insert(tbf, tcp, 1); 3689 3690 mutex_exit(&tbf->tf_lock); 3691 3692 /* 3693 * We don't want tcp_next_port_to_try to "inherit" 3694 * a port number supplied by the user in a bind. 3695 */ 3696 if (user_specified) 3697 return (port); 3698 3699 /* 3700 * This is the only place where tcp_next_port_to_try 3701 * is updated. After the update, it may or may not 3702 * be in the valid range. 3703 */ 3704 if (!tcp->tcp_anon_priv_bind) 3705 tcp_next_port_to_try = port + 1; 3706 return (port); 3707 } 3708 3709 if (tcp->tcp_anon_priv_bind) { 3710 port = tcp_get_next_priv_port(tcp); 3711 } else { 3712 if (count == 0 && user_specified) { 3713 /* 3714 * We may have to return an anonymous port. So 3715 * get one to start with. 3716 */ 3717 port = 3718 tcp_update_next_port(tcp_next_port_to_try, 3719 tcp, B_TRUE); 3720 user_specified = B_FALSE; 3721 } else { 3722 port = tcp_update_next_port(port + 1, tcp, 3723 B_FALSE); 3724 } 3725 } 3726 if (port == 0) 3727 break; 3728 3729 /* 3730 * Don't let this loop run forever in the case where 3731 * all of the anonymous ports are in use. 3732 */ 3733 } while (++count < loopmax); 3734 return (0); 3735 } 3736 3737 /* 3738 * We are dying for some reason. Try to do it gracefully. (May be called 3739 * as writer.) 3740 * 3741 * Return -1 if the structure was not cleaned up (if the cleanup had to be 3742 * done by a service procedure). 3743 * TBD - Should the return value distinguish between the tcp_t being 3744 * freed and it being reinitialized? 3745 */ 3746 static int 3747 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) 3748 { 3749 mblk_t *mp; 3750 queue_t *q; 3751 3752 TCP_CLD_STAT(tag); 3753 3754 #if TCP_TAG_CLEAN_DEATH 3755 tcp->tcp_cleandeathtag = tag; 3756 #endif 3757 3758 if (tcp->tcp_fused) 3759 tcp_unfuse(tcp); 3760 3761 if (tcp->tcp_linger_tid != 0 && 3762 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 3763 tcp_stop_lingering(tcp); 3764 } 3765 3766 ASSERT(tcp != NULL); 3767 ASSERT((tcp->tcp_family == AF_INET && 3768 tcp->tcp_ipversion == IPV4_VERSION) || 3769 (tcp->tcp_family == AF_INET6 && 3770 (tcp->tcp_ipversion == IPV4_VERSION || 3771 tcp->tcp_ipversion == IPV6_VERSION))); 3772 3773 if (TCP_IS_DETACHED(tcp)) { 3774 if (tcp->tcp_hard_binding) { 3775 /* 3776 * Its an eager that we are dealing with. We close the 3777 * eager but in case a conn_ind has already gone to the 3778 * listener, let tcp_accept_finish() send a discon_ind 3779 * to the listener and drop the last reference. If the 3780 * listener doesn't even know about the eager i.e. the 3781 * conn_ind hasn't gone up, blow away the eager and drop 3782 * the last reference as well. If the conn_ind has gone 3783 * up, state should be BOUND. tcp_accept_finish 3784 * will figure out that the connection has received a 3785 * RST and will send a DISCON_IND to the application. 3786 */ 3787 tcp_closei_local(tcp); 3788 if (tcp->tcp_conn.tcp_eager_conn_ind != NULL) { 3789 CONN_DEC_REF(tcp->tcp_connp); 3790 } else { 3791 tcp->tcp_state = TCPS_BOUND; 3792 } 3793 } else { 3794 tcp_close_detached(tcp); 3795 } 3796 return (0); 3797 } 3798 3799 TCP_STAT(tcp_clean_death_nondetached); 3800 3801 /* 3802 * If T_ORDREL_IND has not been sent yet (done when service routine 3803 * is run) postpone cleaning up the endpoint until service routine 3804 * has sent up the T_ORDREL_IND. Avoid clearing out an existing 3805 * client_errno since tcp_close uses the client_errno field. 3806 */ 3807 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 3808 if (err != 0) 3809 tcp->tcp_client_errno = err; 3810 3811 tcp->tcp_deferred_clean_death = B_TRUE; 3812 return (-1); 3813 } 3814 3815 q = tcp->tcp_rq; 3816 3817 /* Trash all inbound data */ 3818 flushq(q, FLUSHALL); 3819 3820 /* 3821 * If we are at least part way open and there is error 3822 * (err==0 implies no error) 3823 * notify our client by a T_DISCON_IND. 3824 */ 3825 if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) { 3826 if (tcp->tcp_state >= TCPS_ESTABLISHED && 3827 !TCP_IS_SOCKET(tcp)) { 3828 /* 3829 * Send M_FLUSH according to TPI. Because sockets will 3830 * (and must) ignore FLUSHR we do that only for TPI 3831 * endpoints and sockets in STREAMS mode. 3832 */ 3833 (void) putnextctl1(q, M_FLUSH, FLUSHR); 3834 } 3835 if (tcp->tcp_debug) { 3836 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 3837 "tcp_clean_death: discon err %d", err); 3838 } 3839 mp = mi_tpi_discon_ind(NULL, err, 0); 3840 if (mp != NULL) { 3841 putnext(q, mp); 3842 } else { 3843 if (tcp->tcp_debug) { 3844 (void) strlog(TCP_MOD_ID, 0, 1, 3845 SL_ERROR|SL_TRACE, 3846 "tcp_clean_death, sending M_ERROR"); 3847 } 3848 (void) putnextctl1(q, M_ERROR, EPROTO); 3849 } 3850 if (tcp->tcp_state <= TCPS_SYN_RCVD) { 3851 /* SYN_SENT or SYN_RCVD */ 3852 BUMP_MIB(&tcp_mib, tcpAttemptFails); 3853 } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) { 3854 /* ESTABLISHED or CLOSE_WAIT */ 3855 BUMP_MIB(&tcp_mib, tcpEstabResets); 3856 } 3857 } 3858 3859 tcp_reinit(tcp); 3860 return (-1); 3861 } 3862 3863 /* 3864 * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout 3865 * to expire, stop the wait and finish the close. 3866 */ 3867 static void 3868 tcp_stop_lingering(tcp_t *tcp) 3869 { 3870 clock_t delta = 0; 3871 3872 tcp->tcp_linger_tid = 0; 3873 if (tcp->tcp_state > TCPS_LISTEN) { 3874 tcp_acceptor_hash_remove(tcp); 3875 if (tcp->tcp_flow_stopped) { 3876 tcp_clrqfull(tcp); 3877 } 3878 3879 if (tcp->tcp_timer_tid != 0) { 3880 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 3881 tcp->tcp_timer_tid = 0; 3882 } 3883 /* 3884 * Need to cancel those timers which will not be used when 3885 * TCP is detached. This has to be done before the tcp_wq 3886 * is set to the global queue. 3887 */ 3888 tcp_timers_stop(tcp); 3889 3890 3891 tcp->tcp_detached = B_TRUE; 3892 tcp->tcp_rq = tcp_g_q; 3893 tcp->tcp_wq = WR(tcp_g_q); 3894 3895 if (tcp->tcp_state == TCPS_TIME_WAIT) { 3896 tcp_time_wait_append(tcp); 3897 TCP_DBGSTAT(tcp_detach_time_wait); 3898 goto finish; 3899 } 3900 3901 /* 3902 * If delta is zero the timer event wasn't executed and was 3903 * successfully canceled. In this case we need to restart it 3904 * with the minimal delta possible. 3905 */ 3906 if (delta >= 0) { 3907 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 3908 delta ? delta : 1); 3909 } 3910 } else { 3911 tcp_closei_local(tcp); 3912 CONN_DEC_REF(tcp->tcp_connp); 3913 } 3914 finish: 3915 /* Signal closing thread that it can complete close */ 3916 mutex_enter(&tcp->tcp_closelock); 3917 tcp->tcp_detached = B_TRUE; 3918 tcp->tcp_rq = tcp_g_q; 3919 tcp->tcp_wq = WR(tcp_g_q); 3920 tcp->tcp_closed = 1; 3921 cv_signal(&tcp->tcp_closecv); 3922 mutex_exit(&tcp->tcp_closelock); 3923 } 3924 3925 /* 3926 * Handle lingering timeouts. This function is called when the SO_LINGER timeout 3927 * expires. 3928 */ 3929 static void 3930 tcp_close_linger_timeout(void *arg) 3931 { 3932 conn_t *connp = (conn_t *)arg; 3933 tcp_t *tcp = connp->conn_tcp; 3934 3935 tcp->tcp_client_errno = ETIMEDOUT; 3936 tcp_stop_lingering(tcp); 3937 } 3938 3939 static int 3940 tcp_close(queue_t *q, int flags) 3941 { 3942 conn_t *connp = Q_TO_CONN(q); 3943 tcp_t *tcp = connp->conn_tcp; 3944 mblk_t *mp = &tcp->tcp_closemp; 3945 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 3946 3947 ASSERT(WR(q)->q_next == NULL); 3948 ASSERT(connp->conn_ref >= 2); 3949 ASSERT((connp->conn_flags & IPCL_TCPMOD) == 0); 3950 3951 /* 3952 * We are being closed as /dev/tcp or /dev/tcp6. 3953 * 3954 * Mark the conn as closing. ill_pending_mp_add will not 3955 * add any mp to the pending mp list, after this conn has 3956 * started closing. Same for sq_pending_mp_add 3957 */ 3958 mutex_enter(&connp->conn_lock); 3959 connp->conn_state_flags |= CONN_CLOSING; 3960 if (connp->conn_oper_pending_ill != NULL) 3961 conn_ioctl_cleanup_reqd = B_TRUE; 3962 CONN_INC_REF_LOCKED(connp); 3963 mutex_exit(&connp->conn_lock); 3964 tcp->tcp_closeflags = (uint8_t)flags; 3965 ASSERT(connp->conn_ref >= 3); 3966 3967 (*tcp_squeue_close_proc)(connp->conn_sqp, mp, 3968 tcp_close_output, connp, SQTAG_IP_TCP_CLOSE); 3969 3970 mutex_enter(&tcp->tcp_closelock); 3971 3972 while (!tcp->tcp_closed) 3973 cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock); 3974 mutex_exit(&tcp->tcp_closelock); 3975 /* 3976 * In the case of listener streams that have eagers in the q or q0 3977 * we wait for the eagers to drop their reference to us. tcp_rq and 3978 * tcp_wq of the eagers point to our queues. By waiting for the 3979 * refcnt to drop to 1, we are sure that the eagers have cleaned 3980 * up their queue pointers and also dropped their references to us. 3981 */ 3982 if (tcp->tcp_wait_for_eagers) { 3983 mutex_enter(&connp->conn_lock); 3984 while (connp->conn_ref != 1) { 3985 cv_wait(&connp->conn_cv, &connp->conn_lock); 3986 } 3987 mutex_exit(&connp->conn_lock); 3988 } 3989 /* 3990 * ioctl cleanup. The mp is queued in the 3991 * ill_pending_mp or in the sq_pending_mp. 3992 */ 3993 if (conn_ioctl_cleanup_reqd) 3994 conn_ioctl_cleanup(connp); 3995 3996 qprocsoff(q); 3997 inet_minor_free(ip_minor_arena, connp->conn_dev); 3998 3999 tcp->tcp_cpid = -1; 4000 4001 /* 4002 * Drop IP's reference on the conn. This is the last reference 4003 * on the connp if the state was less than established. If the 4004 * connection has gone into timewait state, then we will have 4005 * one ref for the TCP and one more ref (total of two) for the 4006 * classifier connected hash list (a timewait connections stays 4007 * in connected hash till closed). 4008 * 4009 * We can't assert the references because there might be other 4010 * transient reference places because of some walkers or queued 4011 * packets in squeue for the timewait state. 4012 */ 4013 CONN_DEC_REF(connp); 4014 q->q_ptr = WR(q)->q_ptr = NULL; 4015 return (0); 4016 } 4017 4018 static int 4019 tcpclose_accept(queue_t *q) 4020 { 4021 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); 4022 4023 /* 4024 * We had opened an acceptor STREAM for sockfs which is 4025 * now being closed due to some error. 4026 */ 4027 qprocsoff(q); 4028 inet_minor_free(ip_minor_arena, (dev_t)q->q_ptr); 4029 q->q_ptr = WR(q)->q_ptr = NULL; 4030 return (0); 4031 } 4032 4033 4034 /* 4035 * Called by streams close routine via squeues when our client blows off her 4036 * descriptor, we take this to mean: "close the stream state NOW, close the tcp 4037 * connection politely" When SO_LINGER is set (with a non-zero linger time and 4038 * it is not a nonblocking socket) then this routine sleeps until the FIN is 4039 * acked. 4040 * 4041 * NOTE: tcp_close potentially returns error when lingering. 4042 * However, the stream head currently does not pass these errors 4043 * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK 4044 * errors to the application (from tsleep()) and not errors 4045 * like ECONNRESET caused by receiving a reset packet. 4046 */ 4047 4048 /* ARGSUSED */ 4049 static void 4050 tcp_close_output(void *arg, mblk_t *mp, void *arg2) 4051 { 4052 char *msg; 4053 conn_t *connp = (conn_t *)arg; 4054 tcp_t *tcp = connp->conn_tcp; 4055 clock_t delta = 0; 4056 4057 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 4058 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 4059 4060 /* Cancel any pending timeout */ 4061 if (tcp->tcp_ordrelid != 0) { 4062 if (tcp->tcp_timeout) { 4063 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ordrelid); 4064 } 4065 tcp->tcp_ordrelid = 0; 4066 tcp->tcp_timeout = B_FALSE; 4067 } 4068 4069 mutex_enter(&tcp->tcp_eager_lock); 4070 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 4071 /* Cleanup for listener */ 4072 tcp_eager_cleanup(tcp, 0); 4073 tcp->tcp_wait_for_eagers = 1; 4074 } 4075 mutex_exit(&tcp->tcp_eager_lock); 4076 4077 connp->conn_mdt_ok = B_FALSE; 4078 tcp->tcp_mdt = B_FALSE; 4079 4080 msg = NULL; 4081 switch (tcp->tcp_state) { 4082 case TCPS_CLOSED: 4083 case TCPS_IDLE: 4084 case TCPS_BOUND: 4085 case TCPS_LISTEN: 4086 break; 4087 case TCPS_SYN_SENT: 4088 msg = "tcp_close, during connect"; 4089 break; 4090 case TCPS_SYN_RCVD: 4091 /* 4092 * Close during the connect 3-way handshake 4093 * but here there may or may not be pending data 4094 * already on queue. Process almost same as in 4095 * the ESTABLISHED state. 4096 */ 4097 /* FALLTHRU */ 4098 default: 4099 if (tcp->tcp_fused) 4100 tcp_unfuse(tcp); 4101 4102 /* 4103 * If SO_LINGER has set a zero linger time, abort the 4104 * connection with a reset. 4105 */ 4106 if (tcp->tcp_linger && tcp->tcp_lingertime == 0) { 4107 msg = "tcp_close, zero lingertime"; 4108 break; 4109 } 4110 4111 ASSERT(tcp->tcp_hard_bound || tcp->tcp_hard_binding); 4112 /* 4113 * Abort connection if there is unread data queued. 4114 */ 4115 if (tcp->tcp_rcv_list || tcp->tcp_reass_head) { 4116 msg = "tcp_close, unread data"; 4117 break; 4118 } 4119 /* 4120 * tcp_hard_bound is now cleared thus all packets go through 4121 * tcp_lookup. This fact is used by tcp_detach below. 4122 * 4123 * We have done a qwait() above which could have possibly 4124 * drained more messages in turn causing transition to a 4125 * different state. Check whether we have to do the rest 4126 * of the processing or not. 4127 */ 4128 if (tcp->tcp_state <= TCPS_LISTEN) 4129 break; 4130 4131 /* 4132 * Transmit the FIN before detaching the tcp_t. 4133 * After tcp_detach returns this queue/perimeter 4134 * no longer owns the tcp_t thus others can modify it. 4135 */ 4136 (void) tcp_xmit_end(tcp); 4137 4138 /* 4139 * If lingering on close then wait until the fin is acked, 4140 * the SO_LINGER time passes, or a reset is sent/received. 4141 */ 4142 if (tcp->tcp_linger && tcp->tcp_lingertime > 0 && 4143 !(tcp->tcp_fin_acked) && 4144 tcp->tcp_state >= TCPS_ESTABLISHED) { 4145 if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) { 4146 tcp->tcp_client_errno = EWOULDBLOCK; 4147 } else if (tcp->tcp_client_errno == 0) { 4148 4149 ASSERT(tcp->tcp_linger_tid == 0); 4150 4151 tcp->tcp_linger_tid = TCP_TIMER(tcp, 4152 tcp_close_linger_timeout, 4153 tcp->tcp_lingertime * hz); 4154 4155 /* tcp_close_linger_timeout will finish close */ 4156 if (tcp->tcp_linger_tid == 0) 4157 tcp->tcp_client_errno = ENOSR; 4158 else 4159 return; 4160 } 4161 4162 /* 4163 * Check if we need to detach or just close 4164 * the instance. 4165 */ 4166 if (tcp->tcp_state <= TCPS_LISTEN) 4167 break; 4168 } 4169 4170 /* 4171 * Make sure that no other thread will access the tcp_rq of 4172 * this instance (through lookups etc.) as tcp_rq will go 4173 * away shortly. 4174 */ 4175 tcp_acceptor_hash_remove(tcp); 4176 4177 if (tcp->tcp_flow_stopped) { 4178 tcp_clrqfull(tcp); 4179 } 4180 4181 if (tcp->tcp_timer_tid != 0) { 4182 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 4183 tcp->tcp_timer_tid = 0; 4184 } 4185 /* 4186 * Need to cancel those timers which will not be used when 4187 * TCP is detached. This has to be done before the tcp_wq 4188 * is set to the global queue. 4189 */ 4190 tcp_timers_stop(tcp); 4191 4192 tcp->tcp_detached = B_TRUE; 4193 if (tcp->tcp_state == TCPS_TIME_WAIT) { 4194 tcp_time_wait_append(tcp); 4195 TCP_DBGSTAT(tcp_detach_time_wait); 4196 ASSERT(connp->conn_ref >= 3); 4197 goto finish; 4198 } 4199 4200 /* 4201 * If delta is zero the timer event wasn't executed and was 4202 * successfully canceled. In this case we need to restart it 4203 * with the minimal delta possible. 4204 */ 4205 if (delta >= 0) 4206 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 4207 delta ? delta : 1); 4208 4209 ASSERT(connp->conn_ref >= 3); 4210 goto finish; 4211 } 4212 4213 /* Detach did not complete. Still need to remove q from stream. */ 4214 if (msg) { 4215 if (tcp->tcp_state == TCPS_ESTABLISHED || 4216 tcp->tcp_state == TCPS_CLOSE_WAIT) 4217 BUMP_MIB(&tcp_mib, tcpEstabResets); 4218 if (tcp->tcp_state == TCPS_SYN_SENT || 4219 tcp->tcp_state == TCPS_SYN_RCVD) 4220 BUMP_MIB(&tcp_mib, tcpAttemptFails); 4221 tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST); 4222 } 4223 4224 tcp_closei_local(tcp); 4225 CONN_DEC_REF(connp); 4226 ASSERT(connp->conn_ref >= 2); 4227 4228 finish: 4229 /* 4230 * Although packets are always processed on the correct 4231 * tcp's perimeter and access is serialized via squeue's, 4232 * IP still needs a queue when sending packets in time_wait 4233 * state so use WR(tcp_g_q) till ip_output() can be 4234 * changed to deal with just connp. For read side, we 4235 * could have set tcp_rq to NULL but there are some cases 4236 * in tcp_rput_data() from early days of this code which 4237 * do a putnext without checking if tcp is closed. Those 4238 * need to be identified before both tcp_rq and tcp_wq 4239 * can be set to NULL and tcp_q_q can disappear forever. 4240 */ 4241 mutex_enter(&tcp->tcp_closelock); 4242 /* 4243 * Don't change the queues in the case of a listener that has 4244 * eagers in its q or q0. It could surprise the eagers. 4245 * Instead wait for the eagers outside the squeue. 4246 */ 4247 if (!tcp->tcp_wait_for_eagers) { 4248 tcp->tcp_detached = B_TRUE; 4249 tcp->tcp_rq = tcp_g_q; 4250 tcp->tcp_wq = WR(tcp_g_q); 4251 } 4252 4253 /* Signal tcp_close() to finish closing. */ 4254 tcp->tcp_closed = 1; 4255 cv_signal(&tcp->tcp_closecv); 4256 mutex_exit(&tcp->tcp_closelock); 4257 } 4258 4259 4260 /* 4261 * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp. 4262 * Some stream heads get upset if they see these later on as anything but NULL. 4263 */ 4264 static void 4265 tcp_close_mpp(mblk_t **mpp) 4266 { 4267 mblk_t *mp; 4268 4269 if ((mp = *mpp) != NULL) { 4270 do { 4271 mp->b_next = NULL; 4272 mp->b_prev = NULL; 4273 } while ((mp = mp->b_cont) != NULL); 4274 4275 mp = *mpp; 4276 *mpp = NULL; 4277 freemsg(mp); 4278 } 4279 } 4280 4281 /* Do detached close. */ 4282 static void 4283 tcp_close_detached(tcp_t *tcp) 4284 { 4285 if (tcp->tcp_fused) 4286 tcp_unfuse(tcp); 4287 4288 /* 4289 * Clustering code serializes TCP disconnect callbacks and 4290 * cluster tcp list walks by blocking a TCP disconnect callback 4291 * if a cluster tcp list walk is in progress. This ensures 4292 * accurate accounting of TCPs in the cluster code even though 4293 * the TCP list walk itself is not atomic. 4294 */ 4295 tcp_closei_local(tcp); 4296 CONN_DEC_REF(tcp->tcp_connp); 4297 } 4298 4299 /* 4300 * Stop all TCP timers, and free the timer mblks if requested. 4301 */ 4302 void 4303 tcp_timers_stop(tcp_t *tcp) 4304 { 4305 if (tcp->tcp_timer_tid != 0) { 4306 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 4307 tcp->tcp_timer_tid = 0; 4308 } 4309 if (tcp->tcp_ka_tid != 0) { 4310 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); 4311 tcp->tcp_ka_tid = 0; 4312 } 4313 if (tcp->tcp_ack_tid != 0) { 4314 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 4315 tcp->tcp_ack_tid = 0; 4316 } 4317 if (tcp->tcp_push_tid != 0) { 4318 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 4319 tcp->tcp_push_tid = 0; 4320 } 4321 } 4322 4323 /* 4324 * The tcp_t is going away. Remove it from all lists and set it 4325 * to TCPS_CLOSED. The freeing up of memory is deferred until 4326 * tcp_inactive. This is needed since a thread in tcp_rput might have 4327 * done a CONN_INC_REF on this structure before it was removed from the 4328 * hashes. 4329 */ 4330 static void 4331 tcp_closei_local(tcp_t *tcp) 4332 { 4333 ire_t *ire; 4334 conn_t *connp = tcp->tcp_connp; 4335 4336 if (!TCP_IS_SOCKET(tcp)) 4337 tcp_acceptor_hash_remove(tcp); 4338 4339 UPDATE_MIB(&tcp_mib, tcpInSegs, tcp->tcp_ibsegs); 4340 tcp->tcp_ibsegs = 0; 4341 UPDATE_MIB(&tcp_mib, tcpOutSegs, tcp->tcp_obsegs); 4342 tcp->tcp_obsegs = 0; 4343 4344 /* 4345 * If we are an eager connection hanging off a listener that 4346 * hasn't formally accepted the connection yet, get off his 4347 * list and blow off any data that we have accumulated. 4348 */ 4349 if (tcp->tcp_listener != NULL) { 4350 tcp_t *listener = tcp->tcp_listener; 4351 mutex_enter(&listener->tcp_eager_lock); 4352 /* 4353 * tcp_eager_conn_ind == NULL means that the 4354 * conn_ind has already gone to listener. At 4355 * this point, eager will be closed but we 4356 * leave it in listeners eager list so that 4357 * if listener decides to close without doing 4358 * accept, we can clean this up. In tcp_wput_accept 4359 * we take case of the case of accept on closed 4360 * eager. 4361 */ 4362 if (tcp->tcp_conn.tcp_eager_conn_ind != NULL) { 4363 tcp_eager_unlink(tcp); 4364 mutex_exit(&listener->tcp_eager_lock); 4365 /* 4366 * We don't want to have any pointers to the 4367 * listener queue, after we have released our 4368 * reference on the listener 4369 */ 4370 tcp->tcp_rq = tcp_g_q; 4371 tcp->tcp_wq = WR(tcp_g_q); 4372 CONN_DEC_REF(listener->tcp_connp); 4373 } else { 4374 mutex_exit(&listener->tcp_eager_lock); 4375 } 4376 } 4377 4378 /* Stop all the timers */ 4379 tcp_timers_stop(tcp); 4380 4381 if (tcp->tcp_state == TCPS_LISTEN) { 4382 if (tcp->tcp_ip_addr_cache) { 4383 kmem_free((void *)tcp->tcp_ip_addr_cache, 4384 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 4385 tcp->tcp_ip_addr_cache = NULL; 4386 } 4387 } 4388 if (tcp->tcp_flow_stopped) 4389 tcp_clrqfull(tcp); 4390 4391 tcp_bind_hash_remove(tcp); 4392 /* 4393 * If the tcp_time_wait_collector (which runs outside the squeue) 4394 * is trying to remove this tcp from the time wait list, we will 4395 * block in tcp_time_wait_remove while trying to acquire the 4396 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also 4397 * requires the ipcl_hash_remove to be ordered after the 4398 * tcp_time_wait_remove for the refcnt checks to work correctly. 4399 */ 4400 if (tcp->tcp_state == TCPS_TIME_WAIT) 4401 tcp_time_wait_remove(tcp, NULL); 4402 CL_INET_DISCONNECT(tcp); 4403 ipcl_hash_remove(connp); 4404 4405 /* 4406 * Delete the cached ire in conn_ire_cache and also mark 4407 * the conn as CONDEMNED 4408 */ 4409 mutex_enter(&connp->conn_lock); 4410 connp->conn_state_flags |= CONN_CONDEMNED; 4411 ire = connp->conn_ire_cache; 4412 connp->conn_ire_cache = NULL; 4413 mutex_exit(&connp->conn_lock); 4414 if (ire != NULL) 4415 IRE_REFRELE_NOTR(ire); 4416 4417 /* Need to cleanup any pending ioctls */ 4418 ASSERT(tcp->tcp_time_wait_next == NULL); 4419 ASSERT(tcp->tcp_time_wait_prev == NULL); 4420 ASSERT(tcp->tcp_time_wait_expire == 0); 4421 tcp->tcp_state = TCPS_CLOSED; 4422 4423 /* Release any SSL context */ 4424 if (tcp->tcp_kssl_ent != NULL) { 4425 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); 4426 tcp->tcp_kssl_ent = NULL; 4427 } 4428 if (tcp->tcp_kssl_ctx != NULL) { 4429 kssl_release_ctx(tcp->tcp_kssl_ctx); 4430 tcp->tcp_kssl_ctx = NULL; 4431 } 4432 tcp->tcp_kssl_pending = B_FALSE; 4433 } 4434 4435 /* 4436 * tcp is dying (called from ipcl_conn_destroy and error cases). 4437 * Free the tcp_t in either case. 4438 */ 4439 void 4440 tcp_free(tcp_t *tcp) 4441 { 4442 mblk_t *mp; 4443 ip6_pkt_t *ipp; 4444 4445 ASSERT(tcp != NULL); 4446 ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL); 4447 4448 tcp->tcp_rq = NULL; 4449 tcp->tcp_wq = NULL; 4450 4451 tcp_close_mpp(&tcp->tcp_xmit_head); 4452 tcp_close_mpp(&tcp->tcp_reass_head); 4453 if (tcp->tcp_rcv_list != NULL) { 4454 /* Free b_next chain */ 4455 tcp_close_mpp(&tcp->tcp_rcv_list); 4456 } 4457 if ((mp = tcp->tcp_urp_mp) != NULL) { 4458 freemsg(mp); 4459 } 4460 if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 4461 freemsg(mp); 4462 } 4463 4464 if (tcp->tcp_fused_sigurg_mp != NULL) { 4465 freeb(tcp->tcp_fused_sigurg_mp); 4466 tcp->tcp_fused_sigurg_mp = NULL; 4467 } 4468 4469 if (tcp->tcp_sack_info != NULL) { 4470 if (tcp->tcp_notsack_list != NULL) { 4471 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 4472 } 4473 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 4474 } 4475 4476 if (tcp->tcp_hopopts != NULL) { 4477 mi_free(tcp->tcp_hopopts); 4478 tcp->tcp_hopopts = NULL; 4479 tcp->tcp_hopoptslen = 0; 4480 } 4481 ASSERT(tcp->tcp_hopoptslen == 0); 4482 if (tcp->tcp_dstopts != NULL) { 4483 mi_free(tcp->tcp_dstopts); 4484 tcp->tcp_dstopts = NULL; 4485 tcp->tcp_dstoptslen = 0; 4486 } 4487 ASSERT(tcp->tcp_dstoptslen == 0); 4488 if (tcp->tcp_rtdstopts != NULL) { 4489 mi_free(tcp->tcp_rtdstopts); 4490 tcp->tcp_rtdstopts = NULL; 4491 tcp->tcp_rtdstoptslen = 0; 4492 } 4493 ASSERT(tcp->tcp_rtdstoptslen == 0); 4494 if (tcp->tcp_rthdr != NULL) { 4495 mi_free(tcp->tcp_rthdr); 4496 tcp->tcp_rthdr = NULL; 4497 tcp->tcp_rthdrlen = 0; 4498 } 4499 ASSERT(tcp->tcp_rthdrlen == 0); 4500 4501 ipp = &tcp->tcp_sticky_ipp; 4502 if (ipp->ipp_fields & (IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | 4503 IPPF_RTHDR)) 4504 ip6_pkt_free(ipp); 4505 4506 /* 4507 * Free memory associated with the tcp/ip header template. 4508 */ 4509 4510 if (tcp->tcp_iphc != NULL) 4511 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 4512 4513 /* 4514 * Following is really a blowing away a union. 4515 * It happens to have exactly two members of identical size 4516 * the following code is enough. 4517 */ 4518 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 4519 4520 if (tcp->tcp_tracebuf != NULL) { 4521 kmem_free(tcp->tcp_tracebuf, sizeof (tcptrch_t)); 4522 tcp->tcp_tracebuf = NULL; 4523 } 4524 } 4525 4526 4527 /* 4528 * Put a connection confirmation message upstream built from the 4529 * address information within 'iph' and 'tcph'. Report our success or failure. 4530 */ 4531 static boolean_t 4532 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, 4533 mblk_t **defermp) 4534 { 4535 sin_t sin; 4536 sin6_t sin6; 4537 mblk_t *mp; 4538 char *optp = NULL; 4539 int optlen = 0; 4540 cred_t *cr; 4541 4542 if (defermp != NULL) 4543 *defermp = NULL; 4544 4545 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { 4546 /* 4547 * Return in T_CONN_CON results of option negotiation through 4548 * the T_CONN_REQ. Note: If there is an real end-to-end option 4549 * negotiation, then what is received from remote end needs 4550 * to be taken into account but there is no such thing (yet?) 4551 * in our TCP/IP. 4552 * Note: We do not use mi_offset_param() here as 4553 * tcp_opts_conn_req contents do not directly come from 4554 * an application and are either generated in kernel or 4555 * from user input that was already verified. 4556 */ 4557 mp = tcp->tcp_conn.tcp_opts_conn_req; 4558 optp = (char *)(mp->b_rptr + 4559 ((struct T_conn_req *)mp->b_rptr)->OPT_offset); 4560 optlen = (int) 4561 ((struct T_conn_req *)mp->b_rptr)->OPT_length; 4562 } 4563 4564 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { 4565 ipha_t *ipha = (ipha_t *)iphdr; 4566 4567 /* packet is IPv4 */ 4568 if (tcp->tcp_family == AF_INET) { 4569 sin = sin_null; 4570 sin.sin_addr.s_addr = ipha->ipha_src; 4571 sin.sin_port = *(uint16_t *)tcph->th_lport; 4572 sin.sin_family = AF_INET; 4573 mp = mi_tpi_conn_con(NULL, (char *)&sin, 4574 (int)sizeof (sin_t), optp, optlen); 4575 } else { 4576 sin6 = sin6_null; 4577 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr); 4578 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4579 sin6.sin6_family = AF_INET6; 4580 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 4581 (int)sizeof (sin6_t), optp, optlen); 4582 4583 } 4584 } else { 4585 ip6_t *ip6h = (ip6_t *)iphdr; 4586 4587 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); 4588 ASSERT(tcp->tcp_family == AF_INET6); 4589 sin6 = sin6_null; 4590 sin6.sin6_addr = ip6h->ip6_src; 4591 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4592 sin6.sin6_family = AF_INET6; 4593 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 4594 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 4595 (int)sizeof (sin6_t), optp, optlen); 4596 } 4597 4598 if (!mp) 4599 return (B_FALSE); 4600 4601 if ((cr = DB_CRED(idmp)) != NULL) { 4602 mblk_setcred(mp, cr); 4603 DB_CPID(mp) = DB_CPID(idmp); 4604 } 4605 4606 if (defermp == NULL) 4607 putnext(tcp->tcp_rq, mp); 4608 else 4609 *defermp = mp; 4610 4611 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 4612 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 4613 return (B_TRUE); 4614 } 4615 4616 /* 4617 * Defense for the SYN attack - 4618 * 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest 4619 * one that doesn't have the dontdrop bit set. 4620 * 2. Don't drop a SYN request before its first timeout. This gives every 4621 * request at least til the first timeout to complete its 3-way handshake. 4622 * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many 4623 * requests currently on the queue that has timed out. This will be used 4624 * as an indicator of whether an attack is under way, so that appropriate 4625 * actions can be taken. (It's incremented in tcp_timer() and decremented 4626 * either when eager goes into ESTABLISHED, or gets freed up.) 4627 * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on 4628 * # of timeout drops back to <= q0len/32 => SYN alert off 4629 */ 4630 static boolean_t 4631 tcp_drop_q0(tcp_t *tcp) 4632 { 4633 tcp_t *eager; 4634 4635 ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock)); 4636 ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 4637 /* 4638 * New one is added after next_q0 so prev_q0 points to the oldest 4639 * Also do not drop any established connections that are deferred on 4640 * q0 due to q being full 4641 */ 4642 4643 eager = tcp->tcp_eager_prev_q0; 4644 while (eager->tcp_dontdrop || eager->tcp_conn_def_q0) { 4645 eager = eager->tcp_eager_prev_q0; 4646 if (eager == tcp) { 4647 eager = tcp->tcp_eager_prev_q0; 4648 break; 4649 } 4650 } 4651 if (eager->tcp_syn_rcvd_timeout == 0) 4652 return (B_FALSE); 4653 4654 if (tcp->tcp_debug) { 4655 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 4656 "tcp_drop_q0: listen half-open queue (max=%d) overflow" 4657 " (%d pending) on %s, drop one", tcp_conn_req_max_q0, 4658 tcp->tcp_conn_req_cnt_q0, 4659 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 4660 } 4661 4662 BUMP_MIB(&tcp_mib, tcpHalfOpenDrop); 4663 4664 /* 4665 * need to do refhold here because the selected eager could 4666 * be removed by someone else if we release the eager lock. 4667 */ 4668 CONN_INC_REF(eager->tcp_connp); 4669 mutex_exit(&tcp->tcp_eager_lock); 4670 4671 /* Mark the IRE created for this SYN request temporary */ 4672 tcp_ip_ire_mark_advice(eager); 4673 (void) tcp_clean_death(eager, ETIMEDOUT, 5); 4674 CONN_DEC_REF(eager->tcp_connp); 4675 4676 mutex_enter(&tcp->tcp_eager_lock); 4677 return (B_TRUE); 4678 } 4679 4680 int 4681 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 4682 tcph_t *tcph, uint_t ipvers, mblk_t *idmp) 4683 { 4684 tcp_t *ltcp = lconnp->conn_tcp; 4685 tcp_t *tcp = connp->conn_tcp; 4686 mblk_t *tpi_mp; 4687 ipha_t *ipha; 4688 ip6_t *ip6h; 4689 sin6_t sin6; 4690 in6_addr_t v6dst; 4691 int err; 4692 int ifindex = 0; 4693 cred_t *cr; 4694 4695 if (ipvers == IPV4_VERSION) { 4696 ipha = (ipha_t *)mp->b_rptr; 4697 4698 connp->conn_send = ip_output; 4699 connp->conn_recv = tcp_input; 4700 4701 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6); 4702 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6); 4703 4704 sin6 = sin6_null; 4705 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr); 4706 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 4707 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4708 sin6.sin6_family = AF_INET6; 4709 sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst, 4710 lconnp->conn_zoneid); 4711 if (tcp->tcp_recvdstaddr) { 4712 sin6_t sin6d; 4713 4714 sin6d = sin6_null; 4715 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, 4716 &sin6d.sin6_addr); 4717 sin6d.sin6_port = *(uint16_t *)tcph->th_fport; 4718 sin6d.sin6_family = AF_INET; 4719 tpi_mp = mi_tpi_extconn_ind(NULL, 4720 (char *)&sin6d, sizeof (sin6_t), 4721 (char *)&tcp, 4722 (t_scalar_t)sizeof (intptr_t), 4723 (char *)&sin6d, sizeof (sin6_t), 4724 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4725 } else { 4726 tpi_mp = mi_tpi_conn_ind(NULL, 4727 (char *)&sin6, sizeof (sin6_t), 4728 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4729 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4730 } 4731 } else { 4732 ip6h = (ip6_t *)mp->b_rptr; 4733 4734 connp->conn_send = ip_output_v6; 4735 connp->conn_recv = tcp_input; 4736 4737 connp->conn_srcv6 = ip6h->ip6_dst; 4738 connp->conn_remv6 = ip6h->ip6_src; 4739 4740 /* db_cksumstuff is set at ip_fanout_tcp_v6 */ 4741 ifindex = (int)DB_CKSUMSTUFF(mp); 4742 DB_CKSUMSTUFF(mp) = 0; 4743 4744 sin6 = sin6_null; 4745 sin6.sin6_addr = ip6h->ip6_src; 4746 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4747 sin6.sin6_family = AF_INET6; 4748 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 4749 sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 4750 lconnp->conn_zoneid); 4751 4752 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { 4753 /* Pass up the scope_id of remote addr */ 4754 sin6.sin6_scope_id = ifindex; 4755 } else { 4756 sin6.sin6_scope_id = 0; 4757 } 4758 if (tcp->tcp_recvdstaddr) { 4759 sin6_t sin6d; 4760 4761 sin6d = sin6_null; 4762 sin6.sin6_addr = ip6h->ip6_dst; 4763 sin6d.sin6_port = *(uint16_t *)tcph->th_fport; 4764 sin6d.sin6_family = AF_INET; 4765 tpi_mp = mi_tpi_extconn_ind(NULL, 4766 (char *)&sin6d, sizeof (sin6_t), 4767 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4768 (char *)&sin6d, sizeof (sin6_t), 4769 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4770 } else { 4771 tpi_mp = mi_tpi_conn_ind(NULL, 4772 (char *)&sin6, sizeof (sin6_t), 4773 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4774 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4775 } 4776 } 4777 4778 if (tpi_mp == NULL) 4779 return (ENOMEM); 4780 4781 connp->conn_fport = *(uint16_t *)tcph->th_lport; 4782 connp->conn_lport = *(uint16_t *)tcph->th_fport; 4783 connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER); 4784 connp->conn_fully_bound = B_FALSE; 4785 4786 if (tcp_trace) 4787 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP); 4788 4789 /* Inherit information from the "parent" */ 4790 tcp->tcp_ipversion = ltcp->tcp_ipversion; 4791 tcp->tcp_family = ltcp->tcp_family; 4792 tcp->tcp_wq = ltcp->tcp_wq; 4793 tcp->tcp_rq = ltcp->tcp_rq; 4794 tcp->tcp_mss = tcp_mss_def_ipv6; 4795 tcp->tcp_detached = B_TRUE; 4796 if ((err = tcp_init_values(tcp)) != 0) { 4797 freemsg(tpi_mp); 4798 return (err); 4799 } 4800 4801 if (ipvers == IPV4_VERSION) { 4802 if ((err = tcp_header_init_ipv4(tcp)) != 0) { 4803 freemsg(tpi_mp); 4804 return (err); 4805 } 4806 ASSERT(tcp->tcp_ipha != NULL); 4807 } else { 4808 /* ifindex must be already set */ 4809 ASSERT(ifindex != 0); 4810 4811 if (ltcp->tcp_bound_if != 0) { 4812 /* 4813 * Set newtcp's bound_if equal to 4814 * listener's value. If ifindex is 4815 * not the same as ltcp->tcp_bound_if, 4816 * it must be a packet for the ipmp group 4817 * of interfaces 4818 */ 4819 tcp->tcp_bound_if = ltcp->tcp_bound_if; 4820 } else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { 4821 tcp->tcp_bound_if = ifindex; 4822 } 4823 4824 tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary; 4825 tcp->tcp_recvifindex = 0; 4826 tcp->tcp_recvhops = 0xffffffffU; 4827 ASSERT(tcp->tcp_ip6h != NULL); 4828 } 4829 4830 tcp->tcp_lport = ltcp->tcp_lport; 4831 4832 if (ltcp->tcp_ipversion == tcp->tcp_ipversion) { 4833 if (tcp->tcp_iphc_len != ltcp->tcp_iphc_len) { 4834 /* 4835 * Listener had options of some sort; eager inherits. 4836 * Free up the eager template and allocate one 4837 * of the right size. 4838 */ 4839 if (tcp->tcp_hdr_grown) { 4840 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 4841 } else { 4842 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 4843 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 4844 } 4845 tcp->tcp_iphc = kmem_zalloc(ltcp->tcp_iphc_len, 4846 KM_NOSLEEP); 4847 if (tcp->tcp_iphc == NULL) { 4848 tcp->tcp_iphc_len = 0; 4849 freemsg(tpi_mp); 4850 return (ENOMEM); 4851 } 4852 tcp->tcp_iphc_len = ltcp->tcp_iphc_len; 4853 tcp->tcp_hdr_grown = B_TRUE; 4854 } 4855 tcp->tcp_hdr_len = ltcp->tcp_hdr_len; 4856 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len; 4857 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; 4858 tcp->tcp_ip6_hops = ltcp->tcp_ip6_hops; 4859 tcp->tcp_ip6_vcf = ltcp->tcp_ip6_vcf; 4860 4861 /* 4862 * Copy the IP+TCP header template from listener to eager 4863 */ 4864 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len); 4865 if (tcp->tcp_ipversion == IPV6_VERSION) { 4866 if (((ip6i_t *)(tcp->tcp_iphc))->ip6i_nxt == 4867 IPPROTO_RAW) { 4868 tcp->tcp_ip6h = 4869 (ip6_t *)(tcp->tcp_iphc + 4870 sizeof (ip6i_t)); 4871 } else { 4872 tcp->tcp_ip6h = 4873 (ip6_t *)(tcp->tcp_iphc); 4874 } 4875 tcp->tcp_ipha = NULL; 4876 } else { 4877 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; 4878 tcp->tcp_ip6h = NULL; 4879 } 4880 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + 4881 tcp->tcp_ip_hdr_len); 4882 } else { 4883 /* 4884 * only valid case when ipversion of listener and 4885 * eager differ is when listener is IPv6 and 4886 * eager is IPv4. 4887 * Eager header template has been initialized to the 4888 * maximum v4 header sizes, which includes space for 4889 * TCP and IP options. 4890 */ 4891 ASSERT((ltcp->tcp_ipversion == IPV6_VERSION) && 4892 (tcp->tcp_ipversion == IPV4_VERSION)); 4893 ASSERT(tcp->tcp_iphc_len >= 4894 TCP_MAX_COMBINED_HEADER_LENGTH); 4895 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; 4896 /* copy IP header fields individually */ 4897 tcp->tcp_ipha->ipha_ttl = 4898 ltcp->tcp_ip6h->ip6_hops; 4899 bcopy(ltcp->tcp_tcph->th_lport, 4900 tcp->tcp_tcph->th_lport, sizeof (ushort_t)); 4901 } 4902 4903 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t)); 4904 bcopy(tcp->tcp_tcph->th_fport, &tcp->tcp_fport, 4905 sizeof (in_port_t)); 4906 4907 if (ltcp->tcp_lport == 0) { 4908 tcp->tcp_lport = *(in_port_t *)tcph->th_fport; 4909 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, 4910 sizeof (in_port_t)); 4911 } 4912 4913 if (tcp->tcp_ipversion == IPV4_VERSION) { 4914 ASSERT(ipha != NULL); 4915 tcp->tcp_ipha->ipha_dst = ipha->ipha_src; 4916 tcp->tcp_ipha->ipha_src = ipha->ipha_dst; 4917 4918 /* Source routing option copyover (reverse it) */ 4919 if (tcp_rev_src_routes) 4920 tcp_opt_reverse(tcp, ipha); 4921 } else { 4922 ASSERT(ip6h != NULL); 4923 tcp->tcp_ip6h->ip6_dst = ip6h->ip6_src; 4924 tcp->tcp_ip6h->ip6_src = ip6h->ip6_dst; 4925 } 4926 4927 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 4928 /* 4929 * If the SYN contains a credential, it's a loopback packet; attach 4930 * the credential to the TPI message. 4931 */ 4932 if ((cr = DB_CRED(idmp)) != NULL) { 4933 mblk_setcred(tpi_mp, cr); 4934 DB_CPID(tpi_mp) = DB_CPID(idmp); 4935 } 4936 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp; 4937 4938 /* Inherit the listener's SSL protection state */ 4939 4940 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) { 4941 kssl_hold_ent(tcp->tcp_kssl_ent); 4942 tcp->tcp_kssl_pending = B_TRUE; 4943 } 4944 4945 return (0); 4946 } 4947 4948 4949 int 4950 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, 4951 tcph_t *tcph, mblk_t *idmp) 4952 { 4953 tcp_t *ltcp = lconnp->conn_tcp; 4954 tcp_t *tcp = connp->conn_tcp; 4955 sin_t sin; 4956 mblk_t *tpi_mp = NULL; 4957 int err; 4958 cred_t *cr; 4959 4960 sin = sin_null; 4961 sin.sin_addr.s_addr = ipha->ipha_src; 4962 sin.sin_port = *(uint16_t *)tcph->th_lport; 4963 sin.sin_family = AF_INET; 4964 if (ltcp->tcp_recvdstaddr) { 4965 sin_t sind; 4966 4967 sind = sin_null; 4968 sind.sin_addr.s_addr = ipha->ipha_dst; 4969 sind.sin_port = *(uint16_t *)tcph->th_fport; 4970 sind.sin_family = AF_INET; 4971 tpi_mp = mi_tpi_extconn_ind(NULL, 4972 (char *)&sind, sizeof (sin_t), (char *)&tcp, 4973 (t_scalar_t)sizeof (intptr_t), (char *)&sind, 4974 sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4975 } else { 4976 tpi_mp = mi_tpi_conn_ind(NULL, 4977 (char *)&sin, sizeof (sin_t), 4978 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4979 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4980 } 4981 4982 if (tpi_mp == NULL) { 4983 return (ENOMEM); 4984 } 4985 4986 connp->conn_flags |= (IPCL_TCP4|IPCL_EAGER); 4987 connp->conn_send = ip_output; 4988 connp->conn_recv = tcp_input; 4989 connp->conn_fully_bound = B_FALSE; 4990 4991 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6); 4992 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6); 4993 connp->conn_fport = *(uint16_t *)tcph->th_lport; 4994 connp->conn_lport = *(uint16_t *)tcph->th_fport; 4995 4996 if (tcp_trace) { 4997 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP); 4998 } 4999 5000 /* Inherit information from the "parent" */ 5001 tcp->tcp_ipversion = ltcp->tcp_ipversion; 5002 tcp->tcp_family = ltcp->tcp_family; 5003 tcp->tcp_wq = ltcp->tcp_wq; 5004 tcp->tcp_rq = ltcp->tcp_rq; 5005 tcp->tcp_mss = tcp_mss_def_ipv4; 5006 tcp->tcp_detached = B_TRUE; 5007 if ((err = tcp_init_values(tcp)) != 0) { 5008 freemsg(tpi_mp); 5009 return (err); 5010 } 5011 5012 /* 5013 * Let's make sure that eager tcp template has enough space to 5014 * copy IPv4 listener's tcp template. Since the conn_t structure is 5015 * preserved and tcp_iphc_len is also preserved, an eager conn_t may 5016 * have a tcp_template of total len TCP_MAX_COMBINED_HEADER_LENGTH or 5017 * more (in case of re-allocation of conn_t with tcp-IPv6 template with 5018 * extension headers or with ip6i_t struct). Note that bcopy() below 5019 * copies listener tcp's hdr_len which cannot be greater than TCP_MAX_ 5020 * COMBINED_HEADER_LENGTH as this listener must be a IPv4 listener. 5021 */ 5022 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 5023 ASSERT(ltcp->tcp_hdr_len <= TCP_MAX_COMBINED_HEADER_LENGTH); 5024 5025 tcp->tcp_hdr_len = ltcp->tcp_hdr_len; 5026 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len; 5027 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; 5028 tcp->tcp_ttl = ltcp->tcp_ttl; 5029 tcp->tcp_tos = ltcp->tcp_tos; 5030 5031 /* Copy the IP+TCP header template from listener to eager */ 5032 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len); 5033 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; 5034 tcp->tcp_ip6h = NULL; 5035 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + 5036 tcp->tcp_ip_hdr_len); 5037 5038 /* Initialize the IP addresses and Ports */ 5039 tcp->tcp_ipha->ipha_dst = ipha->ipha_src; 5040 tcp->tcp_ipha->ipha_src = ipha->ipha_dst; 5041 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t)); 5042 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t)); 5043 5044 /* Source routing option copyover (reverse it) */ 5045 if (tcp_rev_src_routes) 5046 tcp_opt_reverse(tcp, ipha); 5047 5048 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 5049 5050 /* 5051 * If the SYN contains a credential, it's a loopback packet; attach 5052 * the credential to the TPI message. 5053 */ 5054 if ((cr = DB_CRED(idmp)) != NULL) { 5055 mblk_setcred(tpi_mp, cr); 5056 DB_CPID(tpi_mp) = DB_CPID(idmp); 5057 } 5058 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp; 5059 5060 /* Inherit the listener's SSL protection state */ 5061 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) { 5062 kssl_hold_ent(tcp->tcp_kssl_ent); 5063 tcp->tcp_kssl_pending = B_TRUE; 5064 } 5065 5066 return (0); 5067 } 5068 5069 /* 5070 * sets up conn for ipsec. 5071 * if the first mblk is M_CTL it is consumed and mpp is updated. 5072 * in case of error mpp is freed. 5073 */ 5074 conn_t * 5075 tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp) 5076 { 5077 conn_t *connp = tcp->tcp_connp; 5078 conn_t *econnp; 5079 squeue_t *new_sqp; 5080 mblk_t *first_mp = *mpp; 5081 mblk_t *mp = *mpp; 5082 boolean_t mctl_present = B_FALSE; 5083 uint_t ipvers; 5084 5085 econnp = tcp_get_conn(sqp); 5086 if (econnp == NULL) { 5087 freemsg(first_mp); 5088 return (NULL); 5089 } 5090 if (DB_TYPE(mp) == M_CTL) { 5091 if (mp->b_cont == NULL || 5092 mp->b_cont->b_datap->db_type != M_DATA) { 5093 freemsg(first_mp); 5094 return (NULL); 5095 } 5096 mp = mp->b_cont; 5097 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) == 0) { 5098 freemsg(first_mp); 5099 return (NULL); 5100 } 5101 5102 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 5103 first_mp->b_datap->db_struioflag &= ~STRUIO_POLICY; 5104 mctl_present = B_TRUE; 5105 } else { 5106 ASSERT(mp->b_datap->db_struioflag & STRUIO_POLICY); 5107 mp->b_datap->db_struioflag &= ~STRUIO_POLICY; 5108 } 5109 5110 new_sqp = (squeue_t *)DB_CKSUMSTART(mp); 5111 DB_CKSUMSTART(mp) = 0; 5112 5113 ASSERT(OK_32PTR(mp->b_rptr)); 5114 ipvers = IPH_HDR_VERSION(mp->b_rptr); 5115 if (ipvers == IPV4_VERSION) { 5116 uint16_t *up; 5117 uint32_t ports; 5118 ipha_t *ipha; 5119 5120 ipha = (ipha_t *)mp->b_rptr; 5121 up = (uint16_t *)((uchar_t *)ipha + 5122 IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET); 5123 ports = *(uint32_t *)up; 5124 IPCL_TCP_EAGER_INIT(econnp, IPPROTO_TCP, 5125 ipha->ipha_dst, ipha->ipha_src, ports); 5126 } else { 5127 uint16_t *up; 5128 uint32_t ports; 5129 uint16_t ip_hdr_len; 5130 uint8_t *nexthdrp; 5131 ip6_t *ip6h; 5132 tcph_t *tcph; 5133 5134 ip6h = (ip6_t *)mp->b_rptr; 5135 if (ip6h->ip6_nxt == IPPROTO_TCP) { 5136 ip_hdr_len = IPV6_HDR_LEN; 5137 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_len, 5138 &nexthdrp) || *nexthdrp != IPPROTO_TCP) { 5139 CONN_DEC_REF(econnp); 5140 freemsg(first_mp); 5141 return (NULL); 5142 } 5143 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5144 up = (uint16_t *)tcph->th_lport; 5145 ports = *(uint32_t *)up; 5146 IPCL_TCP_EAGER_INIT_V6(econnp, IPPROTO_TCP, 5147 ip6h->ip6_dst, ip6h->ip6_src, ports); 5148 } 5149 5150 /* 5151 * The caller already ensured that there is a sqp present. 5152 */ 5153 econnp->conn_sqp = new_sqp; 5154 5155 if (connp->conn_policy != NULL) { 5156 ipsec_in_t *ii; 5157 ii = (ipsec_in_t *)(first_mp->b_rptr); 5158 ASSERT(ii->ipsec_in_policy == NULL); 5159 IPPH_REFHOLD(connp->conn_policy); 5160 ii->ipsec_in_policy = connp->conn_policy; 5161 5162 first_mp->b_datap->db_type = IPSEC_POLICY_SET; 5163 if (!ip_bind_ipsec_policy_set(econnp, first_mp)) { 5164 CONN_DEC_REF(econnp); 5165 freemsg(first_mp); 5166 return (NULL); 5167 } 5168 } 5169 5170 if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) { 5171 CONN_DEC_REF(econnp); 5172 freemsg(first_mp); 5173 return (NULL); 5174 } 5175 5176 /* 5177 * If we know we have some policy, pass the "IPSEC" 5178 * options size TCP uses this adjust the MSS. 5179 */ 5180 econnp->conn_tcp->tcp_ipsec_overhead = conn_ipsec_length(econnp); 5181 if (mctl_present) { 5182 freeb(first_mp); 5183 *mpp = mp; 5184 } 5185 5186 return (econnp); 5187 } 5188 5189 /* 5190 * tcp_get_conn/tcp_free_conn 5191 * 5192 * tcp_get_conn is used to get a clean tcp connection structure. 5193 * It tries to reuse the connections put on the freelist by the 5194 * time_wait_collector failing which it goes to kmem_cache. This 5195 * way has two benefits compared to just allocating from and 5196 * freeing to kmem_cache. 5197 * 1) The time_wait_collector can free (which includes the cleanup) 5198 * outside the squeue. So when the interrupt comes, we have a clean 5199 * connection sitting in the freelist. Obviously, this buys us 5200 * performance. 5201 * 5202 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_conn_request 5203 * has multiple disadvantages - tying up the squeue during alloc, and the 5204 * fact that IPSec policy initialization has to happen here which 5205 * requires us sending a M_CTL and checking for it i.e. real ugliness. 5206 * But allocating the conn/tcp in IP land is also not the best since 5207 * we can't check the 'q' and 'q0' which are protected by squeue and 5208 * blindly allocate memory which might have to be freed here if we are 5209 * not allowed to accept the connection. By using the freelist and 5210 * putting the conn/tcp back in freelist, we don't pay a penalty for 5211 * allocating memory without checking 'q/q0' and freeing it if we can't 5212 * accept the connection. 5213 * 5214 * Care should be taken to put the conn back in the same squeue's freelist 5215 * from which it was allocated. Best results are obtained if conn is 5216 * allocated from listener's squeue and freed to the same. Time wait 5217 * collector will free up the freelist is the connection ends up sitting 5218 * there for too long. 5219 */ 5220 void * 5221 tcp_get_conn(void *arg) 5222 { 5223 tcp_t *tcp = NULL; 5224 conn_t *connp = NULL; 5225 squeue_t *sqp = (squeue_t *)arg; 5226 tcp_squeue_priv_t *tcp_time_wait; 5227 5228 tcp_time_wait = 5229 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 5230 5231 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 5232 tcp = tcp_time_wait->tcp_free_list; 5233 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0)); 5234 if (tcp != NULL) { 5235 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 5236 tcp_time_wait->tcp_free_list_cnt--; 5237 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 5238 tcp->tcp_time_wait_next = NULL; 5239 connp = tcp->tcp_connp; 5240 connp->conn_flags |= IPCL_REUSED; 5241 return ((void *)connp); 5242 } 5243 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 5244 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL) 5245 return (NULL); 5246 return ((void *)connp); 5247 } 5248 5249 /* 5250 * Update the cached label for the given tcp_t. This should be called once per 5251 * connection, and before any packets are sent or tcp_process_options is 5252 * invoked. Returns B_FALSE if the correct label could not be constructed. 5253 */ 5254 static boolean_t 5255 tcp_update_label(tcp_t *tcp, const cred_t *cr) 5256 { 5257 conn_t *connp = tcp->tcp_connp; 5258 5259 if (tcp->tcp_ipversion == IPV4_VERSION) { 5260 uchar_t optbuf[IP_MAX_OPT_LENGTH]; 5261 int added; 5262 5263 if (tsol_compute_label(cr, tcp->tcp_remote, optbuf, 5264 connp->conn_mac_exempt) != 0) 5265 return (B_FALSE); 5266 5267 added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len); 5268 if (added == -1) 5269 return (B_FALSE); 5270 tcp->tcp_hdr_len += added; 5271 tcp->tcp_tcph = (tcph_t *)((uchar_t *)tcp->tcp_tcph + added); 5272 tcp->tcp_ip_hdr_len += added; 5273 if ((tcp->tcp_label_len = optbuf[IPOPT_OLEN]) != 0) { 5274 tcp->tcp_label_len = (tcp->tcp_label_len + 3) & ~3; 5275 added = tsol_prepend_option(optbuf, tcp->tcp_ipha, 5276 tcp->tcp_hdr_len); 5277 if (added == -1) 5278 return (B_FALSE); 5279 tcp->tcp_hdr_len += added; 5280 tcp->tcp_tcph = (tcph_t *) 5281 ((uchar_t *)tcp->tcp_tcph + added); 5282 tcp->tcp_ip_hdr_len += added; 5283 } 5284 } else { 5285 uchar_t optbuf[TSOL_MAX_IPV6_OPTION]; 5286 5287 if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf, 5288 connp->conn_mac_exempt) != 0) 5289 return (B_FALSE); 5290 if (tsol_update_sticky(&tcp->tcp_sticky_ipp, 5291 &tcp->tcp_label_len, optbuf) != 0) 5292 return (B_FALSE); 5293 if (tcp_build_hdrs(tcp->tcp_rq, tcp) != 0) 5294 return (B_FALSE); 5295 } 5296 5297 connp->conn_ulp_labeled = 1; 5298 5299 return (B_TRUE); 5300 } 5301 5302 /* BEGIN CSTYLED */ 5303 /* 5304 * 5305 * The sockfs ACCEPT path: 5306 * ======================= 5307 * 5308 * The eager is now established in its own perimeter as soon as SYN is 5309 * received in tcp_conn_request(). When sockfs receives conn_ind, it 5310 * completes the accept processing on the acceptor STREAM. The sending 5311 * of conn_ind part is common for both sockfs listener and a TLI/XTI 5312 * listener but a TLI/XTI listener completes the accept processing 5313 * on the listener perimeter. 5314 * 5315 * Common control flow for 3 way handshake: 5316 * ---------------------------------------- 5317 * 5318 * incoming SYN (listener perimeter) -> tcp_rput_data() 5319 * -> tcp_conn_request() 5320 * 5321 * incoming SYN-ACK-ACK (eager perim) -> tcp_rput_data() 5322 * send T_CONN_IND (listener perim) -> tcp_send_conn_ind() 5323 * 5324 * Sockfs ACCEPT Path: 5325 * ------------------- 5326 * 5327 * open acceptor stream (ip_tcpopen allocates tcp_wput_accept() 5328 * as STREAM entry point) 5329 * 5330 * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept() 5331 * 5332 * tcp_wput_accept() extracts the eager and makes the q->q_ptr <-> eager 5333 * association (we are not behind eager's squeue but sockfs is protecting us 5334 * and no one knows about this stream yet. The STREAMS entry point q->q_info 5335 * is changed to point at tcp_wput(). 5336 * 5337 * tcp_wput_accept() sends any deferred eagers via tcp_send_pending() to 5338 * listener (done on listener's perimeter). 5339 * 5340 * tcp_wput_accept() calls tcp_accept_finish() on eagers perimeter to finish 5341 * accept. 5342 * 5343 * TLI/XTI client ACCEPT path: 5344 * --------------------------- 5345 * 5346 * soaccept() sends T_CONN_RES on the listener STREAM. 5347 * 5348 * tcp_accept() -> tcp_accept_swap() complete the processing and send 5349 * the bind_mp to eager perimeter to finish accept (tcp_rput_other()). 5350 * 5351 * Locks: 5352 * ====== 5353 * 5354 * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and 5355 * and listeners->tcp_eager_next_q. 5356 * 5357 * Referencing: 5358 * ============ 5359 * 5360 * 1) We start out in tcp_conn_request by eager placing a ref on 5361 * listener and listener adding eager to listeners->tcp_eager_next_q0. 5362 * 5363 * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before 5364 * doing so we place a ref on the eager. This ref is finally dropped at the 5365 * end of tcp_accept_finish() while unwinding from the squeue, i.e. the 5366 * reference is dropped by the squeue framework. 5367 * 5368 * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish 5369 * 5370 * The reference must be released by the same entity that added the reference 5371 * In the above scheme, the eager is the entity that adds and releases the 5372 * references. Note that tcp_accept_finish executes in the squeue of the eager 5373 * (albeit after it is attached to the acceptor stream). Though 1. executes 5374 * in the listener's squeue, the eager is nascent at this point and the 5375 * reference can be considered to have been added on behalf of the eager. 5376 * 5377 * Eager getting a Reset or listener closing: 5378 * ========================================== 5379 * 5380 * Once the listener and eager are linked, the listener never does the unlink. 5381 * If the listener needs to close, tcp_eager_cleanup() is called which queues 5382 * a message on all eager perimeter. The eager then does the unlink, clears 5383 * any pointers to the listener's queue and drops the reference to the 5384 * listener. The listener waits in tcp_close outside the squeue until its 5385 * refcount has dropped to 1. This ensures that the listener has waited for 5386 * all eagers to clear their association with the listener. 5387 * 5388 * Similarly, if eager decides to go away, it can unlink itself and close. 5389 * When the T_CONN_RES comes down, we check if eager has closed. Note that 5390 * the reference to eager is still valid because of the extra ref we put 5391 * in tcp_send_conn_ind. 5392 * 5393 * Listener can always locate the eager under the protection 5394 * of the listener->tcp_eager_lock, and then do a refhold 5395 * on the eager during the accept processing. 5396 * 5397 * The acceptor stream accesses the eager in the accept processing 5398 * based on the ref placed on eager before sending T_conn_ind. 5399 * The only entity that can negate this refhold is a listener close 5400 * which is mutually exclusive with an active acceptor stream. 5401 * 5402 * Eager's reference on the listener 5403 * =================================== 5404 * 5405 * If the accept happens (even on a closed eager) the eager drops its 5406 * reference on the listener at the start of tcp_accept_finish. If the 5407 * eager is killed due to an incoming RST before the T_conn_ind is sent up, 5408 * the reference is dropped in tcp_closei_local. If the listener closes, 5409 * the reference is dropped in tcp_eager_kill. In all cases the reference 5410 * is dropped while executing in the eager's context (squeue). 5411 */ 5412 /* END CSTYLED */ 5413 5414 /* Process the SYN packet, mp, directed at the listener 'tcp' */ 5415 5416 /* 5417 * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN. 5418 * tcp_rput_data will not see any SYN packets. 5419 */ 5420 /* ARGSUSED */ 5421 void 5422 tcp_conn_request(void *arg, mblk_t *mp, void *arg2) 5423 { 5424 tcph_t *tcph; 5425 uint32_t seg_seq; 5426 tcp_t *eager; 5427 uint_t ipvers; 5428 ipha_t *ipha; 5429 ip6_t *ip6h; 5430 int err; 5431 conn_t *econnp = NULL; 5432 squeue_t *new_sqp; 5433 mblk_t *mp1; 5434 uint_t ip_hdr_len; 5435 conn_t *connp = (conn_t *)arg; 5436 tcp_t *tcp = connp->conn_tcp; 5437 ire_t *ire; 5438 cred_t *credp; 5439 5440 if (tcp->tcp_state != TCPS_LISTEN) 5441 goto error2; 5442 5443 ASSERT((tcp->tcp_connp->conn_flags & IPCL_BOUND) != 0); 5444 5445 mutex_enter(&tcp->tcp_eager_lock); 5446 if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { 5447 mutex_exit(&tcp->tcp_eager_lock); 5448 TCP_STAT(tcp_listendrop); 5449 BUMP_MIB(&tcp_mib, tcpListenDrop); 5450 if (tcp->tcp_debug) { 5451 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 5452 "tcp_conn_request: listen backlog (max=%d) " 5453 "overflow (%d pending) on %s", 5454 tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, 5455 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 5456 } 5457 goto error2; 5458 } 5459 5460 if (tcp->tcp_conn_req_cnt_q0 >= 5461 tcp->tcp_conn_req_max + tcp_conn_req_max_q0) { 5462 /* 5463 * Q0 is full. Drop a pending half-open req from the queue 5464 * to make room for the new SYN req. Also mark the time we 5465 * drop a SYN. 5466 * 5467 * A more aggressive defense against SYN attack will 5468 * be to set the "tcp_syn_defense" flag now. 5469 */ 5470 TCP_STAT(tcp_listendropq0); 5471 tcp->tcp_last_rcv_lbolt = lbolt64; 5472 if (!tcp_drop_q0(tcp)) { 5473 mutex_exit(&tcp->tcp_eager_lock); 5474 BUMP_MIB(&tcp_mib, tcpListenDropQ0); 5475 if (tcp->tcp_debug) { 5476 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 5477 "tcp_conn_request: listen half-open queue " 5478 "(max=%d) full (%d pending) on %s", 5479 tcp_conn_req_max_q0, 5480 tcp->tcp_conn_req_cnt_q0, 5481 tcp_display(tcp, NULL, 5482 DISP_PORT_ONLY)); 5483 } 5484 goto error2; 5485 } 5486 } 5487 mutex_exit(&tcp->tcp_eager_lock); 5488 5489 /* 5490 * IP adds STRUIO_EAGER and ensures that the received packet is 5491 * M_DATA even if conn_ipv6_recvpktinfo is enabled or for ip6 5492 * link local address. If IPSec is enabled, db_struioflag has 5493 * STRUIO_POLICY set (mutually exclusive from STRUIO_EAGER); 5494 * otherwise an error case if neither of them is set. 5495 */ 5496 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 5497 new_sqp = (squeue_t *)DB_CKSUMSTART(mp); 5498 DB_CKSUMSTART(mp) = 0; 5499 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 5500 econnp = (conn_t *)tcp_get_conn(arg2); 5501 if (econnp == NULL) 5502 goto error2; 5503 econnp->conn_sqp = new_sqp; 5504 } else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) { 5505 /* 5506 * mp is updated in tcp_get_ipsec_conn(). 5507 */ 5508 econnp = tcp_get_ipsec_conn(tcp, arg2, &mp); 5509 if (econnp == NULL) { 5510 /* 5511 * mp freed by tcp_get_ipsec_conn. 5512 */ 5513 return; 5514 } 5515 } else { 5516 goto error2; 5517 } 5518 5519 ASSERT(DB_TYPE(mp) == M_DATA); 5520 5521 ipvers = IPH_HDR_VERSION(mp->b_rptr); 5522 ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION); 5523 ASSERT(OK_32PTR(mp->b_rptr)); 5524 if (ipvers == IPV4_VERSION) { 5525 ipha = (ipha_t *)mp->b_rptr; 5526 ip_hdr_len = IPH_HDR_LENGTH(ipha); 5527 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5528 } else { 5529 ip6h = (ip6_t *)mp->b_rptr; 5530 ip_hdr_len = ip_hdr_length_v6(mp, ip6h); 5531 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5532 } 5533 5534 if (tcp->tcp_family == AF_INET) { 5535 ASSERT(ipvers == IPV4_VERSION); 5536 err = tcp_conn_create_v4(connp, econnp, ipha, tcph, mp); 5537 } else { 5538 err = tcp_conn_create_v6(connp, econnp, mp, tcph, ipvers, mp); 5539 } 5540 5541 if (err) 5542 goto error3; 5543 5544 eager = econnp->conn_tcp; 5545 5546 /* Inherit various TCP parameters from the listener */ 5547 eager->tcp_naglim = tcp->tcp_naglim; 5548 eager->tcp_first_timer_threshold = 5549 tcp->tcp_first_timer_threshold; 5550 eager->tcp_second_timer_threshold = 5551 tcp->tcp_second_timer_threshold; 5552 5553 eager->tcp_first_ctimer_threshold = 5554 tcp->tcp_first_ctimer_threshold; 5555 eager->tcp_second_ctimer_threshold = 5556 tcp->tcp_second_ctimer_threshold; 5557 5558 /* 5559 * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics. 5560 * If it does not, the eager's receive window will be set to the 5561 * listener's receive window later in this function. 5562 */ 5563 eager->tcp_rwnd = 0; 5564 5565 /* 5566 * Inherit listener's tcp_init_cwnd. Need to do this before 5567 * calling tcp_process_options() where tcp_mss_set() is called 5568 * to set the initial cwnd. 5569 */ 5570 eager->tcp_init_cwnd = tcp->tcp_init_cwnd; 5571 5572 /* 5573 * Zones: tcp_adapt_ire() and tcp_send_data() both need the 5574 * zone id before the accept is completed in tcp_wput_accept(). 5575 */ 5576 econnp->conn_zoneid = connp->conn_zoneid; 5577 5578 /* Copy nexthop information from listener to eager */ 5579 if (connp->conn_nexthop_set) { 5580 econnp->conn_nexthop_set = connp->conn_nexthop_set; 5581 econnp->conn_nexthop_v4 = connp->conn_nexthop_v4; 5582 } 5583 5584 /* 5585 * TSOL: tsol_input_proc() needs the eager's cred before the 5586 * eager is accepted 5587 */ 5588 econnp->conn_cred = eager->tcp_cred = credp = connp->conn_cred; 5589 crhold(credp); 5590 5591 /* 5592 * If the caller has the process-wide flag set, then default to MAC 5593 * exempt mode. This allows read-down to unlabeled hosts. 5594 */ 5595 if (getpflags(NET_MAC_AWARE, credp) != 0) 5596 econnp->conn_mac_exempt = B_TRUE; 5597 5598 if (is_system_labeled()) { 5599 cred_t *cr; 5600 5601 if (connp->conn_mlp_type != mlptSingle) { 5602 cr = econnp->conn_peercred = DB_CRED(mp); 5603 if (cr != NULL) 5604 crhold(cr); 5605 else 5606 cr = econnp->conn_cred; 5607 DTRACE_PROBE2(mlp_syn_accept, conn_t *, 5608 econnp, cred_t *, cr) 5609 } else { 5610 cr = econnp->conn_cred; 5611 DTRACE_PROBE2(syn_accept, conn_t *, 5612 econnp, cred_t *, cr) 5613 } 5614 5615 if (!tcp_update_label(eager, cr)) { 5616 DTRACE_PROBE3( 5617 tx__ip__log__error__connrequest__tcp, 5618 char *, "eager connp(1) label on SYN mp(2) failed", 5619 conn_t *, econnp, mblk_t *, mp); 5620 goto error3; 5621 } 5622 } 5623 5624 eager->tcp_hard_binding = B_TRUE; 5625 5626 tcp_bind_hash_insert(&tcp_bind_fanout[ 5627 TCP_BIND_HASH(eager->tcp_lport)], eager, 0); 5628 5629 CL_INET_CONNECT(eager); 5630 5631 /* 5632 * No need to check for multicast destination since ip will only pass 5633 * up multicasts to those that have expressed interest 5634 * TODO: what about rejecting broadcasts? 5635 * Also check that source is not a multicast or broadcast address. 5636 */ 5637 eager->tcp_state = TCPS_SYN_RCVD; 5638 5639 5640 /* 5641 * There should be no ire in the mp as we are being called after 5642 * receiving the SYN. 5643 */ 5644 ASSERT(tcp_ire_mp(mp) == NULL); 5645 5646 /* 5647 * Adapt our mss, ttl, ... according to information provided in IRE. 5648 */ 5649 5650 if (tcp_adapt_ire(eager, NULL) == 0) { 5651 /* Undo the bind_hash_insert */ 5652 tcp_bind_hash_remove(eager); 5653 goto error3; 5654 } 5655 5656 /* Process all TCP options. */ 5657 tcp_process_options(eager, tcph); 5658 5659 /* Is the other end ECN capable? */ 5660 if (tcp_ecn_permitted >= 1 && 5661 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 5662 eager->tcp_ecn_ok = B_TRUE; 5663 } 5664 5665 /* 5666 * listener->tcp_rq->q_hiwat should be the default window size or a 5667 * window size changed via SO_RCVBUF option. First round up the 5668 * eager's tcp_rwnd to the nearest MSS. Then find out the window 5669 * scale option value if needed. Call tcp_rwnd_set() to finish the 5670 * setting. 5671 * 5672 * Note if there is a rpipe metric associated with the remote host, 5673 * we should not inherit receive window size from listener. 5674 */ 5675 eager->tcp_rwnd = MSS_ROUNDUP( 5676 (eager->tcp_rwnd == 0 ? tcp->tcp_rq->q_hiwat : 5677 eager->tcp_rwnd), eager->tcp_mss); 5678 if (eager->tcp_snd_ws_ok) 5679 tcp_set_ws_value(eager); 5680 /* 5681 * Note that this is the only place tcp_rwnd_set() is called for 5682 * accepting a connection. We need to call it here instead of 5683 * after the 3-way handshake because we need to tell the other 5684 * side our rwnd in the SYN-ACK segment. 5685 */ 5686 (void) tcp_rwnd_set(eager, eager->tcp_rwnd); 5687 5688 /* 5689 * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ 5690 * via soaccept()->soinheritoptions() which essentially applies 5691 * all the listener options to the new STREAM. The options that we 5692 * need to take care of are: 5693 * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST, 5694 * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER, 5695 * SO_SNDBUF, SO_RCVBUF. 5696 * 5697 * SO_RCVBUF: tcp_rwnd_set() above takes care of it. 5698 * SO_SNDBUF: Set the tcp_xmit_hiwater for the eager. When 5699 * tcp_maxpsz_set() gets called later from 5700 * tcp_accept_finish(), the option takes effect. 5701 * 5702 */ 5703 /* Set the TCP options */ 5704 eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater; 5705 eager->tcp_dgram_errind = tcp->tcp_dgram_errind; 5706 eager->tcp_oobinline = tcp->tcp_oobinline; 5707 eager->tcp_reuseaddr = tcp->tcp_reuseaddr; 5708 eager->tcp_broadcast = tcp->tcp_broadcast; 5709 eager->tcp_useloopback = tcp->tcp_useloopback; 5710 eager->tcp_dontroute = tcp->tcp_dontroute; 5711 eager->tcp_linger = tcp->tcp_linger; 5712 eager->tcp_lingertime = tcp->tcp_lingertime; 5713 if (tcp->tcp_ka_enabled) 5714 eager->tcp_ka_enabled = 1; 5715 5716 /* Set the IP options */ 5717 econnp->conn_broadcast = connp->conn_broadcast; 5718 econnp->conn_loopback = connp->conn_loopback; 5719 econnp->conn_dontroute = connp->conn_dontroute; 5720 econnp->conn_reuseaddr = connp->conn_reuseaddr; 5721 5722 /* Put a ref on the listener for the eager. */ 5723 CONN_INC_REF(connp); 5724 mutex_enter(&tcp->tcp_eager_lock); 5725 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 5726 eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 5727 tcp->tcp_eager_next_q0 = eager; 5728 eager->tcp_eager_prev_q0 = tcp; 5729 5730 /* Set tcp_listener before adding it to tcp_conn_fanout */ 5731 eager->tcp_listener = tcp; 5732 eager->tcp_saved_listener = tcp; 5733 5734 /* 5735 * Tag this detached tcp vector for later retrieval 5736 * by our listener client in tcp_accept(). 5737 */ 5738 eager->tcp_conn_req_seqnum = tcp->tcp_conn_req_seqnum; 5739 tcp->tcp_conn_req_cnt_q0++; 5740 if (++tcp->tcp_conn_req_seqnum == -1) { 5741 /* 5742 * -1 is "special" and defined in TPI as something 5743 * that should never be used in T_CONN_IND 5744 */ 5745 ++tcp->tcp_conn_req_seqnum; 5746 } 5747 mutex_exit(&tcp->tcp_eager_lock); 5748 5749 if (tcp->tcp_syn_defense) { 5750 /* Don't drop the SYN that comes from a good IP source */ 5751 ipaddr_t *addr_cache = (ipaddr_t *)(tcp->tcp_ip_addr_cache); 5752 if (addr_cache != NULL && eager->tcp_remote == 5753 addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) { 5754 eager->tcp_dontdrop = B_TRUE; 5755 } 5756 } 5757 5758 /* 5759 * We need to insert the eager in its own perimeter but as soon 5760 * as we do that, we expose the eager to the classifier and 5761 * should not touch any field outside the eager's perimeter. 5762 * So do all the work necessary before inserting the eager 5763 * in its own perimeter. Be optimistic that ipcl_conn_insert() 5764 * will succeed but undo everything if it fails. 5765 */ 5766 seg_seq = ABE32_TO_U32(tcph->th_seq); 5767 eager->tcp_irs = seg_seq; 5768 eager->tcp_rack = seg_seq; 5769 eager->tcp_rnxt = seg_seq + 1; 5770 U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack); 5771 BUMP_MIB(&tcp_mib, tcpPassiveOpens); 5772 eager->tcp_state = TCPS_SYN_RCVD; 5773 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, 5774 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE); 5775 if (mp1 == NULL) 5776 goto error1; 5777 DB_CPID(mp1) = tcp->tcp_cpid; 5778 5779 /* 5780 * We need to start the rto timer. In normal case, we start 5781 * the timer after sending the packet on the wire (or at 5782 * least believing that packet was sent by waiting for 5783 * CALL_IP_WPUT() to return). Since this is the first packet 5784 * being sent on the wire for the eager, our initial tcp_rto 5785 * is at least tcp_rexmit_interval_min which is a fairly 5786 * large value to allow the algorithm to adjust slowly to large 5787 * fluctuations of RTT during first few transmissions. 5788 * 5789 * Starting the timer first and then sending the packet in this 5790 * case shouldn't make much difference since tcp_rexmit_interval_min 5791 * is of the order of several 100ms and starting the timer 5792 * first and then sending the packet will result in difference 5793 * of few micro seconds. 5794 * 5795 * Without this optimization, we are forced to hold the fanout 5796 * lock across the ipcl_bind_insert() and sending the packet 5797 * so that we don't race against an incoming packet (maybe RST) 5798 * for this eager. 5799 */ 5800 5801 TCP_RECORD_TRACE(eager, mp1, TCP_TRACE_SEND_PKT); 5802 TCP_TIMER_RESTART(eager, eager->tcp_rto); 5803 5804 5805 /* 5806 * Insert the eager in its own perimeter now. We are ready to deal 5807 * with any packets on eager. 5808 */ 5809 if (eager->tcp_ipversion == IPV4_VERSION) { 5810 if (ipcl_conn_insert(econnp, IPPROTO_TCP, 0, 0, 0) != 0) { 5811 goto error; 5812 } 5813 } else { 5814 if (ipcl_conn_insert_v6(econnp, IPPROTO_TCP, 0, 0, 0, 0) != 0) { 5815 goto error; 5816 } 5817 } 5818 5819 /* mark conn as fully-bound */ 5820 econnp->conn_fully_bound = B_TRUE; 5821 5822 /* Send the SYN-ACK */ 5823 tcp_send_data(eager, eager->tcp_wq, mp1); 5824 freemsg(mp); 5825 5826 return; 5827 error: 5828 (void) TCP_TIMER_CANCEL(eager, eager->tcp_timer_tid); 5829 freemsg(mp1); 5830 error1: 5831 /* Undo what we did above */ 5832 mutex_enter(&tcp->tcp_eager_lock); 5833 tcp_eager_unlink(eager); 5834 mutex_exit(&tcp->tcp_eager_lock); 5835 /* Drop eager's reference on the listener */ 5836 CONN_DEC_REF(connp); 5837 5838 /* 5839 * Delete the cached ire in conn_ire_cache and also mark 5840 * the conn as CONDEMNED 5841 */ 5842 mutex_enter(&econnp->conn_lock); 5843 econnp->conn_state_flags |= CONN_CONDEMNED; 5844 ire = econnp->conn_ire_cache; 5845 econnp->conn_ire_cache = NULL; 5846 mutex_exit(&econnp->conn_lock); 5847 if (ire != NULL) 5848 IRE_REFRELE_NOTR(ire); 5849 5850 /* 5851 * tcp_accept_comm inserts the eager to the bind_hash 5852 * we need to remove it from the hash if ipcl_conn_insert 5853 * fails. 5854 */ 5855 tcp_bind_hash_remove(eager); 5856 /* Drop the eager ref placed in tcp_open_detached */ 5857 CONN_DEC_REF(econnp); 5858 5859 /* 5860 * If a connection already exists, send the mp to that connections so 5861 * that it can be appropriately dealt with. 5862 */ 5863 if ((econnp = ipcl_classify(mp, connp->conn_zoneid)) != NULL) { 5864 if (!IPCL_IS_CONNECTED(econnp)) { 5865 /* 5866 * Something bad happened. ipcl_conn_insert() 5867 * failed because a connection already existed 5868 * in connected hash but we can't find it 5869 * anymore (someone blew it away). Just 5870 * free this message and hopefully remote 5871 * will retransmit at which time the SYN can be 5872 * treated as a new connection or dealth with 5873 * a TH_RST if a connection already exists. 5874 */ 5875 freemsg(mp); 5876 } else { 5877 squeue_fill(econnp->conn_sqp, mp, tcp_input, 5878 econnp, SQTAG_TCP_CONN_REQ); 5879 } 5880 } else { 5881 /* Nobody wants this packet */ 5882 freemsg(mp); 5883 } 5884 return; 5885 error2: 5886 freemsg(mp); 5887 return; 5888 error3: 5889 CONN_DEC_REF(econnp); 5890 freemsg(mp); 5891 } 5892 5893 /* 5894 * In an ideal case of vertical partition in NUMA architecture, its 5895 * beneficial to have the listener and all the incoming connections 5896 * tied to the same squeue. The other constraint is that incoming 5897 * connections should be tied to the squeue attached to interrupted 5898 * CPU for obvious locality reason so this leaves the listener to 5899 * be tied to the same squeue. Our only problem is that when listener 5900 * is binding, the CPU that will get interrupted by the NIC whose 5901 * IP address the listener is binding to is not even known. So 5902 * the code below allows us to change that binding at the time the 5903 * CPU is interrupted by virtue of incoming connection's squeue. 5904 * 5905 * This is usefull only in case of a listener bound to a specific IP 5906 * address. For other kind of listeners, they get bound the 5907 * very first time and there is no attempt to rebind them. 5908 */ 5909 void 5910 tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) 5911 { 5912 conn_t *connp = (conn_t *)arg; 5913 squeue_t *sqp = (squeue_t *)arg2; 5914 squeue_t *new_sqp; 5915 uint32_t conn_flags; 5916 5917 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 5918 new_sqp = (squeue_t *)DB_CKSUMSTART(mp); 5919 } else { 5920 goto done; 5921 } 5922 5923 if (connp->conn_fanout == NULL) 5924 goto done; 5925 5926 if (!(connp->conn_flags & IPCL_FULLY_BOUND)) { 5927 mutex_enter(&connp->conn_fanout->connf_lock); 5928 mutex_enter(&connp->conn_lock); 5929 /* 5930 * No one from read or write side can access us now 5931 * except for already queued packets on this squeue. 5932 * But since we haven't changed the squeue yet, they 5933 * can't execute. If they are processed after we have 5934 * changed the squeue, they are sent back to the 5935 * correct squeue down below. 5936 */ 5937 if (connp->conn_sqp != new_sqp) { 5938 while (connp->conn_sqp != new_sqp) 5939 (void) casptr(&connp->conn_sqp, sqp, new_sqp); 5940 } 5941 5942 do { 5943 conn_flags = connp->conn_flags; 5944 conn_flags |= IPCL_FULLY_BOUND; 5945 (void) cas32(&connp->conn_flags, connp->conn_flags, 5946 conn_flags); 5947 } while (!(connp->conn_flags & IPCL_FULLY_BOUND)); 5948 5949 mutex_exit(&connp->conn_fanout->connf_lock); 5950 mutex_exit(&connp->conn_lock); 5951 } 5952 5953 done: 5954 if (connp->conn_sqp != sqp) { 5955 CONN_INC_REF(connp); 5956 squeue_fill(connp->conn_sqp, mp, 5957 connp->conn_recv, connp, SQTAG_TCP_CONN_REQ_UNBOUND); 5958 } else { 5959 tcp_conn_request(connp, mp, sqp); 5960 } 5961 } 5962 5963 /* 5964 * Successful connect request processing begins when our client passes 5965 * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes 5966 * our T_OK_ACK reply message upstream. The control flow looks like this: 5967 * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_connect() -> IP 5968 * upstream <- tcp_rput() <- IP 5969 * After various error checks are completed, tcp_connect() lays 5970 * the target address and port into the composite header template, 5971 * preallocates the T_OK_ACK reply message, construct a full 12 byte bind 5972 * request followed by an IRE request, and passes the three mblk message 5973 * down to IP looking like this: 5974 * O_T_BIND_REQ for IP --> IRE req --> T_OK_ACK for our client 5975 * Processing continues in tcp_rput() when we receive the following message: 5976 * T_BIND_ACK from IP --> IRE ack --> T_OK_ACK for our client 5977 * After consuming the first two mblks, tcp_rput() calls tcp_timer(), 5978 * to fire off the connection request, and then passes the T_OK_ACK mblk 5979 * upstream that we filled in below. There are, of course, numerous 5980 * error conditions along the way which truncate the processing described 5981 * above. 5982 */ 5983 static void 5984 tcp_connect(tcp_t *tcp, mblk_t *mp) 5985 { 5986 sin_t *sin; 5987 sin6_t *sin6; 5988 queue_t *q = tcp->tcp_wq; 5989 struct T_conn_req *tcr; 5990 ipaddr_t *dstaddrp; 5991 in_port_t dstport; 5992 uint_t srcid; 5993 5994 tcr = (struct T_conn_req *)mp->b_rptr; 5995 5996 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 5997 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 5998 tcp_err_ack(tcp, mp, TPROTO, 0); 5999 return; 6000 } 6001 6002 /* 6003 * Determine packet type based on type of address passed in 6004 * the request should contain an IPv4 or IPv6 address. 6005 * Make sure that address family matches the type of 6006 * family of the the address passed down 6007 */ 6008 switch (tcr->DEST_length) { 6009 default: 6010 tcp_err_ack(tcp, mp, TBADADDR, 0); 6011 return; 6012 6013 case (sizeof (sin_t) - sizeof (sin->sin_zero)): { 6014 /* 6015 * XXX: The check for valid DEST_length was not there 6016 * in earlier releases and some buggy 6017 * TLI apps (e.g Sybase) got away with not feeding 6018 * in sin_zero part of address. 6019 * We allow that bug to keep those buggy apps humming. 6020 * Test suites require the check on DEST_length. 6021 * We construct a new mblk with valid DEST_length 6022 * free the original so the rest of the code does 6023 * not have to keep track of this special shorter 6024 * length address case. 6025 */ 6026 mblk_t *nmp; 6027 struct T_conn_req *ntcr; 6028 sin_t *nsin; 6029 6030 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + 6031 tcr->OPT_length, BPRI_HI); 6032 if (nmp == NULL) { 6033 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 6034 return; 6035 } 6036 ntcr = (struct T_conn_req *)nmp->b_rptr; 6037 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ 6038 ntcr->PRIM_type = T_CONN_REQ; 6039 ntcr->DEST_length = sizeof (sin_t); 6040 ntcr->DEST_offset = sizeof (struct T_conn_req); 6041 6042 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); 6043 *nsin = sin_null; 6044 /* Get pointer to shorter address to copy from original mp */ 6045 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 6046 tcr->DEST_length); /* extract DEST_length worth of sin_t */ 6047 if (sin == NULL || !OK_32PTR((char *)sin)) { 6048 freemsg(nmp); 6049 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 6050 return; 6051 } 6052 nsin->sin_family = sin->sin_family; 6053 nsin->sin_port = sin->sin_port; 6054 nsin->sin_addr = sin->sin_addr; 6055 /* Note:nsin->sin_zero zero-fill with sin_null assign above */ 6056 nmp->b_wptr = (uchar_t *)&nsin[1]; 6057 if (tcr->OPT_length != 0) { 6058 ntcr->OPT_length = tcr->OPT_length; 6059 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; 6060 bcopy((uchar_t *)tcr + tcr->OPT_offset, 6061 (uchar_t *)ntcr + ntcr->OPT_offset, 6062 tcr->OPT_length); 6063 nmp->b_wptr += tcr->OPT_length; 6064 } 6065 freemsg(mp); /* original mp freed */ 6066 mp = nmp; /* re-initialize original variables */ 6067 tcr = ntcr; 6068 } 6069 /* FALLTHRU */ 6070 6071 case sizeof (sin_t): 6072 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 6073 sizeof (sin_t)); 6074 if (sin == NULL || !OK_32PTR((char *)sin)) { 6075 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 6076 return; 6077 } 6078 if (tcp->tcp_family != AF_INET || 6079 sin->sin_family != AF_INET) { 6080 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 6081 return; 6082 } 6083 if (sin->sin_port == 0) { 6084 tcp_err_ack(tcp, mp, TBADADDR, 0); 6085 return; 6086 } 6087 if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) { 6088 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 6089 return; 6090 } 6091 6092 break; 6093 6094 case sizeof (sin6_t): 6095 sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset, 6096 sizeof (sin6_t)); 6097 if (sin6 == NULL || !OK_32PTR((char *)sin6)) { 6098 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 6099 return; 6100 } 6101 if (tcp->tcp_family != AF_INET6 || 6102 sin6->sin6_family != AF_INET6) { 6103 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 6104 return; 6105 } 6106 if (sin6->sin6_port == 0) { 6107 tcp_err_ack(tcp, mp, TBADADDR, 0); 6108 return; 6109 } 6110 break; 6111 } 6112 /* 6113 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we 6114 * should key on their sequence number and cut them loose. 6115 */ 6116 6117 /* 6118 * If options passed in, feed it for verification and handling 6119 */ 6120 if (tcr->OPT_length != 0) { 6121 mblk_t *ok_mp; 6122 mblk_t *discon_mp; 6123 mblk_t *conn_opts_mp; 6124 int t_error, sys_error, do_disconnect; 6125 6126 conn_opts_mp = NULL; 6127 6128 if (tcp_conprim_opt_process(tcp, mp, 6129 &do_disconnect, &t_error, &sys_error) < 0) { 6130 if (do_disconnect) { 6131 ASSERT(t_error == 0 && sys_error == 0); 6132 discon_mp = mi_tpi_discon_ind(NULL, 6133 ECONNREFUSED, 0); 6134 if (!discon_mp) { 6135 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 6136 TSYSERR, ENOMEM); 6137 return; 6138 } 6139 ok_mp = mi_tpi_ok_ack_alloc(mp); 6140 if (!ok_mp) { 6141 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6142 TSYSERR, ENOMEM); 6143 return; 6144 } 6145 qreply(q, ok_mp); 6146 qreply(q, discon_mp); /* no flush! */ 6147 } else { 6148 ASSERT(t_error != 0); 6149 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, 6150 sys_error); 6151 } 6152 return; 6153 } 6154 /* 6155 * Success in setting options, the mp option buffer represented 6156 * by OPT_length/offset has been potentially modified and 6157 * contains results of option processing. We copy it in 6158 * another mp to save it for potentially influencing returning 6159 * it in T_CONN_CONN. 6160 */ 6161 if (tcr->OPT_length != 0) { /* there are resulting options */ 6162 conn_opts_mp = copyb(mp); 6163 if (!conn_opts_mp) { 6164 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 6165 TSYSERR, ENOMEM); 6166 return; 6167 } 6168 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); 6169 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; 6170 /* 6171 * Note: 6172 * These resulting option negotiation can include any 6173 * end-to-end negotiation options but there no such 6174 * thing (yet?) in our TCP/IP. 6175 */ 6176 } 6177 } 6178 6179 /* 6180 * If we're connecting to an IPv4-mapped IPv6 address, we need to 6181 * make sure that the template IP header in the tcp structure is an 6182 * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We 6183 * need to this before we call tcp_bindi() so that the port lookup 6184 * code will look for ports in the correct port space (IPv4 and 6185 * IPv6 have separate port spaces). 6186 */ 6187 if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION && 6188 IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 6189 int err = 0; 6190 6191 err = tcp_header_init_ipv4(tcp); 6192 if (err != 0) { 6193 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); 6194 goto connect_failed; 6195 } 6196 if (tcp->tcp_lport != 0) 6197 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 6198 } 6199 6200 switch (tcp->tcp_state) { 6201 case TCPS_IDLE: 6202 /* 6203 * We support quick connect, refer to comments in 6204 * tcp_connect_*() 6205 */ 6206 /* FALLTHRU */ 6207 case TCPS_BOUND: 6208 case TCPS_LISTEN: 6209 if (tcp->tcp_family == AF_INET6) { 6210 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 6211 tcp_connect_ipv6(tcp, mp, 6212 &sin6->sin6_addr, 6213 sin6->sin6_port, sin6->sin6_flowinfo, 6214 sin6->__sin6_src_id, sin6->sin6_scope_id); 6215 return; 6216 } 6217 /* 6218 * Destination adress is mapped IPv6 address. 6219 * Source bound address should be unspecified or 6220 * IPv6 mapped address as well. 6221 */ 6222 if (!IN6_IS_ADDR_UNSPECIFIED( 6223 &tcp->tcp_bound_source_v6) && 6224 !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) { 6225 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, 6226 EADDRNOTAVAIL); 6227 break; 6228 } 6229 dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr)); 6230 dstport = sin6->sin6_port; 6231 srcid = sin6->__sin6_src_id; 6232 } else { 6233 dstaddrp = &sin->sin_addr.s_addr; 6234 dstport = sin->sin_port; 6235 srcid = 0; 6236 } 6237 6238 tcp_connect_ipv4(tcp, mp, dstaddrp, dstport, srcid); 6239 return; 6240 default: 6241 mp = mi_tpi_err_ack_alloc(mp, TOUTSTATE, 0); 6242 break; 6243 } 6244 /* 6245 * Note: Code below is the "failure" case 6246 */ 6247 /* return error ack and blow away saved option results if any */ 6248 connect_failed: 6249 if (mp != NULL) 6250 putnext(tcp->tcp_rq, mp); 6251 else { 6252 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6253 TSYSERR, ENOMEM); 6254 } 6255 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 6256 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 6257 } 6258 6259 /* 6260 * Handle connect to IPv4 destinations, including connections for AF_INET6 6261 * sockets connecting to IPv4 mapped IPv6 destinations. 6262 */ 6263 static void 6264 tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport, 6265 uint_t srcid) 6266 { 6267 tcph_t *tcph; 6268 mblk_t *mp1; 6269 ipaddr_t dstaddr = *dstaddrp; 6270 int32_t oldstate; 6271 uint16_t lport; 6272 6273 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 6274 6275 /* Check for attempt to connect to INADDR_ANY */ 6276 if (dstaddr == INADDR_ANY) { 6277 /* 6278 * SunOS 4.x and 4.3 BSD allow an application 6279 * to connect a TCP socket to INADDR_ANY. 6280 * When they do this, the kernel picks the 6281 * address of one interface and uses it 6282 * instead. The kernel usually ends up 6283 * picking the address of the loopback 6284 * interface. This is an undocumented feature. 6285 * However, we provide the same thing here 6286 * in order to have source and binary 6287 * compatibility with SunOS 4.x. 6288 * Update the T_CONN_REQ (sin/sin6) since it is used to 6289 * generate the T_CONN_CON. 6290 */ 6291 dstaddr = htonl(INADDR_LOOPBACK); 6292 *dstaddrp = dstaddr; 6293 } 6294 6295 /* Handle __sin6_src_id if socket not bound to an IP address */ 6296 if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) { 6297 ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6, 6298 tcp->tcp_connp->conn_zoneid); 6299 IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6, 6300 tcp->tcp_ipha->ipha_src); 6301 } 6302 6303 /* 6304 * Don't let an endpoint connect to itself. Note that 6305 * the test here does not catch the case where the 6306 * source IP addr was left unspecified by the user. In 6307 * this case, the source addr is set in tcp_adapt_ire() 6308 * using the reply to the T_BIND message that we send 6309 * down to IP here and the check is repeated in tcp_rput_other. 6310 */ 6311 if (dstaddr == tcp->tcp_ipha->ipha_src && 6312 dstport == tcp->tcp_lport) { 6313 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 6314 goto failed; 6315 } 6316 6317 tcp->tcp_ipha->ipha_dst = dstaddr; 6318 IN6_IPADDR_TO_V4MAPPED(dstaddr, &tcp->tcp_remote_v6); 6319 6320 /* 6321 * Massage a source route if any putting the first hop 6322 * in iph_dst. Compute a starting value for the checksum which 6323 * takes into account that the original iph_dst should be 6324 * included in the checksum but that ip will include the 6325 * first hop in the source route in the tcp checksum. 6326 */ 6327 tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha); 6328 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); 6329 tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + 6330 (tcp->tcp_ipha->ipha_dst & 0xffff)); 6331 if ((int)tcp->tcp_sum < 0) 6332 tcp->tcp_sum--; 6333 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); 6334 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + 6335 (tcp->tcp_sum >> 16)); 6336 tcph = tcp->tcp_tcph; 6337 *(uint16_t *)tcph->th_fport = dstport; 6338 tcp->tcp_fport = dstport; 6339 6340 oldstate = tcp->tcp_state; 6341 /* 6342 * At this point the remote destination address and remote port fields 6343 * in the tcp-four-tuple have been filled in the tcp structure. Now we 6344 * have to see which state tcp was in so we can take apropriate action. 6345 */ 6346 if (oldstate == TCPS_IDLE) { 6347 /* 6348 * We support a quick connect capability here, allowing 6349 * clients to transition directly from IDLE to SYN_SENT 6350 * tcp_bindi will pick an unused port, insert the connection 6351 * in the bind hash and transition to BOUND state. 6352 */ 6353 lport = tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE); 6354 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, 6355 B_FALSE, B_FALSE); 6356 if (lport == 0) { 6357 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0); 6358 goto failed; 6359 } 6360 } 6361 tcp->tcp_state = TCPS_SYN_SENT; 6362 6363 /* 6364 * TODO: allow data with connect requests 6365 * by unlinking M_DATA trailers here and 6366 * linking them in behind the T_OK_ACK mblk. 6367 * The tcp_rput() bind ack handler would then 6368 * feed them to tcp_wput_data() rather than call 6369 * tcp_timer(). 6370 */ 6371 mp = mi_tpi_ok_ack_alloc(mp); 6372 if (!mp) { 6373 tcp->tcp_state = oldstate; 6374 goto failed; 6375 } 6376 if (tcp->tcp_family == AF_INET) { 6377 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 6378 sizeof (ipa_conn_t)); 6379 } else { 6380 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 6381 sizeof (ipa6_conn_t)); 6382 } 6383 if (mp1) { 6384 /* Hang onto the T_OK_ACK for later. */ 6385 linkb(mp1, mp); 6386 mblk_setcred(mp1, tcp->tcp_cred); 6387 if (tcp->tcp_family == AF_INET) 6388 mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp); 6389 else { 6390 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp, 6391 &tcp->tcp_sticky_ipp); 6392 } 6393 BUMP_MIB(&tcp_mib, tcpActiveOpens); 6394 tcp->tcp_active_open = 1; 6395 /* 6396 * If the bind cannot complete immediately 6397 * IP will arrange to call tcp_rput_other 6398 * when the bind completes. 6399 */ 6400 if (mp1 != NULL) 6401 tcp_rput_other(tcp, mp1); 6402 return; 6403 } 6404 /* Error case */ 6405 tcp->tcp_state = oldstate; 6406 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); 6407 6408 failed: 6409 /* return error ack and blow away saved option results if any */ 6410 if (mp != NULL) 6411 putnext(tcp->tcp_rq, mp); 6412 else { 6413 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6414 TSYSERR, ENOMEM); 6415 } 6416 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 6417 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 6418 6419 } 6420 6421 /* 6422 * Handle connect to IPv6 destinations. 6423 */ 6424 static void 6425 tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, 6426 in_port_t dstport, uint32_t flowinfo, uint_t srcid, uint32_t scope_id) 6427 { 6428 tcph_t *tcph; 6429 mblk_t *mp1; 6430 ip6_rthdr_t *rth; 6431 int32_t oldstate; 6432 uint16_t lport; 6433 6434 ASSERT(tcp->tcp_family == AF_INET6); 6435 6436 /* 6437 * If we're here, it means that the destination address is a native 6438 * IPv6 address. Return an error if tcp_ipversion is not IPv6. A 6439 * reason why it might not be IPv6 is if the socket was bound to an 6440 * IPv4-mapped IPv6 address. 6441 */ 6442 if (tcp->tcp_ipversion != IPV6_VERSION) { 6443 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 6444 goto failed; 6445 } 6446 6447 /* 6448 * Interpret a zero destination to mean loopback. 6449 * Update the T_CONN_REQ (sin/sin6) since it is used to 6450 * generate the T_CONN_CON. 6451 */ 6452 if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) { 6453 *dstaddrp = ipv6_loopback; 6454 } 6455 6456 /* Handle __sin6_src_id if socket not bound to an IP address */ 6457 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { 6458 ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src, 6459 tcp->tcp_connp->conn_zoneid); 6460 tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src; 6461 } 6462 6463 /* 6464 * Take care of the scope_id now and add ip6i_t 6465 * if ip6i_t is not already allocated through TCP 6466 * sticky options. At this point tcp_ip6h does not 6467 * have dst info, thus use dstaddrp. 6468 */ 6469 if (scope_id != 0 && 6470 IN6_IS_ADDR_LINKSCOPE(dstaddrp)) { 6471 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; 6472 ip6i_t *ip6i; 6473 6474 ipp->ipp_ifindex = scope_id; 6475 ip6i = (ip6i_t *)tcp->tcp_iphc; 6476 6477 if ((ipp->ipp_fields & IPPF_HAS_IP6I) && 6478 ip6i != NULL && (ip6i->ip6i_nxt == IPPROTO_RAW)) { 6479 /* Already allocated */ 6480 ip6i->ip6i_flags |= IP6I_IFINDEX; 6481 ip6i->ip6i_ifindex = ipp->ipp_ifindex; 6482 ipp->ipp_fields |= IPPF_SCOPE_ID; 6483 } else { 6484 int reterr; 6485 6486 ipp->ipp_fields |= IPPF_SCOPE_ID; 6487 if (ipp->ipp_fields & IPPF_HAS_IP6I) 6488 ip2dbg(("tcp_connect_v6: SCOPE_ID set\n")); 6489 reterr = tcp_build_hdrs(tcp->tcp_rq, tcp); 6490 if (reterr != 0) 6491 goto failed; 6492 ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n")); 6493 } 6494 } 6495 6496 /* 6497 * Don't let an endpoint connect to itself. Note that 6498 * the test here does not catch the case where the 6499 * source IP addr was left unspecified by the user. In 6500 * this case, the source addr is set in tcp_adapt_ire() 6501 * using the reply to the T_BIND message that we send 6502 * down to IP here and the check is repeated in tcp_rput_other. 6503 */ 6504 if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) && 6505 (dstport == tcp->tcp_lport)) { 6506 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 6507 goto failed; 6508 } 6509 6510 tcp->tcp_ip6h->ip6_dst = *dstaddrp; 6511 tcp->tcp_remote_v6 = *dstaddrp; 6512 tcp->tcp_ip6h->ip6_vcf = 6513 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 6514 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 6515 6516 6517 /* 6518 * Massage a routing header (if present) putting the first hop 6519 * in ip6_dst. Compute a starting value for the checksum which 6520 * takes into account that the original ip6_dst should be 6521 * included in the checksum but that ip will include the 6522 * first hop in the source route in the tcp checksum. 6523 */ 6524 rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph); 6525 if (rth != NULL) { 6526 6527 tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth); 6528 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + 6529 (tcp->tcp_sum >> 16)); 6530 } else { 6531 tcp->tcp_sum = 0; 6532 } 6533 6534 tcph = tcp->tcp_tcph; 6535 *(uint16_t *)tcph->th_fport = dstport; 6536 tcp->tcp_fport = dstport; 6537 6538 oldstate = tcp->tcp_state; 6539 /* 6540 * At this point the remote destination address and remote port fields 6541 * in the tcp-four-tuple have been filled in the tcp structure. Now we 6542 * have to see which state tcp was in so we can take apropriate action. 6543 */ 6544 if (oldstate == TCPS_IDLE) { 6545 /* 6546 * We support a quick connect capability here, allowing 6547 * clients to transition directly from IDLE to SYN_SENT 6548 * tcp_bindi will pick an unused port, insert the connection 6549 * in the bind hash and transition to BOUND state. 6550 */ 6551 lport = tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE); 6552 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, 6553 B_FALSE, B_FALSE); 6554 if (lport == 0) { 6555 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0); 6556 goto failed; 6557 } 6558 } 6559 tcp->tcp_state = TCPS_SYN_SENT; 6560 /* 6561 * TODO: allow data with connect requests 6562 * by unlinking M_DATA trailers here and 6563 * linking them in behind the T_OK_ACK mblk. 6564 * The tcp_rput() bind ack handler would then 6565 * feed them to tcp_wput_data() rather than call 6566 * tcp_timer(). 6567 */ 6568 mp = mi_tpi_ok_ack_alloc(mp); 6569 if (!mp) { 6570 tcp->tcp_state = oldstate; 6571 goto failed; 6572 } 6573 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t)); 6574 if (mp1) { 6575 /* Hang onto the T_OK_ACK for later. */ 6576 linkb(mp1, mp); 6577 mblk_setcred(mp1, tcp->tcp_cred); 6578 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp, 6579 &tcp->tcp_sticky_ipp); 6580 BUMP_MIB(&tcp_mib, tcpActiveOpens); 6581 tcp->tcp_active_open = 1; 6582 /* ip_bind_v6() may return ACK or ERROR */ 6583 if (mp1 != NULL) 6584 tcp_rput_other(tcp, mp1); 6585 return; 6586 } 6587 /* Error case */ 6588 tcp->tcp_state = oldstate; 6589 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); 6590 6591 failed: 6592 /* return error ack and blow away saved option results if any */ 6593 if (mp != NULL) 6594 putnext(tcp->tcp_rq, mp); 6595 else { 6596 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6597 TSYSERR, ENOMEM); 6598 } 6599 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 6600 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 6601 } 6602 6603 /* 6604 * We need a stream q for detached closing tcp connections 6605 * to use. Our client hereby indicates that this q is the 6606 * one to use. 6607 */ 6608 static void 6609 tcp_def_q_set(tcp_t *tcp, mblk_t *mp) 6610 { 6611 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 6612 queue_t *q = tcp->tcp_wq; 6613 6614 mp->b_datap->db_type = M_IOCACK; 6615 iocp->ioc_count = 0; 6616 mutex_enter(&tcp_g_q_lock); 6617 if (tcp_g_q != NULL) { 6618 mutex_exit(&tcp_g_q_lock); 6619 iocp->ioc_error = EALREADY; 6620 } else { 6621 mblk_t *mp1; 6622 6623 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 0); 6624 if (mp1 == NULL) { 6625 mutex_exit(&tcp_g_q_lock); 6626 iocp->ioc_error = ENOMEM; 6627 } else { 6628 tcp_g_q = tcp->tcp_rq; 6629 mutex_exit(&tcp_g_q_lock); 6630 iocp->ioc_error = 0; 6631 iocp->ioc_rval = 0; 6632 /* 6633 * We are passing tcp_sticky_ipp as NULL 6634 * as it is not useful for tcp_default queue 6635 */ 6636 mp1 = ip_bind_v6(q, mp1, tcp->tcp_connp, NULL); 6637 if (mp1 != NULL) 6638 tcp_rput_other(tcp, mp1); 6639 } 6640 } 6641 qreply(q, mp); 6642 } 6643 6644 /* 6645 * Our client hereby directs us to reject the connection request 6646 * that tcp_conn_request() marked with 'seqnum'. Rejection consists 6647 * of sending the appropriate RST, not an ICMP error. 6648 */ 6649 static void 6650 tcp_disconnect(tcp_t *tcp, mblk_t *mp) 6651 { 6652 tcp_t *ltcp = NULL; 6653 t_scalar_t seqnum; 6654 conn_t *connp; 6655 6656 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 6657 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) { 6658 tcp_err_ack(tcp, mp, TPROTO, 0); 6659 return; 6660 } 6661 6662 /* 6663 * Right now, upper modules pass down a T_DISCON_REQ to TCP, 6664 * when the stream is in BOUND state. Do not send a reset, 6665 * since the destination IP address is not valid, and it can 6666 * be the initialized value of all zeros (broadcast address). 6667 * 6668 * If TCP has sent down a bind request to IP and has not 6669 * received the reply, reject the request. Otherwise, TCP 6670 * will be confused. 6671 */ 6672 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) { 6673 if (tcp->tcp_debug) { 6674 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 6675 "tcp_disconnect: bad state, %d", tcp->tcp_state); 6676 } 6677 tcp_err_ack(tcp, mp, TOUTSTATE, 0); 6678 return; 6679 } 6680 6681 seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number; 6682 6683 if (seqnum == -1 || tcp->tcp_conn_req_max == 0) { 6684 6685 /* 6686 * According to TPI, for non-listeners, ignore seqnum 6687 * and disconnect. 6688 * Following interpretation of -1 seqnum is historical 6689 * and implied TPI ? (TPI only states that for T_CONN_IND, 6690 * a valid seqnum should not be -1). 6691 * 6692 * -1 means disconnect everything 6693 * regardless even on a listener. 6694 */ 6695 6696 int old_state = tcp->tcp_state; 6697 6698 /* 6699 * The connection can't be on the tcp_time_wait_head list 6700 * since it is not detached. 6701 */ 6702 ASSERT(tcp->tcp_time_wait_next == NULL); 6703 ASSERT(tcp->tcp_time_wait_prev == NULL); 6704 ASSERT(tcp->tcp_time_wait_expire == 0); 6705 ltcp = NULL; 6706 /* 6707 * If it used to be a listener, check to make sure no one else 6708 * has taken the port before switching back to LISTEN state. 6709 */ 6710 if (tcp->tcp_ipversion == IPV4_VERSION) { 6711 connp = ipcl_lookup_listener_v4(tcp->tcp_lport, 6712 tcp->tcp_ipha->ipha_src, 6713 tcp->tcp_connp->conn_zoneid); 6714 if (connp != NULL) 6715 ltcp = connp->conn_tcp; 6716 } else { 6717 /* Allow tcp_bound_if listeners? */ 6718 connp = ipcl_lookup_listener_v6(tcp->tcp_lport, 6719 &tcp->tcp_ip6h->ip6_src, 0, 6720 tcp->tcp_connp->conn_zoneid); 6721 if (connp != NULL) 6722 ltcp = connp->conn_tcp; 6723 } 6724 if (tcp->tcp_conn_req_max && ltcp == NULL) { 6725 tcp->tcp_state = TCPS_LISTEN; 6726 } else if (old_state > TCPS_BOUND) { 6727 tcp->tcp_conn_req_max = 0; 6728 tcp->tcp_state = TCPS_BOUND; 6729 } 6730 if (ltcp != NULL) 6731 CONN_DEC_REF(ltcp->tcp_connp); 6732 if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) { 6733 BUMP_MIB(&tcp_mib, tcpAttemptFails); 6734 } else if (old_state == TCPS_ESTABLISHED || 6735 old_state == TCPS_CLOSE_WAIT) { 6736 BUMP_MIB(&tcp_mib, tcpEstabResets); 6737 } 6738 6739 if (tcp->tcp_fused) 6740 tcp_unfuse(tcp); 6741 6742 mutex_enter(&tcp->tcp_eager_lock); 6743 if ((tcp->tcp_conn_req_cnt_q0 != 0) || 6744 (tcp->tcp_conn_req_cnt_q != 0)) { 6745 tcp_eager_cleanup(tcp, 0); 6746 } 6747 mutex_exit(&tcp->tcp_eager_lock); 6748 6749 tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt, 6750 tcp->tcp_rnxt, TH_RST | TH_ACK); 6751 6752 tcp_reinit(tcp); 6753 6754 if (old_state >= TCPS_ESTABLISHED) { 6755 /* Send M_FLUSH according to TPI */ 6756 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); 6757 } 6758 mp = mi_tpi_ok_ack_alloc(mp); 6759 if (mp) 6760 putnext(tcp->tcp_rq, mp); 6761 return; 6762 } else if (!tcp_eager_blowoff(tcp, seqnum)) { 6763 tcp_err_ack(tcp, mp, TBADSEQ, 0); 6764 return; 6765 } 6766 if (tcp->tcp_state >= TCPS_ESTABLISHED) { 6767 /* Send M_FLUSH according to TPI */ 6768 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); 6769 } 6770 mp = mi_tpi_ok_ack_alloc(mp); 6771 if (mp) 6772 putnext(tcp->tcp_rq, mp); 6773 } 6774 6775 /* 6776 * Diagnostic routine used to return a string associated with the tcp state. 6777 * Note that if the caller does not supply a buffer, it will use an internal 6778 * static string. This means that if multiple threads call this function at 6779 * the same time, output can be corrupted... Note also that this function 6780 * does not check the size of the supplied buffer. The caller has to make 6781 * sure that it is big enough. 6782 */ 6783 static char * 6784 tcp_display(tcp_t *tcp, char *sup_buf, char format) 6785 { 6786 char buf1[30]; 6787 static char priv_buf[INET6_ADDRSTRLEN * 2 + 80]; 6788 char *buf; 6789 char *cp; 6790 in6_addr_t local, remote; 6791 char local_addrbuf[INET6_ADDRSTRLEN]; 6792 char remote_addrbuf[INET6_ADDRSTRLEN]; 6793 6794 if (sup_buf != NULL) 6795 buf = sup_buf; 6796 else 6797 buf = priv_buf; 6798 6799 if (tcp == NULL) 6800 return ("NULL_TCP"); 6801 switch (tcp->tcp_state) { 6802 case TCPS_CLOSED: 6803 cp = "TCP_CLOSED"; 6804 break; 6805 case TCPS_IDLE: 6806 cp = "TCP_IDLE"; 6807 break; 6808 case TCPS_BOUND: 6809 cp = "TCP_BOUND"; 6810 break; 6811 case TCPS_LISTEN: 6812 cp = "TCP_LISTEN"; 6813 break; 6814 case TCPS_SYN_SENT: 6815 cp = "TCP_SYN_SENT"; 6816 break; 6817 case TCPS_SYN_RCVD: 6818 cp = "TCP_SYN_RCVD"; 6819 break; 6820 case TCPS_ESTABLISHED: 6821 cp = "TCP_ESTABLISHED"; 6822 break; 6823 case TCPS_CLOSE_WAIT: 6824 cp = "TCP_CLOSE_WAIT"; 6825 break; 6826 case TCPS_FIN_WAIT_1: 6827 cp = "TCP_FIN_WAIT_1"; 6828 break; 6829 case TCPS_CLOSING: 6830 cp = "TCP_CLOSING"; 6831 break; 6832 case TCPS_LAST_ACK: 6833 cp = "TCP_LAST_ACK"; 6834 break; 6835 case TCPS_FIN_WAIT_2: 6836 cp = "TCP_FIN_WAIT_2"; 6837 break; 6838 case TCPS_TIME_WAIT: 6839 cp = "TCP_TIME_WAIT"; 6840 break; 6841 default: 6842 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 6843 cp = buf1; 6844 break; 6845 } 6846 switch (format) { 6847 case DISP_ADDR_AND_PORT: 6848 if (tcp->tcp_ipversion == IPV4_VERSION) { 6849 /* 6850 * Note that we use the remote address in the tcp_b 6851 * structure. This means that it will print out 6852 * the real destination address, not the next hop's 6853 * address if source routing is used. 6854 */ 6855 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ip_src, &local); 6856 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &remote); 6857 6858 } else { 6859 local = tcp->tcp_ip_src_v6; 6860 remote = tcp->tcp_remote_v6; 6861 } 6862 (void) inet_ntop(AF_INET6, &local, local_addrbuf, 6863 sizeof (local_addrbuf)); 6864 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf, 6865 sizeof (remote_addrbuf)); 6866 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s", 6867 local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf, 6868 ntohs(tcp->tcp_fport), cp); 6869 break; 6870 case DISP_PORT_ONLY: 6871 default: 6872 (void) mi_sprintf(buf, "[%u, %u] %s", 6873 ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp); 6874 break; 6875 } 6876 6877 return (buf); 6878 } 6879 6880 /* 6881 * Called via squeue to get on to eager's perimeter to send a 6882 * TH_RST. The listener wants the eager to disappear either 6883 * by means of tcp_eager_blowoff() or tcp_eager_cleanup() 6884 * being called. 6885 */ 6886 /* ARGSUSED */ 6887 void 6888 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2) 6889 { 6890 conn_t *econnp = (conn_t *)arg; 6891 tcp_t *eager = econnp->conn_tcp; 6892 tcp_t *listener = eager->tcp_listener; 6893 6894 /* 6895 * We could be called because listener is closing. Since 6896 * the eager is using listener's queue's, its not safe. 6897 * Better use the default queue just to send the TH_RST 6898 * out. 6899 */ 6900 eager->tcp_rq = tcp_g_q; 6901 eager->tcp_wq = WR(tcp_g_q); 6902 6903 if (eager->tcp_state > TCPS_LISTEN) { 6904 tcp_xmit_ctl("tcp_eager_kill, can't wait", 6905 eager, eager->tcp_snxt, 0, TH_RST); 6906 } 6907 6908 /* We are here because listener wants this eager gone */ 6909 if (listener != NULL) { 6910 mutex_enter(&listener->tcp_eager_lock); 6911 tcp_eager_unlink(eager); 6912 if (eager->tcp_conn.tcp_eager_conn_ind == NULL) { 6913 /* 6914 * The eager has sent a conn_ind up to the 6915 * listener but listener decides to close 6916 * instead. We need to drop the extra ref 6917 * placed on eager in tcp_rput_data() before 6918 * sending the conn_ind to listener. 6919 */ 6920 CONN_DEC_REF(econnp); 6921 } 6922 mutex_exit(&listener->tcp_eager_lock); 6923 CONN_DEC_REF(listener->tcp_connp); 6924 } 6925 6926 if (eager->tcp_state > TCPS_BOUND) 6927 tcp_close_detached(eager); 6928 } 6929 6930 /* 6931 * Reset any eager connection hanging off this listener marked 6932 * with 'seqnum' and then reclaim it's resources. 6933 */ 6934 static boolean_t 6935 tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) 6936 { 6937 tcp_t *eager; 6938 mblk_t *mp; 6939 6940 TCP_STAT(tcp_eager_blowoff_calls); 6941 eager = listener; 6942 mutex_enter(&listener->tcp_eager_lock); 6943 do { 6944 eager = eager->tcp_eager_next_q; 6945 if (eager == NULL) { 6946 mutex_exit(&listener->tcp_eager_lock); 6947 return (B_FALSE); 6948 } 6949 } while (eager->tcp_conn_req_seqnum != seqnum); 6950 CONN_INC_REF(eager->tcp_connp); 6951 mutex_exit(&listener->tcp_eager_lock); 6952 mp = &eager->tcp_closemp; 6953 squeue_fill(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, 6954 eager->tcp_connp, SQTAG_TCP_EAGER_BLOWOFF); 6955 return (B_TRUE); 6956 } 6957 6958 /* 6959 * Reset any eager connection hanging off this listener 6960 * and then reclaim it's resources. 6961 */ 6962 static void 6963 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) 6964 { 6965 tcp_t *eager; 6966 mblk_t *mp; 6967 6968 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 6969 6970 if (!q0_only) { 6971 /* First cleanup q */ 6972 TCP_STAT(tcp_eager_blowoff_q); 6973 eager = listener->tcp_eager_next_q; 6974 while (eager != NULL) { 6975 CONN_INC_REF(eager->tcp_connp); 6976 mp = &eager->tcp_closemp; 6977 squeue_fill(eager->tcp_connp->conn_sqp, mp, 6978 tcp_eager_kill, eager->tcp_connp, 6979 SQTAG_TCP_EAGER_CLEANUP); 6980 eager = eager->tcp_eager_next_q; 6981 } 6982 } 6983 /* Then cleanup q0 */ 6984 TCP_STAT(tcp_eager_blowoff_q0); 6985 eager = listener->tcp_eager_next_q0; 6986 while (eager != listener) { 6987 CONN_INC_REF(eager->tcp_connp); 6988 mp = &eager->tcp_closemp; 6989 squeue_fill(eager->tcp_connp->conn_sqp, mp, 6990 tcp_eager_kill, eager->tcp_connp, 6991 SQTAG_TCP_EAGER_CLEANUP_Q0); 6992 eager = eager->tcp_eager_next_q0; 6993 } 6994 } 6995 6996 /* 6997 * If we are an eager connection hanging off a listener that hasn't 6998 * formally accepted the connection yet, get off his list and blow off 6999 * any data that we have accumulated. 7000 */ 7001 static void 7002 tcp_eager_unlink(tcp_t *tcp) 7003 { 7004 tcp_t *listener = tcp->tcp_listener; 7005 7006 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 7007 ASSERT(listener != NULL); 7008 if (tcp->tcp_eager_next_q0 != NULL) { 7009 ASSERT(tcp->tcp_eager_prev_q0 != NULL); 7010 7011 /* Remove the eager tcp from q0 */ 7012 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 7013 tcp->tcp_eager_prev_q0; 7014 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 7015 tcp->tcp_eager_next_q0; 7016 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 7017 listener->tcp_conn_req_cnt_q0--; 7018 7019 tcp->tcp_eager_next_q0 = NULL; 7020 tcp->tcp_eager_prev_q0 = NULL; 7021 7022 if (tcp->tcp_syn_rcvd_timeout != 0) { 7023 /* we have timed out before */ 7024 ASSERT(listener->tcp_syn_rcvd_timeout > 0); 7025 listener->tcp_syn_rcvd_timeout--; 7026 } 7027 } else { 7028 tcp_t **tcpp = &listener->tcp_eager_next_q; 7029 tcp_t *prev = NULL; 7030 7031 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 7032 if (tcpp[0] == tcp) { 7033 if (listener->tcp_eager_last_q == tcp) { 7034 /* 7035 * If we are unlinking the last 7036 * element on the list, adjust 7037 * tail pointer. Set tail pointer 7038 * to nil when list is empty. 7039 */ 7040 ASSERT(tcp->tcp_eager_next_q == NULL); 7041 if (listener->tcp_eager_last_q == 7042 listener->tcp_eager_next_q) { 7043 listener->tcp_eager_last_q = 7044 NULL; 7045 } else { 7046 /* 7047 * We won't get here if there 7048 * is only one eager in the 7049 * list. 7050 */ 7051 ASSERT(prev != NULL); 7052 listener->tcp_eager_last_q = 7053 prev; 7054 } 7055 } 7056 tcpp[0] = tcp->tcp_eager_next_q; 7057 tcp->tcp_eager_next_q = NULL; 7058 tcp->tcp_eager_last_q = NULL; 7059 ASSERT(listener->tcp_conn_req_cnt_q > 0); 7060 listener->tcp_conn_req_cnt_q--; 7061 break; 7062 } 7063 prev = tcpp[0]; 7064 } 7065 } 7066 tcp->tcp_listener = NULL; 7067 } 7068 7069 /* Shorthand to generate and send TPI error acks to our client */ 7070 static void 7071 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) 7072 { 7073 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 7074 putnext(tcp->tcp_rq, mp); 7075 } 7076 7077 /* Shorthand to generate and send TPI error acks to our client */ 7078 static void 7079 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 7080 int t_error, int sys_error) 7081 { 7082 struct T_error_ack *teackp; 7083 7084 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 7085 M_PCPROTO, T_ERROR_ACK)) != NULL) { 7086 teackp = (struct T_error_ack *)mp->b_rptr; 7087 teackp->ERROR_prim = primitive; 7088 teackp->TLI_error = t_error; 7089 teackp->UNIX_error = sys_error; 7090 putnext(tcp->tcp_rq, mp); 7091 } 7092 } 7093 7094 /* 7095 * Note: No locks are held when inspecting tcp_g_*epriv_ports 7096 * but instead the code relies on: 7097 * - the fact that the address of the array and its size never changes 7098 * - the atomic assignment of the elements of the array 7099 */ 7100 /* ARGSUSED */ 7101 static int 7102 tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 7103 { 7104 int i; 7105 7106 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 7107 if (tcp_g_epriv_ports[i] != 0) 7108 (void) mi_mpprintf(mp, "%d ", tcp_g_epriv_ports[i]); 7109 } 7110 return (0); 7111 } 7112 7113 /* 7114 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple 7115 * threads from changing it at the same time. 7116 */ 7117 /* ARGSUSED */ 7118 static int 7119 tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 7120 cred_t *cr) 7121 { 7122 long new_value; 7123 int i; 7124 7125 /* 7126 * Fail the request if the new value does not lie within the 7127 * port number limits. 7128 */ 7129 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 7130 new_value <= 0 || new_value >= 65536) { 7131 return (EINVAL); 7132 } 7133 7134 mutex_enter(&tcp_epriv_port_lock); 7135 /* Check if the value is already in the list */ 7136 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 7137 if (new_value == tcp_g_epriv_ports[i]) { 7138 mutex_exit(&tcp_epriv_port_lock); 7139 return (EEXIST); 7140 } 7141 } 7142 /* Find an empty slot */ 7143 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 7144 if (tcp_g_epriv_ports[i] == 0) 7145 break; 7146 } 7147 if (i == tcp_g_num_epriv_ports) { 7148 mutex_exit(&tcp_epriv_port_lock); 7149 return (EOVERFLOW); 7150 } 7151 /* Set the new value */ 7152 tcp_g_epriv_ports[i] = (uint16_t)new_value; 7153 mutex_exit(&tcp_epriv_port_lock); 7154 return (0); 7155 } 7156 7157 /* 7158 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple 7159 * threads from changing it at the same time. 7160 */ 7161 /* ARGSUSED */ 7162 static int 7163 tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 7164 cred_t *cr) 7165 { 7166 long new_value; 7167 int i; 7168 7169 /* 7170 * Fail the request if the new value does not lie within the 7171 * port number limits. 7172 */ 7173 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value <= 0 || 7174 new_value >= 65536) { 7175 return (EINVAL); 7176 } 7177 7178 mutex_enter(&tcp_epriv_port_lock); 7179 /* Check that the value is already in the list */ 7180 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 7181 if (tcp_g_epriv_ports[i] == new_value) 7182 break; 7183 } 7184 if (i == tcp_g_num_epriv_ports) { 7185 mutex_exit(&tcp_epriv_port_lock); 7186 return (ESRCH); 7187 } 7188 /* Clear the value */ 7189 tcp_g_epriv_ports[i] = 0; 7190 mutex_exit(&tcp_epriv_port_lock); 7191 return (0); 7192 } 7193 7194 /* Return the TPI/TLI equivalent of our current tcp_state */ 7195 static int 7196 tcp_tpistate(tcp_t *tcp) 7197 { 7198 switch (tcp->tcp_state) { 7199 case TCPS_IDLE: 7200 return (TS_UNBND); 7201 case TCPS_LISTEN: 7202 /* 7203 * Return whether there are outstanding T_CONN_IND waiting 7204 * for the matching T_CONN_RES. Therefore don't count q0. 7205 */ 7206 if (tcp->tcp_conn_req_cnt_q > 0) 7207 return (TS_WRES_CIND); 7208 else 7209 return (TS_IDLE); 7210 case TCPS_BOUND: 7211 return (TS_IDLE); 7212 case TCPS_SYN_SENT: 7213 return (TS_WCON_CREQ); 7214 case TCPS_SYN_RCVD: 7215 /* 7216 * Note: assumption: this has to the active open SYN_RCVD. 7217 * The passive instance is detached in SYN_RCVD stage of 7218 * incoming connection processing so we cannot get request 7219 * for T_info_ack on it. 7220 */ 7221 return (TS_WACK_CRES); 7222 case TCPS_ESTABLISHED: 7223 return (TS_DATA_XFER); 7224 case TCPS_CLOSE_WAIT: 7225 return (TS_WREQ_ORDREL); 7226 case TCPS_FIN_WAIT_1: 7227 return (TS_WIND_ORDREL); 7228 case TCPS_FIN_WAIT_2: 7229 return (TS_WIND_ORDREL); 7230 7231 case TCPS_CLOSING: 7232 case TCPS_LAST_ACK: 7233 case TCPS_TIME_WAIT: 7234 case TCPS_CLOSED: 7235 /* 7236 * Following TS_WACK_DREQ7 is a rendition of "not 7237 * yet TS_IDLE" TPI state. There is no best match to any 7238 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we 7239 * choose a value chosen that will map to TLI/XTI level 7240 * state of TSTATECHNG (state is process of changing) which 7241 * captures what this dummy state represents. 7242 */ 7243 return (TS_WACK_DREQ7); 7244 default: 7245 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", 7246 tcp->tcp_state, tcp_display(tcp, NULL, 7247 DISP_PORT_ONLY)); 7248 return (TS_UNBND); 7249 } 7250 } 7251 7252 static void 7253 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) 7254 { 7255 if (tcp->tcp_family == AF_INET6) 7256 *tia = tcp_g_t_info_ack_v6; 7257 else 7258 *tia = tcp_g_t_info_ack; 7259 tia->CURRENT_state = tcp_tpistate(tcp); 7260 tia->OPT_size = tcp_max_optsize; 7261 if (tcp->tcp_mss == 0) { 7262 /* Not yet set - tcp_open does not set mss */ 7263 if (tcp->tcp_ipversion == IPV4_VERSION) 7264 tia->TIDU_size = tcp_mss_def_ipv4; 7265 else 7266 tia->TIDU_size = tcp_mss_def_ipv6; 7267 } else { 7268 tia->TIDU_size = tcp->tcp_mss; 7269 } 7270 /* TODO: Default ETSDU is 1. Is that correct for tcp? */ 7271 } 7272 7273 /* 7274 * This routine responds to T_CAPABILITY_REQ messages. It is called by 7275 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from 7276 * tcp_g_t_info_ack. The current state of the stream is copied from 7277 * tcp_state. 7278 */ 7279 static void 7280 tcp_capability_req(tcp_t *tcp, mblk_t *mp) 7281 { 7282 t_uscalar_t cap_bits1; 7283 struct T_capability_ack *tcap; 7284 7285 if (MBLKL(mp) < sizeof (struct T_capability_req)) { 7286 freemsg(mp); 7287 return; 7288 } 7289 7290 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 7291 7292 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 7293 mp->b_datap->db_type, T_CAPABILITY_ACK); 7294 if (mp == NULL) 7295 return; 7296 7297 tcap = (struct T_capability_ack *)mp->b_rptr; 7298 tcap->CAP_bits1 = 0; 7299 7300 if (cap_bits1 & TC1_INFO) { 7301 tcp_copy_info(&tcap->INFO_ack, tcp); 7302 tcap->CAP_bits1 |= TC1_INFO; 7303 } 7304 7305 if (cap_bits1 & TC1_ACCEPTOR_ID) { 7306 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; 7307 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; 7308 } 7309 7310 putnext(tcp->tcp_rq, mp); 7311 } 7312 7313 /* 7314 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. 7315 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. 7316 * The current state of the stream is copied from tcp_state. 7317 */ 7318 static void 7319 tcp_info_req(tcp_t *tcp, mblk_t *mp) 7320 { 7321 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 7322 T_INFO_ACK); 7323 if (!mp) { 7324 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 7325 return; 7326 } 7327 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); 7328 putnext(tcp->tcp_rq, mp); 7329 } 7330 7331 /* Respond to the TPI addr request */ 7332 static void 7333 tcp_addr_req(tcp_t *tcp, mblk_t *mp) 7334 { 7335 sin_t *sin; 7336 mblk_t *ackmp; 7337 struct T_addr_ack *taa; 7338 7339 /* Make it large enough for worst case */ 7340 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 7341 2 * sizeof (sin6_t), 1); 7342 if (ackmp == NULL) { 7343 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 7344 return; 7345 } 7346 7347 if (tcp->tcp_ipversion == IPV6_VERSION) { 7348 tcp_addr_req_ipv6(tcp, ackmp); 7349 return; 7350 } 7351 taa = (struct T_addr_ack *)ackmp->b_rptr; 7352 7353 bzero(taa, sizeof (struct T_addr_ack)); 7354 ackmp->b_wptr = (uchar_t *)&taa[1]; 7355 7356 taa->PRIM_type = T_ADDR_ACK; 7357 ackmp->b_datap->db_type = M_PCPROTO; 7358 7359 /* 7360 * Note: Following code assumes 32 bit alignment of basic 7361 * data structures like sin_t and struct T_addr_ack. 7362 */ 7363 if (tcp->tcp_state >= TCPS_BOUND) { 7364 /* 7365 * Fill in local address 7366 */ 7367 taa->LOCADDR_length = sizeof (sin_t); 7368 taa->LOCADDR_offset = sizeof (*taa); 7369 7370 sin = (sin_t *)&taa[1]; 7371 7372 /* Fill zeroes and then intialize non-zero fields */ 7373 *sin = sin_null; 7374 7375 sin->sin_family = AF_INET; 7376 7377 sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src; 7378 sin->sin_port = *(uint16_t *)tcp->tcp_tcph->th_lport; 7379 7380 ackmp->b_wptr = (uchar_t *)&sin[1]; 7381 7382 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 7383 /* 7384 * Fill in Remote address 7385 */ 7386 taa->REMADDR_length = sizeof (sin_t); 7387 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset + 7388 taa->LOCADDR_length); 7389 7390 sin = (sin_t *)(ackmp->b_rptr + taa->REMADDR_offset); 7391 *sin = sin_null; 7392 sin->sin_family = AF_INET; 7393 sin->sin_addr.s_addr = tcp->tcp_remote; 7394 sin->sin_port = tcp->tcp_fport; 7395 7396 ackmp->b_wptr = (uchar_t *)&sin[1]; 7397 } 7398 } 7399 putnext(tcp->tcp_rq, ackmp); 7400 } 7401 7402 /* Assumes that tcp_addr_req gets enough space and alignment */ 7403 static void 7404 tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp) 7405 { 7406 sin6_t *sin6; 7407 struct T_addr_ack *taa; 7408 7409 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 7410 ASSERT(OK_32PTR(ackmp->b_rptr)); 7411 ASSERT(ackmp->b_wptr - ackmp->b_rptr >= sizeof (struct T_addr_ack) + 7412 2 * sizeof (sin6_t)); 7413 7414 taa = (struct T_addr_ack *)ackmp->b_rptr; 7415 7416 bzero(taa, sizeof (struct T_addr_ack)); 7417 ackmp->b_wptr = (uchar_t *)&taa[1]; 7418 7419 taa->PRIM_type = T_ADDR_ACK; 7420 ackmp->b_datap->db_type = M_PCPROTO; 7421 7422 /* 7423 * Note: Following code assumes 32 bit alignment of basic 7424 * data structures like sin6_t and struct T_addr_ack. 7425 */ 7426 if (tcp->tcp_state >= TCPS_BOUND) { 7427 /* 7428 * Fill in local address 7429 */ 7430 taa->LOCADDR_length = sizeof (sin6_t); 7431 taa->LOCADDR_offset = sizeof (*taa); 7432 7433 sin6 = (sin6_t *)&taa[1]; 7434 *sin6 = sin6_null; 7435 7436 sin6->sin6_family = AF_INET6; 7437 sin6->sin6_addr = tcp->tcp_ip6h->ip6_src; 7438 sin6->sin6_port = tcp->tcp_lport; 7439 7440 ackmp->b_wptr = (uchar_t *)&sin6[1]; 7441 7442 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 7443 /* 7444 * Fill in Remote address 7445 */ 7446 taa->REMADDR_length = sizeof (sin6_t); 7447 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset + 7448 taa->LOCADDR_length); 7449 7450 sin6 = (sin6_t *)(ackmp->b_rptr + taa->REMADDR_offset); 7451 *sin6 = sin6_null; 7452 sin6->sin6_family = AF_INET6; 7453 sin6->sin6_flowinfo = 7454 tcp->tcp_ip6h->ip6_vcf & 7455 ~IPV6_VERS_AND_FLOW_MASK; 7456 sin6->sin6_addr = tcp->tcp_remote_v6; 7457 sin6->sin6_port = tcp->tcp_fport; 7458 7459 ackmp->b_wptr = (uchar_t *)&sin6[1]; 7460 } 7461 } 7462 putnext(tcp->tcp_rq, ackmp); 7463 } 7464 7465 /* 7466 * Handle reinitialization of a tcp structure. 7467 * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE. 7468 */ 7469 static void 7470 tcp_reinit(tcp_t *tcp) 7471 { 7472 mblk_t *mp; 7473 int err; 7474 7475 TCP_STAT(tcp_reinit_calls); 7476 7477 /* tcp_reinit should never be called for detached tcp_t's */ 7478 ASSERT(tcp->tcp_listener == NULL); 7479 ASSERT((tcp->tcp_family == AF_INET && 7480 tcp->tcp_ipversion == IPV4_VERSION) || 7481 (tcp->tcp_family == AF_INET6 && 7482 (tcp->tcp_ipversion == IPV4_VERSION || 7483 tcp->tcp_ipversion == IPV6_VERSION))); 7484 7485 /* Cancel outstanding timers */ 7486 tcp_timers_stop(tcp); 7487 7488 /* 7489 * Reset everything in the state vector, after updating global 7490 * MIB data from instance counters. 7491 */ 7492 UPDATE_MIB(&tcp_mib, tcpInSegs, tcp->tcp_ibsegs); 7493 tcp->tcp_ibsegs = 0; 7494 UPDATE_MIB(&tcp_mib, tcpOutSegs, tcp->tcp_obsegs); 7495 tcp->tcp_obsegs = 0; 7496 7497 tcp_close_mpp(&tcp->tcp_xmit_head); 7498 if (tcp->tcp_snd_zcopy_aware) 7499 tcp_zcopy_notify(tcp); 7500 tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL; 7501 tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0; 7502 if (tcp->tcp_flow_stopped && 7503 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 7504 tcp_clrqfull(tcp); 7505 } 7506 tcp_close_mpp(&tcp->tcp_reass_head); 7507 tcp->tcp_reass_tail = NULL; 7508 if (tcp->tcp_rcv_list != NULL) { 7509 /* Free b_next chain */ 7510 tcp_close_mpp(&tcp->tcp_rcv_list); 7511 tcp->tcp_rcv_last_head = NULL; 7512 tcp->tcp_rcv_last_tail = NULL; 7513 tcp->tcp_rcv_cnt = 0; 7514 } 7515 tcp->tcp_rcv_last_tail = NULL; 7516 7517 if ((mp = tcp->tcp_urp_mp) != NULL) { 7518 freemsg(mp); 7519 tcp->tcp_urp_mp = NULL; 7520 } 7521 if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 7522 freemsg(mp); 7523 tcp->tcp_urp_mark_mp = NULL; 7524 } 7525 if (tcp->tcp_fused_sigurg_mp != NULL) { 7526 freeb(tcp->tcp_fused_sigurg_mp); 7527 tcp->tcp_fused_sigurg_mp = NULL; 7528 } 7529 7530 /* 7531 * Following is a union with two members which are 7532 * identical types and size so the following cleanup 7533 * is enough. 7534 */ 7535 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 7536 7537 CL_INET_DISCONNECT(tcp); 7538 7539 /* 7540 * The connection can't be on the tcp_time_wait_head list 7541 * since it is not detached. 7542 */ 7543 ASSERT(tcp->tcp_time_wait_next == NULL); 7544 ASSERT(tcp->tcp_time_wait_prev == NULL); 7545 ASSERT(tcp->tcp_time_wait_expire == 0); 7546 7547 if (tcp->tcp_kssl_pending) { 7548 tcp->tcp_kssl_pending = B_FALSE; 7549 7550 /* Don't reset if the initialized by bind. */ 7551 if (tcp->tcp_kssl_ent != NULL) { 7552 kssl_release_ent(tcp->tcp_kssl_ent, NULL, 7553 KSSL_NO_PROXY); 7554 } 7555 } 7556 if (tcp->tcp_kssl_ctx != NULL) { 7557 kssl_release_ctx(tcp->tcp_kssl_ctx); 7558 tcp->tcp_kssl_ctx = NULL; 7559 } 7560 7561 /* 7562 * Reset/preserve other values 7563 */ 7564 tcp_reinit_values(tcp); 7565 ipcl_hash_remove(tcp->tcp_connp); 7566 conn_delete_ire(tcp->tcp_connp, NULL); 7567 7568 if (tcp->tcp_conn_req_max != 0) { 7569 /* 7570 * This is the case when a TLI program uses the same 7571 * transport end point to accept a connection. This 7572 * makes the TCP both a listener and acceptor. When 7573 * this connection is closed, we need to set the state 7574 * back to TCPS_LISTEN. Make sure that the eager list 7575 * is reinitialized. 7576 * 7577 * Note that this stream is still bound to the four 7578 * tuples of the previous connection in IP. If a new 7579 * SYN with different foreign address comes in, IP will 7580 * not find it and will send it to the global queue. In 7581 * the global queue, TCP will do a tcp_lookup_listener() 7582 * to find this stream. This works because this stream 7583 * is only removed from connected hash. 7584 * 7585 */ 7586 tcp->tcp_state = TCPS_LISTEN; 7587 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 7588 tcp->tcp_connp->conn_recv = tcp_conn_request; 7589 if (tcp->tcp_family == AF_INET6) { 7590 ASSERT(tcp->tcp_connp->conn_af_isv6); 7591 (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP, 7592 &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport); 7593 } else { 7594 ASSERT(!tcp->tcp_connp->conn_af_isv6); 7595 (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP, 7596 tcp->tcp_ipha->ipha_src, tcp->tcp_lport); 7597 } 7598 } else { 7599 tcp->tcp_state = TCPS_BOUND; 7600 } 7601 7602 /* 7603 * Initialize to default values 7604 * Can't fail since enough header template space already allocated 7605 * at open(). 7606 */ 7607 err = tcp_init_values(tcp); 7608 ASSERT(err == 0); 7609 /* Restore state in tcp_tcph */ 7610 bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN); 7611 if (tcp->tcp_ipversion == IPV4_VERSION) 7612 tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source; 7613 else 7614 tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6; 7615 /* 7616 * Copy of the src addr. in tcp_t is needed in tcp_t 7617 * since the lookup funcs can only lookup on tcp_t 7618 */ 7619 tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6; 7620 7621 ASSERT(tcp->tcp_ptpbhn != NULL); 7622 tcp->tcp_rq->q_hiwat = tcp_recv_hiwat; 7623 tcp->tcp_rwnd = tcp_recv_hiwat; 7624 tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ? 7625 tcp_mss_def_ipv6 : tcp_mss_def_ipv4; 7626 } 7627 7628 /* 7629 * Force values to zero that need be zero. 7630 * Do not touch values asociated with the BOUND or LISTEN state 7631 * since the connection will end up in that state after the reinit. 7632 * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t 7633 * structure! 7634 */ 7635 static void 7636 tcp_reinit_values(tcp) 7637 tcp_t *tcp; 7638 { 7639 #ifndef lint 7640 #define DONTCARE(x) 7641 #define PRESERVE(x) 7642 #else 7643 #define DONTCARE(x) ((x) = (x)) 7644 #define PRESERVE(x) ((x) = (x)) 7645 #endif /* lint */ 7646 7647 PRESERVE(tcp->tcp_bind_hash); 7648 PRESERVE(tcp->tcp_ptpbhn); 7649 PRESERVE(tcp->tcp_acceptor_hash); 7650 PRESERVE(tcp->tcp_ptpahn); 7651 7652 /* Should be ASSERT NULL on these with new code! */ 7653 ASSERT(tcp->tcp_time_wait_next == NULL); 7654 ASSERT(tcp->tcp_time_wait_prev == NULL); 7655 ASSERT(tcp->tcp_time_wait_expire == 0); 7656 PRESERVE(tcp->tcp_state); 7657 PRESERVE(tcp->tcp_rq); 7658 PRESERVE(tcp->tcp_wq); 7659 7660 ASSERT(tcp->tcp_xmit_head == NULL); 7661 ASSERT(tcp->tcp_xmit_last == NULL); 7662 ASSERT(tcp->tcp_unsent == 0); 7663 ASSERT(tcp->tcp_xmit_tail == NULL); 7664 ASSERT(tcp->tcp_xmit_tail_unsent == 0); 7665 7666 tcp->tcp_snxt = 0; /* Displayed in mib */ 7667 tcp->tcp_suna = 0; /* Displayed in mib */ 7668 tcp->tcp_swnd = 0; 7669 DONTCARE(tcp->tcp_cwnd); /* Init in tcp_mss_set */ 7670 7671 ASSERT(tcp->tcp_ibsegs == 0); 7672 ASSERT(tcp->tcp_obsegs == 0); 7673 7674 if (tcp->tcp_iphc != NULL) { 7675 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 7676 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 7677 } 7678 7679 DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */ 7680 DONTCARE(tcp->tcp_hdr_len); /* Init in tcp_init_values */ 7681 DONTCARE(tcp->tcp_ipha); 7682 DONTCARE(tcp->tcp_ip6h); 7683 DONTCARE(tcp->tcp_ip_hdr_len); 7684 DONTCARE(tcp->tcp_tcph); 7685 DONTCARE(tcp->tcp_tcp_hdr_len); /* Init in tcp_init_values */ 7686 tcp->tcp_valid_bits = 0; 7687 7688 DONTCARE(tcp->tcp_xmit_hiwater); /* Init in tcp_init_values */ 7689 DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */ 7690 DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */ 7691 tcp->tcp_last_rcv_lbolt = 0; 7692 7693 tcp->tcp_init_cwnd = 0; 7694 7695 tcp->tcp_urp_last_valid = 0; 7696 tcp->tcp_hard_binding = 0; 7697 tcp->tcp_hard_bound = 0; 7698 PRESERVE(tcp->tcp_cred); 7699 PRESERVE(tcp->tcp_cpid); 7700 PRESERVE(tcp->tcp_exclbind); 7701 7702 tcp->tcp_fin_acked = 0; 7703 tcp->tcp_fin_rcvd = 0; 7704 tcp->tcp_fin_sent = 0; 7705 tcp->tcp_ordrel_done = 0; 7706 7707 tcp->tcp_debug = 0; 7708 tcp->tcp_dontroute = 0; 7709 tcp->tcp_broadcast = 0; 7710 7711 tcp->tcp_useloopback = 0; 7712 tcp->tcp_reuseaddr = 0; 7713 tcp->tcp_oobinline = 0; 7714 tcp->tcp_dgram_errind = 0; 7715 7716 tcp->tcp_detached = 0; 7717 tcp->tcp_bind_pending = 0; 7718 tcp->tcp_unbind_pending = 0; 7719 tcp->tcp_deferred_clean_death = 0; 7720 7721 tcp->tcp_snd_ws_ok = B_FALSE; 7722 tcp->tcp_snd_ts_ok = B_FALSE; 7723 tcp->tcp_linger = 0; 7724 tcp->tcp_ka_enabled = 0; 7725 tcp->tcp_zero_win_probe = 0; 7726 7727 tcp->tcp_loopback = 0; 7728 tcp->tcp_localnet = 0; 7729 tcp->tcp_syn_defense = 0; 7730 tcp->tcp_set_timer = 0; 7731 7732 tcp->tcp_active_open = 0; 7733 ASSERT(tcp->tcp_timeout == B_FALSE); 7734 tcp->tcp_rexmit = B_FALSE; 7735 tcp->tcp_xmit_zc_clean = B_FALSE; 7736 7737 tcp->tcp_snd_sack_ok = B_FALSE; 7738 PRESERVE(tcp->tcp_recvdstaddr); 7739 tcp->tcp_hwcksum = B_FALSE; 7740 7741 tcp->tcp_ire_ill_check_done = B_FALSE; 7742 DONTCARE(tcp->tcp_maxpsz); /* Init in tcp_init_values */ 7743 7744 tcp->tcp_mdt = B_FALSE; 7745 tcp->tcp_mdt_hdr_head = 0; 7746 tcp->tcp_mdt_hdr_tail = 0; 7747 7748 tcp->tcp_conn_def_q0 = 0; 7749 tcp->tcp_ip_forward_progress = B_FALSE; 7750 tcp->tcp_anon_priv_bind = 0; 7751 tcp->tcp_ecn_ok = B_FALSE; 7752 7753 tcp->tcp_cwr = B_FALSE; 7754 tcp->tcp_ecn_echo_on = B_FALSE; 7755 7756 if (tcp->tcp_sack_info != NULL) { 7757 if (tcp->tcp_notsack_list != NULL) { 7758 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 7759 } 7760 kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info); 7761 tcp->tcp_sack_info = NULL; 7762 } 7763 7764 tcp->tcp_rcv_ws = 0; 7765 tcp->tcp_snd_ws = 0; 7766 tcp->tcp_ts_recent = 0; 7767 tcp->tcp_rnxt = 0; /* Displayed in mib */ 7768 DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */ 7769 tcp->tcp_if_mtu = 0; 7770 7771 ASSERT(tcp->tcp_reass_head == NULL); 7772 ASSERT(tcp->tcp_reass_tail == NULL); 7773 7774 tcp->tcp_cwnd_cnt = 0; 7775 7776 ASSERT(tcp->tcp_rcv_list == NULL); 7777 ASSERT(tcp->tcp_rcv_last_head == NULL); 7778 ASSERT(tcp->tcp_rcv_last_tail == NULL); 7779 ASSERT(tcp->tcp_rcv_cnt == 0); 7780 7781 DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_adapt_ire */ 7782 DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */ 7783 tcp->tcp_csuna = 0; 7784 7785 tcp->tcp_rto = 0; /* Displayed in MIB */ 7786 DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */ 7787 DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */ 7788 tcp->tcp_rtt_update = 0; 7789 7790 DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 7791 DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 7792 7793 tcp->tcp_rack = 0; /* Displayed in mib */ 7794 tcp->tcp_rack_cnt = 0; 7795 tcp->tcp_rack_cur_max = 0; 7796 tcp->tcp_rack_abs_max = 0; 7797 7798 tcp->tcp_max_swnd = 0; 7799 7800 ASSERT(tcp->tcp_listener == NULL); 7801 7802 DONTCARE(tcp->tcp_xmit_lowater); /* Init in tcp_init_values */ 7803 7804 DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */ 7805 DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */ 7806 DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */ 7807 DONTCARE(tcp->tcp_urg); /* tcp_valid_bits cleared */ 7808 7809 ASSERT(tcp->tcp_conn_req_cnt_q == 0); 7810 ASSERT(tcp->tcp_conn_req_cnt_q0 == 0); 7811 PRESERVE(tcp->tcp_conn_req_max); 7812 PRESERVE(tcp->tcp_conn_req_seqnum); 7813 7814 DONTCARE(tcp->tcp_ip_hdr_len); /* Init in tcp_init_values */ 7815 DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */ 7816 DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */ 7817 DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */ 7818 DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */ 7819 7820 tcp->tcp_lingertime = 0; 7821 7822 DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */ 7823 ASSERT(tcp->tcp_urp_mp == NULL); 7824 ASSERT(tcp->tcp_urp_mark_mp == NULL); 7825 ASSERT(tcp->tcp_fused_sigurg_mp == NULL); 7826 7827 ASSERT(tcp->tcp_eager_next_q == NULL); 7828 ASSERT(tcp->tcp_eager_last_q == NULL); 7829 ASSERT((tcp->tcp_eager_next_q0 == NULL && 7830 tcp->tcp_eager_prev_q0 == NULL) || 7831 tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0); 7832 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 7833 7834 tcp->tcp_client_errno = 0; 7835 7836 DONTCARE(tcp->tcp_sum); /* Init in tcp_init_values */ 7837 7838 tcp->tcp_remote_v6 = ipv6_all_zeros; /* Displayed in MIB */ 7839 7840 PRESERVE(tcp->tcp_bound_source_v6); 7841 tcp->tcp_last_sent_len = 0; 7842 tcp->tcp_dupack_cnt = 0; 7843 7844 tcp->tcp_fport = 0; /* Displayed in MIB */ 7845 PRESERVE(tcp->tcp_lport); 7846 7847 PRESERVE(tcp->tcp_acceptor_lockp); 7848 7849 ASSERT(tcp->tcp_ordrelid == 0); 7850 PRESERVE(tcp->tcp_acceptor_id); 7851 DONTCARE(tcp->tcp_ipsec_overhead); 7852 7853 /* 7854 * If tcp_tracing flag is ON (i.e. We have a trace buffer 7855 * in tcp structure and now tracing), Re-initialize all 7856 * members of tcp_traceinfo. 7857 */ 7858 if (tcp->tcp_tracebuf != NULL) { 7859 bzero(tcp->tcp_tracebuf, sizeof (tcptrch_t)); 7860 } 7861 7862 PRESERVE(tcp->tcp_family); 7863 if (tcp->tcp_family == AF_INET6) { 7864 tcp->tcp_ipversion = IPV6_VERSION; 7865 tcp->tcp_mss = tcp_mss_def_ipv6; 7866 } else { 7867 tcp->tcp_ipversion = IPV4_VERSION; 7868 tcp->tcp_mss = tcp_mss_def_ipv4; 7869 } 7870 7871 tcp->tcp_bound_if = 0; 7872 tcp->tcp_ipv6_recvancillary = 0; 7873 tcp->tcp_recvifindex = 0; 7874 tcp->tcp_recvhops = 0; 7875 tcp->tcp_closed = 0; 7876 tcp->tcp_cleandeathtag = 0; 7877 if (tcp->tcp_hopopts != NULL) { 7878 mi_free(tcp->tcp_hopopts); 7879 tcp->tcp_hopopts = NULL; 7880 tcp->tcp_hopoptslen = 0; 7881 } 7882 ASSERT(tcp->tcp_hopoptslen == 0); 7883 if (tcp->tcp_dstopts != NULL) { 7884 mi_free(tcp->tcp_dstopts); 7885 tcp->tcp_dstopts = NULL; 7886 tcp->tcp_dstoptslen = 0; 7887 } 7888 ASSERT(tcp->tcp_dstoptslen == 0); 7889 if (tcp->tcp_rtdstopts != NULL) { 7890 mi_free(tcp->tcp_rtdstopts); 7891 tcp->tcp_rtdstopts = NULL; 7892 tcp->tcp_rtdstoptslen = 0; 7893 } 7894 ASSERT(tcp->tcp_rtdstoptslen == 0); 7895 if (tcp->tcp_rthdr != NULL) { 7896 mi_free(tcp->tcp_rthdr); 7897 tcp->tcp_rthdr = NULL; 7898 tcp->tcp_rthdrlen = 0; 7899 } 7900 ASSERT(tcp->tcp_rthdrlen == 0); 7901 PRESERVE(tcp->tcp_drop_opt_ack_cnt); 7902 7903 /* Reset fusion-related fields */ 7904 tcp->tcp_fused = B_FALSE; 7905 tcp->tcp_unfusable = B_FALSE; 7906 tcp->tcp_fused_sigurg = B_FALSE; 7907 tcp->tcp_direct_sockfs = B_FALSE; 7908 tcp->tcp_fuse_syncstr_stopped = B_FALSE; 7909 tcp->tcp_fuse_syncstr_plugged = B_FALSE; 7910 tcp->tcp_loopback_peer = NULL; 7911 tcp->tcp_fuse_rcv_hiwater = 0; 7912 tcp->tcp_fuse_rcv_unread_hiwater = 0; 7913 tcp->tcp_fuse_rcv_unread_cnt = 0; 7914 7915 tcp->tcp_in_ack_unsent = 0; 7916 tcp->tcp_cork = B_FALSE; 7917 7918 PRESERVE(tcp->tcp_squeue_bytes); 7919 7920 ASSERT(tcp->tcp_kssl_ctx == NULL); 7921 ASSERT(!tcp->tcp_kssl_pending); 7922 PRESERVE(tcp->tcp_kssl_ent); 7923 7924 #undef DONTCARE 7925 #undef PRESERVE 7926 } 7927 7928 /* 7929 * Allocate necessary resources and initialize state vector. 7930 * Guaranteed not to fail so that when an error is returned, 7931 * the caller doesn't need to do any additional cleanup. 7932 */ 7933 int 7934 tcp_init(tcp_t *tcp, queue_t *q) 7935 { 7936 int err; 7937 7938 tcp->tcp_rq = q; 7939 tcp->tcp_wq = WR(q); 7940 tcp->tcp_state = TCPS_IDLE; 7941 if ((err = tcp_init_values(tcp)) != 0) 7942 tcp_timers_stop(tcp); 7943 return (err); 7944 } 7945 7946 static int 7947 tcp_init_values(tcp_t *tcp) 7948 { 7949 int err; 7950 7951 ASSERT((tcp->tcp_family == AF_INET && 7952 tcp->tcp_ipversion == IPV4_VERSION) || 7953 (tcp->tcp_family == AF_INET6 && 7954 (tcp->tcp_ipversion == IPV4_VERSION || 7955 tcp->tcp_ipversion == IPV6_VERSION))); 7956 7957 /* 7958 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 7959 * will be close to tcp_rexmit_interval_initial. By doing this, we 7960 * allow the algorithm to adjust slowly to large fluctuations of RTT 7961 * during first few transmissions of a connection as seen in slow 7962 * links. 7963 */ 7964 tcp->tcp_rtt_sa = tcp_rexmit_interval_initial << 2; 7965 tcp->tcp_rtt_sd = tcp_rexmit_interval_initial >> 1; 7966 tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 7967 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + 7968 tcp_conn_grace_period; 7969 if (tcp->tcp_rto < tcp_rexmit_interval_min) 7970 tcp->tcp_rto = tcp_rexmit_interval_min; 7971 tcp->tcp_timer_backoff = 0; 7972 tcp->tcp_ms_we_have_waited = 0; 7973 tcp->tcp_last_recv_time = lbolt; 7974 tcp->tcp_cwnd_max = tcp_cwnd_max_; 7975 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 7976 tcp->tcp_snd_burst = TCP_CWND_INFINITE; 7977 7978 tcp->tcp_maxpsz = tcp_maxpsz_multiplier; 7979 7980 tcp->tcp_first_timer_threshold = tcp_ip_notify_interval; 7981 tcp->tcp_first_ctimer_threshold = tcp_ip_notify_cinterval; 7982 tcp->tcp_second_timer_threshold = tcp_ip_abort_interval; 7983 /* 7984 * Fix it to tcp_ip_abort_linterval later if it turns out to be a 7985 * passive open. 7986 */ 7987 tcp->tcp_second_ctimer_threshold = tcp_ip_abort_cinterval; 7988 7989 tcp->tcp_naglim = tcp_naglim_def; 7990 7991 /* NOTE: ISS is now set in tcp_adapt_ire(). */ 7992 7993 tcp->tcp_mdt_hdr_head = 0; 7994 tcp->tcp_mdt_hdr_tail = 0; 7995 7996 /* Reset fusion-related fields */ 7997 tcp->tcp_fused = B_FALSE; 7998 tcp->tcp_unfusable = B_FALSE; 7999 tcp->tcp_fused_sigurg = B_FALSE; 8000 tcp->tcp_direct_sockfs = B_FALSE; 8001 tcp->tcp_fuse_syncstr_stopped = B_FALSE; 8002 tcp->tcp_fuse_syncstr_plugged = B_FALSE; 8003 tcp->tcp_loopback_peer = NULL; 8004 tcp->tcp_fuse_rcv_hiwater = 0; 8005 tcp->tcp_fuse_rcv_unread_hiwater = 0; 8006 tcp->tcp_fuse_rcv_unread_cnt = 0; 8007 8008 /* Initialize the header template */ 8009 if (tcp->tcp_ipversion == IPV4_VERSION) { 8010 err = tcp_header_init_ipv4(tcp); 8011 } else { 8012 err = tcp_header_init_ipv6(tcp); 8013 } 8014 if (err) 8015 return (err); 8016 8017 /* 8018 * Init the window scale to the max so tcp_rwnd_set() won't pare 8019 * down tcp_rwnd. tcp_adapt_ire() will set the right value later. 8020 */ 8021 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 8022 tcp->tcp_xmit_lowater = tcp_xmit_lowat; 8023 tcp->tcp_xmit_hiwater = tcp_xmit_hiwat; 8024 8025 tcp->tcp_cork = B_FALSE; 8026 /* 8027 * Init the tcp_debug option. This value determines whether TCP 8028 * calls strlog() to print out debug messages. Doing this 8029 * initialization here means that this value is not inherited thru 8030 * tcp_reinit(). 8031 */ 8032 tcp->tcp_debug = tcp_dbg; 8033 8034 tcp->tcp_ka_interval = tcp_keepalive_interval; 8035 tcp->tcp_ka_abort_thres = tcp_keepalive_abort_interval; 8036 8037 return (0); 8038 } 8039 8040 /* 8041 * Initialize the IPv4 header. Loses any record of any IP options. 8042 */ 8043 static int 8044 tcp_header_init_ipv4(tcp_t *tcp) 8045 { 8046 tcph_t *tcph; 8047 uint32_t sum; 8048 conn_t *connp; 8049 8050 /* 8051 * This is a simple initialization. If there's 8052 * already a template, it should never be too small, 8053 * so reuse it. Otherwise, allocate space for the new one. 8054 */ 8055 if (tcp->tcp_iphc == NULL) { 8056 ASSERT(tcp->tcp_iphc_len == 0); 8057 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 8058 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP); 8059 if (tcp->tcp_iphc == NULL) { 8060 tcp->tcp_iphc_len = 0; 8061 return (ENOMEM); 8062 } 8063 } 8064 8065 /* options are gone; may need a new label */ 8066 connp = tcp->tcp_connp; 8067 connp->conn_mlp_type = mlptSingle; 8068 connp->conn_ulp_labeled = !is_system_labeled(); 8069 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 8070 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; 8071 tcp->tcp_ip6h = NULL; 8072 tcp->tcp_ipversion = IPV4_VERSION; 8073 tcp->tcp_hdr_len = sizeof (ipha_t) + sizeof (tcph_t); 8074 tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 8075 tcp->tcp_ip_hdr_len = sizeof (ipha_t); 8076 tcp->tcp_ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (tcph_t)); 8077 tcp->tcp_ipha->ipha_version_and_hdr_length 8078 = (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS; 8079 tcp->tcp_ipha->ipha_ident = 0; 8080 8081 tcp->tcp_ttl = (uchar_t)tcp_ipv4_ttl; 8082 tcp->tcp_tos = 0; 8083 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; 8084 tcp->tcp_ipha->ipha_ttl = (uchar_t)tcp_ipv4_ttl; 8085 tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP; 8086 8087 tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t)); 8088 tcp->tcp_tcph = tcph; 8089 tcph->th_offset_and_rsrvd[0] = (5 << 4); 8090 /* 8091 * IP wants our header length in the checksum field to 8092 * allow it to perform a single pseudo-header+checksum 8093 * calculation on behalf of TCP. 8094 * Include the adjustment for a source route once IP_OPTIONS is set. 8095 */ 8096 sum = sizeof (tcph_t) + tcp->tcp_sum; 8097 sum = (sum >> 16) + (sum & 0xFFFF); 8098 U16_TO_ABE16(sum, tcph->th_sum); 8099 return (0); 8100 } 8101 8102 /* 8103 * Initialize the IPv6 header. Loses any record of any IPv6 extension headers. 8104 */ 8105 static int 8106 tcp_header_init_ipv6(tcp_t *tcp) 8107 { 8108 tcph_t *tcph; 8109 uint32_t sum; 8110 conn_t *connp; 8111 8112 /* 8113 * This is a simple initialization. If there's 8114 * already a template, it should never be too small, 8115 * so reuse it. Otherwise, allocate space for the new one. 8116 * Ensure that there is enough space to "downgrade" the tcp_t 8117 * to an IPv4 tcp_t. This requires having space for a full load 8118 * of IPv4 options, as well as a full load of TCP options 8119 * (TCP_MAX_COMBINED_HEADER_LENGTH, 120 bytes); this is more space 8120 * than a v6 header and a TCP header with a full load of TCP options 8121 * (IPV6_HDR_LEN is 40 bytes; TCP_MAX_HDR_LENGTH is 60 bytes). 8122 * We want to avoid reallocation in the "downgraded" case when 8123 * processing outbound IPv4 options. 8124 */ 8125 if (tcp->tcp_iphc == NULL) { 8126 ASSERT(tcp->tcp_iphc_len == 0); 8127 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 8128 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP); 8129 if (tcp->tcp_iphc == NULL) { 8130 tcp->tcp_iphc_len = 0; 8131 return (ENOMEM); 8132 } 8133 } 8134 8135 /* options are gone; may need a new label */ 8136 connp = tcp->tcp_connp; 8137 connp->conn_mlp_type = mlptSingle; 8138 connp->conn_ulp_labeled = !is_system_labeled(); 8139 8140 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 8141 tcp->tcp_ipversion = IPV6_VERSION; 8142 tcp->tcp_hdr_len = IPV6_HDR_LEN + sizeof (tcph_t); 8143 tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 8144 tcp->tcp_ip_hdr_len = IPV6_HDR_LEN; 8145 tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc; 8146 tcp->tcp_ipha = NULL; 8147 8148 /* Initialize the header template */ 8149 8150 tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 8151 tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t)); 8152 tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP; 8153 tcp->tcp_ip6h->ip6_hops = (uint8_t)tcp_ipv6_hoplimit; 8154 8155 tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN); 8156 tcp->tcp_tcph = tcph; 8157 tcph->th_offset_and_rsrvd[0] = (5 << 4); 8158 /* 8159 * IP wants our header length in the checksum field to 8160 * allow it to perform a single psuedo-header+checksum 8161 * calculation on behalf of TCP. 8162 * Include the adjustment for a source route when IPV6_RTHDR is set. 8163 */ 8164 sum = sizeof (tcph_t) + tcp->tcp_sum; 8165 sum = (sum >> 16) + (sum & 0xFFFF); 8166 U16_TO_ABE16(sum, tcph->th_sum); 8167 return (0); 8168 } 8169 8170 /* At minimum we need 4 bytes in the TCP header for the lookup */ 8171 #define ICMP_MIN_TCP_HDR 12 8172 8173 /* 8174 * tcp_icmp_error is called by tcp_rput_other to process ICMP error messages 8175 * passed up by IP. The message is always received on the correct tcp_t. 8176 * Assumes that IP has pulled up everything up to and including the ICMP header. 8177 */ 8178 void 8179 tcp_icmp_error(tcp_t *tcp, mblk_t *mp) 8180 { 8181 icmph_t *icmph; 8182 ipha_t *ipha; 8183 int iph_hdr_length; 8184 tcph_t *tcph; 8185 boolean_t ipsec_mctl = B_FALSE; 8186 boolean_t secure; 8187 mblk_t *first_mp = mp; 8188 uint32_t new_mss; 8189 uint32_t ratio; 8190 size_t mp_size = MBLKL(mp); 8191 uint32_t seg_ack; 8192 uint32_t seg_seq; 8193 8194 /* Assume IP provides aligned packets - otherwise toss */ 8195 if (!OK_32PTR(mp->b_rptr)) { 8196 freemsg(mp); 8197 return; 8198 } 8199 8200 /* 8201 * Since ICMP errors are normal data marked with M_CTL when sent 8202 * to TCP or UDP, we have to look for a IPSEC_IN value to identify 8203 * packets starting with an ipsec_info_t, see ipsec_info.h. 8204 */ 8205 if ((mp_size == sizeof (ipsec_info_t)) && 8206 (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == IPSEC_IN)) { 8207 ASSERT(mp->b_cont != NULL); 8208 mp = mp->b_cont; 8209 /* IP should have done this */ 8210 ASSERT(OK_32PTR(mp->b_rptr)); 8211 mp_size = MBLKL(mp); 8212 ipsec_mctl = B_TRUE; 8213 } 8214 8215 /* 8216 * Verify that we have a complete outer IP header. If not, drop it. 8217 */ 8218 if (mp_size < sizeof (ipha_t)) { 8219 noticmpv4: 8220 freemsg(first_mp); 8221 return; 8222 } 8223 8224 ipha = (ipha_t *)mp->b_rptr; 8225 /* 8226 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent 8227 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6. 8228 */ 8229 switch (IPH_HDR_VERSION(ipha)) { 8230 case IPV6_VERSION: 8231 tcp_icmp_error_ipv6(tcp, first_mp, ipsec_mctl); 8232 return; 8233 case IPV4_VERSION: 8234 break; 8235 default: 8236 goto noticmpv4; 8237 } 8238 8239 /* Skip past the outer IP and ICMP headers */ 8240 iph_hdr_length = IPH_HDR_LENGTH(ipha); 8241 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 8242 /* 8243 * If we don't have the correct outer IP header length or if the ULP 8244 * is not IPPROTO_ICMP or if we don't have a complete inner IP header 8245 * send it upstream. 8246 */ 8247 if (iph_hdr_length < sizeof (ipha_t) || 8248 ipha->ipha_protocol != IPPROTO_ICMP || 8249 (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) { 8250 goto noticmpv4; 8251 } 8252 ipha = (ipha_t *)&icmph[1]; 8253 8254 /* Skip past the inner IP and find the ULP header */ 8255 iph_hdr_length = IPH_HDR_LENGTH(ipha); 8256 tcph = (tcph_t *)((char *)ipha + iph_hdr_length); 8257 /* 8258 * If we don't have the correct inner IP header length or if the ULP 8259 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR 8260 * bytes of TCP header, drop it. 8261 */ 8262 if (iph_hdr_length < sizeof (ipha_t) || 8263 ipha->ipha_protocol != IPPROTO_TCP || 8264 (uchar_t *)tcph + ICMP_MIN_TCP_HDR > mp->b_wptr) { 8265 goto noticmpv4; 8266 } 8267 8268 if (TCP_IS_DETACHED_NONEAGER(tcp)) { 8269 if (ipsec_mctl) { 8270 secure = ipsec_in_is_secure(first_mp); 8271 } else { 8272 secure = B_FALSE; 8273 } 8274 if (secure) { 8275 /* 8276 * If we are willing to accept this in clear 8277 * we don't have to verify policy. 8278 */ 8279 if (!ipsec_inbound_accept_clear(mp, ipha, NULL)) { 8280 if (!tcp_check_policy(tcp, first_mp, 8281 ipha, NULL, secure, ipsec_mctl)) { 8282 /* 8283 * tcp_check_policy called 8284 * ip_drop_packet() on failure. 8285 */ 8286 return; 8287 } 8288 } 8289 } 8290 } else if (ipsec_mctl) { 8291 /* 8292 * This is a hard_bound connection. IP has already 8293 * verified policy. We don't have to do it again. 8294 */ 8295 freeb(first_mp); 8296 first_mp = mp; 8297 ipsec_mctl = B_FALSE; 8298 } 8299 8300 seg_ack = ABE32_TO_U32(tcph->th_ack); 8301 seg_seq = ABE32_TO_U32(tcph->th_seq); 8302 /* 8303 * TCP SHOULD check that the TCP sequence number contained in 8304 * payload of the ICMP error message is within the range 8305 * SND.UNA <= SEG.SEQ < SND.NXT. and also SEG.ACK <= RECV.NXT 8306 */ 8307 if (SEQ_LT(seg_seq, tcp->tcp_suna) || 8308 SEQ_GEQ(seg_seq, tcp->tcp_snxt) || 8309 SEQ_GT(seg_ack, tcp->tcp_rnxt)) { 8310 /* 8311 * If the ICMP message is bogus, should we kill the 8312 * connection, or should we just drop the bogus ICMP 8313 * message? It would probably make more sense to just 8314 * drop the message so that if this one managed to get 8315 * in, the real connection should not suffer. 8316 */ 8317 goto noticmpv4; 8318 } 8319 8320 switch (icmph->icmph_type) { 8321 case ICMP_DEST_UNREACHABLE: 8322 switch (icmph->icmph_code) { 8323 case ICMP_FRAGMENTATION_NEEDED: 8324 /* 8325 * Reduce the MSS based on the new MTU. This will 8326 * eliminate any fragmentation locally. 8327 * N.B. There may well be some funny side-effects on 8328 * the local send policy and the remote receive policy. 8329 * Pending further research, we provide 8330 * tcp_ignore_path_mtu just in case this proves 8331 * disastrous somewhere. 8332 * 8333 * After updating the MSS, retransmit part of the 8334 * dropped segment using the new mss by calling 8335 * tcp_wput_data(). Need to adjust all those 8336 * params to make sure tcp_wput_data() work properly. 8337 */ 8338 if (tcp_ignore_path_mtu) 8339 break; 8340 8341 /* 8342 * Decrease the MSS by time stamp options 8343 * IP options and IPSEC options. tcp_hdr_len 8344 * includes time stamp option and IP option 8345 * length. 8346 */ 8347 8348 new_mss = ntohs(icmph->icmph_du_mtu) - 8349 tcp->tcp_hdr_len - tcp->tcp_ipsec_overhead; 8350 8351 /* 8352 * Only update the MSS if the new one is 8353 * smaller than the previous one. This is 8354 * to avoid problems when getting multiple 8355 * ICMP errors for the same MTU. 8356 */ 8357 if (new_mss >= tcp->tcp_mss) 8358 break; 8359 8360 /* 8361 * Stop doing PMTU if new_mss is less than 68 8362 * or less than tcp_mss_min. 8363 * The value 68 comes from rfc 1191. 8364 */ 8365 if (new_mss < MAX(68, tcp_mss_min)) 8366 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 8367 0; 8368 8369 ratio = tcp->tcp_cwnd / tcp->tcp_mss; 8370 ASSERT(ratio >= 1); 8371 tcp_mss_set(tcp, new_mss); 8372 8373 /* 8374 * Make sure we have something to 8375 * send. 8376 */ 8377 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) && 8378 (tcp->tcp_xmit_head != NULL)) { 8379 /* 8380 * Shrink tcp_cwnd in 8381 * proportion to the old MSS/new MSS. 8382 */ 8383 tcp->tcp_cwnd = ratio * tcp->tcp_mss; 8384 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 8385 (tcp->tcp_unsent == 0)) { 8386 tcp->tcp_rexmit_max = tcp->tcp_fss; 8387 } else { 8388 tcp->tcp_rexmit_max = tcp->tcp_snxt; 8389 } 8390 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 8391 tcp->tcp_rexmit = B_TRUE; 8392 tcp->tcp_dupack_cnt = 0; 8393 tcp->tcp_snd_burst = TCP_CWND_SS; 8394 tcp_ss_rexmit(tcp); 8395 } 8396 break; 8397 case ICMP_PORT_UNREACHABLE: 8398 case ICMP_PROTOCOL_UNREACHABLE: 8399 switch (tcp->tcp_state) { 8400 case TCPS_SYN_SENT: 8401 case TCPS_SYN_RCVD: 8402 /* 8403 * ICMP can snipe away incipient 8404 * TCP connections as long as 8405 * seq number is same as initial 8406 * send seq number. 8407 */ 8408 if (seg_seq == tcp->tcp_iss) { 8409 (void) tcp_clean_death(tcp, 8410 ECONNREFUSED, 6); 8411 } 8412 break; 8413 } 8414 break; 8415 case ICMP_HOST_UNREACHABLE: 8416 case ICMP_NET_UNREACHABLE: 8417 /* Record the error in case we finally time out. */ 8418 if (icmph->icmph_code == ICMP_HOST_UNREACHABLE) 8419 tcp->tcp_client_errno = EHOSTUNREACH; 8420 else 8421 tcp->tcp_client_errno = ENETUNREACH; 8422 if (tcp->tcp_state == TCPS_SYN_RCVD) { 8423 if (tcp->tcp_listener != NULL && 8424 tcp->tcp_listener->tcp_syn_defense) { 8425 /* 8426 * Ditch the half-open connection if we 8427 * suspect a SYN attack is under way. 8428 */ 8429 tcp_ip_ire_mark_advice(tcp); 8430 (void) tcp_clean_death(tcp, 8431 tcp->tcp_client_errno, 7); 8432 } 8433 } 8434 break; 8435 default: 8436 break; 8437 } 8438 break; 8439 case ICMP_SOURCE_QUENCH: { 8440 /* 8441 * use a global boolean to control 8442 * whether TCP should respond to ICMP_SOURCE_QUENCH. 8443 * The default is false. 8444 */ 8445 if (tcp_icmp_source_quench) { 8446 /* 8447 * Reduce the sending rate as if we got a 8448 * retransmit timeout 8449 */ 8450 uint32_t npkt; 8451 8452 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / 8453 tcp->tcp_mss; 8454 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss; 8455 tcp->tcp_cwnd = tcp->tcp_mss; 8456 tcp->tcp_cwnd_cnt = 0; 8457 } 8458 break; 8459 } 8460 } 8461 freemsg(first_mp); 8462 } 8463 8464 /* 8465 * tcp_icmp_error_ipv6 is called by tcp_rput_other to process ICMPv6 8466 * error messages passed up by IP. 8467 * Assumes that IP has pulled up all the extension headers as well 8468 * as the ICMPv6 header. 8469 */ 8470 static void 8471 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl) 8472 { 8473 icmp6_t *icmp6; 8474 ip6_t *ip6h; 8475 uint16_t iph_hdr_length; 8476 tcpha_t *tcpha; 8477 uint8_t *nexthdrp; 8478 uint32_t new_mss; 8479 uint32_t ratio; 8480 boolean_t secure; 8481 mblk_t *first_mp = mp; 8482 size_t mp_size; 8483 uint32_t seg_ack; 8484 uint32_t seg_seq; 8485 8486 /* 8487 * The caller has determined if this is an IPSEC_IN packet and 8488 * set ipsec_mctl appropriately (see tcp_icmp_error). 8489 */ 8490 if (ipsec_mctl) 8491 mp = mp->b_cont; 8492 8493 mp_size = MBLKL(mp); 8494 8495 /* 8496 * Verify that we have a complete IP header. If not, send it upstream. 8497 */ 8498 if (mp_size < sizeof (ip6_t)) { 8499 noticmpv6: 8500 freemsg(first_mp); 8501 return; 8502 } 8503 8504 /* 8505 * Verify this is an ICMPV6 packet, else send it upstream. 8506 */ 8507 ip6h = (ip6_t *)mp->b_rptr; 8508 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 8509 iph_hdr_length = IPV6_HDR_LEN; 8510 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, 8511 &nexthdrp) || 8512 *nexthdrp != IPPROTO_ICMPV6) { 8513 goto noticmpv6; 8514 } 8515 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 8516 ip6h = (ip6_t *)&icmp6[1]; 8517 /* 8518 * Verify if we have a complete ICMP and inner IP header. 8519 */ 8520 if ((uchar_t *)&ip6h[1] > mp->b_wptr) 8521 goto noticmpv6; 8522 8523 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) 8524 goto noticmpv6; 8525 tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length); 8526 /* 8527 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't 8528 * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the 8529 * packet. 8530 */ 8531 if ((*nexthdrp != IPPROTO_TCP) || 8532 ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) { 8533 goto noticmpv6; 8534 } 8535 8536 /* 8537 * ICMP errors come on the right queue or come on 8538 * listener/global queue for detached connections and 8539 * get switched to the right queue. If it comes on the 8540 * right queue, policy check has already been done by IP 8541 * and thus free the first_mp without verifying the policy. 8542 * If it has come for a non-hard bound connection, we need 8543 * to verify policy as IP may not have done it. 8544 */ 8545 if (!tcp->tcp_hard_bound) { 8546 if (ipsec_mctl) { 8547 secure = ipsec_in_is_secure(first_mp); 8548 } else { 8549 secure = B_FALSE; 8550 } 8551 if (secure) { 8552 /* 8553 * If we are willing to accept this in clear 8554 * we don't have to verify policy. 8555 */ 8556 if (!ipsec_inbound_accept_clear(mp, NULL, ip6h)) { 8557 if (!tcp_check_policy(tcp, first_mp, 8558 NULL, ip6h, secure, ipsec_mctl)) { 8559 /* 8560 * tcp_check_policy called 8561 * ip_drop_packet() on failure. 8562 */ 8563 return; 8564 } 8565 } 8566 } 8567 } else if (ipsec_mctl) { 8568 /* 8569 * This is a hard_bound connection. IP has already 8570 * verified policy. We don't have to do it again. 8571 */ 8572 freeb(first_mp); 8573 first_mp = mp; 8574 ipsec_mctl = B_FALSE; 8575 } 8576 8577 seg_ack = ntohl(tcpha->tha_ack); 8578 seg_seq = ntohl(tcpha->tha_seq); 8579 /* 8580 * TCP SHOULD check that the TCP sequence number contained in 8581 * payload of the ICMP error message is within the range 8582 * SND.UNA <= SEG.SEQ < SND.NXT. and also SEG.ACK <= RECV.NXT 8583 */ 8584 if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt) || 8585 SEQ_GT(seg_ack, tcp->tcp_rnxt)) { 8586 /* 8587 * If the ICMP message is bogus, should we kill the 8588 * connection, or should we just drop the bogus ICMP 8589 * message? It would probably make more sense to just 8590 * drop the message so that if this one managed to get 8591 * in, the real connection should not suffer. 8592 */ 8593 goto noticmpv6; 8594 } 8595 8596 switch (icmp6->icmp6_type) { 8597 case ICMP6_PACKET_TOO_BIG: 8598 /* 8599 * Reduce the MSS based on the new MTU. This will 8600 * eliminate any fragmentation locally. 8601 * N.B. There may well be some funny side-effects on 8602 * the local send policy and the remote receive policy. 8603 * Pending further research, we provide 8604 * tcp_ignore_path_mtu just in case this proves 8605 * disastrous somewhere. 8606 * 8607 * After updating the MSS, retransmit part of the 8608 * dropped segment using the new mss by calling 8609 * tcp_wput_data(). Need to adjust all those 8610 * params to make sure tcp_wput_data() work properly. 8611 */ 8612 if (tcp_ignore_path_mtu) 8613 break; 8614 8615 /* 8616 * Decrease the MSS by time stamp options 8617 * IP options and IPSEC options. tcp_hdr_len 8618 * includes time stamp option and IP option 8619 * length. 8620 */ 8621 new_mss = ntohs(icmp6->icmp6_mtu) - tcp->tcp_hdr_len - 8622 tcp->tcp_ipsec_overhead; 8623 8624 /* 8625 * Only update the MSS if the new one is 8626 * smaller than the previous one. This is 8627 * to avoid problems when getting multiple 8628 * ICMP errors for the same MTU. 8629 */ 8630 if (new_mss >= tcp->tcp_mss) 8631 break; 8632 8633 ratio = tcp->tcp_cwnd / tcp->tcp_mss; 8634 ASSERT(ratio >= 1); 8635 tcp_mss_set(tcp, new_mss); 8636 8637 /* 8638 * Make sure we have something to 8639 * send. 8640 */ 8641 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) && 8642 (tcp->tcp_xmit_head != NULL)) { 8643 /* 8644 * Shrink tcp_cwnd in 8645 * proportion to the old MSS/new MSS. 8646 */ 8647 tcp->tcp_cwnd = ratio * tcp->tcp_mss; 8648 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 8649 (tcp->tcp_unsent == 0)) { 8650 tcp->tcp_rexmit_max = tcp->tcp_fss; 8651 } else { 8652 tcp->tcp_rexmit_max = tcp->tcp_snxt; 8653 } 8654 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 8655 tcp->tcp_rexmit = B_TRUE; 8656 tcp->tcp_dupack_cnt = 0; 8657 tcp->tcp_snd_burst = TCP_CWND_SS; 8658 tcp_ss_rexmit(tcp); 8659 } 8660 break; 8661 8662 case ICMP6_DST_UNREACH: 8663 switch (icmp6->icmp6_code) { 8664 case ICMP6_DST_UNREACH_NOPORT: 8665 if (((tcp->tcp_state == TCPS_SYN_SENT) || 8666 (tcp->tcp_state == TCPS_SYN_RCVD)) && 8667 (seg_seq == tcp->tcp_iss)) { 8668 (void) tcp_clean_death(tcp, 8669 ECONNREFUSED, 8); 8670 } 8671 break; 8672 8673 case ICMP6_DST_UNREACH_ADMIN: 8674 case ICMP6_DST_UNREACH_NOROUTE: 8675 case ICMP6_DST_UNREACH_BEYONDSCOPE: 8676 case ICMP6_DST_UNREACH_ADDR: 8677 /* Record the error in case we finally time out. */ 8678 tcp->tcp_client_errno = EHOSTUNREACH; 8679 if (((tcp->tcp_state == TCPS_SYN_SENT) || 8680 (tcp->tcp_state == TCPS_SYN_RCVD)) && 8681 (seg_seq == tcp->tcp_iss)) { 8682 if (tcp->tcp_listener != NULL && 8683 tcp->tcp_listener->tcp_syn_defense) { 8684 /* 8685 * Ditch the half-open connection if we 8686 * suspect a SYN attack is under way. 8687 */ 8688 tcp_ip_ire_mark_advice(tcp); 8689 (void) tcp_clean_death(tcp, 8690 tcp->tcp_client_errno, 9); 8691 } 8692 } 8693 8694 8695 break; 8696 default: 8697 break; 8698 } 8699 break; 8700 8701 case ICMP6_PARAM_PROB: 8702 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 8703 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 8704 (uchar_t *)ip6h + icmp6->icmp6_pptr == 8705 (uchar_t *)nexthdrp) { 8706 if (tcp->tcp_state == TCPS_SYN_SENT || 8707 tcp->tcp_state == TCPS_SYN_RCVD) { 8708 (void) tcp_clean_death(tcp, 8709 ECONNREFUSED, 10); 8710 } 8711 break; 8712 } 8713 break; 8714 8715 case ICMP6_TIME_EXCEEDED: 8716 default: 8717 break; 8718 } 8719 freemsg(first_mp); 8720 } 8721 8722 /* 8723 * IP recognizes seven kinds of bind requests: 8724 * 8725 * - A zero-length address binds only to the protocol number. 8726 * 8727 * - A 4-byte address is treated as a request to 8728 * validate that the address is a valid local IPv4 8729 * address, appropriate for an application to bind to. 8730 * IP does the verification, but does not make any note 8731 * of the address at this time. 8732 * 8733 * - A 16-byte address contains is treated as a request 8734 * to validate a local IPv6 address, as the 4-byte 8735 * address case above. 8736 * 8737 * - A 16-byte sockaddr_in to validate the local IPv4 address and also 8738 * use it for the inbound fanout of packets. 8739 * 8740 * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also 8741 * use it for the inbound fanout of packets. 8742 * 8743 * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout 8744 * information consisting of local and remote addresses 8745 * and ports. In this case, the addresses are both 8746 * validated as appropriate for this operation, and, if 8747 * so, the information is retained for use in the 8748 * inbound fanout. 8749 * 8750 * - A 36-byte address address (ipa6_conn_t) containing complete IPv6 8751 * fanout information, like the 12-byte case above. 8752 * 8753 * IP will also fill in the IRE request mblk with information 8754 * regarding our peer. In all cases, we notify IP of our protocol 8755 * type by appending a single protocol byte to the bind request. 8756 */ 8757 static mblk_t * 8758 tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, t_scalar_t addr_length) 8759 { 8760 char *cp; 8761 mblk_t *mp; 8762 struct T_bind_req *tbr; 8763 ipa_conn_t *ac; 8764 ipa6_conn_t *ac6; 8765 sin_t *sin; 8766 sin6_t *sin6; 8767 8768 ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ); 8769 ASSERT((tcp->tcp_family == AF_INET && 8770 tcp->tcp_ipversion == IPV4_VERSION) || 8771 (tcp->tcp_family == AF_INET6 && 8772 (tcp->tcp_ipversion == IPV4_VERSION || 8773 tcp->tcp_ipversion == IPV6_VERSION))); 8774 8775 mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI); 8776 if (!mp) 8777 return (mp); 8778 mp->b_datap->db_type = M_PROTO; 8779 tbr = (struct T_bind_req *)mp->b_rptr; 8780 tbr->PRIM_type = bind_prim; 8781 tbr->ADDR_offset = sizeof (*tbr); 8782 tbr->CONIND_number = 0; 8783 tbr->ADDR_length = addr_length; 8784 cp = (char *)&tbr[1]; 8785 switch (addr_length) { 8786 case sizeof (ipa_conn_t): 8787 ASSERT(tcp->tcp_family == AF_INET); 8788 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 8789 8790 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); 8791 if (mp->b_cont == NULL) { 8792 freemsg(mp); 8793 return (NULL); 8794 } 8795 mp->b_cont->b_wptr += sizeof (ire_t); 8796 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; 8797 8798 /* cp known to be 32 bit aligned */ 8799 ac = (ipa_conn_t *)cp; 8800 ac->ac_laddr = tcp->tcp_ipha->ipha_src; 8801 ac->ac_faddr = tcp->tcp_remote; 8802 ac->ac_fport = tcp->tcp_fport; 8803 ac->ac_lport = tcp->tcp_lport; 8804 tcp->tcp_hard_binding = 1; 8805 break; 8806 8807 case sizeof (ipa6_conn_t): 8808 ASSERT(tcp->tcp_family == AF_INET6); 8809 8810 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); 8811 if (mp->b_cont == NULL) { 8812 freemsg(mp); 8813 return (NULL); 8814 } 8815 mp->b_cont->b_wptr += sizeof (ire_t); 8816 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; 8817 8818 /* cp known to be 32 bit aligned */ 8819 ac6 = (ipa6_conn_t *)cp; 8820 if (tcp->tcp_ipversion == IPV4_VERSION) { 8821 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 8822 &ac6->ac6_laddr); 8823 } else { 8824 ac6->ac6_laddr = tcp->tcp_ip6h->ip6_src; 8825 } 8826 ac6->ac6_faddr = tcp->tcp_remote_v6; 8827 ac6->ac6_fport = tcp->tcp_fport; 8828 ac6->ac6_lport = tcp->tcp_lport; 8829 tcp->tcp_hard_binding = 1; 8830 break; 8831 8832 case sizeof (sin_t): 8833 /* 8834 * NOTE: IPV6_ADDR_LEN also has same size. 8835 * Use family to discriminate. 8836 */ 8837 if (tcp->tcp_family == AF_INET) { 8838 sin = (sin_t *)cp; 8839 8840 *sin = sin_null; 8841 sin->sin_family = AF_INET; 8842 sin->sin_addr.s_addr = tcp->tcp_bound_source; 8843 sin->sin_port = tcp->tcp_lport; 8844 break; 8845 } else { 8846 *(in6_addr_t *)cp = tcp->tcp_bound_source_v6; 8847 } 8848 break; 8849 8850 case sizeof (sin6_t): 8851 ASSERT(tcp->tcp_family == AF_INET6); 8852 sin6 = (sin6_t *)cp; 8853 8854 *sin6 = sin6_null; 8855 sin6->sin6_family = AF_INET6; 8856 sin6->sin6_addr = tcp->tcp_bound_source_v6; 8857 sin6->sin6_port = tcp->tcp_lport; 8858 break; 8859 8860 case IP_ADDR_LEN: 8861 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 8862 *(uint32_t *)cp = tcp->tcp_ipha->ipha_src; 8863 break; 8864 8865 } 8866 /* Add protocol number to end */ 8867 cp[addr_length] = (char)IPPROTO_TCP; 8868 mp->b_wptr = (uchar_t *)&cp[addr_length + 1]; 8869 return (mp); 8870 } 8871 8872 /* 8873 * Notify IP that we are having trouble with this connection. IP should 8874 * blow the IRE away and start over. 8875 */ 8876 static void 8877 tcp_ip_notify(tcp_t *tcp) 8878 { 8879 struct iocblk *iocp; 8880 ipid_t *ipid; 8881 mblk_t *mp; 8882 8883 /* IPv6 has NUD thus notification to delete the IRE is not needed */ 8884 if (tcp->tcp_ipversion == IPV6_VERSION) 8885 return; 8886 8887 mp = mkiocb(IP_IOCTL); 8888 if (mp == NULL) 8889 return; 8890 8891 iocp = (struct iocblk *)mp->b_rptr; 8892 iocp->ioc_count = sizeof (ipid_t) + sizeof (tcp->tcp_ipha->ipha_dst); 8893 8894 mp->b_cont = allocb(iocp->ioc_count, BPRI_HI); 8895 if (!mp->b_cont) { 8896 freeb(mp); 8897 return; 8898 } 8899 8900 ipid = (ipid_t *)mp->b_cont->b_rptr; 8901 mp->b_cont->b_wptr += iocp->ioc_count; 8902 bzero(ipid, sizeof (*ipid)); 8903 ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY; 8904 ipid->ipid_ire_type = IRE_CACHE; 8905 ipid->ipid_addr_offset = sizeof (ipid_t); 8906 ipid->ipid_addr_length = sizeof (tcp->tcp_ipha->ipha_dst); 8907 /* 8908 * Note: in the case of source routing we want to blow away the 8909 * route to the first source route hop. 8910 */ 8911 bcopy(&tcp->tcp_ipha->ipha_dst, &ipid[1], 8912 sizeof (tcp->tcp_ipha->ipha_dst)); 8913 8914 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 8915 } 8916 8917 /* Unlink and return any mblk that looks like it contains an ire */ 8918 static mblk_t * 8919 tcp_ire_mp(mblk_t *mp) 8920 { 8921 mblk_t *prev_mp; 8922 8923 for (;;) { 8924 prev_mp = mp; 8925 mp = mp->b_cont; 8926 if (mp == NULL) 8927 break; 8928 switch (DB_TYPE(mp)) { 8929 case IRE_DB_TYPE: 8930 case IRE_DB_REQ_TYPE: 8931 if (prev_mp != NULL) 8932 prev_mp->b_cont = mp->b_cont; 8933 mp->b_cont = NULL; 8934 return (mp); 8935 default: 8936 break; 8937 } 8938 } 8939 return (mp); 8940 } 8941 8942 /* 8943 * Timer callback routine for keepalive probe. We do a fake resend of 8944 * last ACKed byte. Then set a timer using RTO. When the timer expires, 8945 * check to see if we have heard anything from the other end for the last 8946 * RTO period. If we have, set the timer to expire for another 8947 * tcp_keepalive_intrvl and check again. If we have not, set a timer using 8948 * RTO << 1 and check again when it expires. Keep exponentially increasing 8949 * the timeout if we have not heard from the other side. If for more than 8950 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything, 8951 * kill the connection unless the keepalive abort threshold is 0. In 8952 * that case, we will probe "forever." 8953 */ 8954 static void 8955 tcp_keepalive_killer(void *arg) 8956 { 8957 mblk_t *mp; 8958 conn_t *connp = (conn_t *)arg; 8959 tcp_t *tcp = connp->conn_tcp; 8960 int32_t firetime; 8961 int32_t idletime; 8962 int32_t ka_intrvl; 8963 8964 tcp->tcp_ka_tid = 0; 8965 8966 if (tcp->tcp_fused) 8967 return; 8968 8969 BUMP_MIB(&tcp_mib, tcpTimKeepalive); 8970 ka_intrvl = tcp->tcp_ka_interval; 8971 8972 /* 8973 * Keepalive probe should only be sent if the application has not 8974 * done a close on the connection. 8975 */ 8976 if (tcp->tcp_state > TCPS_CLOSE_WAIT) { 8977 return; 8978 } 8979 /* Timer fired too early, restart it. */ 8980 if (tcp->tcp_state < TCPS_ESTABLISHED) { 8981 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 8982 MSEC_TO_TICK(ka_intrvl)); 8983 return; 8984 } 8985 8986 idletime = TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time); 8987 /* 8988 * If we have not heard from the other side for a long 8989 * time, kill the connection unless the keepalive abort 8990 * threshold is 0. In that case, we will probe "forever." 8991 */ 8992 if (tcp->tcp_ka_abort_thres != 0 && 8993 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) { 8994 BUMP_MIB(&tcp_mib, tcpTimKeepaliveDrop); 8995 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ? 8996 tcp->tcp_client_errno : ETIMEDOUT, 11); 8997 return; 8998 } 8999 9000 if (tcp->tcp_snxt == tcp->tcp_suna && 9001 idletime >= ka_intrvl) { 9002 /* Fake resend of last ACKed byte. */ 9003 mblk_t *mp1 = allocb(1, BPRI_LO); 9004 9005 if (mp1 != NULL) { 9006 *mp1->b_wptr++ = '\0'; 9007 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL, 9008 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE); 9009 freeb(mp1); 9010 /* 9011 * if allocation failed, fall through to start the 9012 * timer back. 9013 */ 9014 if (mp != NULL) { 9015 TCP_RECORD_TRACE(tcp, mp, 9016 TCP_TRACE_SEND_PKT); 9017 tcp_send_data(tcp, tcp->tcp_wq, mp); 9018 BUMP_MIB(&tcp_mib, tcpTimKeepaliveProbe); 9019 if (tcp->tcp_ka_last_intrvl != 0) { 9020 /* 9021 * We should probe again at least 9022 * in ka_intrvl, but not more than 9023 * tcp_rexmit_interval_max. 9024 */ 9025 firetime = MIN(ka_intrvl - 1, 9026 tcp->tcp_ka_last_intrvl << 1); 9027 if (firetime > tcp_rexmit_interval_max) 9028 firetime = 9029 tcp_rexmit_interval_max; 9030 } else { 9031 firetime = tcp->tcp_rto; 9032 } 9033 tcp->tcp_ka_tid = TCP_TIMER(tcp, 9034 tcp_keepalive_killer, 9035 MSEC_TO_TICK(firetime)); 9036 tcp->tcp_ka_last_intrvl = firetime; 9037 return; 9038 } 9039 } 9040 } else { 9041 tcp->tcp_ka_last_intrvl = 0; 9042 } 9043 9044 /* firetime can be negative if (mp1 == NULL || mp == NULL) */ 9045 if ((firetime = ka_intrvl - idletime) < 0) { 9046 firetime = ka_intrvl; 9047 } 9048 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 9049 MSEC_TO_TICK(firetime)); 9050 } 9051 9052 int 9053 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) 9054 { 9055 queue_t *q = tcp->tcp_rq; 9056 int32_t mss = tcp->tcp_mss; 9057 int maxpsz; 9058 9059 if (TCP_IS_DETACHED(tcp)) 9060 return (mss); 9061 9062 if (tcp->tcp_fused) { 9063 maxpsz = tcp_fuse_maxpsz_set(tcp); 9064 mss = INFPSZ; 9065 } else if (tcp->tcp_mdt || tcp->tcp_maxpsz == 0) { 9066 /* 9067 * Set the sd_qn_maxpsz according to the socket send buffer 9068 * size, and sd_maxblk to INFPSZ (-1). This will essentially 9069 * instruct the stream head to copyin user data into contiguous 9070 * kernel-allocated buffers without breaking it up into smaller 9071 * chunks. We round up the buffer size to the nearest SMSS. 9072 */ 9073 maxpsz = MSS_ROUNDUP(tcp->tcp_xmit_hiwater, mss); 9074 if (tcp->tcp_kssl_ctx == NULL) 9075 mss = INFPSZ; 9076 else 9077 mss = SSL3_MAX_RECORD_LEN; 9078 } else { 9079 /* 9080 * Set sd_qn_maxpsz to approx half the (receivers) buffer 9081 * (and a multiple of the mss). This instructs the stream 9082 * head to break down larger than SMSS writes into SMSS- 9083 * size mblks, up to tcp_maxpsz_multiplier mblks at a time. 9084 */ 9085 maxpsz = tcp->tcp_maxpsz * mss; 9086 if (maxpsz > tcp->tcp_xmit_hiwater/2) { 9087 maxpsz = tcp->tcp_xmit_hiwater/2; 9088 /* Round up to nearest mss */ 9089 maxpsz = MSS_ROUNDUP(maxpsz, mss); 9090 } 9091 } 9092 (void) setmaxps(q, maxpsz); 9093 tcp->tcp_wq->q_maxpsz = maxpsz; 9094 9095 if (set_maxblk) 9096 (void) mi_set_sth_maxblk(q, mss); 9097 9098 return (mss); 9099 } 9100 9101 /* 9102 * Extract option values from a tcp header. We put any found values into the 9103 * tcpopt struct and return a bitmask saying which options were found. 9104 */ 9105 static int 9106 tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) 9107 { 9108 uchar_t *endp; 9109 int len; 9110 uint32_t mss; 9111 uchar_t *up = (uchar_t *)tcph; 9112 int found = 0; 9113 int32_t sack_len; 9114 tcp_seq sack_begin, sack_end; 9115 tcp_t *tcp; 9116 9117 endp = up + TCP_HDR_LENGTH(tcph); 9118 up += TCP_MIN_HEADER_LENGTH; 9119 while (up < endp) { 9120 len = endp - up; 9121 switch (*up) { 9122 case TCPOPT_EOL: 9123 break; 9124 9125 case TCPOPT_NOP: 9126 up++; 9127 continue; 9128 9129 case TCPOPT_MAXSEG: 9130 if (len < TCPOPT_MAXSEG_LEN || 9131 up[1] != TCPOPT_MAXSEG_LEN) 9132 break; 9133 9134 mss = BE16_TO_U16(up+2); 9135 /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 9136 tcpopt->tcp_opt_mss = mss; 9137 found |= TCP_OPT_MSS_PRESENT; 9138 9139 up += TCPOPT_MAXSEG_LEN; 9140 continue; 9141 9142 case TCPOPT_WSCALE: 9143 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 9144 break; 9145 9146 if (up[2] > TCP_MAX_WINSHIFT) 9147 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 9148 else 9149 tcpopt->tcp_opt_wscale = up[2]; 9150 found |= TCP_OPT_WSCALE_PRESENT; 9151 9152 up += TCPOPT_WS_LEN; 9153 continue; 9154 9155 case TCPOPT_SACK_PERMITTED: 9156 if (len < TCPOPT_SACK_OK_LEN || 9157 up[1] != TCPOPT_SACK_OK_LEN) 9158 break; 9159 found |= TCP_OPT_SACK_OK_PRESENT; 9160 up += TCPOPT_SACK_OK_LEN; 9161 continue; 9162 9163 case TCPOPT_SACK: 9164 if (len <= 2 || up[1] <= 2 || len < up[1]) 9165 break; 9166 9167 /* If TCP is not interested in SACK blks... */ 9168 if ((tcp = tcpopt->tcp) == NULL) { 9169 up += up[1]; 9170 continue; 9171 } 9172 sack_len = up[1] - TCPOPT_HEADER_LEN; 9173 up += TCPOPT_HEADER_LEN; 9174 9175 /* 9176 * If the list is empty, allocate one and assume 9177 * nothing is sack'ed. 9178 */ 9179 ASSERT(tcp->tcp_sack_info != NULL); 9180 if (tcp->tcp_notsack_list == NULL) { 9181 tcp_notsack_update(&(tcp->tcp_notsack_list), 9182 tcp->tcp_suna, tcp->tcp_snxt, 9183 &(tcp->tcp_num_notsack_blk), 9184 &(tcp->tcp_cnt_notsack_list)); 9185 9186 /* 9187 * Make sure tcp_notsack_list is not NULL. 9188 * This happens when kmem_alloc(KM_NOSLEEP) 9189 * returns NULL. 9190 */ 9191 if (tcp->tcp_notsack_list == NULL) { 9192 up += sack_len; 9193 continue; 9194 } 9195 tcp->tcp_fack = tcp->tcp_suna; 9196 } 9197 9198 while (sack_len > 0) { 9199 if (up + 8 > endp) { 9200 up = endp; 9201 break; 9202 } 9203 sack_begin = BE32_TO_U32(up); 9204 up += 4; 9205 sack_end = BE32_TO_U32(up); 9206 up += 4; 9207 sack_len -= 8; 9208 /* 9209 * Bounds checking. Make sure the SACK 9210 * info is within tcp_suna and tcp_snxt. 9211 * If this SACK blk is out of bound, ignore 9212 * it but continue to parse the following 9213 * blks. 9214 */ 9215 if (SEQ_LEQ(sack_end, sack_begin) || 9216 SEQ_LT(sack_begin, tcp->tcp_suna) || 9217 SEQ_GT(sack_end, tcp->tcp_snxt)) { 9218 continue; 9219 } 9220 tcp_notsack_insert(&(tcp->tcp_notsack_list), 9221 sack_begin, sack_end, 9222 &(tcp->tcp_num_notsack_blk), 9223 &(tcp->tcp_cnt_notsack_list)); 9224 if (SEQ_GT(sack_end, tcp->tcp_fack)) { 9225 tcp->tcp_fack = sack_end; 9226 } 9227 } 9228 found |= TCP_OPT_SACK_PRESENT; 9229 continue; 9230 9231 case TCPOPT_TSTAMP: 9232 if (len < TCPOPT_TSTAMP_LEN || 9233 up[1] != TCPOPT_TSTAMP_LEN) 9234 break; 9235 9236 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 9237 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 9238 9239 found |= TCP_OPT_TSTAMP_PRESENT; 9240 9241 up += TCPOPT_TSTAMP_LEN; 9242 continue; 9243 9244 default: 9245 if (len <= 1 || len < (int)up[1] || up[1] == 0) 9246 break; 9247 up += up[1]; 9248 continue; 9249 } 9250 break; 9251 } 9252 return (found); 9253 } 9254 9255 /* 9256 * Set the mss associated with a particular tcp based on its current value, 9257 * and a new one passed in. Observe minimums and maximums, and reset 9258 * other state variables that we want to view as multiples of mss. 9259 * 9260 * This function is called in various places mainly because 9261 * 1) Various stuffs, tcp_mss, tcp_cwnd, ... need to be adjusted when the 9262 * other side's SYN/SYN-ACK packet arrives. 9263 * 2) PMTUd may get us a new MSS. 9264 * 3) If the other side stops sending us timestamp option, we need to 9265 * increase the MSS size to use the extra bytes available. 9266 */ 9267 static void 9268 tcp_mss_set(tcp_t *tcp, uint32_t mss) 9269 { 9270 uint32_t mss_max; 9271 9272 if (tcp->tcp_ipversion == IPV4_VERSION) 9273 mss_max = tcp_mss_max_ipv4; 9274 else 9275 mss_max = tcp_mss_max_ipv6; 9276 9277 if (mss < tcp_mss_min) 9278 mss = tcp_mss_min; 9279 if (mss > mss_max) 9280 mss = mss_max; 9281 /* 9282 * Unless naglim has been set by our client to 9283 * a non-mss value, force naglim to track mss. 9284 * This can help to aggregate small writes. 9285 */ 9286 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 9287 tcp->tcp_naglim = mss; 9288 /* 9289 * TCP should be able to buffer at least 4 MSS data for obvious 9290 * performance reason. 9291 */ 9292 if ((mss << 2) > tcp->tcp_xmit_hiwater) 9293 tcp->tcp_xmit_hiwater = mss << 2; 9294 9295 /* 9296 * Check if we need to apply the tcp_init_cwnd here. If 9297 * it is set and the MSS gets bigger (should not happen 9298 * normally), we need to adjust the resulting tcp_cwnd properly. 9299 * The new tcp_cwnd should not get bigger. 9300 */ 9301 if (tcp->tcp_init_cwnd == 0) { 9302 tcp->tcp_cwnd = MIN(tcp_slow_start_initial * mss, 9303 MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 9304 } else { 9305 if (tcp->tcp_mss < mss) { 9306 tcp->tcp_cwnd = MAX(1, 9307 (tcp->tcp_init_cwnd * tcp->tcp_mss / mss)) * mss; 9308 } else { 9309 tcp->tcp_cwnd = tcp->tcp_init_cwnd * mss; 9310 } 9311 } 9312 tcp->tcp_mss = mss; 9313 tcp->tcp_cwnd_cnt = 0; 9314 (void) tcp_maxpsz_set(tcp, B_TRUE); 9315 } 9316 9317 static int 9318 tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9319 { 9320 tcp_t *tcp = NULL; 9321 conn_t *connp; 9322 int err; 9323 dev_t conn_dev; 9324 zoneid_t zoneid = getzoneid(); 9325 9326 /* 9327 * Special case for install: miniroot needs to be able to access files 9328 * via NFS as though it were always in the global zone. 9329 */ 9330 if (credp == kcred && nfs_global_client_only != 0) 9331 zoneid = GLOBAL_ZONEID; 9332 9333 if (q->q_ptr != NULL) 9334 return (0); 9335 9336 if (sflag == MODOPEN) { 9337 /* 9338 * This is a special case. The purpose of a modopen 9339 * is to allow just the T_SVR4_OPTMGMT_REQ to pass 9340 * through for MIB browsers. Everything else is failed. 9341 */ 9342 connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt)); 9343 9344 if (connp == NULL) 9345 return (ENOMEM); 9346 9347 connp->conn_flags |= IPCL_TCPMOD; 9348 connp->conn_cred = credp; 9349 connp->conn_zoneid = zoneid; 9350 q->q_ptr = WR(q)->q_ptr = connp; 9351 crhold(credp); 9352 q->q_qinfo = &tcp_mod_rinit; 9353 WR(q)->q_qinfo = &tcp_mod_winit; 9354 qprocson(q); 9355 return (0); 9356 } 9357 9358 if ((conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) 9359 return (EBUSY); 9360 9361 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 9362 9363 if (flag & SO_ACCEPTOR) { 9364 q->q_qinfo = &tcp_acceptor_rinit; 9365 q->q_ptr = (void *)conn_dev; 9366 WR(q)->q_qinfo = &tcp_acceptor_winit; 9367 WR(q)->q_ptr = (void *)conn_dev; 9368 qprocson(q); 9369 return (0); 9370 } 9371 9372 connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt)); 9373 if (connp == NULL) { 9374 inet_minor_free(ip_minor_arena, conn_dev); 9375 q->q_ptr = NULL; 9376 return (ENOSR); 9377 } 9378 connp->conn_sqp = IP_SQUEUE_GET(lbolt); 9379 tcp = connp->conn_tcp; 9380 9381 q->q_ptr = WR(q)->q_ptr = connp; 9382 if (getmajor(*devp) == TCP6_MAJ) { 9383 connp->conn_flags |= (IPCL_TCP6|IPCL_ISV6); 9384 connp->conn_send = ip_output_v6; 9385 connp->conn_af_isv6 = B_TRUE; 9386 connp->conn_pkt_isv6 = B_TRUE; 9387 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; 9388 tcp->tcp_ipversion = IPV6_VERSION; 9389 tcp->tcp_family = AF_INET6; 9390 tcp->tcp_mss = tcp_mss_def_ipv6; 9391 } else { 9392 connp->conn_flags |= IPCL_TCP4; 9393 connp->conn_send = ip_output; 9394 connp->conn_af_isv6 = B_FALSE; 9395 connp->conn_pkt_isv6 = B_FALSE; 9396 tcp->tcp_ipversion = IPV4_VERSION; 9397 tcp->tcp_family = AF_INET; 9398 tcp->tcp_mss = tcp_mss_def_ipv4; 9399 } 9400 9401 /* 9402 * TCP keeps a copy of cred for cache locality reasons but 9403 * we put a reference only once. If connp->conn_cred 9404 * becomes invalid, tcp_cred should also be set to NULL. 9405 */ 9406 tcp->tcp_cred = connp->conn_cred = credp; 9407 crhold(connp->conn_cred); 9408 tcp->tcp_cpid = curproc->p_pid; 9409 connp->conn_zoneid = zoneid; 9410 connp->conn_mlp_type = mlptSingle; 9411 connp->conn_ulp_labeled = !is_system_labeled(); 9412 9413 /* 9414 * If the caller has the process-wide flag set, then default to MAC 9415 * exempt mode. This allows read-down to unlabeled hosts. 9416 */ 9417 if (getpflags(NET_MAC_AWARE, credp) != 0) 9418 connp->conn_mac_exempt = B_TRUE; 9419 9420 connp->conn_dev = conn_dev; 9421 9422 ASSERT(q->q_qinfo == &tcp_rinit); 9423 ASSERT(WR(q)->q_qinfo == &tcp_winit); 9424 9425 if (flag & SO_SOCKSTR) { 9426 /* 9427 * No need to insert a socket in tcp acceptor hash. 9428 * If it was a socket acceptor stream, we dealt with 9429 * it above. A socket listener can never accept a 9430 * connection and doesn't need acceptor_id. 9431 */ 9432 connp->conn_flags |= IPCL_SOCKET; 9433 tcp->tcp_issocket = 1; 9434 WR(q)->q_qinfo = &tcp_sock_winit; 9435 } else { 9436 #ifdef _ILP32 9437 tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); 9438 #else 9439 tcp->tcp_acceptor_id = conn_dev; 9440 #endif /* _ILP32 */ 9441 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 9442 } 9443 9444 if (tcp_trace) 9445 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_SLEEP); 9446 9447 err = tcp_init(tcp, q); 9448 if (err != 0) { 9449 inet_minor_free(ip_minor_arena, connp->conn_dev); 9450 tcp_acceptor_hash_remove(tcp); 9451 CONN_DEC_REF(connp); 9452 q->q_ptr = WR(q)->q_ptr = NULL; 9453 return (err); 9454 } 9455 9456 RD(q)->q_hiwat = tcp_recv_hiwat; 9457 tcp->tcp_rwnd = tcp_recv_hiwat; 9458 9459 /* Non-zero default values */ 9460 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 9461 /* 9462 * Put the ref for TCP. Ref for IP was already put 9463 * by ipcl_conn_create. Also Make the conn_t globally 9464 * visible to walkers 9465 */ 9466 mutex_enter(&connp->conn_lock); 9467 CONN_INC_REF_LOCKED(connp); 9468 ASSERT(connp->conn_ref == 2); 9469 connp->conn_state_flags &= ~CONN_INCIPIENT; 9470 mutex_exit(&connp->conn_lock); 9471 9472 qprocson(q); 9473 return (0); 9474 } 9475 9476 /* 9477 * Some TCP options can be "set" by requesting them in the option 9478 * buffer. This is needed for XTI feature test though we do not 9479 * allow it in general. We interpret that this mechanism is more 9480 * applicable to OSI protocols and need not be allowed in general. 9481 * This routine filters out options for which it is not allowed (most) 9482 * and lets through those (few) for which it is. [ The XTI interface 9483 * test suite specifics will imply that any XTI_GENERIC level XTI_* if 9484 * ever implemented will have to be allowed here ]. 9485 */ 9486 static boolean_t 9487 tcp_allow_connopt_set(int level, int name) 9488 { 9489 9490 switch (level) { 9491 case IPPROTO_TCP: 9492 switch (name) { 9493 case TCP_NODELAY: 9494 return (B_TRUE); 9495 default: 9496 return (B_FALSE); 9497 } 9498 /*NOTREACHED*/ 9499 default: 9500 return (B_FALSE); 9501 } 9502 /*NOTREACHED*/ 9503 } 9504 9505 /* 9506 * This routine gets default values of certain options whose default 9507 * values are maintained by protocol specific code 9508 */ 9509 /* ARGSUSED */ 9510 int 9511 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 9512 { 9513 int32_t *i1 = (int32_t *)ptr; 9514 9515 switch (level) { 9516 case IPPROTO_TCP: 9517 switch (name) { 9518 case TCP_NOTIFY_THRESHOLD: 9519 *i1 = tcp_ip_notify_interval; 9520 break; 9521 case TCP_ABORT_THRESHOLD: 9522 *i1 = tcp_ip_abort_interval; 9523 break; 9524 case TCP_CONN_NOTIFY_THRESHOLD: 9525 *i1 = tcp_ip_notify_cinterval; 9526 break; 9527 case TCP_CONN_ABORT_THRESHOLD: 9528 *i1 = tcp_ip_abort_cinterval; 9529 break; 9530 default: 9531 return (-1); 9532 } 9533 break; 9534 case IPPROTO_IP: 9535 switch (name) { 9536 case IP_TTL: 9537 *i1 = tcp_ipv4_ttl; 9538 break; 9539 default: 9540 return (-1); 9541 } 9542 break; 9543 case IPPROTO_IPV6: 9544 switch (name) { 9545 case IPV6_UNICAST_HOPS: 9546 *i1 = tcp_ipv6_hoplimit; 9547 break; 9548 default: 9549 return (-1); 9550 } 9551 break; 9552 default: 9553 return (-1); 9554 } 9555 return (sizeof (int)); 9556 } 9557 9558 9559 /* 9560 * TCP routine to get the values of options. 9561 */ 9562 int 9563 tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 9564 { 9565 int *i1 = (int *)ptr; 9566 conn_t *connp = Q_TO_CONN(q); 9567 tcp_t *tcp = connp->conn_tcp; 9568 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; 9569 9570 switch (level) { 9571 case SOL_SOCKET: 9572 switch (name) { 9573 case SO_LINGER: { 9574 struct linger *lgr = (struct linger *)ptr; 9575 9576 lgr->l_onoff = tcp->tcp_linger ? SO_LINGER : 0; 9577 lgr->l_linger = tcp->tcp_lingertime; 9578 } 9579 return (sizeof (struct linger)); 9580 case SO_DEBUG: 9581 *i1 = tcp->tcp_debug ? SO_DEBUG : 0; 9582 break; 9583 case SO_KEEPALIVE: 9584 *i1 = tcp->tcp_ka_enabled ? SO_KEEPALIVE : 0; 9585 break; 9586 case SO_DONTROUTE: 9587 *i1 = tcp->tcp_dontroute ? SO_DONTROUTE : 0; 9588 break; 9589 case SO_USELOOPBACK: 9590 *i1 = tcp->tcp_useloopback ? SO_USELOOPBACK : 0; 9591 break; 9592 case SO_BROADCAST: 9593 *i1 = tcp->tcp_broadcast ? SO_BROADCAST : 0; 9594 break; 9595 case SO_REUSEADDR: 9596 *i1 = tcp->tcp_reuseaddr ? SO_REUSEADDR : 0; 9597 break; 9598 case SO_OOBINLINE: 9599 *i1 = tcp->tcp_oobinline ? SO_OOBINLINE : 0; 9600 break; 9601 case SO_DGRAM_ERRIND: 9602 *i1 = tcp->tcp_dgram_errind ? SO_DGRAM_ERRIND : 0; 9603 break; 9604 case SO_TYPE: 9605 *i1 = SOCK_STREAM; 9606 break; 9607 case SO_SNDBUF: 9608 *i1 = tcp->tcp_xmit_hiwater; 9609 break; 9610 case SO_RCVBUF: 9611 *i1 = RD(q)->q_hiwat; 9612 break; 9613 case SO_SND_COPYAVOID: 9614 *i1 = tcp->tcp_snd_zcopy_on ? 9615 SO_SND_COPYAVOID : 0; 9616 break; 9617 case SO_ALLZONES: 9618 *i1 = connp->conn_allzones ? 1 : 0; 9619 break; 9620 case SO_ANON_MLP: 9621 *i1 = connp->conn_anon_mlp; 9622 break; 9623 case SO_MAC_EXEMPT: 9624 *i1 = connp->conn_mac_exempt; 9625 break; 9626 case SO_EXCLBIND: 9627 *i1 = tcp->tcp_exclbind ? SO_EXCLBIND : 0; 9628 break; 9629 default: 9630 return (-1); 9631 } 9632 break; 9633 case IPPROTO_TCP: 9634 switch (name) { 9635 case TCP_NODELAY: 9636 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; 9637 break; 9638 case TCP_MAXSEG: 9639 *i1 = tcp->tcp_mss; 9640 break; 9641 case TCP_NOTIFY_THRESHOLD: 9642 *i1 = (int)tcp->tcp_first_timer_threshold; 9643 break; 9644 case TCP_ABORT_THRESHOLD: 9645 *i1 = tcp->tcp_second_timer_threshold; 9646 break; 9647 case TCP_CONN_NOTIFY_THRESHOLD: 9648 *i1 = tcp->tcp_first_ctimer_threshold; 9649 break; 9650 case TCP_CONN_ABORT_THRESHOLD: 9651 *i1 = tcp->tcp_second_ctimer_threshold; 9652 break; 9653 case TCP_RECVDSTADDR: 9654 *i1 = tcp->tcp_recvdstaddr; 9655 break; 9656 case TCP_ANONPRIVBIND: 9657 *i1 = tcp->tcp_anon_priv_bind; 9658 break; 9659 case TCP_EXCLBIND: 9660 *i1 = tcp->tcp_exclbind ? TCP_EXCLBIND : 0; 9661 break; 9662 case TCP_INIT_CWND: 9663 *i1 = tcp->tcp_init_cwnd; 9664 break; 9665 case TCP_KEEPALIVE_THRESHOLD: 9666 *i1 = tcp->tcp_ka_interval; 9667 break; 9668 case TCP_KEEPALIVE_ABORT_THRESHOLD: 9669 *i1 = tcp->tcp_ka_abort_thres; 9670 break; 9671 case TCP_CORK: 9672 *i1 = tcp->tcp_cork; 9673 break; 9674 default: 9675 return (-1); 9676 } 9677 break; 9678 case IPPROTO_IP: 9679 if (tcp->tcp_family != AF_INET) 9680 return (-1); 9681 switch (name) { 9682 case IP_OPTIONS: 9683 case T_IP_OPTIONS: { 9684 /* 9685 * This is compatible with BSD in that in only return 9686 * the reverse source route with the final destination 9687 * as the last entry. The first 4 bytes of the option 9688 * will contain the final destination. 9689 */ 9690 int opt_len; 9691 9692 opt_len = (char *)tcp->tcp_tcph - (char *)tcp->tcp_ipha; 9693 opt_len -= tcp->tcp_label_len + IP_SIMPLE_HDR_LENGTH; 9694 ASSERT(opt_len >= 0); 9695 /* Caller ensures enough space */ 9696 if (opt_len > 0) { 9697 /* 9698 * TODO: Do we have to handle getsockopt on an 9699 * initiator as well? 9700 */ 9701 return (ip_opt_get_user(tcp->tcp_ipha, ptr)); 9702 } 9703 return (0); 9704 } 9705 case IP_TOS: 9706 case T_IP_TOS: 9707 *i1 = (int)tcp->tcp_ipha->ipha_type_of_service; 9708 break; 9709 case IP_TTL: 9710 *i1 = (int)tcp->tcp_ipha->ipha_ttl; 9711 break; 9712 case IP_NEXTHOP: 9713 /* Handled at IP level */ 9714 return (-EINVAL); 9715 default: 9716 return (-1); 9717 } 9718 break; 9719 case IPPROTO_IPV6: 9720 /* 9721 * IPPROTO_IPV6 options are only supported for sockets 9722 * that are using IPv6 on the wire. 9723 */ 9724 if (tcp->tcp_ipversion != IPV6_VERSION) { 9725 return (-1); 9726 } 9727 switch (name) { 9728 case IPV6_UNICAST_HOPS: 9729 *i1 = (unsigned int) tcp->tcp_ip6h->ip6_hops; 9730 break; /* goto sizeof (int) option return */ 9731 case IPV6_BOUND_IF: 9732 /* Zero if not set */ 9733 *i1 = tcp->tcp_bound_if; 9734 break; /* goto sizeof (int) option return */ 9735 case IPV6_RECVPKTINFO: 9736 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) 9737 *i1 = 1; 9738 else 9739 *i1 = 0; 9740 break; /* goto sizeof (int) option return */ 9741 case IPV6_RECVTCLASS: 9742 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS) 9743 *i1 = 1; 9744 else 9745 *i1 = 0; 9746 break; /* goto sizeof (int) option return */ 9747 case IPV6_RECVHOPLIMIT: 9748 if (tcp->tcp_ipv6_recvancillary & 9749 TCP_IPV6_RECVHOPLIMIT) 9750 *i1 = 1; 9751 else 9752 *i1 = 0; 9753 break; /* goto sizeof (int) option return */ 9754 case IPV6_RECVHOPOPTS: 9755 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) 9756 *i1 = 1; 9757 else 9758 *i1 = 0; 9759 break; /* goto sizeof (int) option return */ 9760 case IPV6_RECVDSTOPTS: 9761 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVDSTOPTS) 9762 *i1 = 1; 9763 else 9764 *i1 = 0; 9765 break; /* goto sizeof (int) option return */ 9766 case _OLD_IPV6_RECVDSTOPTS: 9767 if (tcp->tcp_ipv6_recvancillary & 9768 TCP_OLD_IPV6_RECVDSTOPTS) 9769 *i1 = 1; 9770 else 9771 *i1 = 0; 9772 break; /* goto sizeof (int) option return */ 9773 case IPV6_RECVRTHDR: 9774 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) 9775 *i1 = 1; 9776 else 9777 *i1 = 0; 9778 break; /* goto sizeof (int) option return */ 9779 case IPV6_RECVRTHDRDSTOPTS: 9780 if (tcp->tcp_ipv6_recvancillary & 9781 TCP_IPV6_RECVRTDSTOPTS) 9782 *i1 = 1; 9783 else 9784 *i1 = 0; 9785 break; /* goto sizeof (int) option return */ 9786 case IPV6_PKTINFO: { 9787 /* XXX assumes that caller has room for max size! */ 9788 struct in6_pktinfo *pkti; 9789 9790 pkti = (struct in6_pktinfo *)ptr; 9791 if (ipp->ipp_fields & IPPF_IFINDEX) 9792 pkti->ipi6_ifindex = ipp->ipp_ifindex; 9793 else 9794 pkti->ipi6_ifindex = 0; 9795 if (ipp->ipp_fields & IPPF_ADDR) 9796 pkti->ipi6_addr = ipp->ipp_addr; 9797 else 9798 pkti->ipi6_addr = ipv6_all_zeros; 9799 return (sizeof (struct in6_pktinfo)); 9800 } 9801 case IPV6_TCLASS: 9802 if (ipp->ipp_fields & IPPF_TCLASS) 9803 *i1 = ipp->ipp_tclass; 9804 else 9805 *i1 = IPV6_FLOW_TCLASS( 9806 IPV6_DEFAULT_VERS_AND_FLOW); 9807 break; /* goto sizeof (int) option return */ 9808 case IPV6_NEXTHOP: { 9809 sin6_t *sin6 = (sin6_t *)ptr; 9810 9811 if (!(ipp->ipp_fields & IPPF_NEXTHOP)) 9812 return (0); 9813 *sin6 = sin6_null; 9814 sin6->sin6_family = AF_INET6; 9815 sin6->sin6_addr = ipp->ipp_nexthop; 9816 return (sizeof (sin6_t)); 9817 } 9818 case IPV6_HOPOPTS: 9819 if (!(ipp->ipp_fields & IPPF_HOPOPTS)) 9820 return (0); 9821 if (ipp->ipp_hopoptslen <= tcp->tcp_label_len) 9822 return (0); 9823 bcopy((char *)ipp->ipp_hopopts + tcp->tcp_label_len, 9824 ptr, ipp->ipp_hopoptslen - tcp->tcp_label_len); 9825 if (tcp->tcp_label_len > 0) { 9826 ptr[0] = ((char *)ipp->ipp_hopopts)[0]; 9827 ptr[1] = (ipp->ipp_hopoptslen - 9828 tcp->tcp_label_len + 7) / 8 - 1; 9829 } 9830 return (ipp->ipp_hopoptslen - tcp->tcp_label_len); 9831 case IPV6_RTHDRDSTOPTS: 9832 if (!(ipp->ipp_fields & IPPF_RTDSTOPTS)) 9833 return (0); 9834 bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen); 9835 return (ipp->ipp_rtdstoptslen); 9836 case IPV6_RTHDR: 9837 if (!(ipp->ipp_fields & IPPF_RTHDR)) 9838 return (0); 9839 bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen); 9840 return (ipp->ipp_rthdrlen); 9841 case IPV6_DSTOPTS: 9842 if (!(ipp->ipp_fields & IPPF_DSTOPTS)) 9843 return (0); 9844 bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen); 9845 return (ipp->ipp_dstoptslen); 9846 case IPV6_SRC_PREFERENCES: 9847 return (ip6_get_src_preferences(connp, 9848 (uint32_t *)ptr)); 9849 case IPV6_PATHMTU: { 9850 struct ip6_mtuinfo *mtuinfo = (struct ip6_mtuinfo *)ptr; 9851 9852 if (tcp->tcp_state < TCPS_ESTABLISHED) 9853 return (-1); 9854 9855 return (ip_fill_mtuinfo(&connp->conn_remv6, 9856 connp->conn_fport, mtuinfo)); 9857 } 9858 default: 9859 return (-1); 9860 } 9861 break; 9862 default: 9863 return (-1); 9864 } 9865 return (sizeof (int)); 9866 } 9867 9868 /* 9869 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. 9870 * Parameters are assumed to be verified by the caller. 9871 */ 9872 /* ARGSUSED */ 9873 int 9874 tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, 9875 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 9876 void *thisdg_attrs, cred_t *cr, mblk_t *mblk) 9877 { 9878 conn_t *connp = Q_TO_CONN(q); 9879 tcp_t *tcp = connp->conn_tcp; 9880 int *i1 = (int *)invalp; 9881 boolean_t onoff = (*i1 == 0) ? 0 : 1; 9882 boolean_t checkonly; 9883 int reterr; 9884 9885 switch (optset_context) { 9886 case SETFN_OPTCOM_CHECKONLY: 9887 checkonly = B_TRUE; 9888 /* 9889 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 9890 * inlen != 0 implies value supplied and 9891 * we have to "pretend" to set it. 9892 * inlen == 0 implies that there is no 9893 * value part in T_CHECK request and just validation 9894 * done elsewhere should be enough, we just return here. 9895 */ 9896 if (inlen == 0) { 9897 *outlenp = 0; 9898 return (0); 9899 } 9900 break; 9901 case SETFN_OPTCOM_NEGOTIATE: 9902 checkonly = B_FALSE; 9903 break; 9904 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ 9905 case SETFN_CONN_NEGOTIATE: 9906 checkonly = B_FALSE; 9907 /* 9908 * Negotiating local and "association-related" options 9909 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) 9910 * primitives is allowed by XTI, but we choose 9911 * to not implement this style negotiation for Internet 9912 * protocols (We interpret it is a must for OSI world but 9913 * optional for Internet protocols) for all options. 9914 * [ Will do only for the few options that enable test 9915 * suites that our XTI implementation of this feature 9916 * works for transports that do allow it ] 9917 */ 9918 if (!tcp_allow_connopt_set(level, name)) { 9919 *outlenp = 0; 9920 return (EINVAL); 9921 } 9922 break; 9923 default: 9924 /* 9925 * We should never get here 9926 */ 9927 *outlenp = 0; 9928 return (EINVAL); 9929 } 9930 9931 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 9932 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 9933 9934 /* 9935 * For TCP, we should have no ancillary data sent down 9936 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs 9937 * has to be zero. 9938 */ 9939 ASSERT(thisdg_attrs == NULL); 9940 9941 /* 9942 * For fixed length options, no sanity check 9943 * of passed in length is done. It is assumed *_optcom_req() 9944 * routines do the right thing. 9945 */ 9946 9947 switch (level) { 9948 case SOL_SOCKET: 9949 switch (name) { 9950 case SO_LINGER: { 9951 struct linger *lgr = (struct linger *)invalp; 9952 9953 if (!checkonly) { 9954 if (lgr->l_onoff) { 9955 tcp->tcp_linger = 1; 9956 tcp->tcp_lingertime = lgr->l_linger; 9957 } else { 9958 tcp->tcp_linger = 0; 9959 tcp->tcp_lingertime = 0; 9960 } 9961 /* struct copy */ 9962 *(struct linger *)outvalp = *lgr; 9963 } else { 9964 if (!lgr->l_onoff) { 9965 ((struct linger *)outvalp)->l_onoff = 0; 9966 ((struct linger *)outvalp)->l_linger = 0; 9967 } else { 9968 /* struct copy */ 9969 *(struct linger *)outvalp = *lgr; 9970 } 9971 } 9972 *outlenp = sizeof (struct linger); 9973 return (0); 9974 } 9975 case SO_DEBUG: 9976 if (!checkonly) 9977 tcp->tcp_debug = onoff; 9978 break; 9979 case SO_KEEPALIVE: 9980 if (checkonly) { 9981 /* T_CHECK case */ 9982 break; 9983 } 9984 9985 if (!onoff) { 9986 if (tcp->tcp_ka_enabled) { 9987 if (tcp->tcp_ka_tid != 0) { 9988 (void) TCP_TIMER_CANCEL(tcp, 9989 tcp->tcp_ka_tid); 9990 tcp->tcp_ka_tid = 0; 9991 } 9992 tcp->tcp_ka_enabled = 0; 9993 } 9994 break; 9995 } 9996 if (!tcp->tcp_ka_enabled) { 9997 /* Crank up the keepalive timer */ 9998 tcp->tcp_ka_last_intrvl = 0; 9999 tcp->tcp_ka_tid = TCP_TIMER(tcp, 10000 tcp_keepalive_killer, 10001 MSEC_TO_TICK(tcp->tcp_ka_interval)); 10002 tcp->tcp_ka_enabled = 1; 10003 } 10004 break; 10005 case SO_DONTROUTE: 10006 /* 10007 * SO_DONTROUTE, SO_USELOOPBACK, and SO_BROADCAST are 10008 * only of interest to IP. We track them here only so 10009 * that we can report their current value. 10010 */ 10011 if (!checkonly) { 10012 tcp->tcp_dontroute = onoff; 10013 tcp->tcp_connp->conn_dontroute = onoff; 10014 } 10015 break; 10016 case SO_USELOOPBACK: 10017 if (!checkonly) { 10018 tcp->tcp_useloopback = onoff; 10019 tcp->tcp_connp->conn_loopback = onoff; 10020 } 10021 break; 10022 case SO_BROADCAST: 10023 if (!checkonly) { 10024 tcp->tcp_broadcast = onoff; 10025 tcp->tcp_connp->conn_broadcast = onoff; 10026 } 10027 break; 10028 case SO_REUSEADDR: 10029 if (!checkonly) { 10030 tcp->tcp_reuseaddr = onoff; 10031 tcp->tcp_connp->conn_reuseaddr = onoff; 10032 } 10033 break; 10034 case SO_OOBINLINE: 10035 if (!checkonly) 10036 tcp->tcp_oobinline = onoff; 10037 break; 10038 case SO_DGRAM_ERRIND: 10039 if (!checkonly) 10040 tcp->tcp_dgram_errind = onoff; 10041 break; 10042 case SO_SNDBUF: { 10043 tcp_t *peer_tcp; 10044 10045 if (*i1 > tcp_max_buf) { 10046 *outlenp = 0; 10047 return (ENOBUFS); 10048 } 10049 if (checkonly) 10050 break; 10051 10052 tcp->tcp_xmit_hiwater = *i1; 10053 if (tcp_snd_lowat_fraction != 0) 10054 tcp->tcp_xmit_lowater = 10055 tcp->tcp_xmit_hiwater / 10056 tcp_snd_lowat_fraction; 10057 (void) tcp_maxpsz_set(tcp, B_TRUE); 10058 /* 10059 * If we are flow-controlled, recheck the condition. 10060 * There are apps that increase SO_SNDBUF size when 10061 * flow-controlled (EWOULDBLOCK), and expect the flow 10062 * control condition to be lifted right away. 10063 * 10064 * For the fused tcp loopback case, in order to avoid 10065 * a race with the peer's tcp_fuse_rrw() we need to 10066 * hold its fuse_lock while accessing tcp_flow_stopped. 10067 */ 10068 peer_tcp = tcp->tcp_loopback_peer; 10069 ASSERT(!tcp->tcp_fused || peer_tcp != NULL); 10070 if (tcp->tcp_fused) 10071 mutex_enter(&peer_tcp->tcp_fuse_lock); 10072 10073 if (tcp->tcp_flow_stopped && 10074 TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) { 10075 tcp_clrqfull(tcp); 10076 } 10077 if (tcp->tcp_fused) 10078 mutex_exit(&peer_tcp->tcp_fuse_lock); 10079 break; 10080 } 10081 case SO_RCVBUF: 10082 if (*i1 > tcp_max_buf) { 10083 *outlenp = 0; 10084 return (ENOBUFS); 10085 } 10086 /* Silently ignore zero */ 10087 if (!checkonly && *i1 != 0) { 10088 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); 10089 (void) tcp_rwnd_set(tcp, *i1); 10090 } 10091 /* 10092 * XXX should we return the rwnd here 10093 * and tcp_opt_get ? 10094 */ 10095 break; 10096 case SO_SND_COPYAVOID: 10097 if (!checkonly) { 10098 /* we only allow enable at most once for now */ 10099 if (tcp->tcp_loopback || 10100 (!tcp->tcp_snd_zcopy_aware && 10101 (onoff != 1 || !tcp_zcopy_check(tcp)))) { 10102 *outlenp = 0; 10103 return (EOPNOTSUPP); 10104 } 10105 tcp->tcp_snd_zcopy_aware = 1; 10106 } 10107 break; 10108 case SO_ALLZONES: 10109 /* Handled at the IP level */ 10110 return (-EINVAL); 10111 case SO_ANON_MLP: 10112 if (!checkonly) { 10113 mutex_enter(&connp->conn_lock); 10114 connp->conn_anon_mlp = onoff; 10115 mutex_exit(&connp->conn_lock); 10116 } 10117 break; 10118 case SO_MAC_EXEMPT: 10119 if (secpolicy_net_mac_aware(cr) != 0 || 10120 IPCL_IS_BOUND(connp)) 10121 return (EACCES); 10122 if (!checkonly) { 10123 mutex_enter(&connp->conn_lock); 10124 connp->conn_mac_exempt = onoff; 10125 mutex_exit(&connp->conn_lock); 10126 } 10127 break; 10128 case SO_EXCLBIND: 10129 if (!checkonly) 10130 tcp->tcp_exclbind = onoff; 10131 break; 10132 default: 10133 *outlenp = 0; 10134 return (EINVAL); 10135 } 10136 break; 10137 case IPPROTO_TCP: 10138 switch (name) { 10139 case TCP_NODELAY: 10140 if (!checkonly) 10141 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; 10142 break; 10143 case TCP_NOTIFY_THRESHOLD: 10144 if (!checkonly) 10145 tcp->tcp_first_timer_threshold = *i1; 10146 break; 10147 case TCP_ABORT_THRESHOLD: 10148 if (!checkonly) 10149 tcp->tcp_second_timer_threshold = *i1; 10150 break; 10151 case TCP_CONN_NOTIFY_THRESHOLD: 10152 if (!checkonly) 10153 tcp->tcp_first_ctimer_threshold = *i1; 10154 break; 10155 case TCP_CONN_ABORT_THRESHOLD: 10156 if (!checkonly) 10157 tcp->tcp_second_ctimer_threshold = *i1; 10158 break; 10159 case TCP_RECVDSTADDR: 10160 if (tcp->tcp_state > TCPS_LISTEN) 10161 return (EOPNOTSUPP); 10162 if (!checkonly) 10163 tcp->tcp_recvdstaddr = onoff; 10164 break; 10165 case TCP_ANONPRIVBIND: 10166 if ((reterr = secpolicy_net_privaddr(cr, 0)) != 0) { 10167 *outlenp = 0; 10168 return (reterr); 10169 } 10170 if (!checkonly) { 10171 tcp->tcp_anon_priv_bind = onoff; 10172 } 10173 break; 10174 case TCP_EXCLBIND: 10175 if (!checkonly) 10176 tcp->tcp_exclbind = onoff; 10177 break; /* goto sizeof (int) option return */ 10178 case TCP_INIT_CWND: { 10179 uint32_t init_cwnd = *((uint32_t *)invalp); 10180 10181 if (checkonly) 10182 break; 10183 10184 /* 10185 * Only allow socket with network configuration 10186 * privilege to set the initial cwnd to be larger 10187 * than allowed by RFC 3390. 10188 */ 10189 if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { 10190 tcp->tcp_init_cwnd = init_cwnd; 10191 break; 10192 } 10193 if ((reterr = secpolicy_net_config(cr, B_TRUE)) != 0) { 10194 *outlenp = 0; 10195 return (reterr); 10196 } 10197 if (init_cwnd > TCP_MAX_INIT_CWND) { 10198 *outlenp = 0; 10199 return (EINVAL); 10200 } 10201 tcp->tcp_init_cwnd = init_cwnd; 10202 break; 10203 } 10204 case TCP_KEEPALIVE_THRESHOLD: 10205 if (checkonly) 10206 break; 10207 10208 if (*i1 < tcp_keepalive_interval_low || 10209 *i1 > tcp_keepalive_interval_high) { 10210 *outlenp = 0; 10211 return (EINVAL); 10212 } 10213 if (*i1 != tcp->tcp_ka_interval) { 10214 tcp->tcp_ka_interval = *i1; 10215 /* 10216 * Check if we need to restart the 10217 * keepalive timer. 10218 */ 10219 if (tcp->tcp_ka_tid != 0) { 10220 ASSERT(tcp->tcp_ka_enabled); 10221 (void) TCP_TIMER_CANCEL(tcp, 10222 tcp->tcp_ka_tid); 10223 tcp->tcp_ka_last_intrvl = 0; 10224 tcp->tcp_ka_tid = TCP_TIMER(tcp, 10225 tcp_keepalive_killer, 10226 MSEC_TO_TICK(tcp->tcp_ka_interval)); 10227 } 10228 } 10229 break; 10230 case TCP_KEEPALIVE_ABORT_THRESHOLD: 10231 if (!checkonly) { 10232 if (*i1 < tcp_keepalive_abort_interval_low || 10233 *i1 > tcp_keepalive_abort_interval_high) { 10234 *outlenp = 0; 10235 return (EINVAL); 10236 } 10237 tcp->tcp_ka_abort_thres = *i1; 10238 } 10239 break; 10240 case TCP_CORK: 10241 if (!checkonly) { 10242 /* 10243 * if tcp->tcp_cork was set and is now 10244 * being unset, we have to make sure that 10245 * the remaining data gets sent out. Also 10246 * unset tcp->tcp_cork so that tcp_wput_data() 10247 * can send data even if it is less than mss 10248 */ 10249 if (tcp->tcp_cork && onoff == 0 && 10250 tcp->tcp_unsent > 0) { 10251 tcp->tcp_cork = B_FALSE; 10252 tcp_wput_data(tcp, NULL, B_FALSE); 10253 } 10254 tcp->tcp_cork = onoff; 10255 } 10256 break; 10257 default: 10258 *outlenp = 0; 10259 return (EINVAL); 10260 } 10261 break; 10262 case IPPROTO_IP: 10263 if (tcp->tcp_family != AF_INET) { 10264 *outlenp = 0; 10265 return (ENOPROTOOPT); 10266 } 10267 switch (name) { 10268 case IP_OPTIONS: 10269 case T_IP_OPTIONS: 10270 reterr = tcp_opt_set_header(tcp, checkonly, 10271 invalp, inlen); 10272 if (reterr) { 10273 *outlenp = 0; 10274 return (reterr); 10275 } 10276 /* OK return - copy input buffer into output buffer */ 10277 if (invalp != outvalp) { 10278 /* don't trust bcopy for identical src/dst */ 10279 bcopy(invalp, outvalp, inlen); 10280 } 10281 *outlenp = inlen; 10282 return (0); 10283 case IP_TOS: 10284 case T_IP_TOS: 10285 if (!checkonly) { 10286 tcp->tcp_ipha->ipha_type_of_service = 10287 (uchar_t)*i1; 10288 tcp->tcp_tos = (uchar_t)*i1; 10289 } 10290 break; 10291 case IP_TTL: 10292 if (!checkonly) { 10293 tcp->tcp_ipha->ipha_ttl = (uchar_t)*i1; 10294 tcp->tcp_ttl = (uchar_t)*i1; 10295 } 10296 break; 10297 case IP_BOUND_IF: 10298 case IP_NEXTHOP: 10299 /* Handled at the IP level */ 10300 return (-EINVAL); 10301 case IP_SEC_OPT: 10302 /* 10303 * We should not allow policy setting after 10304 * we start listening for connections. 10305 */ 10306 if (tcp->tcp_state == TCPS_LISTEN) { 10307 return (EINVAL); 10308 } else { 10309 /* Handled at the IP level */ 10310 return (-EINVAL); 10311 } 10312 default: 10313 *outlenp = 0; 10314 return (EINVAL); 10315 } 10316 break; 10317 case IPPROTO_IPV6: { 10318 ip6_pkt_t *ipp; 10319 10320 /* 10321 * IPPROTO_IPV6 options are only supported for sockets 10322 * that are using IPv6 on the wire. 10323 */ 10324 if (tcp->tcp_ipversion != IPV6_VERSION) { 10325 *outlenp = 0; 10326 return (ENOPROTOOPT); 10327 } 10328 /* 10329 * Only sticky options; no ancillary data 10330 */ 10331 ASSERT(thisdg_attrs == NULL); 10332 ipp = &tcp->tcp_sticky_ipp; 10333 10334 switch (name) { 10335 case IPV6_UNICAST_HOPS: 10336 /* -1 means use default */ 10337 if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) { 10338 *outlenp = 0; 10339 return (EINVAL); 10340 } 10341 if (!checkonly) { 10342 if (*i1 == -1) { 10343 tcp->tcp_ip6h->ip6_hops = 10344 ipp->ipp_unicast_hops = 10345 (uint8_t)tcp_ipv6_hoplimit; 10346 ipp->ipp_fields &= ~IPPF_UNICAST_HOPS; 10347 /* Pass modified value to IP. */ 10348 *i1 = tcp->tcp_ip6h->ip6_hops; 10349 } else { 10350 tcp->tcp_ip6h->ip6_hops = 10351 ipp->ipp_unicast_hops = 10352 (uint8_t)*i1; 10353 ipp->ipp_fields |= IPPF_UNICAST_HOPS; 10354 } 10355 reterr = tcp_build_hdrs(q, tcp); 10356 if (reterr != 0) 10357 return (reterr); 10358 } 10359 break; 10360 case IPV6_BOUND_IF: 10361 if (!checkonly) { 10362 int error = 0; 10363 10364 tcp->tcp_bound_if = *i1; 10365 error = ip_opt_set_ill(tcp->tcp_connp, *i1, 10366 B_TRUE, checkonly, level, name, mblk); 10367 if (error != 0) { 10368 *outlenp = 0; 10369 return (error); 10370 } 10371 } 10372 break; 10373 /* 10374 * Set boolean switches for ancillary data delivery 10375 */ 10376 case IPV6_RECVPKTINFO: 10377 if (!checkonly) { 10378 if (onoff) 10379 tcp->tcp_ipv6_recvancillary |= 10380 TCP_IPV6_RECVPKTINFO; 10381 else 10382 tcp->tcp_ipv6_recvancillary &= 10383 ~TCP_IPV6_RECVPKTINFO; 10384 /* Force it to be sent up with the next msg */ 10385 tcp->tcp_recvifindex = 0; 10386 } 10387 break; 10388 case IPV6_RECVTCLASS: 10389 if (!checkonly) { 10390 if (onoff) 10391 tcp->tcp_ipv6_recvancillary |= 10392 TCP_IPV6_RECVTCLASS; 10393 else 10394 tcp->tcp_ipv6_recvancillary &= 10395 ~TCP_IPV6_RECVTCLASS; 10396 } 10397 break; 10398 case IPV6_RECVHOPLIMIT: 10399 if (!checkonly) { 10400 if (onoff) 10401 tcp->tcp_ipv6_recvancillary |= 10402 TCP_IPV6_RECVHOPLIMIT; 10403 else 10404 tcp->tcp_ipv6_recvancillary &= 10405 ~TCP_IPV6_RECVHOPLIMIT; 10406 /* Force it to be sent up with the next msg */ 10407 tcp->tcp_recvhops = 0xffffffffU; 10408 } 10409 break; 10410 case IPV6_RECVHOPOPTS: 10411 if (!checkonly) { 10412 if (onoff) 10413 tcp->tcp_ipv6_recvancillary |= 10414 TCP_IPV6_RECVHOPOPTS; 10415 else 10416 tcp->tcp_ipv6_recvancillary &= 10417 ~TCP_IPV6_RECVHOPOPTS; 10418 } 10419 break; 10420 case IPV6_RECVDSTOPTS: 10421 if (!checkonly) { 10422 if (onoff) 10423 tcp->tcp_ipv6_recvancillary |= 10424 TCP_IPV6_RECVDSTOPTS; 10425 else 10426 tcp->tcp_ipv6_recvancillary &= 10427 ~TCP_IPV6_RECVDSTOPTS; 10428 } 10429 break; 10430 case _OLD_IPV6_RECVDSTOPTS: 10431 if (!checkonly) { 10432 if (onoff) 10433 tcp->tcp_ipv6_recvancillary |= 10434 TCP_OLD_IPV6_RECVDSTOPTS; 10435 else 10436 tcp->tcp_ipv6_recvancillary &= 10437 ~TCP_OLD_IPV6_RECVDSTOPTS; 10438 } 10439 break; 10440 case IPV6_RECVRTHDR: 10441 if (!checkonly) { 10442 if (onoff) 10443 tcp->tcp_ipv6_recvancillary |= 10444 TCP_IPV6_RECVRTHDR; 10445 else 10446 tcp->tcp_ipv6_recvancillary &= 10447 ~TCP_IPV6_RECVRTHDR; 10448 } 10449 break; 10450 case IPV6_RECVRTHDRDSTOPTS: 10451 if (!checkonly) { 10452 if (onoff) 10453 tcp->tcp_ipv6_recvancillary |= 10454 TCP_IPV6_RECVRTDSTOPTS; 10455 else 10456 tcp->tcp_ipv6_recvancillary &= 10457 ~TCP_IPV6_RECVRTDSTOPTS; 10458 } 10459 break; 10460 case IPV6_PKTINFO: 10461 if (inlen != 0 && inlen != sizeof (struct in6_pktinfo)) 10462 return (EINVAL); 10463 if (checkonly) 10464 break; 10465 10466 if (inlen == 0) { 10467 ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR); 10468 } else { 10469 struct in6_pktinfo *pkti; 10470 10471 pkti = (struct in6_pktinfo *)invalp; 10472 /* 10473 * RFC 3542 states that ipi6_addr must be 10474 * the unspecified address when setting the 10475 * IPV6_PKTINFO sticky socket option on a 10476 * TCP socket. 10477 */ 10478 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) 10479 return (EINVAL); 10480 /* 10481 * ip6_set_pktinfo() validates the source 10482 * address and interface index. 10483 */ 10484 reterr = ip6_set_pktinfo(cr, tcp->tcp_connp, 10485 pkti, mblk); 10486 if (reterr != 0) 10487 return (reterr); 10488 ipp->ipp_ifindex = pkti->ipi6_ifindex; 10489 ipp->ipp_addr = pkti->ipi6_addr; 10490 if (ipp->ipp_ifindex != 0) 10491 ipp->ipp_fields |= IPPF_IFINDEX; 10492 else 10493 ipp->ipp_fields &= ~IPPF_IFINDEX; 10494 if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)) 10495 ipp->ipp_fields |= IPPF_ADDR; 10496 else 10497 ipp->ipp_fields &= ~IPPF_ADDR; 10498 } 10499 reterr = tcp_build_hdrs(q, tcp); 10500 if (reterr != 0) 10501 return (reterr); 10502 break; 10503 case IPV6_TCLASS: 10504 if (inlen != 0 && inlen != sizeof (int)) 10505 return (EINVAL); 10506 if (checkonly) 10507 break; 10508 10509 if (inlen == 0) { 10510 ipp->ipp_fields &= ~IPPF_TCLASS; 10511 } else { 10512 if (*i1 > 255 || *i1 < -1) 10513 return (EINVAL); 10514 if (*i1 == -1) { 10515 ipp->ipp_tclass = 0; 10516 *i1 = 0; 10517 } else { 10518 ipp->ipp_tclass = *i1; 10519 } 10520 ipp->ipp_fields |= IPPF_TCLASS; 10521 } 10522 reterr = tcp_build_hdrs(q, tcp); 10523 if (reterr != 0) 10524 return (reterr); 10525 break; 10526 case IPV6_NEXTHOP: 10527 /* 10528 * IP will verify that the nexthop is reachable 10529 * and fail for sticky options. 10530 */ 10531 if (inlen != 0 && inlen != sizeof (sin6_t)) 10532 return (EINVAL); 10533 if (checkonly) 10534 break; 10535 10536 if (inlen == 0) { 10537 ipp->ipp_fields &= ~IPPF_NEXTHOP; 10538 } else { 10539 sin6_t *sin6 = (sin6_t *)invalp; 10540 10541 if (sin6->sin6_family != AF_INET6) 10542 return (EAFNOSUPPORT); 10543 if (IN6_IS_ADDR_V4MAPPED( 10544 &sin6->sin6_addr)) 10545 return (EADDRNOTAVAIL); 10546 ipp->ipp_nexthop = sin6->sin6_addr; 10547 if (!IN6_IS_ADDR_UNSPECIFIED( 10548 &ipp->ipp_nexthop)) 10549 ipp->ipp_fields |= IPPF_NEXTHOP; 10550 else 10551 ipp->ipp_fields &= ~IPPF_NEXTHOP; 10552 } 10553 reterr = tcp_build_hdrs(q, tcp); 10554 if (reterr != 0) 10555 return (reterr); 10556 break; 10557 case IPV6_HOPOPTS: { 10558 ip6_hbh_t *hopts = (ip6_hbh_t *)invalp; 10559 10560 /* 10561 * Sanity checks - minimum size, size a multiple of 10562 * eight bytes, and matching size passed in. 10563 */ 10564 if (inlen != 0 && 10565 inlen != (8 * (hopts->ip6h_len + 1))) 10566 return (EINVAL); 10567 10568 if (checkonly) 10569 break; 10570 10571 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10572 (uchar_t **)&ipp->ipp_hopopts, 10573 &ipp->ipp_hopoptslen, tcp->tcp_label_len); 10574 if (reterr != 0) 10575 return (reterr); 10576 if (ipp->ipp_hopoptslen == 0) 10577 ipp->ipp_fields &= ~IPPF_HOPOPTS; 10578 else 10579 ipp->ipp_fields |= IPPF_HOPOPTS; 10580 reterr = tcp_build_hdrs(q, tcp); 10581 if (reterr != 0) 10582 return (reterr); 10583 break; 10584 } 10585 case IPV6_RTHDRDSTOPTS: { 10586 ip6_dest_t *dopts = (ip6_dest_t *)invalp; 10587 10588 /* 10589 * Sanity checks - minimum size, size a multiple of 10590 * eight bytes, and matching size passed in. 10591 */ 10592 if (inlen != 0 && 10593 inlen != (8 * (dopts->ip6d_len + 1))) 10594 return (EINVAL); 10595 10596 if (checkonly) 10597 break; 10598 10599 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10600 (uchar_t **)&ipp->ipp_rtdstopts, 10601 &ipp->ipp_rtdstoptslen, 0); 10602 if (reterr != 0) 10603 return (reterr); 10604 if (ipp->ipp_rtdstoptslen == 0) 10605 ipp->ipp_fields &= ~IPPF_RTDSTOPTS; 10606 else 10607 ipp->ipp_fields |= IPPF_RTDSTOPTS; 10608 reterr = tcp_build_hdrs(q, tcp); 10609 if (reterr != 0) 10610 return (reterr); 10611 break; 10612 } 10613 case IPV6_DSTOPTS: { 10614 ip6_dest_t *dopts = (ip6_dest_t *)invalp; 10615 10616 /* 10617 * Sanity checks - minimum size, size a multiple of 10618 * eight bytes, and matching size passed in. 10619 */ 10620 if (inlen != 0 && 10621 inlen != (8 * (dopts->ip6d_len + 1))) 10622 return (EINVAL); 10623 10624 if (checkonly) 10625 break; 10626 10627 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10628 (uchar_t **)&ipp->ipp_dstopts, 10629 &ipp->ipp_dstoptslen, 0); 10630 if (reterr != 0) 10631 return (reterr); 10632 if (ipp->ipp_dstoptslen == 0) 10633 ipp->ipp_fields &= ~IPPF_DSTOPTS; 10634 else 10635 ipp->ipp_fields |= IPPF_DSTOPTS; 10636 reterr = tcp_build_hdrs(q, tcp); 10637 if (reterr != 0) 10638 return (reterr); 10639 break; 10640 } 10641 case IPV6_RTHDR: { 10642 ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp; 10643 10644 /* 10645 * Sanity checks - minimum size, size a multiple of 10646 * eight bytes, and matching size passed in. 10647 */ 10648 if (inlen != 0 && 10649 inlen != (8 * (rt->ip6r_len + 1))) 10650 return (EINVAL); 10651 10652 if (checkonly) 10653 break; 10654 10655 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10656 (uchar_t **)&ipp->ipp_rthdr, 10657 &ipp->ipp_rthdrlen, 0); 10658 if (reterr != 0) 10659 return (reterr); 10660 if (ipp->ipp_rthdrlen == 0) 10661 ipp->ipp_fields &= ~IPPF_RTHDR; 10662 else 10663 ipp->ipp_fields |= IPPF_RTHDR; 10664 reterr = tcp_build_hdrs(q, tcp); 10665 if (reterr != 0) 10666 return (reterr); 10667 break; 10668 } 10669 case IPV6_V6ONLY: 10670 if (!checkonly) 10671 tcp->tcp_connp->conn_ipv6_v6only = onoff; 10672 break; 10673 case IPV6_USE_MIN_MTU: 10674 if (inlen != sizeof (int)) 10675 return (EINVAL); 10676 10677 if (*i1 < -1 || *i1 > 1) 10678 return (EINVAL); 10679 10680 if (checkonly) 10681 break; 10682 10683 ipp->ipp_fields |= IPPF_USE_MIN_MTU; 10684 ipp->ipp_use_min_mtu = *i1; 10685 break; 10686 case IPV6_BOUND_PIF: 10687 /* Handled at the IP level */ 10688 return (-EINVAL); 10689 case IPV6_SEC_OPT: 10690 /* 10691 * We should not allow policy setting after 10692 * we start listening for connections. 10693 */ 10694 if (tcp->tcp_state == TCPS_LISTEN) { 10695 return (EINVAL); 10696 } else { 10697 /* Handled at the IP level */ 10698 return (-EINVAL); 10699 } 10700 case IPV6_SRC_PREFERENCES: 10701 if (inlen != sizeof (uint32_t)) 10702 return (EINVAL); 10703 reterr = ip6_set_src_preferences(tcp->tcp_connp, 10704 *(uint32_t *)invalp); 10705 if (reterr != 0) { 10706 *outlenp = 0; 10707 return (reterr); 10708 } 10709 break; 10710 default: 10711 *outlenp = 0; 10712 return (EINVAL); 10713 } 10714 break; 10715 } /* end IPPROTO_IPV6 */ 10716 default: 10717 *outlenp = 0; 10718 return (EINVAL); 10719 } 10720 /* 10721 * Common case of OK return with outval same as inval 10722 */ 10723 if (invalp != outvalp) { 10724 /* don't trust bcopy for identical src/dst */ 10725 (void) bcopy(invalp, outvalp, inlen); 10726 } 10727 *outlenp = inlen; 10728 return (0); 10729 } 10730 10731 /* 10732 * Update tcp_sticky_hdrs based on tcp_sticky_ipp. 10733 * The headers include ip6i_t (if needed), ip6_t, any sticky extension 10734 * headers, and the maximum size tcp header (to avoid reallocation 10735 * on the fly for additional tcp options). 10736 * Returns failure if can't allocate memory. 10737 */ 10738 static int 10739 tcp_build_hdrs(queue_t *q, tcp_t *tcp) 10740 { 10741 char *hdrs; 10742 uint_t hdrs_len; 10743 ip6i_t *ip6i; 10744 char buf[TCP_MAX_HDR_LENGTH]; 10745 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; 10746 in6_addr_t src, dst; 10747 10748 /* 10749 * save the existing tcp header and source/dest IP addresses 10750 */ 10751 bcopy(tcp->tcp_tcph, buf, tcp->tcp_tcp_hdr_len); 10752 src = tcp->tcp_ip6h->ip6_src; 10753 dst = tcp->tcp_ip6h->ip6_dst; 10754 hdrs_len = ip_total_hdrs_len_v6(ipp) + TCP_MAX_HDR_LENGTH; 10755 ASSERT(hdrs_len != 0); 10756 if (hdrs_len > tcp->tcp_iphc_len) { 10757 /* Need to reallocate */ 10758 hdrs = kmem_zalloc(hdrs_len, KM_NOSLEEP); 10759 if (hdrs == NULL) 10760 return (ENOMEM); 10761 if (tcp->tcp_iphc != NULL) { 10762 if (tcp->tcp_hdr_grown) { 10763 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 10764 } else { 10765 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 10766 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 10767 } 10768 tcp->tcp_iphc_len = 0; 10769 } 10770 ASSERT(tcp->tcp_iphc_len == 0); 10771 tcp->tcp_iphc = hdrs; 10772 tcp->tcp_iphc_len = hdrs_len; 10773 tcp->tcp_hdr_grown = B_TRUE; 10774 } 10775 ip_build_hdrs_v6((uchar_t *)tcp->tcp_iphc, 10776 hdrs_len - TCP_MAX_HDR_LENGTH, ipp, IPPROTO_TCP); 10777 10778 /* Set header fields not in ipp */ 10779 if (ipp->ipp_fields & IPPF_HAS_IP6I) { 10780 ip6i = (ip6i_t *)tcp->tcp_iphc; 10781 tcp->tcp_ip6h = (ip6_t *)&ip6i[1]; 10782 } else { 10783 tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc; 10784 } 10785 /* 10786 * tcp->tcp_ip_hdr_len will include ip6i_t if there is one. 10787 * 10788 * tcp->tcp_tcp_hdr_len doesn't change here. 10789 */ 10790 tcp->tcp_ip_hdr_len = hdrs_len - TCP_MAX_HDR_LENGTH; 10791 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + tcp->tcp_ip_hdr_len); 10792 tcp->tcp_hdr_len = tcp->tcp_ip_hdr_len + tcp->tcp_tcp_hdr_len; 10793 10794 bcopy(buf, tcp->tcp_tcph, tcp->tcp_tcp_hdr_len); 10795 10796 tcp->tcp_ip6h->ip6_src = src; 10797 tcp->tcp_ip6h->ip6_dst = dst; 10798 10799 /* 10800 * If the hop limit was not set by ip_build_hdrs_v6(), set it to 10801 * the default value for TCP. 10802 */ 10803 if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS)) 10804 tcp->tcp_ip6h->ip6_hops = tcp_ipv6_hoplimit; 10805 10806 /* 10807 * If we're setting extension headers after a connection 10808 * has been established, and if we have a routing header 10809 * among the extension headers, call ip_massage_options_v6 to 10810 * manipulate the routing header/ip6_dst set the checksum 10811 * difference in the tcp header template. 10812 * (This happens in tcp_connect_ipv6 if the routing header 10813 * is set prior to the connect.) 10814 * Set the tcp_sum to zero first in case we've cleared a 10815 * routing header or don't have one at all. 10816 */ 10817 tcp->tcp_sum = 0; 10818 if ((tcp->tcp_state >= TCPS_SYN_SENT) && 10819 (tcp->tcp_ipp_fields & IPPF_RTHDR)) { 10820 ip6_rthdr_t *rth = ip_find_rthdr_v6(tcp->tcp_ip6h, 10821 (uint8_t *)tcp->tcp_tcph); 10822 if (rth != NULL) { 10823 tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, 10824 rth); 10825 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + 10826 (tcp->tcp_sum >> 16)); 10827 } 10828 } 10829 10830 /* Try to get everything in a single mblk */ 10831 (void) mi_set_sth_wroff(RD(q), hdrs_len + tcp_wroff_xtra); 10832 return (0); 10833 } 10834 10835 /* 10836 * Transfer any source route option from ipha to buf/dst in reversed form. 10837 */ 10838 static int 10839 tcp_opt_rev_src_route(ipha_t *ipha, char *buf, uchar_t *dst) 10840 { 10841 ipoptp_t opts; 10842 uchar_t *opt; 10843 uint8_t optval; 10844 uint8_t optlen; 10845 uint32_t len = 0; 10846 10847 for (optval = ipoptp_first(&opts, ipha); 10848 optval != IPOPT_EOL; 10849 optval = ipoptp_next(&opts)) { 10850 opt = opts.ipoptp_cur; 10851 optlen = opts.ipoptp_len; 10852 switch (optval) { 10853 int off1, off2; 10854 case IPOPT_SSRR: 10855 case IPOPT_LSRR: 10856 10857 /* Reverse source route */ 10858 /* 10859 * First entry should be the next to last one in the 10860 * current source route (the last entry is our 10861 * address.) 10862 * The last entry should be the final destination. 10863 */ 10864 buf[IPOPT_OPTVAL] = (uint8_t)optval; 10865 buf[IPOPT_OLEN] = (uint8_t)optlen; 10866 off1 = IPOPT_MINOFF_SR - 1; 10867 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; 10868 if (off2 < 0) { 10869 /* No entries in source route */ 10870 break; 10871 } 10872 bcopy(opt + off2, dst, IP_ADDR_LEN); 10873 /* 10874 * Note: use src since ipha has not had its src 10875 * and dst reversed (it is in the state it was 10876 * received. 10877 */ 10878 bcopy(&ipha->ipha_src, buf + off2, 10879 IP_ADDR_LEN); 10880 off2 -= IP_ADDR_LEN; 10881 10882 while (off2 > 0) { 10883 bcopy(opt + off2, buf + off1, 10884 IP_ADDR_LEN); 10885 off1 += IP_ADDR_LEN; 10886 off2 -= IP_ADDR_LEN; 10887 } 10888 buf[IPOPT_OFFSET] = IPOPT_MINOFF_SR; 10889 buf += optlen; 10890 len += optlen; 10891 break; 10892 } 10893 } 10894 done: 10895 /* Pad the resulting options */ 10896 while (len & 0x3) { 10897 *buf++ = IPOPT_EOL; 10898 len++; 10899 } 10900 return (len); 10901 } 10902 10903 10904 /* 10905 * Extract and revert a source route from ipha (if any) 10906 * and then update the relevant fields in both tcp_t and the standard header. 10907 */ 10908 static void 10909 tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha) 10910 { 10911 char buf[TCP_MAX_HDR_LENGTH]; 10912 uint_t tcph_len; 10913 int len; 10914 10915 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 10916 len = IPH_HDR_LENGTH(ipha); 10917 if (len == IP_SIMPLE_HDR_LENGTH) 10918 /* Nothing to do */ 10919 return; 10920 if (len > IP_SIMPLE_HDR_LENGTH + TCP_MAX_IP_OPTIONS_LENGTH || 10921 (len & 0x3)) 10922 return; 10923 10924 tcph_len = tcp->tcp_tcp_hdr_len; 10925 bcopy(tcp->tcp_tcph, buf, tcph_len); 10926 tcp->tcp_sum = (tcp->tcp_ipha->ipha_dst >> 16) + 10927 (tcp->tcp_ipha->ipha_dst & 0xffff); 10928 len = tcp_opt_rev_src_route(ipha, (char *)tcp->tcp_ipha + 10929 IP_SIMPLE_HDR_LENGTH, (uchar_t *)&tcp->tcp_ipha->ipha_dst); 10930 len += IP_SIMPLE_HDR_LENGTH; 10931 tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + 10932 (tcp->tcp_ipha->ipha_dst & 0xffff)); 10933 if ((int)tcp->tcp_sum < 0) 10934 tcp->tcp_sum--; 10935 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); 10936 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16)); 10937 tcp->tcp_tcph = (tcph_t *)((char *)tcp->tcp_ipha + len); 10938 bcopy(buf, tcp->tcp_tcph, tcph_len); 10939 tcp->tcp_ip_hdr_len = len; 10940 tcp->tcp_ipha->ipha_version_and_hdr_length = 10941 (IP_VERSION << 4) | (len >> 2); 10942 len += tcph_len; 10943 tcp->tcp_hdr_len = len; 10944 } 10945 10946 /* 10947 * Copy the standard header into its new location, 10948 * lay in the new options and then update the relevant 10949 * fields in both tcp_t and the standard header. 10950 */ 10951 static int 10952 tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len) 10953 { 10954 uint_t tcph_len; 10955 uint8_t *ip_optp; 10956 tcph_t *new_tcph; 10957 10958 if ((len > TCP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3)) 10959 return (EINVAL); 10960 10961 if (len > IP_MAX_OPT_LENGTH - tcp->tcp_label_len) 10962 return (EINVAL); 10963 10964 if (checkonly) { 10965 /* 10966 * do not really set, just pretend to - T_CHECK 10967 */ 10968 return (0); 10969 } 10970 10971 ip_optp = (uint8_t *)tcp->tcp_ipha + IP_SIMPLE_HDR_LENGTH; 10972 if (tcp->tcp_label_len > 0) { 10973 int padlen; 10974 uint8_t opt; 10975 10976 /* convert list termination to no-ops */ 10977 padlen = tcp->tcp_label_len - ip_optp[IPOPT_OLEN]; 10978 ip_optp += ip_optp[IPOPT_OLEN]; 10979 opt = len > 0 ? IPOPT_NOP : IPOPT_EOL; 10980 while (--padlen >= 0) 10981 *ip_optp++ = opt; 10982 } 10983 tcph_len = tcp->tcp_tcp_hdr_len; 10984 new_tcph = (tcph_t *)(ip_optp + len); 10985 ovbcopy(tcp->tcp_tcph, new_tcph, tcph_len); 10986 tcp->tcp_tcph = new_tcph; 10987 bcopy(ptr, ip_optp, len); 10988 10989 len += IP_SIMPLE_HDR_LENGTH + tcp->tcp_label_len; 10990 10991 tcp->tcp_ip_hdr_len = len; 10992 tcp->tcp_ipha->ipha_version_and_hdr_length = 10993 (IP_VERSION << 4) | (len >> 2); 10994 tcp->tcp_hdr_len = len + tcph_len; 10995 if (!TCP_IS_DETACHED(tcp)) { 10996 /* Always allocate room for all options. */ 10997 (void) mi_set_sth_wroff(tcp->tcp_rq, 10998 TCP_MAX_COMBINED_HEADER_LENGTH + tcp_wroff_xtra); 10999 } 11000 return (0); 11001 } 11002 11003 /* Get callback routine passed to nd_load by tcp_param_register */ 11004 /* ARGSUSED */ 11005 static int 11006 tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 11007 { 11008 tcpparam_t *tcppa = (tcpparam_t *)cp; 11009 11010 (void) mi_mpprintf(mp, "%u", tcppa->tcp_param_val); 11011 return (0); 11012 } 11013 11014 /* 11015 * Walk through the param array specified registering each element with the 11016 * named dispatch handler. 11017 */ 11018 static boolean_t 11019 tcp_param_register(tcpparam_t *tcppa, int cnt) 11020 { 11021 for (; cnt-- > 0; tcppa++) { 11022 if (tcppa->tcp_param_name && tcppa->tcp_param_name[0]) { 11023 if (!nd_load(&tcp_g_nd, tcppa->tcp_param_name, 11024 tcp_param_get, tcp_param_set, 11025 (caddr_t)tcppa)) { 11026 nd_free(&tcp_g_nd); 11027 return (B_FALSE); 11028 } 11029 } 11030 } 11031 if (!nd_load(&tcp_g_nd, tcp_wroff_xtra_param.tcp_param_name, 11032 tcp_param_get, tcp_param_set_aligned, 11033 (caddr_t)&tcp_wroff_xtra_param)) { 11034 nd_free(&tcp_g_nd); 11035 return (B_FALSE); 11036 } 11037 if (!nd_load(&tcp_g_nd, tcp_mdt_head_param.tcp_param_name, 11038 tcp_param_get, tcp_param_set_aligned, 11039 (caddr_t)&tcp_mdt_head_param)) { 11040 nd_free(&tcp_g_nd); 11041 return (B_FALSE); 11042 } 11043 if (!nd_load(&tcp_g_nd, tcp_mdt_tail_param.tcp_param_name, 11044 tcp_param_get, tcp_param_set_aligned, 11045 (caddr_t)&tcp_mdt_tail_param)) { 11046 nd_free(&tcp_g_nd); 11047 return (B_FALSE); 11048 } 11049 if (!nd_load(&tcp_g_nd, tcp_mdt_max_pbufs_param.tcp_param_name, 11050 tcp_param_get, tcp_param_set, 11051 (caddr_t)&tcp_mdt_max_pbufs_param)) { 11052 nd_free(&tcp_g_nd); 11053 return (B_FALSE); 11054 } 11055 if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports", 11056 tcp_extra_priv_ports_get, NULL, NULL)) { 11057 nd_free(&tcp_g_nd); 11058 return (B_FALSE); 11059 } 11060 if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports_add", 11061 NULL, tcp_extra_priv_ports_add, NULL)) { 11062 nd_free(&tcp_g_nd); 11063 return (B_FALSE); 11064 } 11065 if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports_del", 11066 NULL, tcp_extra_priv_ports_del, NULL)) { 11067 nd_free(&tcp_g_nd); 11068 return (B_FALSE); 11069 } 11070 if (!nd_load(&tcp_g_nd, "tcp_status", tcp_status_report, NULL, 11071 NULL)) { 11072 nd_free(&tcp_g_nd); 11073 return (B_FALSE); 11074 } 11075 if (!nd_load(&tcp_g_nd, "tcp_bind_hash", tcp_bind_hash_report, 11076 NULL, NULL)) { 11077 nd_free(&tcp_g_nd); 11078 return (B_FALSE); 11079 } 11080 if (!nd_load(&tcp_g_nd, "tcp_listen_hash", tcp_listen_hash_report, 11081 NULL, NULL)) { 11082 nd_free(&tcp_g_nd); 11083 return (B_FALSE); 11084 } 11085 if (!nd_load(&tcp_g_nd, "tcp_conn_hash", tcp_conn_hash_report, 11086 NULL, NULL)) { 11087 nd_free(&tcp_g_nd); 11088 return (B_FALSE); 11089 } 11090 if (!nd_load(&tcp_g_nd, "tcp_acceptor_hash", tcp_acceptor_hash_report, 11091 NULL, NULL)) { 11092 nd_free(&tcp_g_nd); 11093 return (B_FALSE); 11094 } 11095 if (!nd_load(&tcp_g_nd, "tcp_host_param", tcp_host_param_report, 11096 tcp_host_param_set, NULL)) { 11097 nd_free(&tcp_g_nd); 11098 return (B_FALSE); 11099 } 11100 if (!nd_load(&tcp_g_nd, "tcp_host_param_ipv6", tcp_host_param_report, 11101 tcp_host_param_set_ipv6, NULL)) { 11102 nd_free(&tcp_g_nd); 11103 return (B_FALSE); 11104 } 11105 if (!nd_load(&tcp_g_nd, "tcp_1948_phrase", NULL, tcp_1948_phrase_set, 11106 NULL)) { 11107 nd_free(&tcp_g_nd); 11108 return (B_FALSE); 11109 } 11110 if (!nd_load(&tcp_g_nd, "tcp_reserved_port_list", 11111 tcp_reserved_port_list, NULL, NULL)) { 11112 nd_free(&tcp_g_nd); 11113 return (B_FALSE); 11114 } 11115 /* 11116 * Dummy ndd variables - only to convey obsolescence information 11117 * through printing of their name (no get or set routines) 11118 * XXX Remove in future releases ? 11119 */ 11120 if (!nd_load(&tcp_g_nd, 11121 "tcp_close_wait_interval(obsoleted - " 11122 "use tcp_time_wait_interval)", NULL, NULL, NULL)) { 11123 nd_free(&tcp_g_nd); 11124 return (B_FALSE); 11125 } 11126 return (B_TRUE); 11127 } 11128 11129 /* ndd set routine for tcp_wroff_xtra, tcp_mdt_hdr_{head,tail}_min. */ 11130 /* ARGSUSED */ 11131 static int 11132 tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 11133 cred_t *cr) 11134 { 11135 long new_value; 11136 tcpparam_t *tcppa = (tcpparam_t *)cp; 11137 11138 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11139 new_value < tcppa->tcp_param_min || 11140 new_value > tcppa->tcp_param_max) { 11141 return (EINVAL); 11142 } 11143 /* 11144 * Need to make sure new_value is a multiple of 4. If it is not, 11145 * round it up. For future 64 bit requirement, we actually make it 11146 * a multiple of 8. 11147 */ 11148 if (new_value & 0x7) { 11149 new_value = (new_value & ~0x7) + 0x8; 11150 } 11151 tcppa->tcp_param_val = new_value; 11152 return (0); 11153 } 11154 11155 /* Set callback routine passed to nd_load by tcp_param_register */ 11156 /* ARGSUSED */ 11157 static int 11158 tcp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) 11159 { 11160 long new_value; 11161 tcpparam_t *tcppa = (tcpparam_t *)cp; 11162 11163 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11164 new_value < tcppa->tcp_param_min || 11165 new_value > tcppa->tcp_param_max) { 11166 return (EINVAL); 11167 } 11168 tcppa->tcp_param_val = new_value; 11169 return (0); 11170 } 11171 11172 /* 11173 * Add a new piece to the tcp reassembly queue. If the gap at the beginning 11174 * is filled, return as much as we can. The message passed in may be 11175 * multi-part, chained using b_cont. "start" is the starting sequence 11176 * number for this piece. 11177 */ 11178 static mblk_t * 11179 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 11180 { 11181 uint32_t end; 11182 mblk_t *mp1; 11183 mblk_t *mp2; 11184 mblk_t *next_mp; 11185 uint32_t u1; 11186 11187 /* Walk through all the new pieces. */ 11188 do { 11189 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 11190 (uintptr_t)INT_MAX); 11191 end = start + (int)(mp->b_wptr - mp->b_rptr); 11192 next_mp = mp->b_cont; 11193 if (start == end) { 11194 /* Empty. Blast it. */ 11195 freeb(mp); 11196 continue; 11197 } 11198 mp->b_cont = NULL; 11199 TCP_REASS_SET_SEQ(mp, start); 11200 TCP_REASS_SET_END(mp, end); 11201 mp1 = tcp->tcp_reass_tail; 11202 if (!mp1) { 11203 tcp->tcp_reass_tail = mp; 11204 tcp->tcp_reass_head = mp; 11205 BUMP_MIB(&tcp_mib, tcpInDataUnorderSegs); 11206 UPDATE_MIB(&tcp_mib, 11207 tcpInDataUnorderBytes, end - start); 11208 continue; 11209 } 11210 /* New stuff completely beyond tail? */ 11211 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { 11212 /* Link it on end. */ 11213 mp1->b_cont = mp; 11214 tcp->tcp_reass_tail = mp; 11215 BUMP_MIB(&tcp_mib, tcpInDataUnorderSegs); 11216 UPDATE_MIB(&tcp_mib, 11217 tcpInDataUnorderBytes, end - start); 11218 continue; 11219 } 11220 mp1 = tcp->tcp_reass_head; 11221 u1 = TCP_REASS_SEQ(mp1); 11222 /* New stuff at the front? */ 11223 if (SEQ_LT(start, u1)) { 11224 /* Yes... Check for overlap. */ 11225 mp->b_cont = mp1; 11226 tcp->tcp_reass_head = mp; 11227 tcp_reass_elim_overlap(tcp, mp); 11228 continue; 11229 } 11230 /* 11231 * The new piece fits somewhere between the head and tail. 11232 * We find our slot, where mp1 precedes us and mp2 trails. 11233 */ 11234 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 11235 u1 = TCP_REASS_SEQ(mp2); 11236 if (SEQ_LEQ(start, u1)) 11237 break; 11238 } 11239 /* Link ourselves in */ 11240 mp->b_cont = mp2; 11241 mp1->b_cont = mp; 11242 11243 /* Trim overlap with following mblk(s) first */ 11244 tcp_reass_elim_overlap(tcp, mp); 11245 11246 /* Trim overlap with preceding mblk */ 11247 tcp_reass_elim_overlap(tcp, mp1); 11248 11249 } while (start = end, mp = next_mp); 11250 mp1 = tcp->tcp_reass_head; 11251 /* Anything ready to go? */ 11252 if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 11253 return (NULL); 11254 /* Eat what we can off the queue */ 11255 for (;;) { 11256 mp = mp1->b_cont; 11257 end = TCP_REASS_END(mp1); 11258 TCP_REASS_SET_SEQ(mp1, 0); 11259 TCP_REASS_SET_END(mp1, 0); 11260 if (!mp) { 11261 tcp->tcp_reass_tail = NULL; 11262 break; 11263 } 11264 if (end != TCP_REASS_SEQ(mp)) { 11265 mp1->b_cont = NULL; 11266 break; 11267 } 11268 mp1 = mp; 11269 } 11270 mp1 = tcp->tcp_reass_head; 11271 tcp->tcp_reass_head = mp; 11272 return (mp1); 11273 } 11274 11275 /* Eliminate any overlap that mp may have over later mblks */ 11276 static void 11277 tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 11278 { 11279 uint32_t end; 11280 mblk_t *mp1; 11281 uint32_t u1; 11282 11283 end = TCP_REASS_END(mp); 11284 while ((mp1 = mp->b_cont) != NULL) { 11285 u1 = TCP_REASS_SEQ(mp1); 11286 if (!SEQ_GT(end, u1)) 11287 break; 11288 if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 11289 mp->b_wptr -= end - u1; 11290 TCP_REASS_SET_END(mp, u1); 11291 BUMP_MIB(&tcp_mib, tcpInDataPartDupSegs); 11292 UPDATE_MIB(&tcp_mib, tcpInDataPartDupBytes, end - u1); 11293 break; 11294 } 11295 mp->b_cont = mp1->b_cont; 11296 TCP_REASS_SET_SEQ(mp1, 0); 11297 TCP_REASS_SET_END(mp1, 0); 11298 freeb(mp1); 11299 BUMP_MIB(&tcp_mib, tcpInDataDupSegs); 11300 UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, end - u1); 11301 } 11302 if (!mp1) 11303 tcp->tcp_reass_tail = mp; 11304 } 11305 11306 /* 11307 * Send up all messages queued on tcp_rcv_list. 11308 */ 11309 static uint_t 11310 tcp_rcv_drain(queue_t *q, tcp_t *tcp) 11311 { 11312 mblk_t *mp; 11313 uint_t ret = 0; 11314 uint_t thwin; 11315 #ifdef DEBUG 11316 uint_t cnt = 0; 11317 #endif 11318 /* Can't drain on an eager connection */ 11319 if (tcp->tcp_listener != NULL) 11320 return (ret); 11321 11322 /* 11323 * Handle two cases here: we are currently fused or we were 11324 * previously fused and have some urgent data to be delivered 11325 * upstream. The latter happens because we either ran out of 11326 * memory or were detached and therefore sending the SIGURG was 11327 * deferred until this point. In either case we pass control 11328 * over to tcp_fuse_rcv_drain() since it may need to complete 11329 * some work. 11330 */ 11331 if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) { 11332 ASSERT(tcp->tcp_fused_sigurg_mp != NULL); 11333 if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL : 11334 &tcp->tcp_fused_sigurg_mp)) 11335 return (ret); 11336 } 11337 11338 while ((mp = tcp->tcp_rcv_list) != NULL) { 11339 tcp->tcp_rcv_list = mp->b_next; 11340 mp->b_next = NULL; 11341 #ifdef DEBUG 11342 cnt += msgdsize(mp); 11343 #endif 11344 /* Does this need SSL processing first? */ 11345 if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) { 11346 tcp_kssl_input(tcp, mp); 11347 continue; 11348 } 11349 putnext(q, mp); 11350 } 11351 ASSERT(cnt == tcp->tcp_rcv_cnt); 11352 tcp->tcp_rcv_last_head = NULL; 11353 tcp->tcp_rcv_last_tail = NULL; 11354 tcp->tcp_rcv_cnt = 0; 11355 11356 /* Learn the latest rwnd information that we sent to the other side. */ 11357 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 11358 << tcp->tcp_rcv_ws; 11359 /* This is peer's calculated send window (our receive window). */ 11360 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 11361 /* 11362 * Increase the receive window to max. But we need to do receiver 11363 * SWS avoidance. This means that we need to check the increase of 11364 * of receive window is at least 1 MSS. 11365 */ 11366 if (canputnext(q) && (q->q_hiwat - thwin >= tcp->tcp_mss)) { 11367 /* 11368 * If the window that the other side knows is less than max 11369 * deferred acks segments, send an update immediately. 11370 */ 11371 if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { 11372 BUMP_MIB(&tcp_mib, tcpOutWinUpdate); 11373 ret = TH_ACK_NEEDED; 11374 } 11375 tcp->tcp_rwnd = q->q_hiwat; 11376 } 11377 /* No need for the push timer now. */ 11378 if (tcp->tcp_push_tid != 0) { 11379 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 11380 tcp->tcp_push_tid = 0; 11381 } 11382 return (ret); 11383 } 11384 11385 /* 11386 * Queue data on tcp_rcv_list which is a b_next chain. 11387 * tcp_rcv_last_head/tail is the last element of this chain. 11388 * Each element of the chain is a b_cont chain. 11389 * 11390 * M_DATA messages are added to the current element. 11391 * Other messages are added as new (b_next) elements. 11392 */ 11393 void 11394 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) 11395 { 11396 ASSERT(seg_len == msgdsize(mp)); 11397 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); 11398 11399 if (tcp->tcp_rcv_list == NULL) { 11400 ASSERT(tcp->tcp_rcv_last_head == NULL); 11401 tcp->tcp_rcv_list = mp; 11402 tcp->tcp_rcv_last_head = mp; 11403 } else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) { 11404 tcp->tcp_rcv_last_tail->b_cont = mp; 11405 } else { 11406 tcp->tcp_rcv_last_head->b_next = mp; 11407 tcp->tcp_rcv_last_head = mp; 11408 } 11409 11410 while (mp->b_cont) 11411 mp = mp->b_cont; 11412 11413 tcp->tcp_rcv_last_tail = mp; 11414 tcp->tcp_rcv_cnt += seg_len; 11415 tcp->tcp_rwnd -= seg_len; 11416 } 11417 11418 /* 11419 * DEFAULT TCP ENTRY POINT via squeue on READ side. 11420 * 11421 * This is the default entry function into TCP on the read side. TCP is 11422 * always entered via squeue i.e. using squeue's for mutual exclusion. 11423 * When classifier does a lookup to find the tcp, it also puts a reference 11424 * on the conn structure associated so the tcp is guaranteed to exist 11425 * when we come here. We still need to check the state because it might 11426 * as well has been closed. The squeue processing function i.e. squeue_enter, 11427 * squeue_enter_nodrain, or squeue_drain is responsible for doing the 11428 * CONN_DEC_REF. 11429 * 11430 * Apart from the default entry point, IP also sends packets directly to 11431 * tcp_rput_data for AF_INET fast path and tcp_conn_request for incoming 11432 * connections. 11433 */ 11434 void 11435 tcp_input(void *arg, mblk_t *mp, void *arg2) 11436 { 11437 conn_t *connp = (conn_t *)arg; 11438 tcp_t *tcp = (tcp_t *)connp->conn_tcp; 11439 11440 /* arg2 is the sqp */ 11441 ASSERT(arg2 != NULL); 11442 ASSERT(mp != NULL); 11443 11444 /* 11445 * Don't accept any input on a closed tcp as this TCP logically does 11446 * not exist on the system. Don't proceed further with this TCP. 11447 * For eg. this packet could trigger another close of this tcp 11448 * which would be disastrous for tcp_refcnt. tcp_close_detached / 11449 * tcp_clean_death / tcp_closei_local must be called at most once 11450 * on a TCP. In this case we need to refeed the packet into the 11451 * classifier and figure out where the packet should go. Need to 11452 * preserve the recv_ill somehow. Until we figure that out, for 11453 * now just drop the packet if we can't classify the packet. 11454 */ 11455 if (tcp->tcp_state == TCPS_CLOSED || 11456 tcp->tcp_state == TCPS_BOUND) { 11457 conn_t *new_connp; 11458 11459 new_connp = ipcl_classify(mp, connp->conn_zoneid); 11460 if (new_connp != NULL) { 11461 tcp_reinput(new_connp, mp, arg2); 11462 return; 11463 } 11464 /* We failed to classify. For now just drop the packet */ 11465 freemsg(mp); 11466 return; 11467 } 11468 11469 if (DB_TYPE(mp) == M_DATA) 11470 tcp_rput_data(connp, mp, arg2); 11471 else 11472 tcp_rput_common(tcp, mp); 11473 } 11474 11475 /* 11476 * The read side put procedure. 11477 * The packets passed up by ip are assume to be aligned according to 11478 * OK_32PTR and the IP+TCP headers fitting in the first mblk. 11479 */ 11480 static void 11481 tcp_rput_common(tcp_t *tcp, mblk_t *mp) 11482 { 11483 /* 11484 * tcp_rput_data() does not expect M_CTL except for the case 11485 * where tcp_ipv6_recvancillary is set and we get a IN_PKTINFO 11486 * type. Need to make sure that any other M_CTLs don't make 11487 * it to tcp_rput_data since it is not expecting any and doesn't 11488 * check for it. 11489 */ 11490 if (DB_TYPE(mp) == M_CTL) { 11491 switch (*(uint32_t *)(mp->b_rptr)) { 11492 case TCP_IOC_ABORT_CONN: 11493 /* 11494 * Handle connection abort request. 11495 */ 11496 tcp_ioctl_abort_handler(tcp, mp); 11497 return; 11498 case IPSEC_IN: 11499 /* 11500 * Only secure icmp arrive in TCP and they 11501 * don't go through data path. 11502 */ 11503 tcp_icmp_error(tcp, mp); 11504 return; 11505 case IN_PKTINFO: 11506 /* 11507 * Handle IPV6_RECVPKTINFO socket option on AF_INET6 11508 * sockets that are receiving IPv4 traffic. tcp 11509 */ 11510 ASSERT(tcp->tcp_family == AF_INET6); 11511 ASSERT(tcp->tcp_ipv6_recvancillary & 11512 TCP_IPV6_RECVPKTINFO); 11513 tcp_rput_data(tcp->tcp_connp, mp, 11514 tcp->tcp_connp->conn_sqp); 11515 return; 11516 case MDT_IOC_INFO_UPDATE: 11517 /* 11518 * Handle Multidata information update; the 11519 * following routine will free the message. 11520 */ 11521 if (tcp->tcp_connp->conn_mdt_ok) { 11522 tcp_mdt_update(tcp, 11523 &((ip_mdt_info_t *)mp->b_rptr)->mdt_capab, 11524 B_FALSE); 11525 } 11526 freemsg(mp); 11527 return; 11528 default: 11529 break; 11530 } 11531 } 11532 11533 /* No point processing the message if tcp is already closed */ 11534 if (TCP_IS_DETACHED_NONEAGER(tcp)) { 11535 freemsg(mp); 11536 return; 11537 } 11538 11539 tcp_rput_other(tcp, mp); 11540 } 11541 11542 11543 /* The minimum of smoothed mean deviation in RTO calculation. */ 11544 #define TCP_SD_MIN 400 11545 11546 /* 11547 * Set RTO for this connection. The formula is from Jacobson and Karels' 11548 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 11549 * are the same as those in Appendix A.2 of that paper. 11550 * 11551 * m = new measurement 11552 * sa = smoothed RTT average (8 * average estimates). 11553 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 11554 */ 11555 static void 11556 tcp_set_rto(tcp_t *tcp, clock_t rtt) 11557 { 11558 long m = TICK_TO_MSEC(rtt); 11559 clock_t sa = tcp->tcp_rtt_sa; 11560 clock_t sv = tcp->tcp_rtt_sd; 11561 clock_t rto; 11562 11563 BUMP_MIB(&tcp_mib, tcpRttUpdate); 11564 tcp->tcp_rtt_update++; 11565 11566 /* tcp_rtt_sa is not 0 means this is a new sample. */ 11567 if (sa != 0) { 11568 /* 11569 * Update average estimator: 11570 * new rtt = 7/8 old rtt + 1/8 Error 11571 */ 11572 11573 /* m is now Error in estimate. */ 11574 m -= sa >> 3; 11575 if ((sa += m) <= 0) { 11576 /* 11577 * Don't allow the smoothed average to be negative. 11578 * We use 0 to denote reinitialization of the 11579 * variables. 11580 */ 11581 sa = 1; 11582 } 11583 11584 /* 11585 * Update deviation estimator: 11586 * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) 11587 */ 11588 if (m < 0) 11589 m = -m; 11590 m -= sv >> 2; 11591 sv += m; 11592 } else { 11593 /* 11594 * This follows BSD's implementation. So the reinitialized 11595 * RTO is 3 * m. We cannot go less than 2 because if the 11596 * link is bandwidth dominated, doubling the window size 11597 * during slow start means doubling the RTT. We want to be 11598 * more conservative when we reinitialize our estimates. 3 11599 * is just a convenient number. 11600 */ 11601 sa = m << 3; 11602 sv = m << 1; 11603 } 11604 if (sv < TCP_SD_MIN) { 11605 /* 11606 * We do not know that if sa captures the delay ACK 11607 * effect as in a long train of segments, a receiver 11608 * does not delay its ACKs. So set the minimum of sv 11609 * to be TCP_SD_MIN, which is default to 400 ms, twice 11610 * of BSD DATO. That means the minimum of mean 11611 * deviation is 100 ms. 11612 * 11613 */ 11614 sv = TCP_SD_MIN; 11615 } 11616 tcp->tcp_rtt_sa = sa; 11617 tcp->tcp_rtt_sd = sv; 11618 /* 11619 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) 11620 * 11621 * Add tcp_rexmit_interval extra in case of extreme environment 11622 * where the algorithm fails to work. The default value of 11623 * tcp_rexmit_interval_extra should be 0. 11624 * 11625 * As we use a finer grained clock than BSD and update 11626 * RTO for every ACKs, add in another .25 of RTT to the 11627 * deviation of RTO to accomodate burstiness of 1/4 of 11628 * window size. 11629 */ 11630 rto = (sa >> 3) + sv + tcp_rexmit_interval_extra + (sa >> 5); 11631 11632 if (rto > tcp_rexmit_interval_max) { 11633 tcp->tcp_rto = tcp_rexmit_interval_max; 11634 } else if (rto < tcp_rexmit_interval_min) { 11635 tcp->tcp_rto = tcp_rexmit_interval_min; 11636 } else { 11637 tcp->tcp_rto = rto; 11638 } 11639 11640 /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 11641 tcp->tcp_timer_backoff = 0; 11642 } 11643 11644 /* 11645 * tcp_get_seg_mp() is called to get the pointer to a segment in the 11646 * send queue which starts at the given seq. no. 11647 * 11648 * Parameters: 11649 * tcp_t *tcp: the tcp instance pointer. 11650 * uint32_t seq: the starting seq. no of the requested segment. 11651 * int32_t *off: after the execution, *off will be the offset to 11652 * the returned mblk which points to the requested seq no. 11653 * It is the caller's responsibility to send in a non-null off. 11654 * 11655 * Return: 11656 * A mblk_t pointer pointing to the requested segment in send queue. 11657 */ 11658 static mblk_t * 11659 tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 11660 { 11661 int32_t cnt; 11662 mblk_t *mp; 11663 11664 /* Defensive coding. Make sure we don't send incorrect data. */ 11665 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt)) 11666 return (NULL); 11667 11668 cnt = seq - tcp->tcp_suna; 11669 mp = tcp->tcp_xmit_head; 11670 while (cnt > 0 && mp != NULL) { 11671 cnt -= mp->b_wptr - mp->b_rptr; 11672 if (cnt < 0) { 11673 cnt += mp->b_wptr - mp->b_rptr; 11674 break; 11675 } 11676 mp = mp->b_cont; 11677 } 11678 ASSERT(mp != NULL); 11679 *off = cnt; 11680 return (mp); 11681 } 11682 11683 /* 11684 * This function handles all retransmissions if SACK is enabled for this 11685 * connection. First it calculates how many segments can be retransmitted 11686 * based on tcp_pipe. Then it goes thru the notsack list to find eligible 11687 * segments. A segment is eligible if sack_cnt for that segment is greater 11688 * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 11689 * all eligible segments, it checks to see if TCP can send some new segments 11690 * (fast recovery). If it can, set the appropriate flag for tcp_rput_data(). 11691 * 11692 * Parameters: 11693 * tcp_t *tcp: the tcp structure of the connection. 11694 * uint_t *flags: in return, appropriate value will be set for 11695 * tcp_rput_data(). 11696 */ 11697 static void 11698 tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) 11699 { 11700 notsack_blk_t *notsack_blk; 11701 int32_t usable_swnd; 11702 int32_t mss; 11703 uint32_t seg_len; 11704 mblk_t *xmit_mp; 11705 11706 ASSERT(tcp->tcp_sack_info != NULL); 11707 ASSERT(tcp->tcp_notsack_list != NULL); 11708 ASSERT(tcp->tcp_rexmit == B_FALSE); 11709 11710 /* Defensive coding in case there is a bug... */ 11711 if (tcp->tcp_notsack_list == NULL) { 11712 return; 11713 } 11714 notsack_blk = tcp->tcp_notsack_list; 11715 mss = tcp->tcp_mss; 11716 11717 /* 11718 * Limit the num of outstanding data in the network to be 11719 * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 11720 */ 11721 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 11722 11723 /* At least retransmit 1 MSS of data. */ 11724 if (usable_swnd <= 0) { 11725 usable_swnd = mss; 11726 } 11727 11728 /* Make sure no new RTT samples will be taken. */ 11729 tcp->tcp_csuna = tcp->tcp_snxt; 11730 11731 notsack_blk = tcp->tcp_notsack_list; 11732 while (usable_swnd > 0) { 11733 mblk_t *snxt_mp, *tmp_mp; 11734 tcp_seq begin = tcp->tcp_sack_snxt; 11735 tcp_seq end; 11736 int32_t off; 11737 11738 for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 11739 if (SEQ_GT(notsack_blk->end, begin) && 11740 (notsack_blk->sack_cnt >= 11741 tcp_dupack_fast_retransmit)) { 11742 end = notsack_blk->end; 11743 if (SEQ_LT(begin, notsack_blk->begin)) { 11744 begin = notsack_blk->begin; 11745 } 11746 break; 11747 } 11748 } 11749 /* 11750 * All holes are filled. Manipulate tcp_cwnd to send more 11751 * if we can. Note that after the SACK recovery, tcp_cwnd is 11752 * set to tcp_cwnd_ssthresh. 11753 */ 11754 if (notsack_blk == NULL) { 11755 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 11756 if (usable_swnd <= 0 || tcp->tcp_unsent == 0) { 11757 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 11758 ASSERT(tcp->tcp_cwnd > 0); 11759 return; 11760 } else { 11761 usable_swnd = usable_swnd / mss; 11762 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 11763 MAX(usable_swnd * mss, mss); 11764 *flags |= TH_XMIT_NEEDED; 11765 return; 11766 } 11767 } 11768 11769 /* 11770 * Note that we may send more than usable_swnd allows here 11771 * because of round off, but no more than 1 MSS of data. 11772 */ 11773 seg_len = end - begin; 11774 if (seg_len > mss) 11775 seg_len = mss; 11776 snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 11777 ASSERT(snxt_mp != NULL); 11778 /* This should not happen. Defensive coding again... */ 11779 if (snxt_mp == NULL) { 11780 return; 11781 } 11782 11783 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 11784 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 11785 if (xmit_mp == NULL) 11786 return; 11787 11788 usable_swnd -= seg_len; 11789 tcp->tcp_pipe += seg_len; 11790 tcp->tcp_sack_snxt = begin + seg_len; 11791 TCP_RECORD_TRACE(tcp, xmit_mp, TCP_TRACE_SEND_PKT); 11792 tcp_send_data(tcp, tcp->tcp_wq, xmit_mp); 11793 11794 /* 11795 * Update the send timestamp to avoid false retransmission. 11796 */ 11797 snxt_mp->b_prev = (mblk_t *)lbolt; 11798 11799 BUMP_MIB(&tcp_mib, tcpRetransSegs); 11800 UPDATE_MIB(&tcp_mib, tcpRetransBytes, seg_len); 11801 BUMP_MIB(&tcp_mib, tcpOutSackRetransSegs); 11802 /* 11803 * Update tcp_rexmit_max to extend this SACK recovery phase. 11804 * This happens when new data sent during fast recovery is 11805 * also lost. If TCP retransmits those new data, it needs 11806 * to extend SACK recover phase to avoid starting another 11807 * fast retransmit/recovery unnecessarily. 11808 */ 11809 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 11810 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 11811 } 11812 } 11813 } 11814 11815 /* 11816 * This function handles policy checking at TCP level for non-hard_bound/ 11817 * detached connections. 11818 */ 11819 static boolean_t 11820 tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h, 11821 boolean_t secure, boolean_t mctl_present) 11822 { 11823 ipsec_latch_t *ipl = NULL; 11824 ipsec_action_t *act = NULL; 11825 mblk_t *data_mp; 11826 ipsec_in_t *ii; 11827 const char *reason; 11828 kstat_named_t *counter; 11829 11830 ASSERT(mctl_present || !secure); 11831 11832 ASSERT((ipha == NULL && ip6h != NULL) || 11833 (ip6h == NULL && ipha != NULL)); 11834 11835 /* 11836 * We don't necessarily have an ipsec_in_act action to verify 11837 * policy because of assymetrical policy where we have only 11838 * outbound policy and no inbound policy (possible with global 11839 * policy). 11840 */ 11841 if (!secure) { 11842 if (act == NULL || act->ipa_act.ipa_type == IPSEC_ACT_BYPASS || 11843 act->ipa_act.ipa_type == IPSEC_ACT_CLEAR) 11844 return (B_TRUE); 11845 ipsec_log_policy_failure(tcp->tcp_wq, IPSEC_POLICY_MISMATCH, 11846 "tcp_check_policy", ipha, ip6h, secure); 11847 ip_drop_packet(first_mp, B_TRUE, NULL, NULL, 11848 &ipdrops_tcp_clear, &tcp_dropper); 11849 return (B_FALSE); 11850 } 11851 11852 /* 11853 * We have a secure packet. 11854 */ 11855 if (act == NULL) { 11856 ipsec_log_policy_failure(tcp->tcp_wq, 11857 IPSEC_POLICY_NOT_NEEDED, "tcp_check_policy", ipha, ip6h, 11858 secure); 11859 ip_drop_packet(first_mp, B_TRUE, NULL, NULL, 11860 &ipdrops_tcp_secure, &tcp_dropper); 11861 return (B_FALSE); 11862 } 11863 11864 /* 11865 * XXX This whole routine is currently incorrect. ipl should 11866 * be set to the latch pointer, but is currently not set, so 11867 * we initialize it to NULL to avoid picking up random garbage. 11868 */ 11869 if (ipl == NULL) 11870 return (B_TRUE); 11871 11872 data_mp = first_mp->b_cont; 11873 11874 ii = (ipsec_in_t *)first_mp->b_rptr; 11875 11876 if (ipsec_check_ipsecin_latch(ii, data_mp, ipl, ipha, ip6h, &reason, 11877 &counter)) { 11878 BUMP_MIB(&ip_mib, ipsecInSucceeded); 11879 return (B_TRUE); 11880 } 11881 (void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE, 11882 "tcp inbound policy mismatch: %s, packet dropped\n", 11883 reason); 11884 BUMP_MIB(&ip_mib, ipsecInFailed); 11885 11886 ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter, &tcp_dropper); 11887 return (B_FALSE); 11888 } 11889 11890 /* 11891 * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start 11892 * retransmission after a timeout. 11893 * 11894 * To limit the number of duplicate segments, we limit the number of segment 11895 * to be sent in one time to tcp_snd_burst, the burst variable. 11896 */ 11897 static void 11898 tcp_ss_rexmit(tcp_t *tcp) 11899 { 11900 uint32_t snxt; 11901 uint32_t smax; 11902 int32_t win; 11903 int32_t mss; 11904 int32_t off; 11905 int32_t burst = tcp->tcp_snd_burst; 11906 mblk_t *snxt_mp; 11907 11908 /* 11909 * Note that tcp_rexmit can be set even though TCP has retransmitted 11910 * all unack'ed segments. 11911 */ 11912 if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 11913 smax = tcp->tcp_rexmit_max; 11914 snxt = tcp->tcp_rexmit_nxt; 11915 if (SEQ_LT(snxt, tcp->tcp_suna)) { 11916 snxt = tcp->tcp_suna; 11917 } 11918 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 11919 win -= snxt - tcp->tcp_suna; 11920 mss = tcp->tcp_mss; 11921 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 11922 11923 while (SEQ_LT(snxt, smax) && (win > 0) && 11924 (burst > 0) && (snxt_mp != NULL)) { 11925 mblk_t *xmit_mp; 11926 mblk_t *old_snxt_mp = snxt_mp; 11927 uint32_t cnt = mss; 11928 11929 if (win < cnt) { 11930 cnt = win; 11931 } 11932 if (SEQ_GT(snxt + cnt, smax)) { 11933 cnt = smax - snxt; 11934 } 11935 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 11936 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 11937 if (xmit_mp == NULL) 11938 return; 11939 11940 tcp_send_data(tcp, tcp->tcp_wq, xmit_mp); 11941 11942 snxt += cnt; 11943 win -= cnt; 11944 /* 11945 * Update the send timestamp to avoid false 11946 * retransmission. 11947 */ 11948 old_snxt_mp->b_prev = (mblk_t *)lbolt; 11949 BUMP_MIB(&tcp_mib, tcpRetransSegs); 11950 UPDATE_MIB(&tcp_mib, tcpRetransBytes, cnt); 11951 11952 tcp->tcp_rexmit_nxt = snxt; 11953 burst--; 11954 } 11955 /* 11956 * If we have transmitted all we have at the time 11957 * we started the retranmission, we can leave 11958 * the rest of the job to tcp_wput_data(). But we 11959 * need to check the send window first. If the 11960 * win is not 0, go on with tcp_wput_data(). 11961 */ 11962 if (SEQ_LT(snxt, smax) || win == 0) { 11963 return; 11964 } 11965 } 11966 /* Only call tcp_wput_data() if there is data to be sent. */ 11967 if (tcp->tcp_unsent) { 11968 tcp_wput_data(tcp, NULL, B_FALSE); 11969 } 11970 } 11971 11972 /* 11973 * Process all TCP option in SYN segment. Note that this function should 11974 * be called after tcp_adapt_ire() is called so that the necessary info 11975 * from IRE is already set in the tcp structure. 11976 * 11977 * This function sets up the correct tcp_mss value according to the 11978 * MSS option value and our header size. It also sets up the window scale 11979 * and timestamp values, and initialize SACK info blocks. But it does not 11980 * change receive window size after setting the tcp_mss value. The caller 11981 * should do the appropriate change. 11982 */ 11983 void 11984 tcp_process_options(tcp_t *tcp, tcph_t *tcph) 11985 { 11986 int options; 11987 tcp_opt_t tcpopt; 11988 uint32_t mss_max; 11989 char *tmp_tcph; 11990 11991 tcpopt.tcp = NULL; 11992 options = tcp_parse_options(tcph, &tcpopt); 11993 11994 /* 11995 * Process MSS option. Note that MSS option value does not account 11996 * for IP or TCP options. This means that it is equal to MTU - minimum 11997 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 11998 * IPv6. 11999 */ 12000 if (!(options & TCP_OPT_MSS_PRESENT)) { 12001 if (tcp->tcp_ipversion == IPV4_VERSION) 12002 tcpopt.tcp_opt_mss = tcp_mss_def_ipv4; 12003 else 12004 tcpopt.tcp_opt_mss = tcp_mss_def_ipv6; 12005 } else { 12006 if (tcp->tcp_ipversion == IPV4_VERSION) 12007 mss_max = tcp_mss_max_ipv4; 12008 else 12009 mss_max = tcp_mss_max_ipv6; 12010 if (tcpopt.tcp_opt_mss < tcp_mss_min) 12011 tcpopt.tcp_opt_mss = tcp_mss_min; 12012 else if (tcpopt.tcp_opt_mss > mss_max) 12013 tcpopt.tcp_opt_mss = mss_max; 12014 } 12015 12016 /* Process Window Scale option. */ 12017 if (options & TCP_OPT_WSCALE_PRESENT) { 12018 tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 12019 tcp->tcp_snd_ws_ok = B_TRUE; 12020 } else { 12021 tcp->tcp_snd_ws = B_FALSE; 12022 tcp->tcp_snd_ws_ok = B_FALSE; 12023 tcp->tcp_rcv_ws = B_FALSE; 12024 } 12025 12026 /* Process Timestamp option. */ 12027 if ((options & TCP_OPT_TSTAMP_PRESENT) && 12028 (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) { 12029 tmp_tcph = (char *)tcp->tcp_tcph; 12030 12031 tcp->tcp_snd_ts_ok = B_TRUE; 12032 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 12033 tcp->tcp_last_rcv_lbolt = lbolt64; 12034 ASSERT(OK_32PTR(tmp_tcph)); 12035 ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 12036 12037 /* Fill in our template header with basic timestamp option. */ 12038 tmp_tcph += tcp->tcp_tcp_hdr_len; 12039 tmp_tcph[0] = TCPOPT_NOP; 12040 tmp_tcph[1] = TCPOPT_NOP; 12041 tmp_tcph[2] = TCPOPT_TSTAMP; 12042 tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 12043 tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN; 12044 tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN; 12045 tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4); 12046 } else { 12047 tcp->tcp_snd_ts_ok = B_FALSE; 12048 } 12049 12050 /* 12051 * Process SACK options. If SACK is enabled for this connection, 12052 * then allocate the SACK info structure. Note the following ways 12053 * when tcp_snd_sack_ok is set to true. 12054 * 12055 * For active connection: in tcp_adapt_ire() called in 12056 * tcp_rput_other(), or in tcp_rput_other() when tcp_sack_permitted 12057 * is checked. 12058 * 12059 * For passive connection: in tcp_adapt_ire() called in 12060 * tcp_accept_comm(). 12061 * 12062 * That's the reason why the extra TCP_IS_DETACHED() check is there. 12063 * That check makes sure that if we did not send a SACK OK option, 12064 * we will not enable SACK for this connection even though the other 12065 * side sends us SACK OK option. For active connection, the SACK 12066 * info structure has already been allocated. So we need to free 12067 * it if SACK is disabled. 12068 */ 12069 if ((options & TCP_OPT_SACK_OK_PRESENT) && 12070 (tcp->tcp_snd_sack_ok || 12071 (tcp_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) { 12072 /* This should be true only in the passive case. */ 12073 if (tcp->tcp_sack_info == NULL) { 12074 ASSERT(TCP_IS_DETACHED(tcp)); 12075 tcp->tcp_sack_info = 12076 kmem_cache_alloc(tcp_sack_info_cache, KM_NOSLEEP); 12077 } 12078 if (tcp->tcp_sack_info == NULL) { 12079 tcp->tcp_snd_sack_ok = B_FALSE; 12080 } else { 12081 tcp->tcp_snd_sack_ok = B_TRUE; 12082 if (tcp->tcp_snd_ts_ok) { 12083 tcp->tcp_max_sack_blk = 3; 12084 } else { 12085 tcp->tcp_max_sack_blk = 4; 12086 } 12087 } 12088 } else { 12089 /* 12090 * Resetting tcp_snd_sack_ok to B_FALSE so that 12091 * no SACK info will be used for this 12092 * connection. This assumes that SACK usage 12093 * permission is negotiated. This may need 12094 * to be changed once this is clarified. 12095 */ 12096 if (tcp->tcp_sack_info != NULL) { 12097 ASSERT(tcp->tcp_notsack_list == NULL); 12098 kmem_cache_free(tcp_sack_info_cache, 12099 tcp->tcp_sack_info); 12100 tcp->tcp_sack_info = NULL; 12101 } 12102 tcp->tcp_snd_sack_ok = B_FALSE; 12103 } 12104 12105 /* 12106 * Now we know the exact TCP/IP header length, subtract 12107 * that from tcp_mss to get our side's MSS. 12108 */ 12109 tcp->tcp_mss -= tcp->tcp_hdr_len; 12110 /* 12111 * Here we assume that the other side's header size will be equal to 12112 * our header size. We calculate the real MSS accordingly. Need to 12113 * take into additional stuffs IPsec puts in. 12114 * 12115 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 12116 */ 12117 tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len + tcp->tcp_ipsec_overhead - 12118 ((tcp->tcp_ipversion == IPV4_VERSION ? 12119 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH); 12120 12121 /* 12122 * Set MSS to the smaller one of both ends of the connection. 12123 * We should not have called tcp_mss_set() before, but our 12124 * side of the MSS should have been set to a proper value 12125 * by tcp_adapt_ire(). tcp_mss_set() will also set up the 12126 * STREAM head parameters properly. 12127 * 12128 * If we have a larger-than-16-bit window but the other side 12129 * didn't want to do window scale, tcp_rwnd_set() will take 12130 * care of that. 12131 */ 12132 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); 12133 } 12134 12135 /* 12136 * Sends the T_CONN_IND to the listener. The caller calls this 12137 * functions via squeue to get inside the listener's perimeter 12138 * once the 3 way hand shake is done a T_CONN_IND needs to be 12139 * sent. As an optimization, the caller can call this directly 12140 * if listener's perimeter is same as eager's. 12141 */ 12142 /* ARGSUSED */ 12143 void 12144 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) 12145 { 12146 conn_t *lconnp = (conn_t *)arg; 12147 tcp_t *listener = lconnp->conn_tcp; 12148 tcp_t *tcp; 12149 struct T_conn_ind *conn_ind; 12150 ipaddr_t *addr_cache; 12151 boolean_t need_send_conn_ind = B_FALSE; 12152 12153 /* retrieve the eager */ 12154 conn_ind = (struct T_conn_ind *)mp->b_rptr; 12155 ASSERT(conn_ind->OPT_offset != 0 && 12156 conn_ind->OPT_length == sizeof (intptr_t)); 12157 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 12158 conn_ind->OPT_length); 12159 12160 /* 12161 * TLI/XTI applications will get confused by 12162 * sending eager as an option since it violates 12163 * the option semantics. So remove the eager as 12164 * option since TLI/XTI app doesn't need it anyway. 12165 */ 12166 if (!TCP_IS_SOCKET(listener)) { 12167 conn_ind->OPT_length = 0; 12168 conn_ind->OPT_offset = 0; 12169 } 12170 if (listener->tcp_state == TCPS_CLOSED || 12171 TCP_IS_DETACHED(listener)) { 12172 /* 12173 * If listener has closed, it would have caused a 12174 * a cleanup/blowoff to happen for the eager. We 12175 * just need to return. 12176 */ 12177 freemsg(mp); 12178 return; 12179 } 12180 12181 12182 /* 12183 * if the conn_req_q is full defer passing up the 12184 * T_CONN_IND until space is availabe after t_accept() 12185 * processing 12186 */ 12187 mutex_enter(&listener->tcp_eager_lock); 12188 if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { 12189 tcp_t *tail; 12190 12191 /* 12192 * The eager already has an extra ref put in tcp_rput_data 12193 * so that it stays till accept comes back even though it 12194 * might get into TCPS_CLOSED as a result of a TH_RST etc. 12195 */ 12196 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 12197 listener->tcp_conn_req_cnt_q0--; 12198 listener->tcp_conn_req_cnt_q++; 12199 12200 /* Move from SYN_RCVD to ESTABLISHED list */ 12201 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 12202 tcp->tcp_eager_prev_q0; 12203 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 12204 tcp->tcp_eager_next_q0; 12205 tcp->tcp_eager_prev_q0 = NULL; 12206 tcp->tcp_eager_next_q0 = NULL; 12207 12208 /* 12209 * Insert at end of the queue because sockfs 12210 * sends down T_CONN_RES in chronological 12211 * order. Leaving the older conn indications 12212 * at front of the queue helps reducing search 12213 * time. 12214 */ 12215 tail = listener->tcp_eager_last_q; 12216 if (tail != NULL) 12217 tail->tcp_eager_next_q = tcp; 12218 else 12219 listener->tcp_eager_next_q = tcp; 12220 listener->tcp_eager_last_q = tcp; 12221 tcp->tcp_eager_next_q = NULL; 12222 /* 12223 * Delay sending up the T_conn_ind until we are 12224 * done with the eager. Once we have have sent up 12225 * the T_conn_ind, the accept can potentially complete 12226 * any time and release the refhold we have on the eager. 12227 */ 12228 need_send_conn_ind = B_TRUE; 12229 } else { 12230 /* 12231 * Defer connection on q0 and set deferred 12232 * connection bit true 12233 */ 12234 tcp->tcp_conn_def_q0 = B_TRUE; 12235 12236 /* take tcp out of q0 ... */ 12237 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 12238 tcp->tcp_eager_next_q0; 12239 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 12240 tcp->tcp_eager_prev_q0; 12241 12242 /* ... and place it at the end of q0 */ 12243 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 12244 tcp->tcp_eager_next_q0 = listener; 12245 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 12246 listener->tcp_eager_prev_q0 = tcp; 12247 tcp->tcp_conn.tcp_eager_conn_ind = mp; 12248 } 12249 12250 /* we have timed out before */ 12251 if (tcp->tcp_syn_rcvd_timeout != 0) { 12252 tcp->tcp_syn_rcvd_timeout = 0; 12253 listener->tcp_syn_rcvd_timeout--; 12254 if (listener->tcp_syn_defense && 12255 listener->tcp_syn_rcvd_timeout <= 12256 (tcp_conn_req_max_q0 >> 5) && 12257 10*MINUTES < TICK_TO_MSEC(lbolt64 - 12258 listener->tcp_last_rcv_lbolt)) { 12259 /* 12260 * Turn off the defense mode if we 12261 * believe the SYN attack is over. 12262 */ 12263 listener->tcp_syn_defense = B_FALSE; 12264 if (listener->tcp_ip_addr_cache) { 12265 kmem_free((void *)listener->tcp_ip_addr_cache, 12266 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 12267 listener->tcp_ip_addr_cache = NULL; 12268 } 12269 } 12270 } 12271 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 12272 if (addr_cache != NULL) { 12273 /* 12274 * We have finished a 3-way handshake with this 12275 * remote host. This proves the IP addr is good. 12276 * Cache it! 12277 */ 12278 addr_cache[IP_ADDR_CACHE_HASH( 12279 tcp->tcp_remote)] = tcp->tcp_remote; 12280 } 12281 mutex_exit(&listener->tcp_eager_lock); 12282 if (need_send_conn_ind) 12283 putnext(listener->tcp_rq, mp); 12284 } 12285 12286 mblk_t * 12287 tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp, 12288 uint_t *ifindexp, ip6_pkt_t *ippp) 12289 { 12290 in_pktinfo_t *pinfo; 12291 ip6_t *ip6h; 12292 uchar_t *rptr; 12293 mblk_t *first_mp = mp; 12294 boolean_t mctl_present = B_FALSE; 12295 uint_t ifindex = 0; 12296 ip6_pkt_t ipp; 12297 uint_t ipvers; 12298 uint_t ip_hdr_len; 12299 12300 rptr = mp->b_rptr; 12301 ASSERT(OK_32PTR(rptr)); 12302 ASSERT(tcp != NULL); 12303 ipp.ipp_fields = 0; 12304 12305 switch DB_TYPE(mp) { 12306 case M_CTL: 12307 mp = mp->b_cont; 12308 if (mp == NULL) { 12309 freemsg(first_mp); 12310 return (NULL); 12311 } 12312 if (DB_TYPE(mp) != M_DATA) { 12313 freemsg(first_mp); 12314 return (NULL); 12315 } 12316 mctl_present = B_TRUE; 12317 break; 12318 case M_DATA: 12319 break; 12320 default: 12321 cmn_err(CE_NOTE, "tcp_find_pktinfo: unknown db_type"); 12322 freemsg(mp); 12323 return (NULL); 12324 } 12325 ipvers = IPH_HDR_VERSION(rptr); 12326 if (ipvers == IPV4_VERSION) { 12327 if (tcp == NULL) { 12328 ip_hdr_len = IPH_HDR_LENGTH(rptr); 12329 goto done; 12330 } 12331 12332 ipp.ipp_fields |= IPPF_HOPLIMIT; 12333 ipp.ipp_hoplimit = ((ipha_t *)rptr)->ipha_ttl; 12334 12335 /* 12336 * If we have IN_PKTINFO in an M_CTL and tcp_ipv6_recvancillary 12337 * has TCP_IPV6_RECVPKTINFO set, pass I/F index along in ipp. 12338 */ 12339 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) && 12340 mctl_present) { 12341 pinfo = (in_pktinfo_t *)first_mp->b_rptr; 12342 if ((MBLKL(first_mp) == sizeof (in_pktinfo_t)) && 12343 (pinfo->in_pkt_ulp_type == IN_PKTINFO) && 12344 (pinfo->in_pkt_flags & IPF_RECVIF)) { 12345 ipp.ipp_fields |= IPPF_IFINDEX; 12346 ipp.ipp_ifindex = pinfo->in_pkt_ifindex; 12347 ifindex = pinfo->in_pkt_ifindex; 12348 } 12349 freeb(first_mp); 12350 mctl_present = B_FALSE; 12351 } 12352 ip_hdr_len = IPH_HDR_LENGTH(rptr); 12353 } else { 12354 ip6h = (ip6_t *)rptr; 12355 12356 ASSERT(ipvers == IPV6_VERSION); 12357 ipp.ipp_fields = IPPF_HOPLIMIT | IPPF_TCLASS; 12358 ipp.ipp_tclass = (ip6h->ip6_flow & 0x0FF00000) >> 20; 12359 ipp.ipp_hoplimit = ip6h->ip6_hops; 12360 12361 if (ip6h->ip6_nxt != IPPROTO_TCP) { 12362 uint8_t nexthdrp; 12363 12364 /* Look for ifindex information */ 12365 if (ip6h->ip6_nxt == IPPROTO_RAW) { 12366 ip6i_t *ip6i = (ip6i_t *)ip6h; 12367 if ((uchar_t *)&ip6i[1] > mp->b_wptr) { 12368 BUMP_MIB(&ip_mib, tcpInErrs); 12369 freemsg(first_mp); 12370 return (NULL); 12371 } 12372 12373 if (ip6i->ip6i_flags & IP6I_IFINDEX) { 12374 ASSERT(ip6i->ip6i_ifindex != 0); 12375 ipp.ipp_fields |= IPPF_IFINDEX; 12376 ipp.ipp_ifindex = ip6i->ip6i_ifindex; 12377 ifindex = ip6i->ip6i_ifindex; 12378 } 12379 rptr = (uchar_t *)&ip6i[1]; 12380 mp->b_rptr = rptr; 12381 if (rptr == mp->b_wptr) { 12382 mblk_t *mp1; 12383 mp1 = mp->b_cont; 12384 freeb(mp); 12385 mp = mp1; 12386 rptr = mp->b_rptr; 12387 } 12388 if (MBLKL(mp) < IPV6_HDR_LEN + 12389 sizeof (tcph_t)) { 12390 BUMP_MIB(&ip_mib, tcpInErrs); 12391 freemsg(first_mp); 12392 return (NULL); 12393 } 12394 ip6h = (ip6_t *)rptr; 12395 } 12396 12397 /* 12398 * Find any potentially interesting extension headers 12399 * as well as the length of the IPv6 + extension 12400 * headers. 12401 */ 12402 ip_hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp); 12403 /* Verify if this is a TCP packet */ 12404 if (nexthdrp != IPPROTO_TCP) { 12405 BUMP_MIB(&ip_mib, tcpInErrs); 12406 freemsg(first_mp); 12407 return (NULL); 12408 } 12409 } else { 12410 ip_hdr_len = IPV6_HDR_LEN; 12411 } 12412 } 12413 12414 done: 12415 if (ipversp != NULL) 12416 *ipversp = ipvers; 12417 if (ip_hdr_lenp != NULL) 12418 *ip_hdr_lenp = ip_hdr_len; 12419 if (ippp != NULL) 12420 *ippp = ipp; 12421 if (ifindexp != NULL) 12422 *ifindexp = ifindex; 12423 if (mctl_present) { 12424 freeb(first_mp); 12425 } 12426 return (mp); 12427 } 12428 12429 /* 12430 * Handle M_DATA messages from IP. Its called directly from IP via 12431 * squeue for AF_INET type sockets fast path. No M_CTL are expected 12432 * in this path. 12433 * 12434 * For everything else (including AF_INET6 sockets with 'tcp_ipversion' 12435 * v4 and v6), we are called through tcp_input() and a M_CTL can 12436 * be present for options but tcp_find_pktinfo() deals with it. We 12437 * only expect M_DATA packets after tcp_find_pktinfo() is done. 12438 * 12439 * The first argument is always the connp/tcp to which the mp belongs. 12440 * There are no exceptions to this rule. The caller has already put 12441 * a reference on this connp/tcp and once tcp_rput_data() returns, 12442 * the squeue will do the refrele. 12443 * 12444 * The TH_SYN for the listener directly go to tcp_conn_request via 12445 * squeue. 12446 * 12447 * sqp: NULL = recursive, sqp != NULL means called from squeue 12448 */ 12449 void 12450 tcp_rput_data(void *arg, mblk_t *mp, void *arg2) 12451 { 12452 int32_t bytes_acked; 12453 int32_t gap; 12454 mblk_t *mp1; 12455 uint_t flags; 12456 uint32_t new_swnd = 0; 12457 uchar_t *iphdr; 12458 uchar_t *rptr; 12459 int32_t rgap; 12460 uint32_t seg_ack; 12461 int seg_len; 12462 uint_t ip_hdr_len; 12463 uint32_t seg_seq; 12464 tcph_t *tcph; 12465 int urp; 12466 tcp_opt_t tcpopt; 12467 uint_t ipvers; 12468 ip6_pkt_t ipp; 12469 boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 12470 uint32_t cwnd; 12471 uint32_t add; 12472 int npkt; 12473 int mss; 12474 conn_t *connp = (conn_t *)arg; 12475 squeue_t *sqp = (squeue_t *)arg2; 12476 tcp_t *tcp = connp->conn_tcp; 12477 12478 /* 12479 * RST from fused tcp loopback peer should trigger an unfuse. 12480 */ 12481 if (tcp->tcp_fused) { 12482 TCP_STAT(tcp_fusion_aborted); 12483 tcp_unfuse(tcp); 12484 } 12485 12486 iphdr = mp->b_rptr; 12487 rptr = mp->b_rptr; 12488 ASSERT(OK_32PTR(rptr)); 12489 12490 /* 12491 * An AF_INET socket is not capable of receiving any pktinfo. Do inline 12492 * processing here. For rest call tcp_find_pktinfo to fill up the 12493 * necessary information. 12494 */ 12495 if (IPCL_IS_TCP4(connp)) { 12496 ipvers = IPV4_VERSION; 12497 ip_hdr_len = IPH_HDR_LENGTH(rptr); 12498 } else { 12499 mp = tcp_find_pktinfo(tcp, mp, &ipvers, &ip_hdr_len, 12500 NULL, &ipp); 12501 if (mp == NULL) { 12502 TCP_STAT(tcp_rput_v6_error); 12503 return; 12504 } 12505 iphdr = mp->b_rptr; 12506 rptr = mp->b_rptr; 12507 } 12508 ASSERT(DB_TYPE(mp) == M_DATA); 12509 12510 tcph = (tcph_t *)&rptr[ip_hdr_len]; 12511 seg_seq = ABE32_TO_U32(tcph->th_seq); 12512 seg_ack = ABE32_TO_U32(tcph->th_ack); 12513 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 12514 seg_len = (int)(mp->b_wptr - rptr) - 12515 (ip_hdr_len + TCP_HDR_LENGTH(tcph)); 12516 if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) { 12517 do { 12518 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 12519 (uintptr_t)INT_MAX); 12520 seg_len += (int)(mp1->b_wptr - mp1->b_rptr); 12521 } while ((mp1 = mp1->b_cont) != NULL && 12522 mp1->b_datap->db_type == M_DATA); 12523 } 12524 12525 if (tcp->tcp_state == TCPS_TIME_WAIT) { 12526 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 12527 seg_len, tcph); 12528 return; 12529 } 12530 12531 if (sqp != NULL) { 12532 /* 12533 * This is the correct place to update tcp_last_recv_time. Note 12534 * that it is also updated for tcp structure that belongs to 12535 * global and listener queues which do not really need updating. 12536 * But that should not cause any harm. And it is updated for 12537 * all kinds of incoming segments, not only for data segments. 12538 */ 12539 tcp->tcp_last_recv_time = lbolt; 12540 } 12541 12542 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 12543 12544 BUMP_LOCAL(tcp->tcp_ibsegs); 12545 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_RECV_PKT); 12546 12547 if ((flags & TH_URG) && sqp != NULL) { 12548 /* 12549 * TCP can't handle urgent pointers that arrive before 12550 * the connection has been accept()ed since it can't 12551 * buffer OOB data. Discard segment if this happens. 12552 * 12553 * Nor can it reassemble urgent pointers, so discard 12554 * if it's not the next segment expected. 12555 * 12556 * Otherwise, collapse chain into one mblk (discard if 12557 * that fails). This makes sure the headers, retransmitted 12558 * data, and new data all are in the same mblk. 12559 */ 12560 ASSERT(mp != NULL); 12561 if (tcp->tcp_listener || !pullupmsg(mp, -1)) { 12562 freemsg(mp); 12563 return; 12564 } 12565 /* Update pointers into message */ 12566 iphdr = rptr = mp->b_rptr; 12567 tcph = (tcph_t *)&rptr[ip_hdr_len]; 12568 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) { 12569 /* 12570 * Since we can't handle any data with this urgent 12571 * pointer that is out of sequence, we expunge 12572 * the data. This allows us to still register 12573 * the urgent mark and generate the M_PCSIG, 12574 * which we can do. 12575 */ 12576 mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph); 12577 seg_len = 0; 12578 } 12579 } 12580 12581 switch (tcp->tcp_state) { 12582 case TCPS_SYN_SENT: 12583 if (flags & TH_ACK) { 12584 /* 12585 * Note that our stack cannot send data before a 12586 * connection is established, therefore the 12587 * following check is valid. Otherwise, it has 12588 * to be changed. 12589 */ 12590 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 12591 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 12592 freemsg(mp); 12593 if (flags & TH_RST) 12594 return; 12595 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 12596 tcp, seg_ack, 0, TH_RST); 12597 return; 12598 } 12599 ASSERT(tcp->tcp_suna + 1 == seg_ack); 12600 } 12601 if (flags & TH_RST) { 12602 freemsg(mp); 12603 if (flags & TH_ACK) 12604 (void) tcp_clean_death(tcp, 12605 ECONNREFUSED, 13); 12606 return; 12607 } 12608 if (!(flags & TH_SYN)) { 12609 freemsg(mp); 12610 return; 12611 } 12612 12613 /* Process all TCP options. */ 12614 tcp_process_options(tcp, tcph); 12615 /* 12616 * The following changes our rwnd to be a multiple of the 12617 * MIN(peer MSS, our MSS) for performance reason. 12618 */ 12619 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rq->q_hiwat, 12620 tcp->tcp_mss)); 12621 12622 /* Is the other end ECN capable? */ 12623 if (tcp->tcp_ecn_ok) { 12624 if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 12625 tcp->tcp_ecn_ok = B_FALSE; 12626 } 12627 } 12628 /* 12629 * Clear ECN flags because it may interfere with later 12630 * processing. 12631 */ 12632 flags &= ~(TH_ECE|TH_CWR); 12633 12634 tcp->tcp_irs = seg_seq; 12635 tcp->tcp_rack = seg_seq; 12636 tcp->tcp_rnxt = seg_seq + 1; 12637 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 12638 if (!TCP_IS_DETACHED(tcp)) { 12639 /* Allocate room for SACK options if needed. */ 12640 if (tcp->tcp_snd_sack_ok) { 12641 (void) mi_set_sth_wroff(tcp->tcp_rq, 12642 tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + 12643 (tcp->tcp_loopback ? 0 : tcp_wroff_xtra)); 12644 } else { 12645 (void) mi_set_sth_wroff(tcp->tcp_rq, 12646 tcp->tcp_hdr_len + 12647 (tcp->tcp_loopback ? 0 : tcp_wroff_xtra)); 12648 } 12649 } 12650 if (flags & TH_ACK) { 12651 /* 12652 * If we can't get the confirmation upstream, pretend 12653 * we didn't even see this one. 12654 * 12655 * XXX: how can we pretend we didn't see it if we 12656 * have updated rnxt et. al. 12657 * 12658 * For loopback we defer sending up the T_CONN_CON 12659 * until after some checks below. 12660 */ 12661 mp1 = NULL; 12662 if (!tcp_conn_con(tcp, iphdr, tcph, mp, 12663 tcp->tcp_loopback ? &mp1 : NULL)) { 12664 freemsg(mp); 12665 return; 12666 } 12667 /* SYN was acked - making progress */ 12668 if (tcp->tcp_ipversion == IPV6_VERSION) 12669 tcp->tcp_ip_forward_progress = B_TRUE; 12670 12671 /* One for the SYN */ 12672 tcp->tcp_suna = tcp->tcp_iss + 1; 12673 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 12674 tcp->tcp_state = TCPS_ESTABLISHED; 12675 12676 /* 12677 * If SYN was retransmitted, need to reset all 12678 * retransmission info. This is because this 12679 * segment will be treated as a dup ACK. 12680 */ 12681 if (tcp->tcp_rexmit) { 12682 tcp->tcp_rexmit = B_FALSE; 12683 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 12684 tcp->tcp_rexmit_max = tcp->tcp_snxt; 12685 tcp->tcp_snd_burst = tcp->tcp_localnet ? 12686 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 12687 tcp->tcp_ms_we_have_waited = 0; 12688 12689 /* 12690 * Set tcp_cwnd back to 1 MSS, per 12691 * recommendation from 12692 * draft-floyd-incr-init-win-01.txt, 12693 * Increasing TCP's Initial Window. 12694 */ 12695 tcp->tcp_cwnd = tcp->tcp_mss; 12696 } 12697 12698 tcp->tcp_swl1 = seg_seq; 12699 tcp->tcp_swl2 = seg_ack; 12700 12701 new_swnd = BE16_TO_U16(tcph->th_win); 12702 tcp->tcp_swnd = new_swnd; 12703 if (new_swnd > tcp->tcp_max_swnd) 12704 tcp->tcp_max_swnd = new_swnd; 12705 12706 /* 12707 * Always send the three-way handshake ack immediately 12708 * in order to make the connection complete as soon as 12709 * possible on the accepting host. 12710 */ 12711 flags |= TH_ACK_NEEDED; 12712 12713 /* 12714 * Special case for loopback. At this point we have 12715 * received SYN-ACK from the remote endpoint. In 12716 * order to ensure that both endpoints reach the 12717 * fused state prior to any data exchange, the final 12718 * ACK needs to be sent before we indicate T_CONN_CON 12719 * to the module upstream. 12720 */ 12721 if (tcp->tcp_loopback) { 12722 mblk_t *ack_mp; 12723 12724 ASSERT(!tcp->tcp_unfusable); 12725 ASSERT(mp1 != NULL); 12726 /* 12727 * For loopback, we always get a pure SYN-ACK 12728 * and only need to send back the final ACK 12729 * with no data (this is because the other 12730 * tcp is ours and we don't do T/TCP). This 12731 * final ACK triggers the passive side to 12732 * perform fusion in ESTABLISHED state. 12733 */ 12734 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) { 12735 if (tcp->tcp_ack_tid != 0) { 12736 (void) TCP_TIMER_CANCEL(tcp, 12737 tcp->tcp_ack_tid); 12738 tcp->tcp_ack_tid = 0; 12739 } 12740 TCP_RECORD_TRACE(tcp, ack_mp, 12741 TCP_TRACE_SEND_PKT); 12742 tcp_send_data(tcp, tcp->tcp_wq, ack_mp); 12743 BUMP_LOCAL(tcp->tcp_obsegs); 12744 BUMP_MIB(&tcp_mib, tcpOutAck); 12745 12746 /* Send up T_CONN_CON */ 12747 putnext(tcp->tcp_rq, mp1); 12748 12749 freemsg(mp); 12750 return; 12751 } 12752 /* 12753 * Forget fusion; we need to handle more 12754 * complex cases below. Send the deferred 12755 * T_CONN_CON message upstream and proceed 12756 * as usual. Mark this tcp as not capable 12757 * of fusion. 12758 */ 12759 TCP_STAT(tcp_fusion_unfusable); 12760 tcp->tcp_unfusable = B_TRUE; 12761 putnext(tcp->tcp_rq, mp1); 12762 } 12763 12764 /* 12765 * Check to see if there is data to be sent. If 12766 * yes, set the transmit flag. Then check to see 12767 * if received data processing needs to be done. 12768 * If not, go straight to xmit_check. This short 12769 * cut is OK as we don't support T/TCP. 12770 */ 12771 if (tcp->tcp_unsent) 12772 flags |= TH_XMIT_NEEDED; 12773 12774 if (seg_len == 0 && !(flags & TH_URG)) { 12775 freemsg(mp); 12776 goto xmit_check; 12777 } 12778 12779 flags &= ~TH_SYN; 12780 seg_seq++; 12781 break; 12782 } 12783 tcp->tcp_state = TCPS_SYN_RCVD; 12784 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 12785 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 12786 if (mp1) { 12787 DB_CPID(mp1) = tcp->tcp_cpid; 12788 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); 12789 tcp_send_data(tcp, tcp->tcp_wq, mp1); 12790 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 12791 } 12792 freemsg(mp); 12793 return; 12794 case TCPS_SYN_RCVD: 12795 if (flags & TH_ACK) { 12796 /* 12797 * In this state, a SYN|ACK packet is either bogus 12798 * because the other side must be ACKing our SYN which 12799 * indicates it has seen the ACK for their SYN and 12800 * shouldn't retransmit it or we're crossing SYNs 12801 * on active open. 12802 */ 12803 if ((flags & TH_SYN) && !tcp->tcp_active_open) { 12804 freemsg(mp); 12805 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_syn", 12806 tcp, seg_ack, 0, TH_RST); 12807 return; 12808 } 12809 /* 12810 * NOTE: RFC 793 pg. 72 says this should be 12811 * tcp->tcp_suna <= seg_ack <= tcp->tcp_snxt 12812 * but that would mean we have an ack that ignored 12813 * our SYN. 12814 */ 12815 if (SEQ_LEQ(seg_ack, tcp->tcp_suna) || 12816 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 12817 freemsg(mp); 12818 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 12819 tcp, seg_ack, 0, TH_RST); 12820 return; 12821 } 12822 } 12823 break; 12824 case TCPS_LISTEN: 12825 /* 12826 * Only a TLI listener can come through this path when a 12827 * acceptor is going back to be a listener and a packet 12828 * for the acceptor hits the classifier. For a socket 12829 * listener, this can never happen because a listener 12830 * can never accept connection on itself and hence a 12831 * socket acceptor can not go back to being a listener. 12832 */ 12833 ASSERT(!TCP_IS_SOCKET(tcp)); 12834 /*FALLTHRU*/ 12835 case TCPS_CLOSED: 12836 case TCPS_BOUND: { 12837 conn_t *new_connp; 12838 12839 new_connp = ipcl_classify(mp, connp->conn_zoneid); 12840 if (new_connp != NULL) { 12841 tcp_reinput(new_connp, mp, connp->conn_sqp); 12842 return; 12843 } 12844 /* We failed to classify. For now just drop the packet */ 12845 freemsg(mp); 12846 return; 12847 } 12848 case TCPS_IDLE: 12849 /* 12850 * Handle the case where the tcp_clean_death() has happened 12851 * on a connection (application hasn't closed yet) but a packet 12852 * was already queued on squeue before tcp_clean_death() 12853 * was processed. Calling tcp_clean_death() twice on same 12854 * connection can result in weird behaviour. 12855 */ 12856 freemsg(mp); 12857 return; 12858 default: 12859 break; 12860 } 12861 12862 /* 12863 * Already on the correct queue/perimeter. 12864 * If this is a detached connection and not an eager 12865 * connection hanging off a listener then new data 12866 * (past the FIN) will cause a reset. 12867 * We do a special check here where it 12868 * is out of the main line, rather than check 12869 * if we are detached every time we see new 12870 * data down below. 12871 */ 12872 if (TCP_IS_DETACHED_NONEAGER(tcp) && 12873 (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) { 12874 BUMP_MIB(&tcp_mib, tcpInClosed); 12875 TCP_RECORD_TRACE(tcp, 12876 mp, TCP_TRACE_RECV_PKT); 12877 12878 freemsg(mp); 12879 /* 12880 * This could be an SSL closure alert. We're detached so just 12881 * acknowledge it this last time. 12882 */ 12883 if (tcp->tcp_kssl_ctx != NULL) { 12884 kssl_release_ctx(tcp->tcp_kssl_ctx); 12885 tcp->tcp_kssl_ctx = NULL; 12886 12887 tcp->tcp_rnxt += seg_len; 12888 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 12889 flags |= TH_ACK_NEEDED; 12890 goto ack_check; 12891 } 12892 12893 tcp_xmit_ctl("new data when detached", tcp, 12894 tcp->tcp_snxt, 0, TH_RST); 12895 (void) tcp_clean_death(tcp, EPROTO, 12); 12896 return; 12897 } 12898 12899 mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph); 12900 urp = BE16_TO_U16(tcph->th_urp) - TCP_OLD_URP_INTERPRETATION; 12901 new_swnd = BE16_TO_U16(tcph->th_win) << 12902 ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 12903 mss = tcp->tcp_mss; 12904 12905 if (tcp->tcp_snd_ts_ok) { 12906 if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 12907 /* 12908 * This segment is not acceptable. 12909 * Drop it and send back an ACK. 12910 */ 12911 freemsg(mp); 12912 flags |= TH_ACK_NEEDED; 12913 goto ack_check; 12914 } 12915 } else if (tcp->tcp_snd_sack_ok) { 12916 ASSERT(tcp->tcp_sack_info != NULL); 12917 tcpopt.tcp = tcp; 12918 /* 12919 * SACK info in already updated in tcp_parse_options. Ignore 12920 * all other TCP options... 12921 */ 12922 (void) tcp_parse_options(tcph, &tcpopt); 12923 } 12924 try_again:; 12925 gap = seg_seq - tcp->tcp_rnxt; 12926 rgap = tcp->tcp_rwnd - (gap + seg_len); 12927 /* 12928 * gap is the amount of sequence space between what we expect to see 12929 * and what we got for seg_seq. A positive value for gap means 12930 * something got lost. A negative value means we got some old stuff. 12931 */ 12932 if (gap < 0) { 12933 /* Old stuff present. Is the SYN in there? */ 12934 if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 12935 (seg_len != 0)) { 12936 flags &= ~TH_SYN; 12937 seg_seq++; 12938 urp--; 12939 /* Recompute the gaps after noting the SYN. */ 12940 goto try_again; 12941 } 12942 BUMP_MIB(&tcp_mib, tcpInDataDupSegs); 12943 UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, 12944 (seg_len > -gap ? -gap : seg_len)); 12945 /* Remove the old stuff from seg_len. */ 12946 seg_len += gap; 12947 /* 12948 * Anything left? 12949 * Make sure to check for unack'd FIN when rest of data 12950 * has been previously ack'd. 12951 */ 12952 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 12953 /* 12954 * Resets are only valid if they lie within our offered 12955 * window. If the RST bit is set, we just ignore this 12956 * segment. 12957 */ 12958 if (flags & TH_RST) { 12959 freemsg(mp); 12960 return; 12961 } 12962 12963 /* 12964 * The arriving of dup data packets indicate that we 12965 * may have postponed an ack for too long, or the other 12966 * side's RTT estimate is out of shape. Start acking 12967 * more often. 12968 */ 12969 if (SEQ_GEQ(seg_seq + seg_len - gap, tcp->tcp_rack) && 12970 tcp->tcp_rack_cnt >= 1 && 12971 tcp->tcp_rack_abs_max > 2) { 12972 tcp->tcp_rack_abs_max--; 12973 } 12974 tcp->tcp_rack_cur_max = 1; 12975 12976 /* 12977 * This segment is "unacceptable". None of its 12978 * sequence space lies within our advertized window. 12979 * 12980 * Adjust seg_len to the original value for tracing. 12981 */ 12982 seg_len -= gap; 12983 if (tcp->tcp_debug) { 12984 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 12985 "tcp_rput: unacceptable, gap %d, rgap %d, " 12986 "flags 0x%x, seg_seq %u, seg_ack %u, " 12987 "seg_len %d, rnxt %u, snxt %u, %s", 12988 gap, rgap, flags, seg_seq, seg_ack, 12989 seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 12990 tcp_display(tcp, NULL, 12991 DISP_ADDR_AND_PORT)); 12992 } 12993 12994 /* 12995 * Arrange to send an ACK in response to the 12996 * unacceptable segment per RFC 793 page 69. There 12997 * is only one small difference between ours and the 12998 * acceptability test in the RFC - we accept ACK-only 12999 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 13000 * will be generated. 13001 * 13002 * Note that we have to ACK an ACK-only packet at least 13003 * for stacks that send 0-length keep-alives with 13004 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 13005 * section 4.2.3.6. As long as we don't ever generate 13006 * an unacceptable packet in response to an incoming 13007 * packet that is unacceptable, it should not cause 13008 * "ACK wars". 13009 */ 13010 flags |= TH_ACK_NEEDED; 13011 13012 /* 13013 * Continue processing this segment in order to use the 13014 * ACK information it contains, but skip all other 13015 * sequence-number processing. Processing the ACK 13016 * information is necessary in order to 13017 * re-synchronize connections that may have lost 13018 * synchronization. 13019 * 13020 * We clear seg_len and flag fields related to 13021 * sequence number processing as they are not 13022 * to be trusted for an unacceptable segment. 13023 */ 13024 seg_len = 0; 13025 flags &= ~(TH_SYN | TH_FIN | TH_URG); 13026 goto process_ack; 13027 } 13028 13029 /* Fix seg_seq, and chew the gap off the front. */ 13030 seg_seq = tcp->tcp_rnxt; 13031 urp += gap; 13032 do { 13033 mblk_t *mp2; 13034 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 13035 (uintptr_t)UINT_MAX); 13036 gap += (uint_t)(mp->b_wptr - mp->b_rptr); 13037 if (gap > 0) { 13038 mp->b_rptr = mp->b_wptr - gap; 13039 break; 13040 } 13041 mp2 = mp; 13042 mp = mp->b_cont; 13043 freeb(mp2); 13044 } while (gap < 0); 13045 /* 13046 * If the urgent data has already been acknowledged, we 13047 * should ignore TH_URG below 13048 */ 13049 if (urp < 0) 13050 flags &= ~TH_URG; 13051 } 13052 /* 13053 * rgap is the amount of stuff received out of window. A negative 13054 * value is the amount out of window. 13055 */ 13056 if (rgap < 0) { 13057 mblk_t *mp2; 13058 13059 if (tcp->tcp_rwnd == 0) { 13060 BUMP_MIB(&tcp_mib, tcpInWinProbe); 13061 } else { 13062 BUMP_MIB(&tcp_mib, tcpInDataPastWinSegs); 13063 UPDATE_MIB(&tcp_mib, tcpInDataPastWinBytes, -rgap); 13064 } 13065 13066 /* 13067 * seg_len does not include the FIN, so if more than 13068 * just the FIN is out of window, we act like we don't 13069 * see it. (If just the FIN is out of window, rgap 13070 * will be zero and we will go ahead and acknowledge 13071 * the FIN.) 13072 */ 13073 flags &= ~TH_FIN; 13074 13075 /* Fix seg_len and make sure there is something left. */ 13076 seg_len += rgap; 13077 if (seg_len <= 0) { 13078 /* 13079 * Resets are only valid if they lie within our offered 13080 * window. If the RST bit is set, we just ignore this 13081 * segment. 13082 */ 13083 if (flags & TH_RST) { 13084 freemsg(mp); 13085 return; 13086 } 13087 13088 /* Per RFC 793, we need to send back an ACK. */ 13089 flags |= TH_ACK_NEEDED; 13090 13091 /* 13092 * Send SIGURG as soon as possible i.e. even 13093 * if the TH_URG was delivered in a window probe 13094 * packet (which will be unacceptable). 13095 * 13096 * We generate a signal if none has been generated 13097 * for this connection or if this is a new urgent 13098 * byte. Also send a zero-length "unmarked" message 13099 * to inform SIOCATMARK that this is not the mark. 13100 * 13101 * tcp_urp_last_valid is cleared when the T_exdata_ind 13102 * is sent up. This plus the check for old data 13103 * (gap >= 0) handles the wraparound of the sequence 13104 * number space without having to always track the 13105 * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks 13106 * this max in its rcv_up variable). 13107 * 13108 * This prevents duplicate SIGURGS due to a "late" 13109 * zero-window probe when the T_EXDATA_IND has already 13110 * been sent up. 13111 */ 13112 if ((flags & TH_URG) && 13113 (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, 13114 tcp->tcp_urp_last))) { 13115 mp1 = allocb(0, BPRI_MED); 13116 if (mp1 == NULL) { 13117 freemsg(mp); 13118 return; 13119 } 13120 if (!TCP_IS_DETACHED(tcp) && 13121 !putnextctl1(tcp->tcp_rq, M_PCSIG, 13122 SIGURG)) { 13123 /* Try again on the rexmit. */ 13124 freemsg(mp1); 13125 freemsg(mp); 13126 return; 13127 } 13128 /* 13129 * If the next byte would be the mark 13130 * then mark with MARKNEXT else mark 13131 * with NOTMARKNEXT. 13132 */ 13133 if (gap == 0 && urp == 0) 13134 mp1->b_flag |= MSGMARKNEXT; 13135 else 13136 mp1->b_flag |= MSGNOTMARKNEXT; 13137 freemsg(tcp->tcp_urp_mark_mp); 13138 tcp->tcp_urp_mark_mp = mp1; 13139 flags |= TH_SEND_URP_MARK; 13140 tcp->tcp_urp_last_valid = B_TRUE; 13141 tcp->tcp_urp_last = urp + seg_seq; 13142 } 13143 /* 13144 * If this is a zero window probe, continue to 13145 * process the ACK part. But we need to set seg_len 13146 * to 0 to avoid data processing. Otherwise just 13147 * drop the segment and send back an ACK. 13148 */ 13149 if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 13150 flags &= ~(TH_SYN | TH_URG); 13151 seg_len = 0; 13152 goto process_ack; 13153 } else { 13154 freemsg(mp); 13155 goto ack_check; 13156 } 13157 } 13158 /* Pitch out of window stuff off the end. */ 13159 rgap = seg_len; 13160 mp2 = mp; 13161 do { 13162 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 13163 (uintptr_t)INT_MAX); 13164 rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 13165 if (rgap < 0) { 13166 mp2->b_wptr += rgap; 13167 if ((mp1 = mp2->b_cont) != NULL) { 13168 mp2->b_cont = NULL; 13169 freemsg(mp1); 13170 } 13171 break; 13172 } 13173 } while ((mp2 = mp2->b_cont) != NULL); 13174 } 13175 ok:; 13176 /* 13177 * TCP should check ECN info for segments inside the window only. 13178 * Therefore the check should be done here. 13179 */ 13180 if (tcp->tcp_ecn_ok) { 13181 if (flags & TH_CWR) { 13182 tcp->tcp_ecn_echo_on = B_FALSE; 13183 } 13184 /* 13185 * Note that both ECN_CE and CWR can be set in the 13186 * same segment. In this case, we once again turn 13187 * on ECN_ECHO. 13188 */ 13189 if (tcp->tcp_ipversion == IPV4_VERSION) { 13190 uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service; 13191 13192 if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 13193 tcp->tcp_ecn_echo_on = B_TRUE; 13194 } 13195 } else { 13196 uint32_t vcf = ((ip6_t *)rptr)->ip6_vcf; 13197 13198 if ((vcf & htonl(IPH_ECN_CE << 20)) == 13199 htonl(IPH_ECN_CE << 20)) { 13200 tcp->tcp_ecn_echo_on = B_TRUE; 13201 } 13202 } 13203 } 13204 13205 /* 13206 * Check whether we can update tcp_ts_recent. This test is 13207 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 13208 * Extensions for High Performance: An Update", Internet Draft. 13209 */ 13210 if (tcp->tcp_snd_ts_ok && 13211 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 13212 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 13213 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 13214 tcp->tcp_last_rcv_lbolt = lbolt64; 13215 } 13216 13217 if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 13218 /* 13219 * FIN in an out of order segment. We record this in 13220 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 13221 * Clear the FIN so that any check on FIN flag will fail. 13222 * Remember that FIN also counts in the sequence number 13223 * space. So we need to ack out of order FIN only segments. 13224 */ 13225 if (flags & TH_FIN) { 13226 tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 13227 tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 13228 flags &= ~TH_FIN; 13229 flags |= TH_ACK_NEEDED; 13230 } 13231 if (seg_len > 0) { 13232 /* Fill in the SACK blk list. */ 13233 if (tcp->tcp_snd_sack_ok) { 13234 ASSERT(tcp->tcp_sack_info != NULL); 13235 tcp_sack_insert(tcp->tcp_sack_list, 13236 seg_seq, seg_seq + seg_len, 13237 &(tcp->tcp_num_sack_blk)); 13238 } 13239 13240 /* 13241 * Attempt reassembly and see if we have something 13242 * ready to go. 13243 */ 13244 mp = tcp_reass(tcp, mp, seg_seq); 13245 /* Always ack out of order packets */ 13246 flags |= TH_ACK_NEEDED | TH_PUSH; 13247 if (mp) { 13248 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 13249 (uintptr_t)INT_MAX); 13250 seg_len = mp->b_cont ? msgdsize(mp) : 13251 (int)(mp->b_wptr - mp->b_rptr); 13252 seg_seq = tcp->tcp_rnxt; 13253 /* 13254 * A gap is filled and the seq num and len 13255 * of the gap match that of a previously 13256 * received FIN, put the FIN flag back in. 13257 */ 13258 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 13259 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 13260 flags |= TH_FIN; 13261 tcp->tcp_valid_bits &= 13262 ~TCP_OFO_FIN_VALID; 13263 } 13264 } else { 13265 /* 13266 * Keep going even with NULL mp. 13267 * There may be a useful ACK or something else 13268 * we don't want to miss. 13269 * 13270 * But TCP should not perform fast retransmit 13271 * because of the ack number. TCP uses 13272 * seg_len == 0 to determine if it is a pure 13273 * ACK. And this is not a pure ACK. 13274 */ 13275 seg_len = 0; 13276 ofo_seg = B_TRUE; 13277 } 13278 } 13279 } else if (seg_len > 0) { 13280 BUMP_MIB(&tcp_mib, tcpInDataInorderSegs); 13281 UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, seg_len); 13282 /* 13283 * If an out of order FIN was received before, and the seq 13284 * num and len of the new segment match that of the FIN, 13285 * put the FIN flag back in. 13286 */ 13287 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 13288 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 13289 flags |= TH_FIN; 13290 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 13291 } 13292 } 13293 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 13294 if (flags & TH_RST) { 13295 freemsg(mp); 13296 switch (tcp->tcp_state) { 13297 case TCPS_SYN_RCVD: 13298 (void) tcp_clean_death(tcp, ECONNREFUSED, 14); 13299 break; 13300 case TCPS_ESTABLISHED: 13301 case TCPS_FIN_WAIT_1: 13302 case TCPS_FIN_WAIT_2: 13303 case TCPS_CLOSE_WAIT: 13304 (void) tcp_clean_death(tcp, ECONNRESET, 15); 13305 break; 13306 case TCPS_CLOSING: 13307 case TCPS_LAST_ACK: 13308 (void) tcp_clean_death(tcp, 0, 16); 13309 break; 13310 default: 13311 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 13312 (void) tcp_clean_death(tcp, ENXIO, 17); 13313 break; 13314 } 13315 return; 13316 } 13317 if (flags & TH_SYN) { 13318 /* 13319 * See RFC 793, Page 71 13320 * 13321 * The seq number must be in the window as it should 13322 * be "fixed" above. If it is outside window, it should 13323 * be already rejected. Note that we allow seg_seq to be 13324 * rnxt + rwnd because we want to accept 0 window probe. 13325 */ 13326 ASSERT(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 13327 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 13328 freemsg(mp); 13329 /* 13330 * If the ACK flag is not set, just use our snxt as the 13331 * seq number of the RST segment. 13332 */ 13333 if (!(flags & TH_ACK)) { 13334 seg_ack = tcp->tcp_snxt; 13335 } 13336 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 13337 TH_RST|TH_ACK); 13338 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 13339 (void) tcp_clean_death(tcp, ECONNRESET, 18); 13340 return; 13341 } 13342 /* 13343 * urp could be -1 when the urp field in the packet is 0 13344 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent 13345 * byte was at seg_seq - 1, in which case we ignore the urgent flag. 13346 */ 13347 if (flags & TH_URG && urp >= 0) { 13348 if (!tcp->tcp_urp_last_valid || 13349 SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { 13350 /* 13351 * If we haven't generated the signal yet for this 13352 * urgent pointer value, do it now. Also, send up a 13353 * zero-length M_DATA indicating whether or not this is 13354 * the mark. The latter is not needed when a 13355 * T_EXDATA_IND is sent up. However, if there are 13356 * allocation failures this code relies on the sender 13357 * retransmitting and the socket code for determining 13358 * the mark should not block waiting for the peer to 13359 * transmit. Thus, for simplicity we always send up the 13360 * mark indication. 13361 */ 13362 mp1 = allocb(0, BPRI_MED); 13363 if (mp1 == NULL) { 13364 freemsg(mp); 13365 return; 13366 } 13367 if (!TCP_IS_DETACHED(tcp) && 13368 !putnextctl1(tcp->tcp_rq, M_PCSIG, SIGURG)) { 13369 /* Try again on the rexmit. */ 13370 freemsg(mp1); 13371 freemsg(mp); 13372 return; 13373 } 13374 /* 13375 * Mark with NOTMARKNEXT for now. 13376 * The code below will change this to MARKNEXT 13377 * if we are at the mark. 13378 * 13379 * If there are allocation failures (e.g. in dupmsg 13380 * below) the next time tcp_rput_data sees the urgent 13381 * segment it will send up the MSG*MARKNEXT message. 13382 */ 13383 mp1->b_flag |= MSGNOTMARKNEXT; 13384 freemsg(tcp->tcp_urp_mark_mp); 13385 tcp->tcp_urp_mark_mp = mp1; 13386 flags |= TH_SEND_URP_MARK; 13387 #ifdef DEBUG 13388 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13389 "tcp_rput: sent M_PCSIG 2 seq %x urp %x " 13390 "last %x, %s", 13391 seg_seq, urp, tcp->tcp_urp_last, 13392 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 13393 #endif /* DEBUG */ 13394 tcp->tcp_urp_last_valid = B_TRUE; 13395 tcp->tcp_urp_last = urp + seg_seq; 13396 } else if (tcp->tcp_urp_mark_mp != NULL) { 13397 /* 13398 * An allocation failure prevented the previous 13399 * tcp_rput_data from sending up the allocated 13400 * MSG*MARKNEXT message - send it up this time 13401 * around. 13402 */ 13403 flags |= TH_SEND_URP_MARK; 13404 } 13405 13406 /* 13407 * If the urgent byte is in this segment, make sure that it is 13408 * all by itself. This makes it much easier to deal with the 13409 * possibility of an allocation failure on the T_exdata_ind. 13410 * Note that seg_len is the number of bytes in the segment, and 13411 * urp is the offset into the segment of the urgent byte. 13412 * urp < seg_len means that the urgent byte is in this segment. 13413 */ 13414 if (urp < seg_len) { 13415 if (seg_len != 1) { 13416 uint32_t tmp_rnxt; 13417 /* 13418 * Break it up and feed it back in. 13419 * Re-attach the IP header. 13420 */ 13421 mp->b_rptr = iphdr; 13422 if (urp > 0) { 13423 /* 13424 * There is stuff before the urgent 13425 * byte. 13426 */ 13427 mp1 = dupmsg(mp); 13428 if (!mp1) { 13429 /* 13430 * Trim from urgent byte on. 13431 * The rest will come back. 13432 */ 13433 (void) adjmsg(mp, 13434 urp - seg_len); 13435 tcp_rput_data(connp, 13436 mp, NULL); 13437 return; 13438 } 13439 (void) adjmsg(mp1, urp - seg_len); 13440 /* Feed this piece back in. */ 13441 tmp_rnxt = tcp->tcp_rnxt; 13442 tcp_rput_data(connp, mp1, NULL); 13443 /* 13444 * If the data passed back in was not 13445 * processed (ie: bad ACK) sending 13446 * the remainder back in will cause a 13447 * loop. In this case, drop the 13448 * packet and let the sender try 13449 * sending a good packet. 13450 */ 13451 if (tmp_rnxt == tcp->tcp_rnxt) { 13452 freemsg(mp); 13453 return; 13454 } 13455 } 13456 if (urp != seg_len - 1) { 13457 uint32_t tmp_rnxt; 13458 /* 13459 * There is stuff after the urgent 13460 * byte. 13461 */ 13462 mp1 = dupmsg(mp); 13463 if (!mp1) { 13464 /* 13465 * Trim everything beyond the 13466 * urgent byte. The rest will 13467 * come back. 13468 */ 13469 (void) adjmsg(mp, 13470 urp + 1 - seg_len); 13471 tcp_rput_data(connp, 13472 mp, NULL); 13473 return; 13474 } 13475 (void) adjmsg(mp1, urp + 1 - seg_len); 13476 tmp_rnxt = tcp->tcp_rnxt; 13477 tcp_rput_data(connp, mp1, NULL); 13478 /* 13479 * If the data passed back in was not 13480 * processed (ie: bad ACK) sending 13481 * the remainder back in will cause a 13482 * loop. In this case, drop the 13483 * packet and let the sender try 13484 * sending a good packet. 13485 */ 13486 if (tmp_rnxt == tcp->tcp_rnxt) { 13487 freemsg(mp); 13488 return; 13489 } 13490 } 13491 tcp_rput_data(connp, mp, NULL); 13492 return; 13493 } 13494 /* 13495 * This segment contains only the urgent byte. We 13496 * have to allocate the T_exdata_ind, if we can. 13497 */ 13498 if (!tcp->tcp_urp_mp) { 13499 struct T_exdata_ind *tei; 13500 mp1 = allocb(sizeof (struct T_exdata_ind), 13501 BPRI_MED); 13502 if (!mp1) { 13503 /* 13504 * Sigh... It'll be back. 13505 * Generate any MSG*MARK message now. 13506 */ 13507 freemsg(mp); 13508 seg_len = 0; 13509 if (flags & TH_SEND_URP_MARK) { 13510 13511 13512 ASSERT(tcp->tcp_urp_mark_mp); 13513 tcp->tcp_urp_mark_mp->b_flag &= 13514 ~MSGNOTMARKNEXT; 13515 tcp->tcp_urp_mark_mp->b_flag |= 13516 MSGMARKNEXT; 13517 } 13518 goto ack_check; 13519 } 13520 mp1->b_datap->db_type = M_PROTO; 13521 tei = (struct T_exdata_ind *)mp1->b_rptr; 13522 tei->PRIM_type = T_EXDATA_IND; 13523 tei->MORE_flag = 0; 13524 mp1->b_wptr = (uchar_t *)&tei[1]; 13525 tcp->tcp_urp_mp = mp1; 13526 #ifdef DEBUG 13527 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13528 "tcp_rput: allocated exdata_ind %s", 13529 tcp_display(tcp, NULL, 13530 DISP_PORT_ONLY)); 13531 #endif /* DEBUG */ 13532 /* 13533 * There is no need to send a separate MSG*MARK 13534 * message since the T_EXDATA_IND will be sent 13535 * now. 13536 */ 13537 flags &= ~TH_SEND_URP_MARK; 13538 freemsg(tcp->tcp_urp_mark_mp); 13539 tcp->tcp_urp_mark_mp = NULL; 13540 } 13541 /* 13542 * Now we are all set. On the next putnext upstream, 13543 * tcp_urp_mp will be non-NULL and will get prepended 13544 * to what has to be this piece containing the urgent 13545 * byte. If for any reason we abort this segment below, 13546 * if it comes back, we will have this ready, or it 13547 * will get blown off in close. 13548 */ 13549 } else if (urp == seg_len) { 13550 /* 13551 * The urgent byte is the next byte after this sequence 13552 * number. If there is data it is marked with 13553 * MSGMARKNEXT and any tcp_urp_mark_mp is discarded 13554 * since it is not needed. Otherwise, if the code 13555 * above just allocated a zero-length tcp_urp_mark_mp 13556 * message, that message is tagged with MSGMARKNEXT. 13557 * Sending up these MSGMARKNEXT messages makes 13558 * SIOCATMARK work correctly even though 13559 * the T_EXDATA_IND will not be sent up until the 13560 * urgent byte arrives. 13561 */ 13562 if (seg_len != 0) { 13563 flags |= TH_MARKNEXT_NEEDED; 13564 freemsg(tcp->tcp_urp_mark_mp); 13565 tcp->tcp_urp_mark_mp = NULL; 13566 flags &= ~TH_SEND_URP_MARK; 13567 } else if (tcp->tcp_urp_mark_mp != NULL) { 13568 flags |= TH_SEND_URP_MARK; 13569 tcp->tcp_urp_mark_mp->b_flag &= 13570 ~MSGNOTMARKNEXT; 13571 tcp->tcp_urp_mark_mp->b_flag |= MSGMARKNEXT; 13572 } 13573 #ifdef DEBUG 13574 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13575 "tcp_rput: AT MARK, len %d, flags 0x%x, %s", 13576 seg_len, flags, 13577 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 13578 #endif /* DEBUG */ 13579 } else { 13580 /* Data left until we hit mark */ 13581 #ifdef DEBUG 13582 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13583 "tcp_rput: URP %d bytes left, %s", 13584 urp - seg_len, tcp_display(tcp, NULL, 13585 DISP_PORT_ONLY)); 13586 #endif /* DEBUG */ 13587 } 13588 } 13589 13590 process_ack: 13591 if (!(flags & TH_ACK)) { 13592 freemsg(mp); 13593 goto xmit_check; 13594 } 13595 } 13596 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 13597 13598 if (tcp->tcp_ipversion == IPV6_VERSION && bytes_acked > 0) 13599 tcp->tcp_ip_forward_progress = B_TRUE; 13600 if (tcp->tcp_state == TCPS_SYN_RCVD) { 13601 if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) && 13602 ((tcp->tcp_kssl_ent == NULL) || !tcp->tcp_kssl_pending)) { 13603 /* 3-way handshake complete - pass up the T_CONN_IND */ 13604 tcp_t *listener = tcp->tcp_listener; 13605 mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind; 13606 13607 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 13608 /* 13609 * We are here means eager is fine but it can 13610 * get a TH_RST at any point between now and till 13611 * accept completes and disappear. We need to 13612 * ensure that reference to eager is valid after 13613 * we get out of eager's perimeter. So we do 13614 * an extra refhold. 13615 */ 13616 CONN_INC_REF(connp); 13617 13618 /* 13619 * The listener also exists because of the refhold 13620 * done in tcp_conn_request. Its possible that it 13621 * might have closed. We will check that once we 13622 * get inside listeners context. 13623 */ 13624 CONN_INC_REF(listener->tcp_connp); 13625 if (listener->tcp_connp->conn_sqp == 13626 connp->conn_sqp) { 13627 tcp_send_conn_ind(listener->tcp_connp, mp, 13628 listener->tcp_connp->conn_sqp); 13629 CONN_DEC_REF(listener->tcp_connp); 13630 } else if (!tcp->tcp_loopback) { 13631 squeue_fill(listener->tcp_connp->conn_sqp, mp, 13632 tcp_send_conn_ind, 13633 listener->tcp_connp, SQTAG_TCP_CONN_IND); 13634 } else { 13635 squeue_enter(listener->tcp_connp->conn_sqp, mp, 13636 tcp_send_conn_ind, listener->tcp_connp, 13637 SQTAG_TCP_CONN_IND); 13638 } 13639 } 13640 13641 if (tcp->tcp_active_open) { 13642 /* 13643 * We are seeing the final ack in the three way 13644 * hand shake of a active open'ed connection 13645 * so we must send up a T_CONN_CON 13646 */ 13647 if (!tcp_conn_con(tcp, iphdr, tcph, mp, NULL)) { 13648 freemsg(mp); 13649 return; 13650 } 13651 /* 13652 * Don't fuse the loopback endpoints for 13653 * simultaneous active opens. 13654 */ 13655 if (tcp->tcp_loopback) { 13656 TCP_STAT(tcp_fusion_unfusable); 13657 tcp->tcp_unfusable = B_TRUE; 13658 } 13659 } 13660 13661 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 13662 bytes_acked--; 13663 /* SYN was acked - making progress */ 13664 if (tcp->tcp_ipversion == IPV6_VERSION) 13665 tcp->tcp_ip_forward_progress = B_TRUE; 13666 13667 /* 13668 * If SYN was retransmitted, need to reset all 13669 * retransmission info as this segment will be 13670 * treated as a dup ACK. 13671 */ 13672 if (tcp->tcp_rexmit) { 13673 tcp->tcp_rexmit = B_FALSE; 13674 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 13675 tcp->tcp_rexmit_max = tcp->tcp_snxt; 13676 tcp->tcp_snd_burst = tcp->tcp_localnet ? 13677 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 13678 tcp->tcp_ms_we_have_waited = 0; 13679 tcp->tcp_cwnd = mss; 13680 } 13681 13682 /* 13683 * We set the send window to zero here. 13684 * This is needed if there is data to be 13685 * processed already on the queue. 13686 * Later (at swnd_update label), the 13687 * "new_swnd > tcp_swnd" condition is satisfied 13688 * the XMIT_NEEDED flag is set in the current 13689 * (SYN_RCVD) state. This ensures tcp_wput_data() is 13690 * called if there is already data on queue in 13691 * this state. 13692 */ 13693 tcp->tcp_swnd = 0; 13694 13695 if (new_swnd > tcp->tcp_max_swnd) 13696 tcp->tcp_max_swnd = new_swnd; 13697 tcp->tcp_swl1 = seg_seq; 13698 tcp->tcp_swl2 = seg_ack; 13699 tcp->tcp_state = TCPS_ESTABLISHED; 13700 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 13701 13702 /* Fuse when both sides are in ESTABLISHED state */ 13703 if (tcp->tcp_loopback && do_tcp_fusion) 13704 tcp_fuse(tcp, iphdr, tcph); 13705 13706 } 13707 /* This code follows 4.4BSD-Lite2 mostly. */ 13708 if (bytes_acked < 0) 13709 goto est; 13710 13711 /* 13712 * If TCP is ECN capable and the congestion experience bit is 13713 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 13714 * done once per window (or more loosely, per RTT). 13715 */ 13716 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 13717 tcp->tcp_cwr = B_FALSE; 13718 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { 13719 if (!tcp->tcp_cwr) { 13720 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss; 13721 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; 13722 tcp->tcp_cwnd = npkt * mss; 13723 /* 13724 * If the cwnd is 0, use the timer to clock out 13725 * new segments. This is required by the ECN spec. 13726 */ 13727 if (npkt == 0) { 13728 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 13729 /* 13730 * This makes sure that when the ACK comes 13731 * back, we will increase tcp_cwnd by 1 MSS. 13732 */ 13733 tcp->tcp_cwnd_cnt = 0; 13734 } 13735 tcp->tcp_cwr = B_TRUE; 13736 /* 13737 * This marks the end of the current window of in 13738 * flight data. That is why we don't use 13739 * tcp_suna + tcp_swnd. Only data in flight can 13740 * provide ECN info. 13741 */ 13742 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 13743 tcp->tcp_ecn_cwr_sent = B_FALSE; 13744 } 13745 } 13746 13747 mp1 = tcp->tcp_xmit_head; 13748 if (bytes_acked == 0) { 13749 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 13750 int dupack_cnt; 13751 13752 BUMP_MIB(&tcp_mib, tcpInDupAck); 13753 /* 13754 * Fast retransmit. When we have seen exactly three 13755 * identical ACKs while we have unacked data 13756 * outstanding we take it as a hint that our peer 13757 * dropped something. 13758 * 13759 * If TCP is retransmitting, don't do fast retransmit. 13760 */ 13761 if (mp1 && tcp->tcp_suna != tcp->tcp_snxt && 13762 ! tcp->tcp_rexmit) { 13763 /* Do Limited Transmit */ 13764 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 13765 tcp_dupack_fast_retransmit) { 13766 /* 13767 * RFC 3042 13768 * 13769 * What we need to do is temporarily 13770 * increase tcp_cwnd so that new 13771 * data can be sent if it is allowed 13772 * by the receive window (tcp_rwnd). 13773 * tcp_wput_data() will take care of 13774 * the rest. 13775 * 13776 * If the connection is SACK capable, 13777 * only do limited xmit when there 13778 * is SACK info. 13779 * 13780 * Note how tcp_cwnd is incremented. 13781 * The first dup ACK will increase 13782 * it by 1 MSS. The second dup ACK 13783 * will increase it by 2 MSS. This 13784 * means that only 1 new segment will 13785 * be sent for each dup ACK. 13786 */ 13787 if (tcp->tcp_unsent > 0 && 13788 (!tcp->tcp_snd_sack_ok || 13789 (tcp->tcp_snd_sack_ok && 13790 tcp->tcp_notsack_list != NULL))) { 13791 tcp->tcp_cwnd += mss << 13792 (tcp->tcp_dupack_cnt - 1); 13793 flags |= TH_LIMIT_XMIT; 13794 } 13795 } else if (dupack_cnt == 13796 tcp_dupack_fast_retransmit) { 13797 13798 /* 13799 * If we have reduced tcp_ssthresh 13800 * because of ECN, do not reduce it again 13801 * unless it is already one window of data 13802 * away. After one window of data, tcp_cwr 13803 * should then be cleared. Note that 13804 * for non ECN capable connection, tcp_cwr 13805 * should always be false. 13806 * 13807 * Adjust cwnd since the duplicate 13808 * ack indicates that a packet was 13809 * dropped (due to congestion.) 13810 */ 13811 if (!tcp->tcp_cwr) { 13812 npkt = ((tcp->tcp_snxt - 13813 tcp->tcp_suna) >> 1) / mss; 13814 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 13815 mss; 13816 tcp->tcp_cwnd = (npkt + 13817 tcp->tcp_dupack_cnt) * mss; 13818 } 13819 if (tcp->tcp_ecn_ok) { 13820 tcp->tcp_cwr = B_TRUE; 13821 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 13822 tcp->tcp_ecn_cwr_sent = B_FALSE; 13823 } 13824 13825 /* 13826 * We do Hoe's algorithm. Refer to her 13827 * paper "Improving the Start-up Behavior 13828 * of a Congestion Control Scheme for TCP," 13829 * appeared in SIGCOMM'96. 13830 * 13831 * Save highest seq no we have sent so far. 13832 * Be careful about the invisible FIN byte. 13833 */ 13834 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 13835 (tcp->tcp_unsent == 0)) { 13836 tcp->tcp_rexmit_max = tcp->tcp_fss; 13837 } else { 13838 tcp->tcp_rexmit_max = tcp->tcp_snxt; 13839 } 13840 13841 /* 13842 * Do not allow bursty traffic during. 13843 * fast recovery. Refer to Fall and Floyd's 13844 * paper "Simulation-based Comparisons of 13845 * Tahoe, Reno and SACK TCP" (in CCR?) 13846 * This is a best current practise. 13847 */ 13848 tcp->tcp_snd_burst = TCP_CWND_SS; 13849 13850 /* 13851 * For SACK: 13852 * Calculate tcp_pipe, which is the 13853 * estimated number of bytes in 13854 * network. 13855 * 13856 * tcp_fack is the highest sack'ed seq num 13857 * TCP has received. 13858 * 13859 * tcp_pipe is explained in the above quoted 13860 * Fall and Floyd's paper. tcp_fack is 13861 * explained in Mathis and Mahdavi's 13862 * "Forward Acknowledgment: Refining TCP 13863 * Congestion Control" in SIGCOMM '96. 13864 */ 13865 if (tcp->tcp_snd_sack_ok) { 13866 ASSERT(tcp->tcp_sack_info != NULL); 13867 if (tcp->tcp_notsack_list != NULL) { 13868 tcp->tcp_pipe = tcp->tcp_snxt - 13869 tcp->tcp_fack; 13870 tcp->tcp_sack_snxt = seg_ack; 13871 flags |= TH_NEED_SACK_REXMIT; 13872 } else { 13873 /* 13874 * Always initialize tcp_pipe 13875 * even though we don't have 13876 * any SACK info. If later 13877 * we get SACK info and 13878 * tcp_pipe is not initialized, 13879 * funny things will happen. 13880 */ 13881 tcp->tcp_pipe = 13882 tcp->tcp_cwnd_ssthresh; 13883 } 13884 } else { 13885 flags |= TH_REXMIT_NEEDED; 13886 } /* tcp_snd_sack_ok */ 13887 13888 } else { 13889 /* 13890 * Here we perform congestion 13891 * avoidance, but NOT slow start. 13892 * This is known as the Fast 13893 * Recovery Algorithm. 13894 */ 13895 if (tcp->tcp_snd_sack_ok && 13896 tcp->tcp_notsack_list != NULL) { 13897 flags |= TH_NEED_SACK_REXMIT; 13898 tcp->tcp_pipe -= mss; 13899 if (tcp->tcp_pipe < 0) 13900 tcp->tcp_pipe = 0; 13901 } else { 13902 /* 13903 * We know that one more packet has 13904 * left the pipe thus we can update 13905 * cwnd. 13906 */ 13907 cwnd = tcp->tcp_cwnd + mss; 13908 if (cwnd > tcp->tcp_cwnd_max) 13909 cwnd = tcp->tcp_cwnd_max; 13910 tcp->tcp_cwnd = cwnd; 13911 if (tcp->tcp_unsent > 0) 13912 flags |= TH_XMIT_NEEDED; 13913 } 13914 } 13915 } 13916 } else if (tcp->tcp_zero_win_probe) { 13917 /* 13918 * If the window has opened, need to arrange 13919 * to send additional data. 13920 */ 13921 if (new_swnd != 0) { 13922 /* tcp_suna != tcp_snxt */ 13923 /* Packet contains a window update */ 13924 BUMP_MIB(&tcp_mib, tcpInWinUpdate); 13925 tcp->tcp_zero_win_probe = 0; 13926 tcp->tcp_timer_backoff = 0; 13927 tcp->tcp_ms_we_have_waited = 0; 13928 13929 /* 13930 * Transmit starting with tcp_suna since 13931 * the one byte probe is not ack'ed. 13932 * If TCP has sent more than one identical 13933 * probe, tcp_rexmit will be set. That means 13934 * tcp_ss_rexmit() will send out the one 13935 * byte along with new data. Otherwise, 13936 * fake the retransmission. 13937 */ 13938 flags |= TH_XMIT_NEEDED; 13939 if (!tcp->tcp_rexmit) { 13940 tcp->tcp_rexmit = B_TRUE; 13941 tcp->tcp_dupack_cnt = 0; 13942 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 13943 tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 13944 } 13945 } 13946 } 13947 goto swnd_update; 13948 } 13949 13950 /* 13951 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 13952 * If the ACK value acks something that we have not yet sent, it might 13953 * be an old duplicate segment. Send an ACK to re-synchronize the 13954 * other side. 13955 * Note: reset in response to unacceptable ACK in SYN_RECEIVE 13956 * state is handled above, so we can always just drop the segment and 13957 * send an ACK here. 13958 * 13959 * Should we send ACKs in response to ACK only segments? 13960 */ 13961 if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 13962 BUMP_MIB(&tcp_mib, tcpInAckUnsent); 13963 /* drop the received segment */ 13964 freemsg(mp); 13965 13966 /* 13967 * Send back an ACK. If tcp_drop_ack_unsent_cnt is 13968 * greater than 0, check if the number of such 13969 * bogus ACks is greater than that count. If yes, 13970 * don't send back any ACK. This prevents TCP from 13971 * getting into an ACK storm if somehow an attacker 13972 * successfully spoofs an acceptable segment to our 13973 * peer. 13974 */ 13975 if (tcp_drop_ack_unsent_cnt > 0 && 13976 ++tcp->tcp_in_ack_unsent > tcp_drop_ack_unsent_cnt) { 13977 TCP_STAT(tcp_in_ack_unsent_drop); 13978 return; 13979 } 13980 mp = tcp_ack_mp(tcp); 13981 if (mp != NULL) { 13982 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 13983 BUMP_LOCAL(tcp->tcp_obsegs); 13984 BUMP_MIB(&tcp_mib, tcpOutAck); 13985 tcp_send_data(tcp, tcp->tcp_wq, mp); 13986 } 13987 return; 13988 } 13989 13990 /* 13991 * TCP gets a new ACK, update the notsack'ed list to delete those 13992 * blocks that are covered by this ACK. 13993 */ 13994 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 13995 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 13996 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 13997 } 13998 13999 /* 14000 * If we got an ACK after fast retransmit, check to see 14001 * if it is a partial ACK. If it is not and the congestion 14002 * window was inflated to account for the other side's 14003 * cached packets, retract it. If it is, do Hoe's algorithm. 14004 */ 14005 if (tcp->tcp_dupack_cnt >= tcp_dupack_fast_retransmit) { 14006 ASSERT(tcp->tcp_rexmit == B_FALSE); 14007 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 14008 tcp->tcp_dupack_cnt = 0; 14009 /* 14010 * Restore the orig tcp_cwnd_ssthresh after 14011 * fast retransmit phase. 14012 */ 14013 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 14014 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; 14015 } 14016 tcp->tcp_rexmit_max = seg_ack; 14017 tcp->tcp_cwnd_cnt = 0; 14018 tcp->tcp_snd_burst = tcp->tcp_localnet ? 14019 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 14020 14021 /* 14022 * Remove all notsack info to avoid confusion with 14023 * the next fast retrasnmit/recovery phase. 14024 */ 14025 if (tcp->tcp_snd_sack_ok && 14026 tcp->tcp_notsack_list != NULL) { 14027 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 14028 } 14029 } else { 14030 if (tcp->tcp_snd_sack_ok && 14031 tcp->tcp_notsack_list != NULL) { 14032 flags |= TH_NEED_SACK_REXMIT; 14033 tcp->tcp_pipe -= mss; 14034 if (tcp->tcp_pipe < 0) 14035 tcp->tcp_pipe = 0; 14036 } else { 14037 /* 14038 * Hoe's algorithm: 14039 * 14040 * Retransmit the unack'ed segment and 14041 * restart fast recovery. Note that we 14042 * need to scale back tcp_cwnd to the 14043 * original value when we started fast 14044 * recovery. This is to prevent overly 14045 * aggressive behaviour in sending new 14046 * segments. 14047 */ 14048 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + 14049 tcp_dupack_fast_retransmit * mss; 14050 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 14051 flags |= TH_REXMIT_NEEDED; 14052 } 14053 } 14054 } else { 14055 tcp->tcp_dupack_cnt = 0; 14056 if (tcp->tcp_rexmit) { 14057 /* 14058 * TCP is retranmitting. If the ACK ack's all 14059 * outstanding data, update tcp_rexmit_max and 14060 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 14061 * to the correct value. 14062 * 14063 * Note that SEQ_LEQ() is used. This is to avoid 14064 * unnecessary fast retransmit caused by dup ACKs 14065 * received when TCP does slow start retransmission 14066 * after a time out. During this phase, TCP may 14067 * send out segments which are already received. 14068 * This causes dup ACKs to be sent back. 14069 */ 14070 if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 14071 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 14072 tcp->tcp_rexmit_nxt = seg_ack; 14073 } 14074 if (seg_ack != tcp->tcp_rexmit_max) { 14075 flags |= TH_XMIT_NEEDED; 14076 } 14077 } else { 14078 tcp->tcp_rexmit = B_FALSE; 14079 tcp->tcp_xmit_zc_clean = B_FALSE; 14080 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 14081 tcp->tcp_snd_burst = tcp->tcp_localnet ? 14082 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 14083 } 14084 tcp->tcp_ms_we_have_waited = 0; 14085 } 14086 } 14087 14088 BUMP_MIB(&tcp_mib, tcpInAckSegs); 14089 UPDATE_MIB(&tcp_mib, tcpInAckBytes, bytes_acked); 14090 tcp->tcp_suna = seg_ack; 14091 if (tcp->tcp_zero_win_probe != 0) { 14092 tcp->tcp_zero_win_probe = 0; 14093 tcp->tcp_timer_backoff = 0; 14094 } 14095 14096 /* 14097 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 14098 * Note that it cannot be the SYN being ack'ed. The code flow 14099 * will not reach here. 14100 */ 14101 if (mp1 == NULL) { 14102 goto fin_acked; 14103 } 14104 14105 /* 14106 * Update the congestion window. 14107 * 14108 * If TCP is not ECN capable or TCP is ECN capable but the 14109 * congestion experience bit is not set, increase the tcp_cwnd as 14110 * usual. 14111 */ 14112 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 14113 cwnd = tcp->tcp_cwnd; 14114 add = mss; 14115 14116 if (cwnd >= tcp->tcp_cwnd_ssthresh) { 14117 /* 14118 * This is to prevent an increase of less than 1 MSS of 14119 * tcp_cwnd. With partial increase, tcp_wput_data() 14120 * may send out tinygrams in order to preserve mblk 14121 * boundaries. 14122 * 14123 * By initializing tcp_cwnd_cnt to new tcp_cwnd and 14124 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is 14125 * increased by 1 MSS for every RTTs. 14126 */ 14127 if (tcp->tcp_cwnd_cnt <= 0) { 14128 tcp->tcp_cwnd_cnt = cwnd + add; 14129 } else { 14130 tcp->tcp_cwnd_cnt -= add; 14131 add = 0; 14132 } 14133 } 14134 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); 14135 } 14136 14137 /* See if the latest urgent data has been acknowledged */ 14138 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && 14139 SEQ_GT(seg_ack, tcp->tcp_urg)) 14140 tcp->tcp_valid_bits &= ~TCP_URG_VALID; 14141 14142 /* Can we update the RTT estimates? */ 14143 if (tcp->tcp_snd_ts_ok) { 14144 /* Ignore zero timestamp echo-reply. */ 14145 if (tcpopt.tcp_opt_ts_ecr != 0) { 14146 tcp_set_rto(tcp, (int32_t)lbolt - 14147 (int32_t)tcpopt.tcp_opt_ts_ecr); 14148 } 14149 14150 /* If needed, restart the timer. */ 14151 if (tcp->tcp_set_timer == 1) { 14152 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 14153 tcp->tcp_set_timer = 0; 14154 } 14155 /* 14156 * Update tcp_csuna in case the other side stops sending 14157 * us timestamps. 14158 */ 14159 tcp->tcp_csuna = tcp->tcp_snxt; 14160 } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 14161 /* 14162 * An ACK sequence we haven't seen before, so get the RTT 14163 * and update the RTO. But first check if the timestamp is 14164 * valid to use. 14165 */ 14166 if ((mp1->b_next != NULL) && 14167 SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) 14168 tcp_set_rto(tcp, (int32_t)lbolt - 14169 (int32_t)(intptr_t)mp1->b_prev); 14170 else 14171 BUMP_MIB(&tcp_mib, tcpRttNoUpdate); 14172 14173 /* Remeber the last sequence to be ACKed */ 14174 tcp->tcp_csuna = seg_ack; 14175 if (tcp->tcp_set_timer == 1) { 14176 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 14177 tcp->tcp_set_timer = 0; 14178 } 14179 } else { 14180 BUMP_MIB(&tcp_mib, tcpRttNoUpdate); 14181 } 14182 14183 /* Eat acknowledged bytes off the xmit queue. */ 14184 for (;;) { 14185 mblk_t *mp2; 14186 uchar_t *wptr; 14187 14188 wptr = mp1->b_wptr; 14189 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 14190 bytes_acked -= (int)(wptr - mp1->b_rptr); 14191 if (bytes_acked < 0) { 14192 mp1->b_rptr = wptr + bytes_acked; 14193 /* 14194 * Set a new timestamp if all the bytes timed by the 14195 * old timestamp have been ack'ed. 14196 */ 14197 if (SEQ_GT(seg_ack, 14198 (uint32_t)(uintptr_t)(mp1->b_next))) { 14199 mp1->b_prev = (mblk_t *)(uintptr_t)lbolt; 14200 mp1->b_next = NULL; 14201 } 14202 break; 14203 } 14204 mp1->b_next = NULL; 14205 mp1->b_prev = NULL; 14206 mp2 = mp1; 14207 mp1 = mp1->b_cont; 14208 14209 /* 14210 * This notification is required for some zero-copy 14211 * clients to maintain a copy semantic. After the data 14212 * is ack'ed, client is safe to modify or reuse the buffer. 14213 */ 14214 if (tcp->tcp_snd_zcopy_aware && 14215 (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 14216 tcp_zcopy_notify(tcp); 14217 freeb(mp2); 14218 if (bytes_acked == 0) { 14219 if (mp1 == NULL) { 14220 /* Everything is ack'ed, clear the tail. */ 14221 tcp->tcp_xmit_tail = NULL; 14222 /* 14223 * Cancel the timer unless we are still 14224 * waiting for an ACK for the FIN packet. 14225 */ 14226 if (tcp->tcp_timer_tid != 0 && 14227 tcp->tcp_snxt == tcp->tcp_suna) { 14228 (void) TCP_TIMER_CANCEL(tcp, 14229 tcp->tcp_timer_tid); 14230 tcp->tcp_timer_tid = 0; 14231 } 14232 goto pre_swnd_update; 14233 } 14234 if (mp2 != tcp->tcp_xmit_tail) 14235 break; 14236 tcp->tcp_xmit_tail = mp1; 14237 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 14238 (uintptr_t)INT_MAX); 14239 tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 14240 mp1->b_rptr); 14241 break; 14242 } 14243 if (mp1 == NULL) { 14244 /* 14245 * More was acked but there is nothing more 14246 * outstanding. This means that the FIN was 14247 * just acked or that we're talking to a clown. 14248 */ 14249 fin_acked: 14250 ASSERT(tcp->tcp_fin_sent); 14251 tcp->tcp_xmit_tail = NULL; 14252 if (tcp->tcp_fin_sent) { 14253 /* FIN was acked - making progress */ 14254 if (tcp->tcp_ipversion == IPV6_VERSION && 14255 !tcp->tcp_fin_acked) 14256 tcp->tcp_ip_forward_progress = B_TRUE; 14257 tcp->tcp_fin_acked = B_TRUE; 14258 if (tcp->tcp_linger_tid != 0 && 14259 TCP_TIMER_CANCEL(tcp, 14260 tcp->tcp_linger_tid) >= 0) { 14261 tcp_stop_lingering(tcp); 14262 } 14263 } else { 14264 /* 14265 * We should never get here because 14266 * we have already checked that the 14267 * number of bytes ack'ed should be 14268 * smaller than or equal to what we 14269 * have sent so far (it is the 14270 * acceptability check of the ACK). 14271 * We can only get here if the send 14272 * queue is corrupted. 14273 * 14274 * Terminate the connection and 14275 * panic the system. It is better 14276 * for us to panic instead of 14277 * continuing to avoid other disaster. 14278 */ 14279 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 14280 tcp->tcp_rnxt, TH_RST|TH_ACK); 14281 panic("Memory corruption " 14282 "detected for connection %s.", 14283 tcp_display(tcp, NULL, 14284 DISP_ADDR_AND_PORT)); 14285 /*NOTREACHED*/ 14286 } 14287 goto pre_swnd_update; 14288 } 14289 ASSERT(mp2 != tcp->tcp_xmit_tail); 14290 } 14291 if (tcp->tcp_unsent) { 14292 flags |= TH_XMIT_NEEDED; 14293 } 14294 pre_swnd_update: 14295 tcp->tcp_xmit_head = mp1; 14296 swnd_update: 14297 /* 14298 * The following check is different from most other implementations. 14299 * For bi-directional transfer, when segments are dropped, the 14300 * "normal" check will not accept a window update in those 14301 * retransmitted segemnts. Failing to do that, TCP may send out 14302 * segments which are outside receiver's window. As TCP accepts 14303 * the ack in those retransmitted segments, if the window update in 14304 * the same segment is not accepted, TCP will incorrectly calculates 14305 * that it can send more segments. This can create a deadlock 14306 * with the receiver if its window becomes zero. 14307 */ 14308 if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 14309 SEQ_LT(tcp->tcp_swl1, seg_seq) || 14310 (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 14311 /* 14312 * The criteria for update is: 14313 * 14314 * 1. the segment acknowledges some data. Or 14315 * 2. the segment is new, i.e. it has a higher seq num. Or 14316 * 3. the segment is not old and the advertised window is 14317 * larger than the previous advertised window. 14318 */ 14319 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 14320 flags |= TH_XMIT_NEEDED; 14321 tcp->tcp_swnd = new_swnd; 14322 if (new_swnd > tcp->tcp_max_swnd) 14323 tcp->tcp_max_swnd = new_swnd; 14324 tcp->tcp_swl1 = seg_seq; 14325 tcp->tcp_swl2 = seg_ack; 14326 } 14327 est: 14328 if (tcp->tcp_state > TCPS_ESTABLISHED) { 14329 14330 switch (tcp->tcp_state) { 14331 case TCPS_FIN_WAIT_1: 14332 if (tcp->tcp_fin_acked) { 14333 tcp->tcp_state = TCPS_FIN_WAIT_2; 14334 /* 14335 * We implement the non-standard BSD/SunOS 14336 * FIN_WAIT_2 flushing algorithm. 14337 * If there is no user attached to this 14338 * TCP endpoint, then this TCP struct 14339 * could hang around forever in FIN_WAIT_2 14340 * state if the peer forgets to send us 14341 * a FIN. To prevent this, we wait only 14342 * 2*MSL (a convenient time value) for 14343 * the FIN to arrive. If it doesn't show up, 14344 * we flush the TCP endpoint. This algorithm, 14345 * though a violation of RFC-793, has worked 14346 * for over 10 years in BSD systems. 14347 * Note: SunOS 4.x waits 675 seconds before 14348 * flushing the FIN_WAIT_2 connection. 14349 */ 14350 TCP_TIMER_RESTART(tcp, 14351 tcp_fin_wait_2_flush_interval); 14352 } 14353 break; 14354 case TCPS_FIN_WAIT_2: 14355 break; /* Shutdown hook? */ 14356 case TCPS_LAST_ACK: 14357 freemsg(mp); 14358 if (tcp->tcp_fin_acked) { 14359 (void) tcp_clean_death(tcp, 0, 19); 14360 return; 14361 } 14362 goto xmit_check; 14363 case TCPS_CLOSING: 14364 if (tcp->tcp_fin_acked) { 14365 tcp->tcp_state = TCPS_TIME_WAIT; 14366 /* 14367 * Unconditionally clear the exclusive binding 14368 * bit so this TIME-WAIT connection won't 14369 * interfere with new ones. 14370 */ 14371 tcp->tcp_exclbind = 0; 14372 if (!TCP_IS_DETACHED(tcp)) { 14373 TCP_TIMER_RESTART(tcp, 14374 tcp_time_wait_interval); 14375 } else { 14376 tcp_time_wait_append(tcp); 14377 TCP_DBGSTAT(tcp_rput_time_wait); 14378 } 14379 } 14380 /*FALLTHRU*/ 14381 case TCPS_CLOSE_WAIT: 14382 freemsg(mp); 14383 goto xmit_check; 14384 default: 14385 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 14386 break; 14387 } 14388 } 14389 if (flags & TH_FIN) { 14390 /* Make sure we ack the fin */ 14391 flags |= TH_ACK_NEEDED; 14392 if (!tcp->tcp_fin_rcvd) { 14393 tcp->tcp_fin_rcvd = B_TRUE; 14394 tcp->tcp_rnxt++; 14395 tcph = tcp->tcp_tcph; 14396 U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); 14397 14398 /* 14399 * Generate the ordrel_ind at the end unless we 14400 * are an eager guy. 14401 * In the eager case tcp_rsrv will do this when run 14402 * after tcp_accept is done. 14403 */ 14404 if (tcp->tcp_listener == NULL && 14405 !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding)) 14406 flags |= TH_ORDREL_NEEDED; 14407 switch (tcp->tcp_state) { 14408 case TCPS_SYN_RCVD: 14409 case TCPS_ESTABLISHED: 14410 tcp->tcp_state = TCPS_CLOSE_WAIT; 14411 /* Keepalive? */ 14412 break; 14413 case TCPS_FIN_WAIT_1: 14414 if (!tcp->tcp_fin_acked) { 14415 tcp->tcp_state = TCPS_CLOSING; 14416 break; 14417 } 14418 /* FALLTHRU */ 14419 case TCPS_FIN_WAIT_2: 14420 tcp->tcp_state = TCPS_TIME_WAIT; 14421 /* 14422 * Unconditionally clear the exclusive binding 14423 * bit so this TIME-WAIT connection won't 14424 * interfere with new ones. 14425 */ 14426 tcp->tcp_exclbind = 0; 14427 if (!TCP_IS_DETACHED(tcp)) { 14428 TCP_TIMER_RESTART(tcp, 14429 tcp_time_wait_interval); 14430 } else { 14431 tcp_time_wait_append(tcp); 14432 TCP_DBGSTAT(tcp_rput_time_wait); 14433 } 14434 if (seg_len) { 14435 /* 14436 * implies data piggybacked on FIN. 14437 * break to handle data. 14438 */ 14439 break; 14440 } 14441 freemsg(mp); 14442 goto ack_check; 14443 } 14444 } 14445 } 14446 if (mp == NULL) 14447 goto xmit_check; 14448 if (seg_len == 0) { 14449 freemsg(mp); 14450 goto xmit_check; 14451 } 14452 if (mp->b_rptr == mp->b_wptr) { 14453 /* 14454 * The header has been consumed, so we remove the 14455 * zero-length mblk here. 14456 */ 14457 mp1 = mp; 14458 mp = mp->b_cont; 14459 freeb(mp1); 14460 } 14461 tcph = tcp->tcp_tcph; 14462 tcp->tcp_rack_cnt++; 14463 { 14464 uint32_t cur_max; 14465 14466 cur_max = tcp->tcp_rack_cur_max; 14467 if (tcp->tcp_rack_cnt >= cur_max) { 14468 /* 14469 * We have more unacked data than we should - send 14470 * an ACK now. 14471 */ 14472 flags |= TH_ACK_NEEDED; 14473 cur_max++; 14474 if (cur_max > tcp->tcp_rack_abs_max) 14475 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 14476 else 14477 tcp->tcp_rack_cur_max = cur_max; 14478 } else if (TCP_IS_DETACHED(tcp)) { 14479 /* We don't have an ACK timer for detached TCP. */ 14480 flags |= TH_ACK_NEEDED; 14481 } else if (seg_len < mss) { 14482 /* 14483 * If we get a segment that is less than an mss, and we 14484 * already have unacknowledged data, and the amount 14485 * unacknowledged is not a multiple of mss, then we 14486 * better generate an ACK now. Otherwise, this may be 14487 * the tail piece of a transaction, and we would rather 14488 * wait for the response. 14489 */ 14490 uint32_t udif; 14491 ASSERT((uintptr_t)(tcp->tcp_rnxt - tcp->tcp_rack) <= 14492 (uintptr_t)INT_MAX); 14493 udif = (int)(tcp->tcp_rnxt - tcp->tcp_rack); 14494 if (udif && (udif % mss)) 14495 flags |= TH_ACK_NEEDED; 14496 else 14497 flags |= TH_ACK_TIMER_NEEDED; 14498 } else { 14499 /* Start delayed ack timer */ 14500 flags |= TH_ACK_TIMER_NEEDED; 14501 } 14502 } 14503 tcp->tcp_rnxt += seg_len; 14504 U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); 14505 14506 /* Update SACK list */ 14507 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 14508 tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 14509 &(tcp->tcp_num_sack_blk)); 14510 } 14511 14512 if (tcp->tcp_urp_mp) { 14513 tcp->tcp_urp_mp->b_cont = mp; 14514 mp = tcp->tcp_urp_mp; 14515 tcp->tcp_urp_mp = NULL; 14516 /* Ready for a new signal. */ 14517 tcp->tcp_urp_last_valid = B_FALSE; 14518 #ifdef DEBUG 14519 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14520 "tcp_rput: sending exdata_ind %s", 14521 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 14522 #endif /* DEBUG */ 14523 } 14524 14525 /* 14526 * Check for ancillary data changes compared to last segment. 14527 */ 14528 if (tcp->tcp_ipv6_recvancillary != 0) { 14529 mp = tcp_rput_add_ancillary(tcp, mp, &ipp); 14530 if (mp == NULL) 14531 return; 14532 } 14533 14534 if (tcp->tcp_listener || tcp->tcp_hard_binding) { 14535 /* 14536 * Side queue inbound data until the accept happens. 14537 * tcp_accept/tcp_rput drains this when the accept happens. 14538 * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or 14539 * T_EXDATA_IND) it is queued on b_next. 14540 * XXX Make urgent data use this. Requires: 14541 * Removing tcp_listener check for TH_URG 14542 * Making M_PCPROTO and MARK messages skip the eager case 14543 */ 14544 14545 if (tcp->tcp_kssl_pending) { 14546 tcp_kssl_input(tcp, mp); 14547 } else { 14548 tcp_rcv_enqueue(tcp, mp, seg_len); 14549 } 14550 } else { 14551 if (mp->b_datap->db_type != M_DATA || 14552 (flags & TH_MARKNEXT_NEEDED)) { 14553 if (tcp->tcp_rcv_list != NULL) { 14554 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 14555 } 14556 ASSERT(tcp->tcp_rcv_list == NULL || 14557 tcp->tcp_fused_sigurg); 14558 if (flags & TH_MARKNEXT_NEEDED) { 14559 #ifdef DEBUG 14560 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14561 "tcp_rput: sending MSGMARKNEXT %s", 14562 tcp_display(tcp, NULL, 14563 DISP_PORT_ONLY)); 14564 #endif /* DEBUG */ 14565 mp->b_flag |= MSGMARKNEXT; 14566 flags &= ~TH_MARKNEXT_NEEDED; 14567 } 14568 14569 /* Does this need SSL processing first? */ 14570 if ((tcp->tcp_kssl_ctx != NULL) && 14571 (DB_TYPE(mp) == M_DATA)) { 14572 tcp_kssl_input(tcp, mp); 14573 } else { 14574 putnext(tcp->tcp_rq, mp); 14575 if (!canputnext(tcp->tcp_rq)) 14576 tcp->tcp_rwnd -= seg_len; 14577 } 14578 } else if ((flags & (TH_PUSH|TH_FIN)) || 14579 tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_rq->q_hiwat >> 3) { 14580 if (tcp->tcp_rcv_list != NULL) { 14581 /* 14582 * Enqueue the new segment first and then 14583 * call tcp_rcv_drain() to send all data 14584 * up. The other way to do this is to 14585 * send all queued data up and then call 14586 * putnext() to send the new segment up. 14587 * This way can remove the else part later 14588 * on. 14589 * 14590 * We don't this to avoid one more call to 14591 * canputnext() as tcp_rcv_drain() needs to 14592 * call canputnext(). 14593 */ 14594 tcp_rcv_enqueue(tcp, mp, seg_len); 14595 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 14596 } else { 14597 /* Does this need SSL processing first? */ 14598 if ((tcp->tcp_kssl_ctx != NULL) && 14599 (DB_TYPE(mp) == M_DATA)) { 14600 tcp_kssl_input(tcp, mp); 14601 } else { 14602 putnext(tcp->tcp_rq, mp); 14603 if (!canputnext(tcp->tcp_rq)) 14604 tcp->tcp_rwnd -= seg_len; 14605 } 14606 } 14607 } else { 14608 /* 14609 * Enqueue all packets when processing an mblk 14610 * from the co queue and also enqueue normal packets. 14611 */ 14612 tcp_rcv_enqueue(tcp, mp, seg_len); 14613 } 14614 /* 14615 * Make sure the timer is running if we have data waiting 14616 * for a push bit. This provides resiliency against 14617 * implementations that do not correctly generate push bits. 14618 */ 14619 if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) { 14620 /* 14621 * The connection may be closed at this point, so don't 14622 * do anything for a detached tcp. 14623 */ 14624 if (!TCP_IS_DETACHED(tcp)) 14625 tcp->tcp_push_tid = TCP_TIMER(tcp, 14626 tcp_push_timer, 14627 MSEC_TO_TICK(tcp_push_timer_interval)); 14628 } 14629 } 14630 xmit_check: 14631 /* Is there anything left to do? */ 14632 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 14633 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 14634 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED| 14635 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 14636 goto done; 14637 14638 /* Any transmit work to do and a non-zero window? */ 14639 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 14640 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 14641 if (flags & TH_REXMIT_NEEDED) { 14642 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 14643 14644 BUMP_MIB(&tcp_mib, tcpOutFastRetrans); 14645 if (snd_size > mss) 14646 snd_size = mss; 14647 if (snd_size > tcp->tcp_swnd) 14648 snd_size = tcp->tcp_swnd; 14649 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 14650 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 14651 B_TRUE); 14652 14653 if (mp1 != NULL) { 14654 tcp->tcp_xmit_head->b_prev = (mblk_t *)lbolt; 14655 tcp->tcp_csuna = tcp->tcp_snxt; 14656 BUMP_MIB(&tcp_mib, tcpRetransSegs); 14657 UPDATE_MIB(&tcp_mib, tcpRetransBytes, snd_size); 14658 TCP_RECORD_TRACE(tcp, mp1, 14659 TCP_TRACE_SEND_PKT); 14660 tcp_send_data(tcp, tcp->tcp_wq, mp1); 14661 } 14662 } 14663 if (flags & TH_NEED_SACK_REXMIT) { 14664 tcp_sack_rxmit(tcp, &flags); 14665 } 14666 /* 14667 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 14668 * out new segment. Note that tcp_rexmit should not be 14669 * set, otherwise TH_LIMIT_XMIT should not be set. 14670 */ 14671 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 14672 if (!tcp->tcp_rexmit) { 14673 tcp_wput_data(tcp, NULL, B_FALSE); 14674 } else { 14675 tcp_ss_rexmit(tcp); 14676 } 14677 } 14678 /* 14679 * Adjust tcp_cwnd back to normal value after sending 14680 * new data segments. 14681 */ 14682 if (flags & TH_LIMIT_XMIT) { 14683 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 14684 /* 14685 * This will restart the timer. Restarting the 14686 * timer is used to avoid a timeout before the 14687 * limited transmitted segment's ACK gets back. 14688 */ 14689 if (tcp->tcp_xmit_head != NULL) 14690 tcp->tcp_xmit_head->b_prev = (mblk_t *)lbolt; 14691 } 14692 14693 /* Anything more to do? */ 14694 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| 14695 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 14696 goto done; 14697 } 14698 ack_check: 14699 if (flags & TH_SEND_URP_MARK) { 14700 ASSERT(tcp->tcp_urp_mark_mp); 14701 /* 14702 * Send up any queued data and then send the mark message 14703 */ 14704 if (tcp->tcp_rcv_list != NULL) { 14705 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 14706 } 14707 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 14708 14709 mp1 = tcp->tcp_urp_mark_mp; 14710 tcp->tcp_urp_mark_mp = NULL; 14711 #ifdef DEBUG 14712 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14713 "tcp_rput: sending zero-length %s %s", 14714 ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" : 14715 "MSGNOTMARKNEXT"), 14716 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 14717 #endif /* DEBUG */ 14718 putnext(tcp->tcp_rq, mp1); 14719 flags &= ~TH_SEND_URP_MARK; 14720 } 14721 if (flags & TH_ACK_NEEDED) { 14722 /* 14723 * Time to send an ack for some reason. 14724 */ 14725 mp1 = tcp_ack_mp(tcp); 14726 14727 if (mp1 != NULL) { 14728 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); 14729 tcp_send_data(tcp, tcp->tcp_wq, mp1); 14730 BUMP_LOCAL(tcp->tcp_obsegs); 14731 BUMP_MIB(&tcp_mib, tcpOutAck); 14732 } 14733 if (tcp->tcp_ack_tid != 0) { 14734 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 14735 tcp->tcp_ack_tid = 0; 14736 } 14737 } 14738 if (flags & TH_ACK_TIMER_NEEDED) { 14739 /* 14740 * Arrange for deferred ACK or push wait timeout. 14741 * Start timer if it is not already running. 14742 */ 14743 if (tcp->tcp_ack_tid == 0) { 14744 tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer, 14745 MSEC_TO_TICK(tcp->tcp_localnet ? 14746 (clock_t)tcp_local_dack_interval : 14747 (clock_t)tcp_deferred_ack_interval)); 14748 } 14749 } 14750 if (flags & TH_ORDREL_NEEDED) { 14751 /* 14752 * Send up the ordrel_ind unless we are an eager guy. 14753 * In the eager case tcp_rsrv will do this when run 14754 * after tcp_accept is done. 14755 */ 14756 ASSERT(tcp->tcp_listener == NULL); 14757 if (tcp->tcp_rcv_list != NULL) { 14758 /* 14759 * Push any mblk(s) enqueued from co processing. 14760 */ 14761 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 14762 } 14763 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 14764 if ((mp1 = mi_tpi_ordrel_ind()) != NULL) { 14765 tcp->tcp_ordrel_done = B_TRUE; 14766 putnext(tcp->tcp_rq, mp1); 14767 if (tcp->tcp_deferred_clean_death) { 14768 /* 14769 * tcp_clean_death was deferred 14770 * for T_ORDREL_IND - do it now 14771 */ 14772 (void) tcp_clean_death(tcp, 14773 tcp->tcp_client_errno, 20); 14774 tcp->tcp_deferred_clean_death = B_FALSE; 14775 } 14776 } else { 14777 /* 14778 * Run the orderly release in the 14779 * service routine. 14780 */ 14781 qenable(tcp->tcp_rq); 14782 /* 14783 * Caveat(XXX): The machine may be so 14784 * overloaded that tcp_rsrv() is not scheduled 14785 * until after the endpoint has transitioned 14786 * to TCPS_TIME_WAIT 14787 * and tcp_time_wait_interval expires. Then 14788 * tcp_timer() will blow away state in tcp_t 14789 * and T_ORDREL_IND will never be delivered 14790 * upstream. Unlikely but potentially 14791 * a problem. 14792 */ 14793 } 14794 } 14795 done: 14796 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 14797 } 14798 14799 /* 14800 * This function does PAWS protection check. Returns B_TRUE if the 14801 * segment passes the PAWS test, else returns B_FALSE. 14802 */ 14803 boolean_t 14804 tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) 14805 { 14806 uint8_t flags; 14807 int options; 14808 uint8_t *up; 14809 14810 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 14811 /* 14812 * If timestamp option is aligned nicely, get values inline, 14813 * otherwise call general routine to parse. Only do that 14814 * if timestamp is the only option. 14815 */ 14816 if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH + 14817 TCPOPT_REAL_TS_LEN && 14818 OK_32PTR((up = ((uint8_t *)tcph) + 14819 TCP_MIN_HEADER_LENGTH)) && 14820 *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 14821 tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 14822 tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 14823 14824 options = TCP_OPT_TSTAMP_PRESENT; 14825 } else { 14826 if (tcp->tcp_snd_sack_ok) { 14827 tcpoptp->tcp = tcp; 14828 } else { 14829 tcpoptp->tcp = NULL; 14830 } 14831 options = tcp_parse_options(tcph, tcpoptp); 14832 } 14833 14834 if (options & TCP_OPT_TSTAMP_PRESENT) { 14835 /* 14836 * Do PAWS per RFC 1323 section 4.2. Accept RST 14837 * regardless of the timestamp, page 18 RFC 1323.bis. 14838 */ 14839 if ((flags & TH_RST) == 0 && 14840 TSTMP_LT(tcpoptp->tcp_opt_ts_val, 14841 tcp->tcp_ts_recent)) { 14842 if (TSTMP_LT(lbolt64, tcp->tcp_last_rcv_lbolt + 14843 PAWS_TIMEOUT)) { 14844 /* This segment is not acceptable. */ 14845 return (B_FALSE); 14846 } else { 14847 /* 14848 * Connection has been idle for 14849 * too long. Reset the timestamp 14850 * and assume the segment is valid. 14851 */ 14852 tcp->tcp_ts_recent = 14853 tcpoptp->tcp_opt_ts_val; 14854 } 14855 } 14856 } else { 14857 /* 14858 * If we don't get a timestamp on every packet, we 14859 * figure we can't really trust 'em, so we stop sending 14860 * and parsing them. 14861 */ 14862 tcp->tcp_snd_ts_ok = B_FALSE; 14863 14864 tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 14865 tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 14866 tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4); 14867 tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); 14868 if (tcp->tcp_snd_sack_ok) { 14869 ASSERT(tcp->tcp_sack_info != NULL); 14870 tcp->tcp_max_sack_blk = 4; 14871 } 14872 } 14873 return (B_TRUE); 14874 } 14875 14876 /* 14877 * Attach ancillary data to a received TCP segments for the 14878 * ancillary pieces requested by the application that are 14879 * different than they were in the previous data segment. 14880 * 14881 * Save the "current" values once memory allocation is ok so that 14882 * when memory allocation fails we can just wait for the next data segment. 14883 */ 14884 static mblk_t * 14885 tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) 14886 { 14887 struct T_optdata_ind *todi; 14888 int optlen; 14889 uchar_t *optptr; 14890 struct T_opthdr *toh; 14891 uint_t addflag; /* Which pieces to add */ 14892 mblk_t *mp1; 14893 14894 optlen = 0; 14895 addflag = 0; 14896 /* If app asked for pktinfo and the index has changed ... */ 14897 if ((ipp->ipp_fields & IPPF_IFINDEX) && 14898 ipp->ipp_ifindex != tcp->tcp_recvifindex && 14899 (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)) { 14900 optlen += sizeof (struct T_opthdr) + 14901 sizeof (struct in6_pktinfo); 14902 addflag |= TCP_IPV6_RECVPKTINFO; 14903 } 14904 /* If app asked for hoplimit and it has changed ... */ 14905 if ((ipp->ipp_fields & IPPF_HOPLIMIT) && 14906 ipp->ipp_hoplimit != tcp->tcp_recvhops && 14907 (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPLIMIT)) { 14908 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 14909 addflag |= TCP_IPV6_RECVHOPLIMIT; 14910 } 14911 /* If app asked for tclass and it has changed ... */ 14912 if ((ipp->ipp_fields & IPPF_TCLASS) && 14913 ipp->ipp_tclass != tcp->tcp_recvtclass && 14914 (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)) { 14915 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 14916 addflag |= TCP_IPV6_RECVTCLASS; 14917 } 14918 /* 14919 * If app asked for hopbyhop headers and it has changed ... 14920 * For security labels, note that (1) security labels can't change on 14921 * a connected socket at all, (2) we're connected to at most one peer, 14922 * (3) if anything changes, then it must be some other extra option. 14923 */ 14924 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) && 14925 ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen, 14926 (ipp->ipp_fields & IPPF_HOPOPTS), 14927 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) { 14928 optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen - 14929 tcp->tcp_label_len; 14930 addflag |= TCP_IPV6_RECVHOPOPTS; 14931 if (!ip_allocbuf((void **)&tcp->tcp_hopopts, 14932 &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), 14933 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) 14934 return (mp); 14935 } 14936 /* If app asked for dst headers before routing headers ... */ 14937 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTDSTOPTS) && 14938 ip_cmpbuf(tcp->tcp_rtdstopts, tcp->tcp_rtdstoptslen, 14939 (ipp->ipp_fields & IPPF_RTDSTOPTS), 14940 ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) { 14941 optlen += sizeof (struct T_opthdr) + 14942 ipp->ipp_rtdstoptslen; 14943 addflag |= TCP_IPV6_RECVRTDSTOPTS; 14944 if (!ip_allocbuf((void **)&tcp->tcp_rtdstopts, 14945 &tcp->tcp_rtdstoptslen, (ipp->ipp_fields & IPPF_RTDSTOPTS), 14946 ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) 14947 return (mp); 14948 } 14949 /* If app asked for routing headers and it has changed ... */ 14950 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) && 14951 ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen, 14952 (ipp->ipp_fields & IPPF_RTHDR), 14953 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { 14954 optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; 14955 addflag |= TCP_IPV6_RECVRTHDR; 14956 if (!ip_allocbuf((void **)&tcp->tcp_rthdr, 14957 &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), 14958 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) 14959 return (mp); 14960 } 14961 /* If app asked for dest headers and it has changed ... */ 14962 if ((tcp->tcp_ipv6_recvancillary & 14963 (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) && 14964 ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen, 14965 (ipp->ipp_fields & IPPF_DSTOPTS), 14966 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) { 14967 optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; 14968 addflag |= TCP_IPV6_RECVDSTOPTS; 14969 if (!ip_allocbuf((void **)&tcp->tcp_dstopts, 14970 &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), 14971 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) 14972 return (mp); 14973 } 14974 14975 if (optlen == 0) { 14976 /* Nothing to add */ 14977 return (mp); 14978 } 14979 mp1 = allocb(sizeof (struct T_optdata_ind) + optlen, BPRI_MED); 14980 if (mp1 == NULL) { 14981 /* 14982 * Defer sending ancillary data until the next TCP segment 14983 * arrives. 14984 */ 14985 return (mp); 14986 } 14987 mp1->b_cont = mp; 14988 mp = mp1; 14989 mp->b_wptr += sizeof (*todi) + optlen; 14990 mp->b_datap->db_type = M_PROTO; 14991 todi = (struct T_optdata_ind *)mp->b_rptr; 14992 todi->PRIM_type = T_OPTDATA_IND; 14993 todi->DATA_flag = 1; /* MORE data */ 14994 todi->OPT_length = optlen; 14995 todi->OPT_offset = sizeof (*todi); 14996 optptr = (uchar_t *)&todi[1]; 14997 /* 14998 * If app asked for pktinfo and the index has changed ... 14999 * Note that the local address never changes for the connection. 15000 */ 15001 if (addflag & TCP_IPV6_RECVPKTINFO) { 15002 struct in6_pktinfo *pkti; 15003 15004 toh = (struct T_opthdr *)optptr; 15005 toh->level = IPPROTO_IPV6; 15006 toh->name = IPV6_PKTINFO; 15007 toh->len = sizeof (*toh) + sizeof (*pkti); 15008 toh->status = 0; 15009 optptr += sizeof (*toh); 15010 pkti = (struct in6_pktinfo *)optptr; 15011 if (tcp->tcp_ipversion == IPV6_VERSION) 15012 pkti->ipi6_addr = tcp->tcp_ip6h->ip6_src; 15013 else 15014 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 15015 &pkti->ipi6_addr); 15016 pkti->ipi6_ifindex = ipp->ipp_ifindex; 15017 optptr += sizeof (*pkti); 15018 ASSERT(OK_32PTR(optptr)); 15019 /* Save as "last" value */ 15020 tcp->tcp_recvifindex = ipp->ipp_ifindex; 15021 } 15022 /* If app asked for hoplimit and it has changed ... */ 15023 if (addflag & TCP_IPV6_RECVHOPLIMIT) { 15024 toh = (struct T_opthdr *)optptr; 15025 toh->level = IPPROTO_IPV6; 15026 toh->name = IPV6_HOPLIMIT; 15027 toh->len = sizeof (*toh) + sizeof (uint_t); 15028 toh->status = 0; 15029 optptr += sizeof (*toh); 15030 *(uint_t *)optptr = ipp->ipp_hoplimit; 15031 optptr += sizeof (uint_t); 15032 ASSERT(OK_32PTR(optptr)); 15033 /* Save as "last" value */ 15034 tcp->tcp_recvhops = ipp->ipp_hoplimit; 15035 } 15036 /* If app asked for tclass and it has changed ... */ 15037 if (addflag & TCP_IPV6_RECVTCLASS) { 15038 toh = (struct T_opthdr *)optptr; 15039 toh->level = IPPROTO_IPV6; 15040 toh->name = IPV6_TCLASS; 15041 toh->len = sizeof (*toh) + sizeof (uint_t); 15042 toh->status = 0; 15043 optptr += sizeof (*toh); 15044 *(uint_t *)optptr = ipp->ipp_tclass; 15045 optptr += sizeof (uint_t); 15046 ASSERT(OK_32PTR(optptr)); 15047 /* Save as "last" value */ 15048 tcp->tcp_recvtclass = ipp->ipp_tclass; 15049 } 15050 if (addflag & TCP_IPV6_RECVHOPOPTS) { 15051 toh = (struct T_opthdr *)optptr; 15052 toh->level = IPPROTO_IPV6; 15053 toh->name = IPV6_HOPOPTS; 15054 toh->len = sizeof (*toh) + ipp->ipp_hopoptslen - 15055 tcp->tcp_label_len; 15056 toh->status = 0; 15057 optptr += sizeof (*toh); 15058 bcopy((uchar_t *)ipp->ipp_hopopts + tcp->tcp_label_len, optptr, 15059 ipp->ipp_hopoptslen - tcp->tcp_label_len); 15060 optptr += ipp->ipp_hopoptslen - tcp->tcp_label_len; 15061 ASSERT(OK_32PTR(optptr)); 15062 /* Save as last value */ 15063 ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, 15064 (ipp->ipp_fields & IPPF_HOPOPTS), 15065 ipp->ipp_hopopts, ipp->ipp_hopoptslen); 15066 } 15067 if (addflag & TCP_IPV6_RECVRTDSTOPTS) { 15068 toh = (struct T_opthdr *)optptr; 15069 toh->level = IPPROTO_IPV6; 15070 toh->name = IPV6_RTHDRDSTOPTS; 15071 toh->len = sizeof (*toh) + ipp->ipp_rtdstoptslen; 15072 toh->status = 0; 15073 optptr += sizeof (*toh); 15074 bcopy(ipp->ipp_rtdstopts, optptr, ipp->ipp_rtdstoptslen); 15075 optptr += ipp->ipp_rtdstoptslen; 15076 ASSERT(OK_32PTR(optptr)); 15077 /* Save as last value */ 15078 ip_savebuf((void **)&tcp->tcp_rtdstopts, 15079 &tcp->tcp_rtdstoptslen, 15080 (ipp->ipp_fields & IPPF_RTDSTOPTS), 15081 ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); 15082 } 15083 if (addflag & TCP_IPV6_RECVRTHDR) { 15084 toh = (struct T_opthdr *)optptr; 15085 toh->level = IPPROTO_IPV6; 15086 toh->name = IPV6_RTHDR; 15087 toh->len = sizeof (*toh) + ipp->ipp_rthdrlen; 15088 toh->status = 0; 15089 optptr += sizeof (*toh); 15090 bcopy(ipp->ipp_rthdr, optptr, ipp->ipp_rthdrlen); 15091 optptr += ipp->ipp_rthdrlen; 15092 ASSERT(OK_32PTR(optptr)); 15093 /* Save as last value */ 15094 ip_savebuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen, 15095 (ipp->ipp_fields & IPPF_RTHDR), 15096 ipp->ipp_rthdr, ipp->ipp_rthdrlen); 15097 } 15098 if (addflag & (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) { 15099 toh = (struct T_opthdr *)optptr; 15100 toh->level = IPPROTO_IPV6; 15101 toh->name = IPV6_DSTOPTS; 15102 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen; 15103 toh->status = 0; 15104 optptr += sizeof (*toh); 15105 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen); 15106 optptr += ipp->ipp_dstoptslen; 15107 ASSERT(OK_32PTR(optptr)); 15108 /* Save as last value */ 15109 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen, 15110 (ipp->ipp_fields & IPPF_DSTOPTS), 15111 ipp->ipp_dstopts, ipp->ipp_dstoptslen); 15112 } 15113 ASSERT(optptr == mp->b_wptr); 15114 return (mp); 15115 } 15116 15117 15118 /* 15119 * Handle a *T_BIND_REQ that has failed either due to a T_ERROR_ACK 15120 * or a "bad" IRE detected by tcp_adapt_ire. 15121 * We can't tell if the failure was due to the laddr or the faddr 15122 * thus we clear out all addresses and ports. 15123 */ 15124 static void 15125 tcp_bind_failed(tcp_t *tcp, mblk_t *mp, int error) 15126 { 15127 queue_t *q = tcp->tcp_rq; 15128 tcph_t *tcph; 15129 struct T_error_ack *tea; 15130 conn_t *connp = tcp->tcp_connp; 15131 15132 15133 ASSERT(mp->b_datap->db_type == M_PCPROTO); 15134 15135 if (mp->b_cont) { 15136 freemsg(mp->b_cont); 15137 mp->b_cont = NULL; 15138 } 15139 tea = (struct T_error_ack *)mp->b_rptr; 15140 switch (tea->PRIM_type) { 15141 case T_BIND_ACK: 15142 /* 15143 * Need to unbind with classifier since we were just told that 15144 * our bind succeeded. 15145 */ 15146 tcp->tcp_hard_bound = B_FALSE; 15147 tcp->tcp_hard_binding = B_FALSE; 15148 15149 ipcl_hash_remove(connp); 15150 /* Reuse the mblk if possible */ 15151 ASSERT(mp->b_datap->db_lim - mp->b_datap->db_base >= 15152 sizeof (*tea)); 15153 mp->b_rptr = mp->b_datap->db_base; 15154 mp->b_wptr = mp->b_rptr + sizeof (*tea); 15155 tea = (struct T_error_ack *)mp->b_rptr; 15156 tea->PRIM_type = T_ERROR_ACK; 15157 tea->TLI_error = TSYSERR; 15158 tea->UNIX_error = error; 15159 if (tcp->tcp_state >= TCPS_SYN_SENT) { 15160 tea->ERROR_prim = T_CONN_REQ; 15161 } else { 15162 tea->ERROR_prim = O_T_BIND_REQ; 15163 } 15164 break; 15165 15166 case T_ERROR_ACK: 15167 if (tcp->tcp_state >= TCPS_SYN_SENT) 15168 tea->ERROR_prim = T_CONN_REQ; 15169 break; 15170 default: 15171 panic("tcp_bind_failed: unexpected TPI type"); 15172 /*NOTREACHED*/ 15173 } 15174 15175 tcp->tcp_state = TCPS_IDLE; 15176 if (tcp->tcp_ipversion == IPV4_VERSION) 15177 tcp->tcp_ipha->ipha_src = 0; 15178 else 15179 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); 15180 /* 15181 * Copy of the src addr. in tcp_t is needed since 15182 * the lookup funcs. can only look at tcp_t 15183 */ 15184 V6_SET_ZERO(tcp->tcp_ip_src_v6); 15185 15186 tcph = tcp->tcp_tcph; 15187 tcph->th_lport[0] = 0; 15188 tcph->th_lport[1] = 0; 15189 tcp_bind_hash_remove(tcp); 15190 bzero(&connp->u_port, sizeof (connp->u_port)); 15191 /* blow away saved option results if any */ 15192 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 15193 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 15194 15195 conn_delete_ire(tcp->tcp_connp, NULL); 15196 putnext(q, mp); 15197 } 15198 15199 /* 15200 * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA 15201 * messages. 15202 */ 15203 void 15204 tcp_rput_other(tcp_t *tcp, mblk_t *mp) 15205 { 15206 mblk_t *mp1; 15207 uchar_t *rptr = mp->b_rptr; 15208 queue_t *q = tcp->tcp_rq; 15209 struct T_error_ack *tea; 15210 uint32_t mss; 15211 mblk_t *syn_mp; 15212 mblk_t *mdti; 15213 int retval; 15214 mblk_t *ire_mp; 15215 15216 switch (mp->b_datap->db_type) { 15217 case M_PROTO: 15218 case M_PCPROTO: 15219 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 15220 if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) 15221 break; 15222 tea = (struct T_error_ack *)rptr; 15223 switch (tea->PRIM_type) { 15224 case T_BIND_ACK: 15225 /* 15226 * Adapt Multidata information, if any. The 15227 * following tcp_mdt_update routine will free 15228 * the message. 15229 */ 15230 if ((mdti = tcp_mdt_info_mp(mp)) != NULL) { 15231 tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti-> 15232 b_rptr)->mdt_capab, B_TRUE); 15233 freemsg(mdti); 15234 } 15235 15236 /* Get the IRE, if we had requested for it */ 15237 ire_mp = tcp_ire_mp(mp); 15238 15239 if (tcp->tcp_hard_binding) { 15240 tcp->tcp_hard_binding = B_FALSE; 15241 tcp->tcp_hard_bound = B_TRUE; 15242 CL_INET_CONNECT(tcp); 15243 } else { 15244 if (ire_mp != NULL) 15245 freeb(ire_mp); 15246 goto after_syn_sent; 15247 } 15248 15249 retval = tcp_adapt_ire(tcp, ire_mp); 15250 if (ire_mp != NULL) 15251 freeb(ire_mp); 15252 if (retval == 0) { 15253 tcp_bind_failed(tcp, mp, 15254 (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? 15255 ENETUNREACH : EADDRNOTAVAIL)); 15256 return; 15257 } 15258 /* 15259 * Don't let an endpoint connect to itself. 15260 * Also checked in tcp_connect() but that 15261 * check can't handle the case when the 15262 * local IP address is INADDR_ANY. 15263 */ 15264 if (tcp->tcp_ipversion == IPV4_VERSION) { 15265 if ((tcp->tcp_ipha->ipha_dst == 15266 tcp->tcp_ipha->ipha_src) && 15267 (BE16_EQL(tcp->tcp_tcph->th_lport, 15268 tcp->tcp_tcph->th_fport))) { 15269 tcp_bind_failed(tcp, mp, EADDRNOTAVAIL); 15270 return; 15271 } 15272 } else { 15273 if (IN6_ARE_ADDR_EQUAL( 15274 &tcp->tcp_ip6h->ip6_dst, 15275 &tcp->tcp_ip6h->ip6_src) && 15276 (BE16_EQL(tcp->tcp_tcph->th_lport, 15277 tcp->tcp_tcph->th_fport))) { 15278 tcp_bind_failed(tcp, mp, EADDRNOTAVAIL); 15279 return; 15280 } 15281 } 15282 ASSERT(tcp->tcp_state == TCPS_SYN_SENT); 15283 /* 15284 * This should not be possible! Just for 15285 * defensive coding... 15286 */ 15287 if (tcp->tcp_state != TCPS_SYN_SENT) 15288 goto after_syn_sent; 15289 15290 if (is_system_labeled() && 15291 !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) { 15292 tcp_bind_failed(tcp, mp, EHOSTUNREACH); 15293 return; 15294 } 15295 15296 ASSERT(q == tcp->tcp_rq); 15297 /* 15298 * tcp_adapt_ire() does not adjust 15299 * for TCP/IP header length. 15300 */ 15301 mss = tcp->tcp_mss - tcp->tcp_hdr_len; 15302 15303 /* 15304 * Just make sure our rwnd is at 15305 * least tcp_recv_hiwat_mss * MSS 15306 * large, and round up to the nearest 15307 * MSS. 15308 * 15309 * We do the round up here because 15310 * we need to get the interface 15311 * MTU first before we can do the 15312 * round up. 15313 */ 15314 tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 15315 tcp_recv_hiwat_minmss * mss); 15316 q->q_hiwat = tcp->tcp_rwnd; 15317 tcp_set_ws_value(tcp); 15318 U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), 15319 tcp->tcp_tcph->th_win); 15320 if (tcp->tcp_rcv_ws > 0 || tcp_wscale_always) 15321 tcp->tcp_snd_ws_ok = B_TRUE; 15322 15323 /* 15324 * Set tcp_snd_ts_ok to true 15325 * so that tcp_xmit_mp will 15326 * include the timestamp 15327 * option in the SYN segment. 15328 */ 15329 if (tcp_tstamp_always || 15330 (tcp->tcp_rcv_ws && tcp_tstamp_if_wscale)) { 15331 tcp->tcp_snd_ts_ok = B_TRUE; 15332 } 15333 15334 /* 15335 * tcp_snd_sack_ok can be set in 15336 * tcp_adapt_ire() if the sack metric 15337 * is set. So check it here also. 15338 */ 15339 if (tcp_sack_permitted == 2 || 15340 tcp->tcp_snd_sack_ok) { 15341 if (tcp->tcp_sack_info == NULL) { 15342 tcp->tcp_sack_info = 15343 kmem_cache_alloc(tcp_sack_info_cache, 15344 KM_SLEEP); 15345 } 15346 tcp->tcp_snd_sack_ok = B_TRUE; 15347 } 15348 15349 /* 15350 * Should we use ECN? Note that the current 15351 * default value (SunOS 5.9) of tcp_ecn_permitted 15352 * is 1. The reason for doing this is that there 15353 * are equipments out there that will drop ECN 15354 * enabled IP packets. Setting it to 1 avoids 15355 * compatibility problems. 15356 */ 15357 if (tcp_ecn_permitted == 2) 15358 tcp->tcp_ecn_ok = B_TRUE; 15359 15360 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 15361 syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 15362 tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 15363 if (syn_mp) { 15364 cred_t *cr; 15365 pid_t pid; 15366 15367 /* 15368 * Obtain the credential from the 15369 * thread calling connect(); the credential 15370 * lives on in the second mblk which 15371 * originated from T_CONN_REQ and is echoed 15372 * with the T_BIND_ACK from ip. If none 15373 * can be found, default to the creator 15374 * of the socket. 15375 */ 15376 if (mp->b_cont == NULL || 15377 (cr = DB_CRED(mp->b_cont)) == NULL) { 15378 cr = tcp->tcp_cred; 15379 pid = tcp->tcp_cpid; 15380 } else { 15381 pid = DB_CPID(mp->b_cont); 15382 } 15383 15384 TCP_RECORD_TRACE(tcp, syn_mp, 15385 TCP_TRACE_SEND_PKT); 15386 mblk_setcred(syn_mp, cr); 15387 DB_CPID(syn_mp) = pid; 15388 tcp_send_data(tcp, tcp->tcp_wq, syn_mp); 15389 } 15390 after_syn_sent: 15391 /* 15392 * A trailer mblk indicates a waiting client upstream. 15393 * We complete here the processing begun in 15394 * either tcp_bind() or tcp_connect() by passing 15395 * upstream the reply message they supplied. 15396 */ 15397 mp1 = mp; 15398 mp = mp->b_cont; 15399 freeb(mp1); 15400 if (mp) 15401 break; 15402 return; 15403 case T_ERROR_ACK: 15404 if (tcp->tcp_debug) { 15405 (void) strlog(TCP_MOD_ID, 0, 1, 15406 SL_TRACE|SL_ERROR, 15407 "tcp_rput_other: case T_ERROR_ACK, " 15408 "ERROR_prim == %d", 15409 tea->ERROR_prim); 15410 } 15411 switch (tea->ERROR_prim) { 15412 case O_T_BIND_REQ: 15413 case T_BIND_REQ: 15414 tcp_bind_failed(tcp, mp, 15415 (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? 15416 ENETUNREACH : EADDRNOTAVAIL)); 15417 return; 15418 case T_UNBIND_REQ: 15419 tcp->tcp_hard_binding = B_FALSE; 15420 tcp->tcp_hard_bound = B_FALSE; 15421 if (mp->b_cont) { 15422 freemsg(mp->b_cont); 15423 mp->b_cont = NULL; 15424 } 15425 if (tcp->tcp_unbind_pending) 15426 tcp->tcp_unbind_pending = 0; 15427 else { 15428 /* From tcp_ip_unbind() - free */ 15429 freemsg(mp); 15430 return; 15431 } 15432 break; 15433 case T_SVR4_OPTMGMT_REQ: 15434 if (tcp->tcp_drop_opt_ack_cnt > 0) { 15435 /* T_OPTMGMT_REQ generated by TCP */ 15436 printf("T_SVR4_OPTMGMT_REQ failed " 15437 "%d/%d - dropped (cnt %d)\n", 15438 tea->TLI_error, tea->UNIX_error, 15439 tcp->tcp_drop_opt_ack_cnt); 15440 freemsg(mp); 15441 tcp->tcp_drop_opt_ack_cnt--; 15442 return; 15443 } 15444 break; 15445 } 15446 if (tea->ERROR_prim == T_SVR4_OPTMGMT_REQ && 15447 tcp->tcp_drop_opt_ack_cnt > 0) { 15448 printf("T_SVR4_OPTMGMT_REQ failed %d/%d " 15449 "- dropped (cnt %d)\n", 15450 tea->TLI_error, tea->UNIX_error, 15451 tcp->tcp_drop_opt_ack_cnt); 15452 freemsg(mp); 15453 tcp->tcp_drop_opt_ack_cnt--; 15454 return; 15455 } 15456 break; 15457 case T_OPTMGMT_ACK: 15458 if (tcp->tcp_drop_opt_ack_cnt > 0) { 15459 /* T_OPTMGMT_REQ generated by TCP */ 15460 freemsg(mp); 15461 tcp->tcp_drop_opt_ack_cnt--; 15462 return; 15463 } 15464 break; 15465 default: 15466 break; 15467 } 15468 break; 15469 case M_CTL: 15470 /* 15471 * ICMP messages. 15472 */ 15473 tcp_icmp_error(tcp, mp); 15474 return; 15475 case M_FLUSH: 15476 if (*rptr & FLUSHR) 15477 flushq(q, FLUSHDATA); 15478 break; 15479 default: 15480 break; 15481 } 15482 /* 15483 * Make sure we set this bit before sending the ACK for 15484 * bind. Otherwise accept could possibly run and free 15485 * this tcp struct. 15486 */ 15487 putnext(q, mp); 15488 } 15489 15490 /* 15491 * Called as the result of a qbufcall or a qtimeout to remedy a failure 15492 * to allocate a T_ordrel_ind in tcp_rsrv(). qenable(q) will make 15493 * tcp_rsrv() try again. 15494 */ 15495 static void 15496 tcp_ordrel_kick(void *arg) 15497 { 15498 conn_t *connp = (conn_t *)arg; 15499 tcp_t *tcp = connp->conn_tcp; 15500 15501 tcp->tcp_ordrelid = 0; 15502 tcp->tcp_timeout = B_FALSE; 15503 if (!TCP_IS_DETACHED(tcp) && tcp->tcp_rq != NULL && 15504 tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 15505 qenable(tcp->tcp_rq); 15506 } 15507 } 15508 15509 /* ARGSUSED */ 15510 static void 15511 tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) 15512 { 15513 conn_t *connp = (conn_t *)arg; 15514 tcp_t *tcp = connp->conn_tcp; 15515 queue_t *q = tcp->tcp_rq; 15516 uint_t thwin; 15517 15518 freeb(mp); 15519 15520 TCP_STAT(tcp_rsrv_calls); 15521 15522 if (TCP_IS_DETACHED(tcp) || q == NULL) { 15523 return; 15524 } 15525 15526 if (tcp->tcp_fused) { 15527 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 15528 15529 ASSERT(tcp->tcp_fused); 15530 ASSERT(peer_tcp != NULL && peer_tcp->tcp_fused); 15531 ASSERT(peer_tcp->tcp_loopback_peer == tcp); 15532 ASSERT(!TCP_IS_DETACHED(tcp)); 15533 ASSERT(tcp->tcp_connp->conn_sqp == 15534 peer_tcp->tcp_connp->conn_sqp); 15535 15536 /* 15537 * Normally we would not get backenabled in synchronous 15538 * streams mode, but in case this happens, we need to plug 15539 * synchronous streams during our drain to prevent a race 15540 * with tcp_fuse_rrw() or tcp_fuse_rinfop(). 15541 */ 15542 TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp); 15543 if (tcp->tcp_rcv_list != NULL) 15544 (void) tcp_rcv_drain(tcp->tcp_rq, tcp); 15545 15546 tcp_clrqfull(peer_tcp); 15547 TCP_FUSE_SYNCSTR_UNPLUG_DRAIN(tcp); 15548 TCP_STAT(tcp_fusion_backenabled); 15549 return; 15550 } 15551 15552 if (canputnext(q)) { 15553 tcp->tcp_rwnd = q->q_hiwat; 15554 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 15555 << tcp->tcp_rcv_ws; 15556 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 15557 /* 15558 * Send back a window update immediately if TCP is above 15559 * ESTABLISHED state and the increase of the rcv window 15560 * that the other side knows is at least 1 MSS after flow 15561 * control is lifted. 15562 */ 15563 if (tcp->tcp_state >= TCPS_ESTABLISHED && 15564 (q->q_hiwat - thwin >= tcp->tcp_mss)) { 15565 tcp_xmit_ctl(NULL, tcp, 15566 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 15567 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 15568 BUMP_MIB(&tcp_mib, tcpOutWinUpdate); 15569 } 15570 } 15571 /* Handle a failure to allocate a T_ORDREL_IND here */ 15572 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 15573 ASSERT(tcp->tcp_listener == NULL); 15574 if (tcp->tcp_rcv_list != NULL) { 15575 (void) tcp_rcv_drain(q, tcp); 15576 } 15577 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 15578 mp = mi_tpi_ordrel_ind(); 15579 if (mp) { 15580 tcp->tcp_ordrel_done = B_TRUE; 15581 putnext(q, mp); 15582 if (tcp->tcp_deferred_clean_death) { 15583 /* 15584 * tcp_clean_death was deferred for 15585 * T_ORDREL_IND - do it now 15586 */ 15587 tcp->tcp_deferred_clean_death = B_FALSE; 15588 (void) tcp_clean_death(tcp, 15589 tcp->tcp_client_errno, 22); 15590 } 15591 } else if (!tcp->tcp_timeout && tcp->tcp_ordrelid == 0) { 15592 /* 15593 * If there isn't already a timer running 15594 * start one. Use a 4 second 15595 * timer as a fallback since it can't fail. 15596 */ 15597 tcp->tcp_timeout = B_TRUE; 15598 tcp->tcp_ordrelid = TCP_TIMER(tcp, tcp_ordrel_kick, 15599 MSEC_TO_TICK(4000)); 15600 } 15601 } 15602 } 15603 15604 /* 15605 * The read side service routine is called mostly when we get back-enabled as a 15606 * result of flow control relief. Since we don't actually queue anything in 15607 * TCP, we have no data to send out of here. What we do is clear the receive 15608 * window, and send out a window update. 15609 * This routine is also called to drive an orderly release message upstream 15610 * if the attempt in tcp_rput failed. 15611 */ 15612 static void 15613 tcp_rsrv(queue_t *q) 15614 { 15615 conn_t *connp = Q_TO_CONN(q); 15616 tcp_t *tcp = connp->conn_tcp; 15617 mblk_t *mp; 15618 15619 /* No code does a putq on the read side */ 15620 ASSERT(q->q_first == NULL); 15621 15622 /* Nothing to do for the default queue */ 15623 if (q == tcp_g_q) { 15624 return; 15625 } 15626 15627 mp = allocb(0, BPRI_HI); 15628 if (mp == NULL) { 15629 /* 15630 * We are under memory pressure. Return for now and we 15631 * we will be called again later. 15632 */ 15633 if (!tcp->tcp_timeout && tcp->tcp_ordrelid == 0) { 15634 /* 15635 * If there isn't already a timer running 15636 * start one. Use a 4 second 15637 * timer as a fallback since it can't fail. 15638 */ 15639 tcp->tcp_timeout = B_TRUE; 15640 tcp->tcp_ordrelid = TCP_TIMER(tcp, tcp_ordrel_kick, 15641 MSEC_TO_TICK(4000)); 15642 } 15643 return; 15644 } 15645 CONN_INC_REF(connp); 15646 squeue_enter(connp->conn_sqp, mp, tcp_rsrv_input, connp, 15647 SQTAG_TCP_RSRV); 15648 } 15649 15650 /* 15651 * tcp_rwnd_set() is called to adjust the receive window to a desired value. 15652 * We do not allow the receive window to shrink. After setting rwnd, 15653 * set the flow control hiwat of the stream. 15654 * 15655 * This function is called in 2 cases: 15656 * 15657 * 1) Before data transfer begins, in tcp_accept_comm() for accepting a 15658 * connection (passive open) and in tcp_rput_data() for active connect. 15659 * This is called after tcp_mss_set() when the desired MSS value is known. 15660 * This makes sure that our window size is a mutiple of the other side's 15661 * MSS. 15662 * 2) Handling SO_RCVBUF option. 15663 * 15664 * It is ASSUMED that the requested size is a multiple of the current MSS. 15665 * 15666 * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 15667 * user requests so. 15668 */ 15669 static int 15670 tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 15671 { 15672 uint32_t mss = tcp->tcp_mss; 15673 uint32_t old_max_rwnd; 15674 uint32_t max_transmittable_rwnd; 15675 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 15676 15677 if (tcp->tcp_fused) { 15678 size_t sth_hiwat; 15679 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 15680 15681 ASSERT(peer_tcp != NULL); 15682 /* 15683 * Record the stream head's high water mark for 15684 * this endpoint; this is used for flow-control 15685 * purposes in tcp_fuse_output(). 15686 */ 15687 sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd); 15688 if (!tcp_detached) 15689 (void) mi_set_sth_hiwat(tcp->tcp_rq, sth_hiwat); 15690 15691 /* 15692 * In the fusion case, the maxpsz stream head value of 15693 * our peer is set according to its send buffer size 15694 * and our receive buffer size; since the latter may 15695 * have changed we need to update the peer's maxpsz. 15696 */ 15697 (void) tcp_maxpsz_set(peer_tcp, B_TRUE); 15698 return (rwnd); 15699 } 15700 15701 if (tcp_detached) 15702 old_max_rwnd = tcp->tcp_rwnd; 15703 else 15704 old_max_rwnd = tcp->tcp_rq->q_hiwat; 15705 15706 /* 15707 * Insist on a receive window that is at least 15708 * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 15709 * funny TCP interactions of Nagle algorithm, SWS avoidance 15710 * and delayed acknowledgement. 15711 */ 15712 rwnd = MAX(rwnd, tcp_recv_hiwat_minmss * mss); 15713 15714 /* 15715 * If window size info has already been exchanged, TCP should not 15716 * shrink the window. Shrinking window is doable if done carefully. 15717 * We may add that support later. But so far there is not a real 15718 * need to do that. 15719 */ 15720 if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 15721 /* MSS may have changed, do a round up again. */ 15722 rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 15723 } 15724 15725 /* 15726 * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 15727 * can be applied even before the window scale option is decided. 15728 */ 15729 max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 15730 if (rwnd > max_transmittable_rwnd) { 15731 rwnd = max_transmittable_rwnd - 15732 (max_transmittable_rwnd % mss); 15733 if (rwnd < mss) 15734 rwnd = max_transmittable_rwnd; 15735 /* 15736 * If we're over the limit we may have to back down tcp_rwnd. 15737 * The increment below won't work for us. So we set all three 15738 * here and the increment below will have no effect. 15739 */ 15740 tcp->tcp_rwnd = old_max_rwnd = rwnd; 15741 } 15742 if (tcp->tcp_localnet) { 15743 tcp->tcp_rack_abs_max = 15744 MIN(tcp_local_dacks_max, rwnd / mss / 2); 15745 } else { 15746 /* 15747 * For a remote host on a different subnet (through a router), 15748 * we ack every other packet to be conforming to RFC1122. 15749 * tcp_deferred_acks_max is default to 2. 15750 */ 15751 tcp->tcp_rack_abs_max = 15752 MIN(tcp_deferred_acks_max, rwnd / mss / 2); 15753 } 15754 if (tcp->tcp_rack_cur_max > tcp->tcp_rack_abs_max) 15755 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 15756 else 15757 tcp->tcp_rack_cur_max = 0; 15758 /* 15759 * Increment the current rwnd by the amount the maximum grew (we 15760 * can not overwrite it since we might be in the middle of a 15761 * connection.) 15762 */ 15763 tcp->tcp_rwnd += rwnd - old_max_rwnd; 15764 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 15765 if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 15766 tcp->tcp_cwnd_max = rwnd; 15767 15768 if (tcp_detached) 15769 return (rwnd); 15770 /* 15771 * We set the maximum receive window into rq->q_hiwat. 15772 * This is not actually used for flow control. 15773 */ 15774 tcp->tcp_rq->q_hiwat = rwnd; 15775 /* 15776 * Set the Stream head high water mark. This doesn't have to be 15777 * here, since we are simply using default values, but we would 15778 * prefer to choose these values algorithmically, with a likely 15779 * relationship to rwnd. 15780 */ 15781 (void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd, tcp_sth_rcv_hiwat)); 15782 return (rwnd); 15783 } 15784 15785 /* 15786 * Return SNMP stuff in buffer in mpdata. 15787 */ 15788 int 15789 tcp_snmp_get(queue_t *q, mblk_t *mpctl) 15790 { 15791 mblk_t *mpdata; 15792 mblk_t *mp_conn_ctl = NULL; 15793 mblk_t *mp_conn_tail; 15794 mblk_t *mp_attr_ctl = NULL; 15795 mblk_t *mp_attr_tail; 15796 mblk_t *mp6_conn_ctl = NULL; 15797 mblk_t *mp6_conn_tail; 15798 mblk_t *mp6_attr_ctl = NULL; 15799 mblk_t *mp6_attr_tail; 15800 struct opthdr *optp; 15801 mib2_tcpConnEntry_t tce; 15802 mib2_tcp6ConnEntry_t tce6; 15803 mib2_transportMLPEntry_t mlp; 15804 connf_t *connfp; 15805 conn_t *connp; 15806 int i; 15807 boolean_t ispriv; 15808 zoneid_t zoneid; 15809 int v4_conn_idx; 15810 int v6_conn_idx; 15811 15812 if (mpctl == NULL || 15813 (mpdata = mpctl->b_cont) == NULL || 15814 (mp_conn_ctl = copymsg(mpctl)) == NULL || 15815 (mp_attr_ctl = copymsg(mpctl)) == NULL || 15816 (mp6_conn_ctl = copymsg(mpctl)) == NULL || 15817 (mp6_attr_ctl = copymsg(mpctl)) == NULL) { 15818 freemsg(mp_conn_ctl); 15819 freemsg(mp_attr_ctl); 15820 freemsg(mp6_conn_ctl); 15821 freemsg(mp6_attr_ctl); 15822 return (0); 15823 } 15824 15825 /* build table of connections -- need count in fixed part */ 15826 SET_MIB(tcp_mib.tcpRtoAlgorithm, 4); /* vanj */ 15827 SET_MIB(tcp_mib.tcpRtoMin, tcp_rexmit_interval_min); 15828 SET_MIB(tcp_mib.tcpRtoMax, tcp_rexmit_interval_max); 15829 SET_MIB(tcp_mib.tcpMaxConn, -1); 15830 SET_MIB(tcp_mib.tcpCurrEstab, 0); 15831 15832 ispriv = 15833 secpolicy_net_config((Q_TO_CONN(q))->conn_cred, B_TRUE) == 0; 15834 zoneid = Q_TO_CONN(q)->conn_zoneid; 15835 15836 v4_conn_idx = v6_conn_idx = 0; 15837 mp_conn_tail = mp_attr_tail = mp6_conn_tail = mp6_attr_tail = NULL; 15838 15839 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 15840 15841 connfp = &ipcl_globalhash_fanout[i]; 15842 15843 connp = NULL; 15844 15845 while ((connp = 15846 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 15847 tcp_t *tcp; 15848 boolean_t needattr; 15849 15850 if (connp->conn_zoneid != zoneid) 15851 continue; /* not in this zone */ 15852 15853 tcp = connp->conn_tcp; 15854 UPDATE_MIB(&tcp_mib, tcpInSegs, tcp->tcp_ibsegs); 15855 tcp->tcp_ibsegs = 0; 15856 UPDATE_MIB(&tcp_mib, tcpOutSegs, tcp->tcp_obsegs); 15857 tcp->tcp_obsegs = 0; 15858 15859 tce6.tcp6ConnState = tce.tcpConnState = 15860 tcp_snmp_state(tcp); 15861 if (tce.tcpConnState == MIB2_TCP_established || 15862 tce.tcpConnState == MIB2_TCP_closeWait) 15863 BUMP_MIB(&tcp_mib, tcpCurrEstab); 15864 15865 needattr = B_FALSE; 15866 bzero(&mlp, sizeof (mlp)); 15867 if (connp->conn_mlp_type != mlptSingle) { 15868 if (connp->conn_mlp_type == mlptShared || 15869 connp->conn_mlp_type == mlptBoth) 15870 mlp.tme_flags |= MIB2_TMEF_SHARED; 15871 if (connp->conn_mlp_type == mlptPrivate || 15872 connp->conn_mlp_type == mlptBoth) 15873 mlp.tme_flags |= MIB2_TMEF_PRIVATE; 15874 needattr = B_TRUE; 15875 } 15876 if (connp->conn_peercred != NULL) { 15877 ts_label_t *tsl; 15878 15879 tsl = crgetlabel(connp->conn_peercred); 15880 mlp.tme_doi = label2doi(tsl); 15881 mlp.tme_label = *label2bslabel(tsl); 15882 needattr = B_TRUE; 15883 } 15884 15885 /* Create a message to report on IPv6 entries */ 15886 if (tcp->tcp_ipversion == IPV6_VERSION) { 15887 tce6.tcp6ConnLocalAddress = tcp->tcp_ip_src_v6; 15888 tce6.tcp6ConnRemAddress = tcp->tcp_remote_v6; 15889 tce6.tcp6ConnLocalPort = ntohs(tcp->tcp_lport); 15890 tce6.tcp6ConnRemPort = ntohs(tcp->tcp_fport); 15891 tce6.tcp6ConnIfIndex = tcp->tcp_bound_if; 15892 /* Don't want just anybody seeing these... */ 15893 if (ispriv) { 15894 tce6.tcp6ConnEntryInfo.ce_snxt = 15895 tcp->tcp_snxt; 15896 tce6.tcp6ConnEntryInfo.ce_suna = 15897 tcp->tcp_suna; 15898 tce6.tcp6ConnEntryInfo.ce_rnxt = 15899 tcp->tcp_rnxt; 15900 tce6.tcp6ConnEntryInfo.ce_rack = 15901 tcp->tcp_rack; 15902 } else { 15903 /* 15904 * Netstat, unfortunately, uses this to 15905 * get send/receive queue sizes. How to fix? 15906 * Why not compute the difference only? 15907 */ 15908 tce6.tcp6ConnEntryInfo.ce_snxt = 15909 tcp->tcp_snxt - tcp->tcp_suna; 15910 tce6.tcp6ConnEntryInfo.ce_suna = 0; 15911 tce6.tcp6ConnEntryInfo.ce_rnxt = 15912 tcp->tcp_rnxt - tcp->tcp_rack; 15913 tce6.tcp6ConnEntryInfo.ce_rack = 0; 15914 } 15915 15916 tce6.tcp6ConnEntryInfo.ce_swnd = tcp->tcp_swnd; 15917 tce6.tcp6ConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; 15918 tce6.tcp6ConnEntryInfo.ce_rto = tcp->tcp_rto; 15919 tce6.tcp6ConnEntryInfo.ce_mss = tcp->tcp_mss; 15920 tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state; 15921 15922 (void) snmp_append_data2(mp6_conn_ctl->b_cont, 15923 &mp6_conn_tail, (char *)&tce6, sizeof (tce6)); 15924 15925 mlp.tme_connidx = v6_conn_idx++; 15926 if (needattr) 15927 (void) snmp_append_data2(mp6_attr_ctl->b_cont, 15928 &mp6_attr_tail, (char *)&mlp, sizeof (mlp)); 15929 } 15930 /* 15931 * Create an IPv4 table entry for IPv4 entries and also 15932 * for IPv6 entries which are bound to in6addr_any 15933 * but don't have IPV6_V6ONLY set. 15934 * (i.e. anything an IPv4 peer could connect to) 15935 */ 15936 if (tcp->tcp_ipversion == IPV4_VERSION || 15937 (tcp->tcp_state <= TCPS_LISTEN && 15938 !tcp->tcp_connp->conn_ipv6_v6only && 15939 IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip_src_v6))) { 15940 if (tcp->tcp_ipversion == IPV6_VERSION) { 15941 tce.tcpConnRemAddress = INADDR_ANY; 15942 tce.tcpConnLocalAddress = INADDR_ANY; 15943 } else { 15944 tce.tcpConnRemAddress = 15945 tcp->tcp_remote; 15946 tce.tcpConnLocalAddress = 15947 tcp->tcp_ip_src; 15948 } 15949 tce.tcpConnLocalPort = ntohs(tcp->tcp_lport); 15950 tce.tcpConnRemPort = ntohs(tcp->tcp_fport); 15951 /* Don't want just anybody seeing these... */ 15952 if (ispriv) { 15953 tce.tcpConnEntryInfo.ce_snxt = 15954 tcp->tcp_snxt; 15955 tce.tcpConnEntryInfo.ce_suna = 15956 tcp->tcp_suna; 15957 tce.tcpConnEntryInfo.ce_rnxt = 15958 tcp->tcp_rnxt; 15959 tce.tcpConnEntryInfo.ce_rack = 15960 tcp->tcp_rack; 15961 } else { 15962 /* 15963 * Netstat, unfortunately, uses this to 15964 * get send/receive queue sizes. How 15965 * to fix? 15966 * Why not compute the difference only? 15967 */ 15968 tce.tcpConnEntryInfo.ce_snxt = 15969 tcp->tcp_snxt - tcp->tcp_suna; 15970 tce.tcpConnEntryInfo.ce_suna = 0; 15971 tce.tcpConnEntryInfo.ce_rnxt = 15972 tcp->tcp_rnxt - tcp->tcp_rack; 15973 tce.tcpConnEntryInfo.ce_rack = 0; 15974 } 15975 15976 tce.tcpConnEntryInfo.ce_swnd = tcp->tcp_swnd; 15977 tce.tcpConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; 15978 tce.tcpConnEntryInfo.ce_rto = tcp->tcp_rto; 15979 tce.tcpConnEntryInfo.ce_mss = tcp->tcp_mss; 15980 tce.tcpConnEntryInfo.ce_state = 15981 tcp->tcp_state; 15982 15983 (void) snmp_append_data2(mp_conn_ctl->b_cont, 15984 &mp_conn_tail, (char *)&tce, sizeof (tce)); 15985 15986 mlp.tme_connidx = v4_conn_idx++; 15987 if (needattr) 15988 (void) snmp_append_data2( 15989 mp_attr_ctl->b_cont, 15990 &mp_attr_tail, (char *)&mlp, 15991 sizeof (mlp)); 15992 } 15993 } 15994 } 15995 15996 /* fixed length structure for IPv4 and IPv6 counters */ 15997 SET_MIB(tcp_mib.tcpConnTableSize, sizeof (mib2_tcpConnEntry_t)); 15998 SET_MIB(tcp_mib.tcp6ConnTableSize, sizeof (mib2_tcp6ConnEntry_t)); 15999 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 16000 optp->level = MIB2_TCP; 16001 optp->name = 0; 16002 (void) snmp_append_data(mpdata, (char *)&tcp_mib, sizeof (tcp_mib)); 16003 optp->len = msgdsize(mpdata); 16004 qreply(q, mpctl); 16005 16006 /* table of connections... */ 16007 optp = (struct opthdr *)&mp_conn_ctl->b_rptr[ 16008 sizeof (struct T_optmgmt_ack)]; 16009 optp->level = MIB2_TCP; 16010 optp->name = MIB2_TCP_CONN; 16011 optp->len = msgdsize(mp_conn_ctl->b_cont); 16012 qreply(q, mp_conn_ctl); 16013 16014 /* table of MLP attributes... */ 16015 optp = (struct opthdr *)&mp_attr_ctl->b_rptr[ 16016 sizeof (struct T_optmgmt_ack)]; 16017 optp->level = MIB2_TCP; 16018 optp->name = EXPER_XPORT_MLP; 16019 optp->len = msgdsize(mp_attr_ctl->b_cont); 16020 if (optp->len == 0) 16021 freemsg(mp_attr_ctl); 16022 else 16023 qreply(q, mp_attr_ctl); 16024 16025 /* table of IPv6 connections... */ 16026 optp = (struct opthdr *)&mp6_conn_ctl->b_rptr[ 16027 sizeof (struct T_optmgmt_ack)]; 16028 optp->level = MIB2_TCP6; 16029 optp->name = MIB2_TCP6_CONN; 16030 optp->len = msgdsize(mp6_conn_ctl->b_cont); 16031 qreply(q, mp6_conn_ctl); 16032 16033 /* table of IPv6 MLP attributes... */ 16034 optp = (struct opthdr *)&mp6_attr_ctl->b_rptr[ 16035 sizeof (struct T_optmgmt_ack)]; 16036 optp->level = MIB2_TCP6; 16037 optp->name = EXPER_XPORT_MLP; 16038 optp->len = msgdsize(mp6_attr_ctl->b_cont); 16039 if (optp->len == 0) 16040 freemsg(mp6_attr_ctl); 16041 else 16042 qreply(q, mp6_attr_ctl); 16043 return (1); 16044 } 16045 16046 /* Return 0 if invalid set request, 1 otherwise, including non-tcp requests */ 16047 /* ARGSUSED */ 16048 int 16049 tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) 16050 { 16051 mib2_tcpConnEntry_t *tce = (mib2_tcpConnEntry_t *)ptr; 16052 16053 switch (level) { 16054 case MIB2_TCP: 16055 switch (name) { 16056 case 13: 16057 if (tce->tcpConnState != MIB2_TCP_deleteTCB) 16058 return (0); 16059 /* TODO: delete entry defined by tce */ 16060 return (1); 16061 default: 16062 return (0); 16063 } 16064 default: 16065 return (1); 16066 } 16067 } 16068 16069 /* Translate TCP state to MIB2 TCP state. */ 16070 static int 16071 tcp_snmp_state(tcp_t *tcp) 16072 { 16073 if (tcp == NULL) 16074 return (0); 16075 16076 switch (tcp->tcp_state) { 16077 case TCPS_CLOSED: 16078 case TCPS_IDLE: /* RFC1213 doesn't have analogue for IDLE & BOUND */ 16079 case TCPS_BOUND: 16080 return (MIB2_TCP_closed); 16081 case TCPS_LISTEN: 16082 return (MIB2_TCP_listen); 16083 case TCPS_SYN_SENT: 16084 return (MIB2_TCP_synSent); 16085 case TCPS_SYN_RCVD: 16086 return (MIB2_TCP_synReceived); 16087 case TCPS_ESTABLISHED: 16088 return (MIB2_TCP_established); 16089 case TCPS_CLOSE_WAIT: 16090 return (MIB2_TCP_closeWait); 16091 case TCPS_FIN_WAIT_1: 16092 return (MIB2_TCP_finWait1); 16093 case TCPS_CLOSING: 16094 return (MIB2_TCP_closing); 16095 case TCPS_LAST_ACK: 16096 return (MIB2_TCP_lastAck); 16097 case TCPS_FIN_WAIT_2: 16098 return (MIB2_TCP_finWait2); 16099 case TCPS_TIME_WAIT: 16100 return (MIB2_TCP_timeWait); 16101 default: 16102 return (0); 16103 } 16104 } 16105 16106 static char tcp_report_header[] = 16107 "TCP " MI_COL_HDRPAD_STR 16108 "zone dest snxt suna " 16109 "swnd rnxt rack rwnd rto mss w sw rw t " 16110 "recent [lport,fport] state"; 16111 16112 /* 16113 * TCP status report triggered via the Named Dispatch mechanism. 16114 */ 16115 /* ARGSUSED */ 16116 static void 16117 tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, tcp_t *thisstream, 16118 cred_t *cr) 16119 { 16120 char hash[10], addrbuf[INET6_ADDRSTRLEN]; 16121 boolean_t ispriv = secpolicy_net_config(cr, B_TRUE) == 0; 16122 char cflag; 16123 in6_addr_t v6dst; 16124 char buf[80]; 16125 uint_t print_len, buf_len; 16126 16127 buf_len = mp->b_datap->db_lim - mp->b_wptr; 16128 if (buf_len <= 0) 16129 return; 16130 16131 if (hashval >= 0) 16132 (void) sprintf(hash, "%03d ", hashval); 16133 else 16134 hash[0] = '\0'; 16135 16136 /* 16137 * Note that we use the remote address in the tcp_b structure. 16138 * This means that it will print out the real destination address, 16139 * not the next hop's address if source routing is used. This 16140 * avoid the confusion on the output because user may not 16141 * know that source routing is used for a connection. 16142 */ 16143 if (tcp->tcp_ipversion == IPV4_VERSION) { 16144 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &v6dst); 16145 } else { 16146 v6dst = tcp->tcp_remote_v6; 16147 } 16148 (void) inet_ntop(AF_INET6, &v6dst, addrbuf, sizeof (addrbuf)); 16149 /* 16150 * the ispriv checks are so that normal users cannot determine 16151 * sequence number information using NDD. 16152 */ 16153 16154 if (TCP_IS_DETACHED(tcp)) 16155 cflag = '*'; 16156 else 16157 cflag = ' '; 16158 print_len = snprintf((char *)mp->b_wptr, buf_len, 16159 "%s " MI_COL_PTRFMT_STR "%d %s %08x %08x %010d %08x %08x " 16160 "%010d %05ld %05d %1d %02d %02d %1d %08x %s%c\n", 16161 hash, 16162 (void *)tcp, 16163 tcp->tcp_connp->conn_zoneid, 16164 addrbuf, 16165 (ispriv) ? tcp->tcp_snxt : 0, 16166 (ispriv) ? tcp->tcp_suna : 0, 16167 tcp->tcp_swnd, 16168 (ispriv) ? tcp->tcp_rnxt : 0, 16169 (ispriv) ? tcp->tcp_rack : 0, 16170 tcp->tcp_rwnd, 16171 tcp->tcp_rto, 16172 tcp->tcp_mss, 16173 tcp->tcp_snd_ws_ok, 16174 tcp->tcp_snd_ws, 16175 tcp->tcp_rcv_ws, 16176 tcp->tcp_snd_ts_ok, 16177 tcp->tcp_ts_recent, 16178 tcp_display(tcp, buf, DISP_PORT_ONLY), cflag); 16179 if (print_len < buf_len) { 16180 ((mblk_t *)mp)->b_wptr += print_len; 16181 } else { 16182 ((mblk_t *)mp)->b_wptr += buf_len; 16183 } 16184 } 16185 16186 /* 16187 * TCP status report (for listeners only) triggered via the Named Dispatch 16188 * mechanism. 16189 */ 16190 /* ARGSUSED */ 16191 static void 16192 tcp_report_listener(mblk_t *mp, tcp_t *tcp, int hashval) 16193 { 16194 char addrbuf[INET6_ADDRSTRLEN]; 16195 in6_addr_t v6dst; 16196 uint_t print_len, buf_len; 16197 16198 buf_len = mp->b_datap->db_lim - mp->b_wptr; 16199 if (buf_len <= 0) 16200 return; 16201 16202 if (tcp->tcp_ipversion == IPV4_VERSION) { 16203 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6dst); 16204 (void) inet_ntop(AF_INET6, &v6dst, addrbuf, sizeof (addrbuf)); 16205 } else { 16206 (void) inet_ntop(AF_INET6, &tcp->tcp_ip6h->ip6_src, 16207 addrbuf, sizeof (addrbuf)); 16208 } 16209 print_len = snprintf((char *)mp->b_wptr, buf_len, 16210 "%03d " 16211 MI_COL_PTRFMT_STR 16212 "%d %s %05u %08u %d/%d/%d%c\n", 16213 hashval, (void *)tcp, 16214 tcp->tcp_connp->conn_zoneid, 16215 addrbuf, 16216 (uint_t)BE16_TO_U16(tcp->tcp_tcph->th_lport), 16217 tcp->tcp_conn_req_seqnum, 16218 tcp->tcp_conn_req_cnt_q0, tcp->tcp_conn_req_cnt_q, 16219 tcp->tcp_conn_req_max, 16220 tcp->tcp_syn_defense ? '*' : ' '); 16221 if (print_len < buf_len) { 16222 ((mblk_t *)mp)->b_wptr += print_len; 16223 } else { 16224 ((mblk_t *)mp)->b_wptr += buf_len; 16225 } 16226 } 16227 16228 /* TCP status report triggered via the Named Dispatch mechanism. */ 16229 /* ARGSUSED */ 16230 static int 16231 tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16232 { 16233 tcp_t *tcp; 16234 int i; 16235 conn_t *connp; 16236 connf_t *connfp; 16237 zoneid_t zoneid; 16238 16239 /* 16240 * Because of the ndd constraint, at most we can have 64K buffer 16241 * to put in all TCP info. So to be more efficient, just 16242 * allocate a 64K buffer here, assuming we need that large buffer. 16243 * This may be a problem as any user can read tcp_status. Therefore 16244 * we limit the rate of doing this using tcp_ndd_get_info_interval. 16245 * This should be OK as normal users should not do this too often. 16246 */ 16247 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16248 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16249 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16250 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16251 return (0); 16252 } 16253 } 16254 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16255 /* The following may work even if we cannot get a large buf. */ 16256 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16257 return (0); 16258 } 16259 16260 (void) mi_mpprintf(mp, "%s", tcp_report_header); 16261 16262 zoneid = Q_TO_CONN(q)->conn_zoneid; 16263 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 16264 16265 connfp = &ipcl_globalhash_fanout[i]; 16266 16267 connp = NULL; 16268 16269 while ((connp = 16270 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 16271 tcp = connp->conn_tcp; 16272 if (zoneid != GLOBAL_ZONEID && 16273 zoneid != connp->conn_zoneid) 16274 continue; 16275 tcp_report_item(mp->b_cont, tcp, -1, tcp, 16276 cr); 16277 } 16278 16279 } 16280 16281 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16282 return (0); 16283 } 16284 16285 /* TCP status report triggered via the Named Dispatch mechanism. */ 16286 /* ARGSUSED */ 16287 static int 16288 tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16289 { 16290 tf_t *tbf; 16291 tcp_t *tcp; 16292 int i; 16293 zoneid_t zoneid; 16294 16295 /* Refer to comments in tcp_status_report(). */ 16296 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16297 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16298 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16299 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16300 return (0); 16301 } 16302 } 16303 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16304 /* The following may work even if we cannot get a large buf. */ 16305 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16306 return (0); 16307 } 16308 16309 (void) mi_mpprintf(mp, " %s", tcp_report_header); 16310 16311 zoneid = Q_TO_CONN(q)->conn_zoneid; 16312 16313 for (i = 0; i < A_CNT(tcp_bind_fanout); i++) { 16314 tbf = &tcp_bind_fanout[i]; 16315 mutex_enter(&tbf->tf_lock); 16316 for (tcp = tbf->tf_tcp; tcp != NULL; 16317 tcp = tcp->tcp_bind_hash) { 16318 if (zoneid != GLOBAL_ZONEID && 16319 zoneid != tcp->tcp_connp->conn_zoneid) 16320 continue; 16321 CONN_INC_REF(tcp->tcp_connp); 16322 tcp_report_item(mp->b_cont, tcp, i, 16323 Q_TO_TCP(q), cr); 16324 CONN_DEC_REF(tcp->tcp_connp); 16325 } 16326 mutex_exit(&tbf->tf_lock); 16327 } 16328 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16329 return (0); 16330 } 16331 16332 /* TCP status report triggered via the Named Dispatch mechanism. */ 16333 /* ARGSUSED */ 16334 static int 16335 tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16336 { 16337 connf_t *connfp; 16338 conn_t *connp; 16339 tcp_t *tcp; 16340 int i; 16341 zoneid_t zoneid; 16342 16343 /* Refer to comments in tcp_status_report(). */ 16344 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16345 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16346 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16347 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16348 return (0); 16349 } 16350 } 16351 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16352 /* The following may work even if we cannot get a large buf. */ 16353 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16354 return (0); 16355 } 16356 16357 (void) mi_mpprintf(mp, 16358 " TCP " MI_COL_HDRPAD_STR 16359 "zone IP addr port seqnum backlog (q0/q/max)"); 16360 16361 zoneid = Q_TO_CONN(q)->conn_zoneid; 16362 16363 for (i = 0; i < ipcl_bind_fanout_size; i++) { 16364 connfp = &ipcl_bind_fanout[i]; 16365 connp = NULL; 16366 while ((connp = 16367 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 16368 tcp = connp->conn_tcp; 16369 if (zoneid != GLOBAL_ZONEID && 16370 zoneid != connp->conn_zoneid) 16371 continue; 16372 tcp_report_listener(mp->b_cont, tcp, i); 16373 } 16374 } 16375 16376 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16377 return (0); 16378 } 16379 16380 /* TCP status report triggered via the Named Dispatch mechanism. */ 16381 /* ARGSUSED */ 16382 static int 16383 tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16384 { 16385 connf_t *connfp; 16386 conn_t *connp; 16387 tcp_t *tcp; 16388 int i; 16389 zoneid_t zoneid; 16390 16391 /* Refer to comments in tcp_status_report(). */ 16392 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16393 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16394 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16395 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16396 return (0); 16397 } 16398 } 16399 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16400 /* The following may work even if we cannot get a large buf. */ 16401 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16402 return (0); 16403 } 16404 16405 (void) mi_mpprintf(mp, "tcp_conn_hash_size = %d", 16406 ipcl_conn_fanout_size); 16407 (void) mi_mpprintf(mp, " %s", tcp_report_header); 16408 16409 zoneid = Q_TO_CONN(q)->conn_zoneid; 16410 16411 for (i = 0; i < ipcl_conn_fanout_size; i++) { 16412 connfp = &ipcl_conn_fanout[i]; 16413 connp = NULL; 16414 while ((connp = 16415 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 16416 tcp = connp->conn_tcp; 16417 if (zoneid != GLOBAL_ZONEID && 16418 zoneid != connp->conn_zoneid) 16419 continue; 16420 tcp_report_item(mp->b_cont, tcp, i, 16421 Q_TO_TCP(q), cr); 16422 } 16423 } 16424 16425 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16426 return (0); 16427 } 16428 16429 /* TCP status report triggered via the Named Dispatch mechanism. */ 16430 /* ARGSUSED */ 16431 static int 16432 tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16433 { 16434 tf_t *tf; 16435 tcp_t *tcp; 16436 int i; 16437 zoneid_t zoneid; 16438 16439 /* Refer to comments in tcp_status_report(). */ 16440 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16441 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16442 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16443 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16444 return (0); 16445 } 16446 } 16447 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16448 /* The following may work even if we cannot get a large buf. */ 16449 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16450 return (0); 16451 } 16452 16453 (void) mi_mpprintf(mp, " %s", tcp_report_header); 16454 16455 zoneid = Q_TO_CONN(q)->conn_zoneid; 16456 16457 for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) { 16458 tf = &tcp_acceptor_fanout[i]; 16459 mutex_enter(&tf->tf_lock); 16460 for (tcp = tf->tf_tcp; tcp != NULL; 16461 tcp = tcp->tcp_acceptor_hash) { 16462 if (zoneid != GLOBAL_ZONEID && 16463 zoneid != tcp->tcp_connp->conn_zoneid) 16464 continue; 16465 tcp_report_item(mp->b_cont, tcp, i, 16466 Q_TO_TCP(q), cr); 16467 } 16468 mutex_exit(&tf->tf_lock); 16469 } 16470 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16471 return (0); 16472 } 16473 16474 /* 16475 * tcp_timer is the timer service routine. It handles the retransmission, 16476 * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out 16477 * from the state of the tcp instance what kind of action needs to be done 16478 * at the time it is called. 16479 */ 16480 static void 16481 tcp_timer(void *arg) 16482 { 16483 mblk_t *mp; 16484 clock_t first_threshold; 16485 clock_t second_threshold; 16486 clock_t ms; 16487 uint32_t mss; 16488 conn_t *connp = (conn_t *)arg; 16489 tcp_t *tcp = connp->conn_tcp; 16490 16491 tcp->tcp_timer_tid = 0; 16492 16493 if (tcp->tcp_fused) 16494 return; 16495 16496 first_threshold = tcp->tcp_first_timer_threshold; 16497 second_threshold = tcp->tcp_second_timer_threshold; 16498 switch (tcp->tcp_state) { 16499 case TCPS_IDLE: 16500 case TCPS_BOUND: 16501 case TCPS_LISTEN: 16502 return; 16503 case TCPS_SYN_RCVD: { 16504 tcp_t *listener = tcp->tcp_listener; 16505 16506 if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) { 16507 ASSERT(tcp->tcp_rq == listener->tcp_rq); 16508 /* it's our first timeout */ 16509 tcp->tcp_syn_rcvd_timeout = 1; 16510 mutex_enter(&listener->tcp_eager_lock); 16511 listener->tcp_syn_rcvd_timeout++; 16512 if (!listener->tcp_syn_defense && 16513 (listener->tcp_syn_rcvd_timeout > 16514 (tcp_conn_req_max_q0 >> 2)) && 16515 (tcp_conn_req_max_q0 > 200)) { 16516 /* We may be under attack. Put on a defense. */ 16517 listener->tcp_syn_defense = B_TRUE; 16518 cmn_err(CE_WARN, "High TCP connect timeout " 16519 "rate! System (port %d) may be under a " 16520 "SYN flood attack!", 16521 BE16_TO_U16(listener->tcp_tcph->th_lport)); 16522 16523 listener->tcp_ip_addr_cache = kmem_zalloc( 16524 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t), 16525 KM_NOSLEEP); 16526 } 16527 mutex_exit(&listener->tcp_eager_lock); 16528 } 16529 } 16530 /* FALLTHRU */ 16531 case TCPS_SYN_SENT: 16532 first_threshold = tcp->tcp_first_ctimer_threshold; 16533 second_threshold = tcp->tcp_second_ctimer_threshold; 16534 break; 16535 case TCPS_ESTABLISHED: 16536 case TCPS_FIN_WAIT_1: 16537 case TCPS_CLOSING: 16538 case TCPS_CLOSE_WAIT: 16539 case TCPS_LAST_ACK: 16540 /* If we have data to rexmit */ 16541 if (tcp->tcp_suna != tcp->tcp_snxt) { 16542 clock_t time_to_wait; 16543 16544 BUMP_MIB(&tcp_mib, tcpTimRetrans); 16545 if (!tcp->tcp_xmit_head) 16546 break; 16547 time_to_wait = lbolt - 16548 (clock_t)tcp->tcp_xmit_head->b_prev; 16549 time_to_wait = tcp->tcp_rto - 16550 TICK_TO_MSEC(time_to_wait); 16551 /* 16552 * If the timer fires too early, 1 clock tick earlier, 16553 * restart the timer. 16554 */ 16555 if (time_to_wait > msec_per_tick) { 16556 TCP_STAT(tcp_timer_fire_early); 16557 TCP_TIMER_RESTART(tcp, time_to_wait); 16558 return; 16559 } 16560 /* 16561 * When we probe zero windows, we force the swnd open. 16562 * If our peer acks with a closed window swnd will be 16563 * set to zero by tcp_rput(). As long as we are 16564 * receiving acks tcp_rput will 16565 * reset 'tcp_ms_we_have_waited' so as not to trip the 16566 * first and second interval actions. NOTE: the timer 16567 * interval is allowed to continue its exponential 16568 * backoff. 16569 */ 16570 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 16571 if (tcp->tcp_debug) { 16572 (void) strlog(TCP_MOD_ID, 0, 1, 16573 SL_TRACE, "tcp_timer: zero win"); 16574 } 16575 } else { 16576 /* 16577 * After retransmission, we need to do 16578 * slow start. Set the ssthresh to one 16579 * half of current effective window and 16580 * cwnd to one MSS. Also reset 16581 * tcp_cwnd_cnt. 16582 * 16583 * Note that if tcp_ssthresh is reduced because 16584 * of ECN, do not reduce it again unless it is 16585 * already one window of data away (tcp_cwr 16586 * should then be cleared) or this is a 16587 * timeout for a retransmitted segment. 16588 */ 16589 uint32_t npkt; 16590 16591 if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 16592 npkt = ((tcp->tcp_timer_backoff ? 16593 tcp->tcp_cwnd_ssthresh : 16594 tcp->tcp_snxt - 16595 tcp->tcp_suna) >> 1) / tcp->tcp_mss; 16596 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 16597 tcp->tcp_mss; 16598 } 16599 tcp->tcp_cwnd = tcp->tcp_mss; 16600 tcp->tcp_cwnd_cnt = 0; 16601 if (tcp->tcp_ecn_ok) { 16602 tcp->tcp_cwr = B_TRUE; 16603 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 16604 tcp->tcp_ecn_cwr_sent = B_FALSE; 16605 } 16606 } 16607 break; 16608 } 16609 /* 16610 * We have something to send yet we cannot send. The 16611 * reason can be: 16612 * 16613 * 1. Zero send window: we need to do zero window probe. 16614 * 2. Zero cwnd: because of ECN, we need to "clock out 16615 * segments. 16616 * 3. SWS avoidance: receiver may have shrunk window, 16617 * reset our knowledge. 16618 * 16619 * Note that condition 2 can happen with either 1 or 16620 * 3. But 1 and 3 are exclusive. 16621 */ 16622 if (tcp->tcp_unsent != 0) { 16623 if (tcp->tcp_cwnd == 0) { 16624 /* 16625 * Set tcp_cwnd to 1 MSS so that a 16626 * new segment can be sent out. We 16627 * are "clocking out" new data when 16628 * the network is really congested. 16629 */ 16630 ASSERT(tcp->tcp_ecn_ok); 16631 tcp->tcp_cwnd = tcp->tcp_mss; 16632 } 16633 if (tcp->tcp_swnd == 0) { 16634 /* Extend window for zero window probe */ 16635 tcp->tcp_swnd++; 16636 tcp->tcp_zero_win_probe = B_TRUE; 16637 BUMP_MIB(&tcp_mib, tcpOutWinProbe); 16638 } else { 16639 /* 16640 * Handle timeout from sender SWS avoidance. 16641 * Reset our knowledge of the max send window 16642 * since the receiver might have reduced its 16643 * receive buffer. Avoid setting tcp_max_swnd 16644 * to one since that will essentially disable 16645 * the SWS checks. 16646 * 16647 * Note that since we don't have a SWS 16648 * state variable, if the timeout is set 16649 * for ECN but not for SWS, this 16650 * code will also be executed. This is 16651 * fine as tcp_max_swnd is updated 16652 * constantly and it will not affect 16653 * anything. 16654 */ 16655 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 16656 } 16657 tcp_wput_data(tcp, NULL, B_FALSE); 16658 return; 16659 } 16660 /* Is there a FIN that needs to be to re retransmitted? */ 16661 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 16662 !tcp->tcp_fin_acked) 16663 break; 16664 /* Nothing to do, return without restarting timer. */ 16665 TCP_STAT(tcp_timer_fire_miss); 16666 return; 16667 case TCPS_FIN_WAIT_2: 16668 /* 16669 * User closed the TCP endpoint and peer ACK'ed our FIN. 16670 * We waited some time for for peer's FIN, but it hasn't 16671 * arrived. We flush the connection now to avoid 16672 * case where the peer has rebooted. 16673 */ 16674 if (TCP_IS_DETACHED(tcp)) { 16675 (void) tcp_clean_death(tcp, 0, 23); 16676 } else { 16677 TCP_TIMER_RESTART(tcp, tcp_fin_wait_2_flush_interval); 16678 } 16679 return; 16680 case TCPS_TIME_WAIT: 16681 (void) tcp_clean_death(tcp, 0, 24); 16682 return; 16683 default: 16684 if (tcp->tcp_debug) { 16685 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 16686 "tcp_timer: strange state (%d) %s", 16687 tcp->tcp_state, tcp_display(tcp, NULL, 16688 DISP_PORT_ONLY)); 16689 } 16690 return; 16691 } 16692 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 16693 /* 16694 * For zero window probe, we need to send indefinitely, 16695 * unless we have not heard from the other side for some 16696 * time... 16697 */ 16698 if ((tcp->tcp_zero_win_probe == 0) || 16699 (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) > 16700 second_threshold)) { 16701 BUMP_MIB(&tcp_mib, tcpTimRetransDrop); 16702 /* 16703 * If TCP is in SYN_RCVD state, send back a 16704 * RST|ACK as BSD does. Note that tcp_zero_win_probe 16705 * should be zero in TCPS_SYN_RCVD state. 16706 */ 16707 if (tcp->tcp_state == TCPS_SYN_RCVD) { 16708 tcp_xmit_ctl("tcp_timer: RST sent on timeout " 16709 "in SYN_RCVD", 16710 tcp, tcp->tcp_snxt, 16711 tcp->tcp_rnxt, TH_RST | TH_ACK); 16712 } 16713 (void) tcp_clean_death(tcp, 16714 tcp->tcp_client_errno ? 16715 tcp->tcp_client_errno : ETIMEDOUT, 25); 16716 return; 16717 } else { 16718 /* 16719 * Set tcp_ms_we_have_waited to second_threshold 16720 * so that in next timeout, we will do the above 16721 * check (lbolt - tcp_last_recv_time). This is 16722 * also to avoid overflow. 16723 * 16724 * We don't need to decrement tcp_timer_backoff 16725 * to avoid overflow because it will be decremented 16726 * later if new timeout value is greater than 16727 * tcp_rexmit_interval_max. In the case when 16728 * tcp_rexmit_interval_max is greater than 16729 * second_threshold, it means that we will wait 16730 * longer than second_threshold to send the next 16731 * window probe. 16732 */ 16733 tcp->tcp_ms_we_have_waited = second_threshold; 16734 } 16735 } else if (ms > first_threshold) { 16736 if (tcp->tcp_snd_zcopy_aware && (!tcp->tcp_xmit_zc_clean) && 16737 tcp->tcp_xmit_head != NULL) { 16738 tcp->tcp_xmit_head = 16739 tcp_zcopy_backoff(tcp, tcp->tcp_xmit_head, 1); 16740 } 16741 /* 16742 * We have been retransmitting for too long... The RTT 16743 * we calculated is probably incorrect. Reinitialize it. 16744 * Need to compensate for 0 tcp_rtt_sa. Reset 16745 * tcp_rtt_update so that we won't accidentally cache a 16746 * bad value. But only do this if this is not a zero 16747 * window probe. 16748 */ 16749 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { 16750 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 16751 (tcp->tcp_rtt_sa >> 5); 16752 tcp->tcp_rtt_sa = 0; 16753 tcp_ip_notify(tcp); 16754 tcp->tcp_rtt_update = 0; 16755 } 16756 } 16757 tcp->tcp_timer_backoff++; 16758 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 16759 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 16760 tcp_rexmit_interval_min) { 16761 /* 16762 * This means the original RTO is tcp_rexmit_interval_min. 16763 * So we will use tcp_rexmit_interval_min as the RTO value 16764 * and do the backoff. 16765 */ 16766 ms = tcp_rexmit_interval_min << tcp->tcp_timer_backoff; 16767 } else { 16768 ms <<= tcp->tcp_timer_backoff; 16769 } 16770 if (ms > tcp_rexmit_interval_max) { 16771 ms = tcp_rexmit_interval_max; 16772 /* 16773 * ms is at max, decrement tcp_timer_backoff to avoid 16774 * overflow. 16775 */ 16776 tcp->tcp_timer_backoff--; 16777 } 16778 tcp->tcp_ms_we_have_waited += ms; 16779 if (tcp->tcp_zero_win_probe == 0) { 16780 tcp->tcp_rto = ms; 16781 } 16782 TCP_TIMER_RESTART(tcp, ms); 16783 /* 16784 * This is after a timeout and tcp_rto is backed off. Set 16785 * tcp_set_timer to 1 so that next time RTO is updated, we will 16786 * restart the timer with a correct value. 16787 */ 16788 tcp->tcp_set_timer = 1; 16789 mss = tcp->tcp_snxt - tcp->tcp_suna; 16790 if (mss > tcp->tcp_mss) 16791 mss = tcp->tcp_mss; 16792 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 16793 mss = tcp->tcp_swnd; 16794 16795 if ((mp = tcp->tcp_xmit_head) != NULL) 16796 mp->b_prev = (mblk_t *)lbolt; 16797 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 16798 B_TRUE); 16799 16800 /* 16801 * When slow start after retransmission begins, start with 16802 * this seq no. tcp_rexmit_max marks the end of special slow 16803 * start phase. tcp_snd_burst controls how many segments 16804 * can be sent because of an ack. 16805 */ 16806 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 16807 tcp->tcp_snd_burst = TCP_CWND_SS; 16808 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 16809 (tcp->tcp_unsent == 0)) { 16810 tcp->tcp_rexmit_max = tcp->tcp_fss; 16811 } else { 16812 tcp->tcp_rexmit_max = tcp->tcp_snxt; 16813 } 16814 tcp->tcp_rexmit = B_TRUE; 16815 tcp->tcp_dupack_cnt = 0; 16816 16817 /* 16818 * Remove all rexmit SACK blk to start from fresh. 16819 */ 16820 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 16821 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 16822 tcp->tcp_num_notsack_blk = 0; 16823 tcp->tcp_cnt_notsack_list = 0; 16824 } 16825 if (mp == NULL) { 16826 return; 16827 } 16828 /* Attach credentials to retransmitted initial SYNs. */ 16829 if (tcp->tcp_state == TCPS_SYN_SENT) { 16830 mblk_setcred(mp, tcp->tcp_cred); 16831 DB_CPID(mp) = tcp->tcp_cpid; 16832 } 16833 16834 tcp->tcp_csuna = tcp->tcp_snxt; 16835 BUMP_MIB(&tcp_mib, tcpRetransSegs); 16836 UPDATE_MIB(&tcp_mib, tcpRetransBytes, mss); 16837 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 16838 tcp_send_data(tcp, tcp->tcp_wq, mp); 16839 16840 } 16841 16842 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ 16843 static void 16844 tcp_unbind(tcp_t *tcp, mblk_t *mp) 16845 { 16846 conn_t *connp; 16847 16848 switch (tcp->tcp_state) { 16849 case TCPS_BOUND: 16850 case TCPS_LISTEN: 16851 break; 16852 default: 16853 tcp_err_ack(tcp, mp, TOUTSTATE, 0); 16854 return; 16855 } 16856 16857 /* 16858 * Need to clean up all the eagers since after the unbind, segments 16859 * will no longer be delivered to this listener stream. 16860 */ 16861 mutex_enter(&tcp->tcp_eager_lock); 16862 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 16863 tcp_eager_cleanup(tcp, 0); 16864 } 16865 mutex_exit(&tcp->tcp_eager_lock); 16866 16867 if (tcp->tcp_ipversion == IPV4_VERSION) { 16868 tcp->tcp_ipha->ipha_src = 0; 16869 } else { 16870 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); 16871 } 16872 V6_SET_ZERO(tcp->tcp_ip_src_v6); 16873 bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport)); 16874 tcp_bind_hash_remove(tcp); 16875 tcp->tcp_state = TCPS_IDLE; 16876 tcp->tcp_mdt = B_FALSE; 16877 /* Send M_FLUSH according to TPI */ 16878 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); 16879 connp = tcp->tcp_connp; 16880 connp->conn_mdt_ok = B_FALSE; 16881 ipcl_hash_remove(connp); 16882 bzero(&connp->conn_ports, sizeof (connp->conn_ports)); 16883 mp = mi_tpi_ok_ack_alloc(mp); 16884 putnext(tcp->tcp_rq, mp); 16885 } 16886 16887 /* 16888 * Don't let port fall into the privileged range. 16889 * Since the extra privileged ports can be arbitrary we also 16890 * ensure that we exclude those from consideration. 16891 * tcp_g_epriv_ports is not sorted thus we loop over it until 16892 * there are no changes. 16893 * 16894 * Note: No locks are held when inspecting tcp_g_*epriv_ports 16895 * but instead the code relies on: 16896 * - the fact that the address of the array and its size never changes 16897 * - the atomic assignment of the elements of the array 16898 * 16899 * Returns 0 if there are no more ports available. 16900 * 16901 * TS note: skip multilevel ports. 16902 */ 16903 static in_port_t 16904 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) 16905 { 16906 int i; 16907 boolean_t restart = B_FALSE; 16908 16909 if (random && tcp_random_anon_port != 0) { 16910 (void) random_get_pseudo_bytes((uint8_t *)&port, 16911 sizeof (in_port_t)); 16912 /* 16913 * Unless changed by a sys admin, the smallest anon port 16914 * is 32768 and the largest anon port is 65535. It is 16915 * very likely (50%) for the random port to be smaller 16916 * than the smallest anon port. When that happens, 16917 * add port % (anon port range) to the smallest anon 16918 * port to get the random port. It should fall into the 16919 * valid anon port range. 16920 */ 16921 if (port < tcp_smallest_anon_port) { 16922 port = tcp_smallest_anon_port + 16923 port % (tcp_largest_anon_port - 16924 tcp_smallest_anon_port); 16925 } 16926 } 16927 16928 retry: 16929 if (port < tcp_smallest_anon_port) 16930 port = (in_port_t)tcp_smallest_anon_port; 16931 16932 if (port > tcp_largest_anon_port) { 16933 if (restart) 16934 return (0); 16935 restart = B_TRUE; 16936 port = (in_port_t)tcp_smallest_anon_port; 16937 } 16938 16939 if (port < tcp_smallest_nonpriv_port) 16940 port = (in_port_t)tcp_smallest_nonpriv_port; 16941 16942 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 16943 if (port == tcp_g_epriv_ports[i]) { 16944 port++; 16945 /* 16946 * Make sure whether the port is in the 16947 * valid range. 16948 */ 16949 goto retry; 16950 } 16951 } 16952 if (is_system_labeled() && 16953 (i = tsol_next_port(crgetzone(tcp->tcp_cred), port, 16954 IPPROTO_TCP, B_TRUE)) != 0) { 16955 port = i; 16956 goto retry; 16957 } 16958 return (port); 16959 } 16960 16961 /* 16962 * Return the next anonymous port in the privileged port range for 16963 * bind checking. It starts at IPPORT_RESERVED - 1 and goes 16964 * downwards. This is the same behavior as documented in the userland 16965 * library call rresvport(3N). 16966 * 16967 * TS note: skip multilevel ports. 16968 */ 16969 static in_port_t 16970 tcp_get_next_priv_port(const tcp_t *tcp) 16971 { 16972 static in_port_t next_priv_port = IPPORT_RESERVED - 1; 16973 in_port_t nextport; 16974 boolean_t restart = B_FALSE; 16975 16976 retry: 16977 if (next_priv_port < tcp_min_anonpriv_port || 16978 next_priv_port >= IPPORT_RESERVED) { 16979 next_priv_port = IPPORT_RESERVED - 1; 16980 if (restart) 16981 return (0); 16982 restart = B_TRUE; 16983 } 16984 if (is_system_labeled() && 16985 (nextport = tsol_next_port(crgetzone(tcp->tcp_cred), 16986 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { 16987 next_priv_port = nextport; 16988 goto retry; 16989 } 16990 return (next_priv_port--); 16991 } 16992 16993 /* The write side r/w procedure. */ 16994 16995 #if CCS_STATS 16996 struct { 16997 struct { 16998 int64_t count, bytes; 16999 } tot, hit; 17000 } wrw_stats; 17001 #endif 17002 17003 /* 17004 * Call by tcp_wput() to handle all non data, except M_PROTO and M_PCPROTO, 17005 * messages. 17006 */ 17007 /* ARGSUSED */ 17008 static void 17009 tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2) 17010 { 17011 conn_t *connp = (conn_t *)arg; 17012 tcp_t *tcp = connp->conn_tcp; 17013 queue_t *q = tcp->tcp_wq; 17014 17015 ASSERT(DB_TYPE(mp) != M_IOCTL); 17016 /* 17017 * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close. 17018 * Once the close starts, streamhead and sockfs will not let any data 17019 * packets come down (close ensures that there are no threads using the 17020 * queue and no new threads will come down) but since qprocsoff() 17021 * hasn't happened yet, a M_FLUSH or some non data message might 17022 * get reflected back (in response to our own FLUSHRW) and get 17023 * processed after tcp_close() is done. The conn would still be valid 17024 * because a ref would have added but we need to check the state 17025 * before actually processing the packet. 17026 */ 17027 if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) { 17028 freemsg(mp); 17029 return; 17030 } 17031 17032 switch (DB_TYPE(mp)) { 17033 case M_IOCDATA: 17034 tcp_wput_iocdata(tcp, mp); 17035 break; 17036 case M_FLUSH: 17037 tcp_wput_flush(tcp, mp); 17038 break; 17039 default: 17040 CALL_IP_WPUT(connp, q, mp); 17041 break; 17042 } 17043 } 17044 17045 /* 17046 * The TCP fast path write put procedure. 17047 * NOTE: the logic of the fast path is duplicated from tcp_wput_data() 17048 */ 17049 /* ARGSUSED */ 17050 void 17051 tcp_output(void *arg, mblk_t *mp, void *arg2) 17052 { 17053 int len; 17054 int hdrlen; 17055 int plen; 17056 mblk_t *mp1; 17057 uchar_t *rptr; 17058 uint32_t snxt; 17059 tcph_t *tcph; 17060 struct datab *db; 17061 uint32_t suna; 17062 uint32_t mss; 17063 ipaddr_t *dst; 17064 ipaddr_t *src; 17065 uint32_t sum; 17066 int usable; 17067 conn_t *connp = (conn_t *)arg; 17068 tcp_t *tcp = connp->conn_tcp; 17069 uint32_t msize; 17070 17071 /* 17072 * Try and ASSERT the minimum possible references on the 17073 * conn early enough. Since we are executing on write side, 17074 * the connection is obviously not detached and that means 17075 * there is a ref each for TCP and IP. Since we are behind 17076 * the squeue, the minimum references needed are 3. If the 17077 * conn is in classifier hash list, there should be an 17078 * extra ref for that (we check both the possibilities). 17079 */ 17080 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 17081 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 17082 17083 ASSERT(DB_TYPE(mp) == M_DATA); 17084 msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 17085 17086 mutex_enter(&connp->conn_lock); 17087 tcp->tcp_squeue_bytes -= msize; 17088 mutex_exit(&connp->conn_lock); 17089 17090 /* Bypass tcp protocol for fused tcp loopback */ 17091 if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 17092 return; 17093 17094 mss = tcp->tcp_mss; 17095 if (tcp->tcp_xmit_zc_clean) 17096 mp = tcp_zcopy_backoff(tcp, mp, 0); 17097 17098 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 17099 len = (int)(mp->b_wptr - mp->b_rptr); 17100 17101 /* 17102 * Criteria for fast path: 17103 * 17104 * 1. no unsent data 17105 * 2. single mblk in request 17106 * 3. connection established 17107 * 4. data in mblk 17108 * 5. len <= mss 17109 * 6. no tcp_valid bits 17110 */ 17111 if ((tcp->tcp_unsent != 0) || 17112 (tcp->tcp_cork) || 17113 (mp->b_cont != NULL) || 17114 (tcp->tcp_state != TCPS_ESTABLISHED) || 17115 (len == 0) || 17116 (len > mss) || 17117 (tcp->tcp_valid_bits != 0)) { 17118 tcp_wput_data(tcp, mp, B_FALSE); 17119 return; 17120 } 17121 17122 ASSERT(tcp->tcp_xmit_tail_unsent == 0); 17123 ASSERT(tcp->tcp_fin_sent == 0); 17124 17125 /* queue new packet onto retransmission queue */ 17126 if (tcp->tcp_xmit_head == NULL) { 17127 tcp->tcp_xmit_head = mp; 17128 } else { 17129 tcp->tcp_xmit_last->b_cont = mp; 17130 } 17131 tcp->tcp_xmit_last = mp; 17132 tcp->tcp_xmit_tail = mp; 17133 17134 /* find out how much we can send */ 17135 /* BEGIN CSTYLED */ 17136 /* 17137 * un-acked usable 17138 * |--------------|-----------------| 17139 * tcp_suna tcp_snxt tcp_suna+tcp_swnd 17140 */ 17141 /* END CSTYLED */ 17142 17143 /* start sending from tcp_snxt */ 17144 snxt = tcp->tcp_snxt; 17145 17146 /* 17147 * Check to see if this connection has been idled for some 17148 * time and no ACK is expected. If it is, we need to slow 17149 * start again to get back the connection's "self-clock" as 17150 * described in VJ's paper. 17151 * 17152 * Refer to the comment in tcp_mss_set() for the calculation 17153 * of tcp_cwnd after idle. 17154 */ 17155 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 17156 (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 17157 SET_TCP_INIT_CWND(tcp, mss, tcp_slow_start_after_idle); 17158 } 17159 17160 usable = tcp->tcp_swnd; /* tcp window size */ 17161 if (usable > tcp->tcp_cwnd) 17162 usable = tcp->tcp_cwnd; /* congestion window smaller */ 17163 usable -= snxt; /* subtract stuff already sent */ 17164 suna = tcp->tcp_suna; 17165 usable += suna; 17166 /* usable can be < 0 if the congestion window is smaller */ 17167 if (len > usable) { 17168 /* Can't send complete M_DATA in one shot */ 17169 goto slow; 17170 } 17171 17172 if (tcp->tcp_flow_stopped && 17173 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 17174 tcp_clrqfull(tcp); 17175 } 17176 17177 /* 17178 * determine if anything to send (Nagle). 17179 * 17180 * 1. len < tcp_mss (i.e. small) 17181 * 2. unacknowledged data present 17182 * 3. len < nagle limit 17183 * 4. last packet sent < nagle limit (previous packet sent) 17184 */ 17185 if ((len < mss) && (snxt != suna) && 17186 (len < (int)tcp->tcp_naglim) && 17187 (tcp->tcp_last_sent_len < tcp->tcp_naglim)) { 17188 /* 17189 * This was the first unsent packet and normally 17190 * mss < xmit_hiwater so there is no need to worry 17191 * about flow control. The next packet will go 17192 * through the flow control check in tcp_wput_data(). 17193 */ 17194 /* leftover work from above */ 17195 tcp->tcp_unsent = len; 17196 tcp->tcp_xmit_tail_unsent = len; 17197 17198 return; 17199 } 17200 17201 /* len <= tcp->tcp_mss && len == unsent so no silly window */ 17202 17203 if (snxt == suna) { 17204 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 17205 } 17206 17207 /* we have always sent something */ 17208 tcp->tcp_rack_cnt = 0; 17209 17210 tcp->tcp_snxt = snxt + len; 17211 tcp->tcp_rack = tcp->tcp_rnxt; 17212 17213 if ((mp1 = dupb(mp)) == 0) 17214 goto no_memory; 17215 mp->b_prev = (mblk_t *)(uintptr_t)lbolt; 17216 mp->b_next = (mblk_t *)(uintptr_t)snxt; 17217 17218 /* adjust tcp header information */ 17219 tcph = tcp->tcp_tcph; 17220 tcph->th_flags[0] = (TH_ACK|TH_PUSH); 17221 17222 sum = len + tcp->tcp_tcp_hdr_len + tcp->tcp_sum; 17223 sum = (sum >> 16) + (sum & 0xFFFF); 17224 U16_TO_ABE16(sum, tcph->th_sum); 17225 17226 U32_TO_ABE32(snxt, tcph->th_seq); 17227 17228 BUMP_MIB(&tcp_mib, tcpOutDataSegs); 17229 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len); 17230 BUMP_LOCAL(tcp->tcp_obsegs); 17231 17232 /* Update the latest receive window size in TCP header. */ 17233 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 17234 tcph->th_win); 17235 17236 tcp->tcp_last_sent_len = (ushort_t)len; 17237 17238 plen = len + tcp->tcp_hdr_len; 17239 17240 if (tcp->tcp_ipversion == IPV4_VERSION) { 17241 tcp->tcp_ipha->ipha_length = htons(plen); 17242 } else { 17243 tcp->tcp_ip6h->ip6_plen = htons(plen - 17244 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 17245 } 17246 17247 /* see if we need to allocate a mblk for the headers */ 17248 hdrlen = tcp->tcp_hdr_len; 17249 rptr = mp1->b_rptr - hdrlen; 17250 db = mp1->b_datap; 17251 if ((db->db_ref != 2) || rptr < db->db_base || 17252 (!OK_32PTR(rptr))) { 17253 /* NOTE: we assume allocb returns an OK_32PTR */ 17254 mp = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 17255 tcp_wroff_xtra, BPRI_MED); 17256 if (!mp) { 17257 freemsg(mp1); 17258 goto no_memory; 17259 } 17260 mp->b_cont = mp1; 17261 mp1 = mp; 17262 /* Leave room for Link Level header */ 17263 /* hdrlen = tcp->tcp_hdr_len; */ 17264 rptr = &mp1->b_rptr[tcp_wroff_xtra]; 17265 mp1->b_wptr = &rptr[hdrlen]; 17266 } 17267 mp1->b_rptr = rptr; 17268 17269 /* Fill in the timestamp option. */ 17270 if (tcp->tcp_snd_ts_ok) { 17271 U32_TO_BE32((uint32_t)lbolt, 17272 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 17273 U32_TO_BE32(tcp->tcp_ts_recent, 17274 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 17275 } else { 17276 ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 17277 } 17278 17279 /* copy header into outgoing packet */ 17280 dst = (ipaddr_t *)rptr; 17281 src = (ipaddr_t *)tcp->tcp_iphc; 17282 dst[0] = src[0]; 17283 dst[1] = src[1]; 17284 dst[2] = src[2]; 17285 dst[3] = src[3]; 17286 dst[4] = src[4]; 17287 dst[5] = src[5]; 17288 dst[6] = src[6]; 17289 dst[7] = src[7]; 17290 dst[8] = src[8]; 17291 dst[9] = src[9]; 17292 if (hdrlen -= 40) { 17293 hdrlen >>= 2; 17294 dst += 10; 17295 src += 10; 17296 do { 17297 *dst++ = *src++; 17298 } while (--hdrlen); 17299 } 17300 17301 /* 17302 * Set the ECN info in the TCP header. Note that this 17303 * is not the template header. 17304 */ 17305 if (tcp->tcp_ecn_ok) { 17306 SET_ECT(tcp, rptr); 17307 17308 tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 17309 if (tcp->tcp_ecn_echo_on) 17310 tcph->th_flags[0] |= TH_ECE; 17311 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 17312 tcph->th_flags[0] |= TH_CWR; 17313 tcp->tcp_ecn_cwr_sent = B_TRUE; 17314 } 17315 } 17316 17317 if (tcp->tcp_ip_forward_progress) { 17318 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 17319 *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; 17320 tcp->tcp_ip_forward_progress = B_FALSE; 17321 } 17322 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); 17323 tcp_send_data(tcp, tcp->tcp_wq, mp1); 17324 return; 17325 17326 /* 17327 * If we ran out of memory, we pretend to have sent the packet 17328 * and that it was lost on the wire. 17329 */ 17330 no_memory: 17331 return; 17332 17333 slow: 17334 /* leftover work from above */ 17335 tcp->tcp_unsent = len; 17336 tcp->tcp_xmit_tail_unsent = len; 17337 tcp_wput_data(tcp, NULL, B_FALSE); 17338 } 17339 17340 /* 17341 * The function called through squeue to get behind eager's perimeter to 17342 * finish the accept processing. 17343 */ 17344 /* ARGSUSED */ 17345 void 17346 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) 17347 { 17348 conn_t *connp = (conn_t *)arg; 17349 tcp_t *tcp = connp->conn_tcp; 17350 queue_t *q = tcp->tcp_rq; 17351 mblk_t *mp1; 17352 mblk_t *stropt_mp = mp; 17353 struct stroptions *stropt; 17354 uint_t thwin; 17355 17356 /* 17357 * Drop the eager's ref on the listener, that was placed when 17358 * this eager began life in tcp_conn_request. 17359 */ 17360 CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); 17361 17362 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) { 17363 /* 17364 * Someone blewoff the eager before we could finish 17365 * the accept. 17366 * 17367 * The only reason eager exists it because we put in 17368 * a ref on it when conn ind went up. We need to send 17369 * a disconnect indication up while the last reference 17370 * on the eager will be dropped by the squeue when we 17371 * return. 17372 */ 17373 ASSERT(tcp->tcp_listener == NULL); 17374 if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { 17375 struct T_discon_ind *tdi; 17376 17377 (void) putnextctl1(q, M_FLUSH, FLUSHRW); 17378 /* 17379 * Let us reuse the incoming mblk to avoid memory 17380 * allocation failure problems. We know that the 17381 * size of the incoming mblk i.e. stroptions is greater 17382 * than sizeof T_discon_ind. So the reallocb below 17383 * can't fail. 17384 */ 17385 freemsg(mp->b_cont); 17386 mp->b_cont = NULL; 17387 ASSERT(DB_REF(mp) == 1); 17388 mp = reallocb(mp, sizeof (struct T_discon_ind), 17389 B_FALSE); 17390 ASSERT(mp != NULL); 17391 DB_TYPE(mp) = M_PROTO; 17392 ((union T_primitives *)mp->b_rptr)->type = T_DISCON_IND; 17393 tdi = (struct T_discon_ind *)mp->b_rptr; 17394 if (tcp->tcp_issocket) { 17395 tdi->DISCON_reason = ECONNREFUSED; 17396 tdi->SEQ_number = 0; 17397 } else { 17398 tdi->DISCON_reason = ENOPROTOOPT; 17399 tdi->SEQ_number = 17400 tcp->tcp_conn_req_seqnum; 17401 } 17402 mp->b_wptr = mp->b_rptr + sizeof (struct T_discon_ind); 17403 putnext(q, mp); 17404 } else { 17405 freemsg(mp); 17406 } 17407 if (tcp->tcp_hard_binding) { 17408 tcp->tcp_hard_binding = B_FALSE; 17409 tcp->tcp_hard_bound = B_TRUE; 17410 } 17411 tcp->tcp_detached = B_FALSE; 17412 return; 17413 } 17414 17415 mp1 = stropt_mp->b_cont; 17416 stropt_mp->b_cont = NULL; 17417 ASSERT(DB_TYPE(stropt_mp) == M_SETOPTS); 17418 stropt = (struct stroptions *)stropt_mp->b_rptr; 17419 17420 while (mp1 != NULL) { 17421 mp = mp1; 17422 mp1 = mp1->b_cont; 17423 mp->b_cont = NULL; 17424 tcp->tcp_drop_opt_ack_cnt++; 17425 CALL_IP_WPUT(connp, tcp->tcp_wq, mp); 17426 } 17427 mp = NULL; 17428 17429 /* 17430 * For a loopback connection with tcp_direct_sockfs on, note that 17431 * we don't have to protect tcp_rcv_list yet because synchronous 17432 * streams has not yet been enabled and tcp_fuse_rrw() cannot 17433 * possibly race with us. 17434 */ 17435 17436 /* 17437 * Set the max window size (tcp_rq->q_hiwat) of the acceptor 17438 * properly. This is the first time we know of the acceptor' 17439 * queue. So we do it here. 17440 */ 17441 if (tcp->tcp_rcv_list == NULL) { 17442 /* 17443 * Recv queue is empty, tcp_rwnd should not have changed. 17444 * That means it should be equal to the listener's tcp_rwnd. 17445 */ 17446 tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd; 17447 } else { 17448 #ifdef DEBUG 17449 uint_t cnt = 0; 17450 17451 mp1 = tcp->tcp_rcv_list; 17452 while ((mp = mp1) != NULL) { 17453 mp1 = mp->b_next; 17454 cnt += msgdsize(mp); 17455 } 17456 ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt); 17457 #endif 17458 /* There is some data, add them back to get the max. */ 17459 tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; 17460 } 17461 17462 stropt->so_flags = SO_HIWAT; 17463 stropt->so_hiwat = MAX(q->q_hiwat, tcp_sth_rcv_hiwat); 17464 17465 stropt->so_flags |= SO_MAXBLK; 17466 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 17467 17468 /* 17469 * This is the first time we run on the correct 17470 * queue after tcp_accept. So fix all the q parameters 17471 * here. 17472 */ 17473 /* Allocate room for SACK options if needed. */ 17474 stropt->so_flags |= SO_WROFF; 17475 if (tcp->tcp_fused) { 17476 ASSERT(tcp->tcp_loopback); 17477 ASSERT(tcp->tcp_loopback_peer != NULL); 17478 /* 17479 * For fused tcp loopback, set the stream head's write 17480 * offset value to zero since we won't be needing any room 17481 * for TCP/IP headers. This would also improve performance 17482 * since it would reduce the amount of work done by kmem. 17483 * Non-fused tcp loopback case is handled separately below. 17484 */ 17485 stropt->so_wroff = 0; 17486 /* 17487 * Record the stream head's high water mark for this endpoint; 17488 * this is used for flow-control purposes in tcp_fuse_output(). 17489 */ 17490 stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(tcp, q->q_hiwat); 17491 /* 17492 * Update the peer's transmit parameters according to 17493 * our recently calculated high water mark value. 17494 */ 17495 (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); 17496 } else if (tcp->tcp_snd_sack_ok) { 17497 stropt->so_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + 17498 (tcp->tcp_loopback ? 0 : tcp_wroff_xtra); 17499 } else { 17500 stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : 17501 tcp_wroff_xtra); 17502 } 17503 17504 /* 17505 * If this is endpoint is handling SSL, then reserve extra 17506 * offset and space at the end. 17507 * Also have the stream head allocate SSL3_MAX_RECORD_LEN packets, 17508 * overriding the previous setting. The extra cost of signing and 17509 * encrypting multiple MSS-size records (12 of them with Ethernet), 17510 * instead of a single contiguous one by the stream head 17511 * largely outweighs the statistical reduction of ACKs, when 17512 * applicable. The peer will also save on decyption and verification 17513 * costs. 17514 */ 17515 if (tcp->tcp_kssl_ctx != NULL) { 17516 stropt->so_wroff += SSL3_WROFFSET; 17517 17518 stropt->so_flags |= SO_TAIL; 17519 stropt->so_tail = SSL3_MAX_TAIL_LEN; 17520 17521 stropt->so_maxblk = SSL3_MAX_RECORD_LEN; 17522 } 17523 17524 /* Send the options up */ 17525 putnext(q, stropt_mp); 17526 17527 /* 17528 * Pass up any data and/or a fin that has been received. 17529 * 17530 * Adjust receive window in case it had decreased 17531 * (because there is data <=> tcp_rcv_list != NULL) 17532 * while the connection was detached. Note that 17533 * in case the eager was flow-controlled, w/o this 17534 * code, the rwnd may never open up again! 17535 */ 17536 if (tcp->tcp_rcv_list != NULL) { 17537 /* We drain directly in case of fused tcp loopback */ 17538 if (!tcp->tcp_fused && canputnext(q)) { 17539 tcp->tcp_rwnd = q->q_hiwat; 17540 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 17541 << tcp->tcp_rcv_ws; 17542 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 17543 if (tcp->tcp_state >= TCPS_ESTABLISHED && 17544 (q->q_hiwat - thwin >= tcp->tcp_mss)) { 17545 tcp_xmit_ctl(NULL, 17546 tcp, (tcp->tcp_swnd == 0) ? 17547 tcp->tcp_suna : tcp->tcp_snxt, 17548 tcp->tcp_rnxt, TH_ACK); 17549 BUMP_MIB(&tcp_mib, tcpOutWinUpdate); 17550 } 17551 17552 } 17553 (void) tcp_rcv_drain(q, tcp); 17554 17555 /* 17556 * For fused tcp loopback, back-enable peer endpoint 17557 * if it's currently flow-controlled. 17558 */ 17559 if (tcp->tcp_fused && 17560 tcp->tcp_loopback_peer->tcp_flow_stopped) { 17561 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 17562 17563 ASSERT(peer_tcp != NULL); 17564 ASSERT(peer_tcp->tcp_fused); 17565 17566 tcp_clrqfull(peer_tcp); 17567 TCP_STAT(tcp_fusion_backenabled); 17568 } 17569 } 17570 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 17571 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 17572 mp = mi_tpi_ordrel_ind(); 17573 if (mp) { 17574 tcp->tcp_ordrel_done = B_TRUE; 17575 putnext(q, mp); 17576 if (tcp->tcp_deferred_clean_death) { 17577 /* 17578 * tcp_clean_death was deferred 17579 * for T_ORDREL_IND - do it now 17580 */ 17581 (void) tcp_clean_death(tcp, 17582 tcp->tcp_client_errno, 21); 17583 tcp->tcp_deferred_clean_death = B_FALSE; 17584 } 17585 } else { 17586 /* 17587 * Run the orderly release in the 17588 * service routine. 17589 */ 17590 qenable(q); 17591 } 17592 } 17593 if (tcp->tcp_hard_binding) { 17594 tcp->tcp_hard_binding = B_FALSE; 17595 tcp->tcp_hard_bound = B_TRUE; 17596 } 17597 17598 tcp->tcp_detached = B_FALSE; 17599 17600 /* We can enable synchronous streams now */ 17601 if (tcp->tcp_fused) { 17602 tcp_fuse_syncstr_enable_pair(tcp); 17603 } 17604 17605 if (tcp->tcp_ka_enabled) { 17606 tcp->tcp_ka_last_intrvl = 0; 17607 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 17608 MSEC_TO_TICK(tcp->tcp_ka_interval)); 17609 } 17610 17611 /* 17612 * At this point, eager is fully established and will 17613 * have the following references - 17614 * 17615 * 2 references for connection to exist (1 for TCP and 1 for IP). 17616 * 1 reference for the squeue which will be dropped by the squeue as 17617 * soon as this function returns. 17618 * There will be 1 additonal reference for being in classifier 17619 * hash list provided something bad hasn't happened. 17620 */ 17621 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 17622 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 17623 } 17624 17625 /* 17626 * The function called through squeue to get behind listener's perimeter to 17627 * send a deffered conn_ind. 17628 */ 17629 /* ARGSUSED */ 17630 void 17631 tcp_send_pending(void *arg, mblk_t *mp, void *arg2) 17632 { 17633 conn_t *connp = (conn_t *)arg; 17634 tcp_t *listener = connp->conn_tcp; 17635 17636 if (listener->tcp_state == TCPS_CLOSED || 17637 TCP_IS_DETACHED(listener)) { 17638 /* 17639 * If listener has closed, it would have caused a 17640 * a cleanup/blowoff to happen for the eager. 17641 */ 17642 tcp_t *tcp; 17643 struct T_conn_ind *conn_ind; 17644 17645 conn_ind = (struct T_conn_ind *)mp->b_rptr; 17646 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 17647 conn_ind->OPT_length); 17648 /* 17649 * We need to drop the ref on eager that was put 17650 * tcp_rput_data() before trying to send the conn_ind 17651 * to listener. The conn_ind was deferred in tcp_send_conn_ind 17652 * and tcp_wput_accept() is sending this deferred conn_ind but 17653 * listener is closed so we drop the ref. 17654 */ 17655 CONN_DEC_REF(tcp->tcp_connp); 17656 freemsg(mp); 17657 return; 17658 } 17659 putnext(listener->tcp_rq, mp); 17660 } 17661 17662 17663 /* 17664 * This is the STREAMS entry point for T_CONN_RES coming down on 17665 * Acceptor STREAM when sockfs listener does accept processing. 17666 * Read the block comment on top pf tcp_conn_request(). 17667 */ 17668 void 17669 tcp_wput_accept(queue_t *q, mblk_t *mp) 17670 { 17671 queue_t *rq = RD(q); 17672 struct T_conn_res *conn_res; 17673 tcp_t *eager; 17674 tcp_t *listener; 17675 struct T_ok_ack *ok; 17676 t_scalar_t PRIM_type; 17677 mblk_t *opt_mp; 17678 conn_t *econnp; 17679 17680 ASSERT(DB_TYPE(mp) == M_PROTO); 17681 17682 conn_res = (struct T_conn_res *)mp->b_rptr; 17683 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 17684 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { 17685 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 17686 if (mp != NULL) 17687 putnext(rq, mp); 17688 return; 17689 } 17690 switch (conn_res->PRIM_type) { 17691 case O_T_CONN_RES: 17692 case T_CONN_RES: 17693 /* 17694 * We pass up an err ack if allocb fails. This will 17695 * cause sockfs to issue a T_DISCON_REQ which will cause 17696 * tcp_eager_blowoff to be called. sockfs will then call 17697 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. 17698 * we need to do the allocb up here because we have to 17699 * make sure rq->q_qinfo->qi_qclose still points to the 17700 * correct function (tcpclose_accept) in case allocb 17701 * fails. 17702 */ 17703 opt_mp = allocb(sizeof (struct stroptions), BPRI_HI); 17704 if (opt_mp == NULL) { 17705 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 17706 if (mp != NULL) 17707 putnext(rq, mp); 17708 return; 17709 } 17710 17711 bcopy(mp->b_rptr + conn_res->OPT_offset, 17712 &eager, conn_res->OPT_length); 17713 PRIM_type = conn_res->PRIM_type; 17714 mp->b_datap->db_type = M_PCPROTO; 17715 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); 17716 ok = (struct T_ok_ack *)mp->b_rptr; 17717 ok->PRIM_type = T_OK_ACK; 17718 ok->CORRECT_prim = PRIM_type; 17719 econnp = eager->tcp_connp; 17720 econnp->conn_dev = (dev_t)q->q_ptr; 17721 eager->tcp_rq = rq; 17722 eager->tcp_wq = q; 17723 rq->q_ptr = econnp; 17724 rq->q_qinfo = &tcp_rinit; 17725 q->q_ptr = econnp; 17726 q->q_qinfo = &tcp_winit; 17727 listener = eager->tcp_listener; 17728 eager->tcp_issocket = B_TRUE; 17729 econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; 17730 17731 /* Put the ref for IP */ 17732 CONN_INC_REF(econnp); 17733 17734 /* 17735 * We should have minimum of 3 references on the conn 17736 * at this point. One each for TCP and IP and one for 17737 * the T_conn_ind that was sent up when the 3-way handshake 17738 * completed. In the normal case we would also have another 17739 * reference (making a total of 4) for the conn being in the 17740 * classifier hash list. However the eager could have received 17741 * an RST subsequently and tcp_closei_local could have removed 17742 * the eager from the classifier hash list, hence we can't 17743 * assert that reference. 17744 */ 17745 ASSERT(econnp->conn_ref >= 3); 17746 17747 /* 17748 * Send the new local address also up to sockfs. There 17749 * should already be enough space in the mp that came 17750 * down from soaccept(). 17751 */ 17752 if (eager->tcp_family == AF_INET) { 17753 sin_t *sin; 17754 17755 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 17756 (sizeof (struct T_ok_ack) + sizeof (sin_t))); 17757 sin = (sin_t *)mp->b_wptr; 17758 mp->b_wptr += sizeof (sin_t); 17759 sin->sin_family = AF_INET; 17760 sin->sin_port = eager->tcp_lport; 17761 sin->sin_addr.s_addr = eager->tcp_ipha->ipha_src; 17762 } else { 17763 sin6_t *sin6; 17764 17765 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 17766 sizeof (struct T_ok_ack) + sizeof (sin6_t)); 17767 sin6 = (sin6_t *)mp->b_wptr; 17768 mp->b_wptr += sizeof (sin6_t); 17769 sin6->sin6_family = AF_INET6; 17770 sin6->sin6_port = eager->tcp_lport; 17771 if (eager->tcp_ipversion == IPV4_VERSION) { 17772 sin6->sin6_flowinfo = 0; 17773 IN6_IPADDR_TO_V4MAPPED( 17774 eager->tcp_ipha->ipha_src, 17775 &sin6->sin6_addr); 17776 } else { 17777 ASSERT(eager->tcp_ip6h != NULL); 17778 sin6->sin6_flowinfo = 17779 eager->tcp_ip6h->ip6_vcf & 17780 ~IPV6_VERS_AND_FLOW_MASK; 17781 sin6->sin6_addr = eager->tcp_ip6h->ip6_src; 17782 } 17783 sin6->sin6_scope_id = 0; 17784 sin6->__sin6_src_id = 0; 17785 } 17786 17787 putnext(rq, mp); 17788 17789 opt_mp->b_datap->db_type = M_SETOPTS; 17790 opt_mp->b_wptr += sizeof (struct stroptions); 17791 17792 /* 17793 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO 17794 * from listener to acceptor. The message is chained on the 17795 * bind_mp which tcp_rput_other will send down to IP. 17796 */ 17797 if (listener->tcp_bound_if != 0) { 17798 /* allocate optmgmt req */ 17799 mp = tcp_setsockopt_mp(IPPROTO_IPV6, 17800 IPV6_BOUND_IF, (char *)&listener->tcp_bound_if, 17801 sizeof (int)); 17802 if (mp != NULL) 17803 linkb(opt_mp, mp); 17804 } 17805 if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { 17806 uint_t on = 1; 17807 17808 /* allocate optmgmt req */ 17809 mp = tcp_setsockopt_mp(IPPROTO_IPV6, 17810 IPV6_RECVPKTINFO, (char *)&on, sizeof (on)); 17811 if (mp != NULL) 17812 linkb(opt_mp, mp); 17813 } 17814 17815 17816 mutex_enter(&listener->tcp_eager_lock); 17817 17818 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 17819 17820 tcp_t *tail; 17821 tcp_t *tcp; 17822 mblk_t *mp1; 17823 17824 tcp = listener->tcp_eager_prev_q0; 17825 /* 17826 * listener->tcp_eager_prev_q0 points to the TAIL of the 17827 * deferred T_conn_ind queue. We need to get to the head 17828 * of the queue in order to send up T_conn_ind the same 17829 * order as how the 3WHS is completed. 17830 */ 17831 while (tcp != listener) { 17832 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 && 17833 !tcp->tcp_kssl_pending) 17834 break; 17835 else 17836 tcp = tcp->tcp_eager_prev_q0; 17837 } 17838 /* None of the pending eagers can be sent up now */ 17839 if (tcp == listener) 17840 goto no_more_eagers; 17841 17842 mp1 = tcp->tcp_conn.tcp_eager_conn_ind; 17843 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 17844 /* Move from q0 to q */ 17845 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 17846 listener->tcp_conn_req_cnt_q0--; 17847 listener->tcp_conn_req_cnt_q++; 17848 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 17849 tcp->tcp_eager_prev_q0; 17850 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 17851 tcp->tcp_eager_next_q0; 17852 tcp->tcp_eager_prev_q0 = NULL; 17853 tcp->tcp_eager_next_q0 = NULL; 17854 tcp->tcp_conn_def_q0 = B_FALSE; 17855 17856 /* 17857 * Insert at end of the queue because sockfs sends 17858 * down T_CONN_RES in chronological order. Leaving 17859 * the older conn indications at front of the queue 17860 * helps reducing search time. 17861 */ 17862 tail = listener->tcp_eager_last_q; 17863 if (tail != NULL) { 17864 tail->tcp_eager_next_q = tcp; 17865 } else { 17866 listener->tcp_eager_next_q = tcp; 17867 } 17868 listener->tcp_eager_last_q = tcp; 17869 tcp->tcp_eager_next_q = NULL; 17870 17871 /* Need to get inside the listener perimeter */ 17872 CONN_INC_REF(listener->tcp_connp); 17873 squeue_fill(listener->tcp_connp->conn_sqp, mp1, 17874 tcp_send_pending, listener->tcp_connp, 17875 SQTAG_TCP_SEND_PENDING); 17876 } 17877 no_more_eagers: 17878 tcp_eager_unlink(eager); 17879 mutex_exit(&listener->tcp_eager_lock); 17880 17881 /* 17882 * At this point, the eager is detached from the listener 17883 * but we still have an extra refs on eager (apart from the 17884 * usual tcp references). The ref was placed in tcp_rput_data 17885 * before sending the conn_ind in tcp_send_conn_ind. 17886 * The ref will be dropped in tcp_accept_finish(). 17887 */ 17888 squeue_enter_nodrain(econnp->conn_sqp, opt_mp, 17889 tcp_accept_finish, econnp, SQTAG_TCP_ACCEPT_FINISH_Q0); 17890 return; 17891 default: 17892 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); 17893 if (mp != NULL) 17894 putnext(rq, mp); 17895 return; 17896 } 17897 } 17898 17899 void 17900 tcp_wput(queue_t *q, mblk_t *mp) 17901 { 17902 conn_t *connp = Q_TO_CONN(q); 17903 tcp_t *tcp; 17904 void (*output_proc)(); 17905 t_scalar_t type; 17906 uchar_t *rptr; 17907 struct iocblk *iocp; 17908 uint32_t msize; 17909 17910 ASSERT(connp->conn_ref >= 2); 17911 17912 switch (DB_TYPE(mp)) { 17913 case M_DATA: 17914 tcp = connp->conn_tcp; 17915 ASSERT(tcp != NULL); 17916 17917 msize = msgdsize(mp); 17918 17919 mutex_enter(&connp->conn_lock); 17920 CONN_INC_REF_LOCKED(connp); 17921 17922 tcp->tcp_squeue_bytes += msize; 17923 if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) { 17924 mutex_exit(&connp->conn_lock); 17925 tcp_setqfull(tcp); 17926 } else 17927 mutex_exit(&connp->conn_lock); 17928 17929 (*tcp_squeue_wput_proc)(connp->conn_sqp, mp, 17930 tcp_output, connp, SQTAG_TCP_OUTPUT); 17931 return; 17932 case M_PROTO: 17933 case M_PCPROTO: 17934 /* 17935 * if it is a snmp message, don't get behind the squeue 17936 */ 17937 tcp = connp->conn_tcp; 17938 rptr = mp->b_rptr; 17939 if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 17940 type = ((union T_primitives *)rptr)->type; 17941 } else { 17942 if (tcp->tcp_debug) { 17943 (void) strlog(TCP_MOD_ID, 0, 1, 17944 SL_ERROR|SL_TRACE, 17945 "tcp_wput_proto, dropping one..."); 17946 } 17947 freemsg(mp); 17948 return; 17949 } 17950 if (type == T_SVR4_OPTMGMT_REQ) { 17951 cred_t *cr = DB_CREDDEF(mp, tcp->tcp_cred); 17952 if (snmpcom_req(q, mp, tcp_snmp_set, tcp_snmp_get, 17953 cr)) { 17954 /* 17955 * This was a SNMP request 17956 */ 17957 return; 17958 } else { 17959 output_proc = tcp_wput_proto; 17960 } 17961 } else { 17962 output_proc = tcp_wput_proto; 17963 } 17964 break; 17965 case M_IOCTL: 17966 /* 17967 * Most ioctls can be processed right away without going via 17968 * squeues - process them right here. Those that do require 17969 * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK) 17970 * are processed by tcp_wput_ioctl(). 17971 */ 17972 iocp = (struct iocblk *)mp->b_rptr; 17973 tcp = connp->conn_tcp; 17974 17975 switch (iocp->ioc_cmd) { 17976 case TCP_IOC_ABORT_CONN: 17977 tcp_ioctl_abort_conn(q, mp); 17978 return; 17979 case TI_GETPEERNAME: 17980 if (tcp->tcp_state < TCPS_SYN_RCVD) { 17981 iocp->ioc_error = ENOTCONN; 17982 iocp->ioc_count = 0; 17983 mp->b_datap->db_type = M_IOCACK; 17984 qreply(q, mp); 17985 return; 17986 } 17987 /* FALLTHRU */ 17988 case TI_GETMYNAME: 17989 mi_copyin(q, mp, NULL, 17990 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 17991 return; 17992 case ND_SET: 17993 /* nd_getset does the necessary checks */ 17994 case ND_GET: 17995 if (!nd_getset(q, tcp_g_nd, mp)) { 17996 CALL_IP_WPUT(connp, q, mp); 17997 return; 17998 } 17999 qreply(q, mp); 18000 return; 18001 case TCP_IOC_DEFAULT_Q: 18002 /* 18003 * Wants to be the default wq. Check the credentials 18004 * first, the rest is executed via squeue. 18005 */ 18006 if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) { 18007 iocp->ioc_error = EPERM; 18008 iocp->ioc_count = 0; 18009 mp->b_datap->db_type = M_IOCACK; 18010 qreply(q, mp); 18011 return; 18012 } 18013 output_proc = tcp_wput_ioctl; 18014 break; 18015 default: 18016 output_proc = tcp_wput_ioctl; 18017 break; 18018 } 18019 break; 18020 default: 18021 output_proc = tcp_wput_nondata; 18022 break; 18023 } 18024 18025 CONN_INC_REF(connp); 18026 (*tcp_squeue_wput_proc)(connp->conn_sqp, mp, 18027 output_proc, connp, SQTAG_TCP_WPUT_OTHER); 18028 } 18029 18030 /* 18031 * Initial STREAMS write side put() procedure for sockets. It tries to 18032 * handle the T_CAPABILITY_REQ which sockfs sends down while setting 18033 * up the socket without using the squeue. Non T_CAPABILITY_REQ messages 18034 * are handled by tcp_wput() as usual. 18035 * 18036 * All further messages will also be handled by tcp_wput() because we cannot 18037 * be sure that the above short cut is safe later. 18038 */ 18039 static void 18040 tcp_wput_sock(queue_t *wq, mblk_t *mp) 18041 { 18042 conn_t *connp = Q_TO_CONN(wq); 18043 tcp_t *tcp = connp->conn_tcp; 18044 struct T_capability_req *car = (struct T_capability_req *)mp->b_rptr; 18045 18046 ASSERT(wq->q_qinfo == &tcp_sock_winit); 18047 wq->q_qinfo = &tcp_winit; 18048 18049 ASSERT(IPCL_IS_TCP(connp)); 18050 ASSERT(TCP_IS_SOCKET(tcp)); 18051 18052 if (DB_TYPE(mp) == M_PCPROTO && 18053 MBLKL(mp) == sizeof (struct T_capability_req) && 18054 car->PRIM_type == T_CAPABILITY_REQ) { 18055 tcp_capability_req(tcp, mp); 18056 return; 18057 } 18058 18059 tcp_wput(wq, mp); 18060 } 18061 18062 static boolean_t 18063 tcp_zcopy_check(tcp_t *tcp) 18064 { 18065 conn_t *connp = tcp->tcp_connp; 18066 ire_t *ire; 18067 boolean_t zc_enabled = B_FALSE; 18068 18069 if (do_tcpzcopy == 2) 18070 zc_enabled = B_TRUE; 18071 else if (tcp->tcp_ipversion == IPV4_VERSION && 18072 IPCL_IS_CONNECTED(connp) && 18073 (connp->conn_flags & IPCL_CHECK_POLICY) == 0 && 18074 connp->conn_dontroute == 0 && 18075 !connp->conn_nexthop_set && 18076 connp->conn_xmit_if_ill == NULL && 18077 connp->conn_nofailover_ill == NULL && 18078 do_tcpzcopy == 1) { 18079 /* 18080 * the checks above closely resemble the fast path checks 18081 * in tcp_send_data(). 18082 */ 18083 mutex_enter(&connp->conn_lock); 18084 ire = connp->conn_ire_cache; 18085 ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); 18086 if (ire != NULL && !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18087 IRE_REFHOLD(ire); 18088 if (ire->ire_stq != NULL) { 18089 ill_t *ill = (ill_t *)ire->ire_stq->q_ptr; 18090 18091 zc_enabled = ill && (ill->ill_capabilities & 18092 ILL_CAPAB_ZEROCOPY) && 18093 (ill->ill_zerocopy_capab-> 18094 ill_zerocopy_flags != 0); 18095 } 18096 IRE_REFRELE(ire); 18097 } 18098 mutex_exit(&connp->conn_lock); 18099 } 18100 tcp->tcp_snd_zcopy_on = zc_enabled; 18101 if (!TCP_IS_DETACHED(tcp)) { 18102 if (zc_enabled) { 18103 (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMSAFE); 18104 TCP_STAT(tcp_zcopy_on); 18105 } else { 18106 (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE); 18107 TCP_STAT(tcp_zcopy_off); 18108 } 18109 } 18110 return (zc_enabled); 18111 } 18112 18113 static mblk_t * 18114 tcp_zcopy_disable(tcp_t *tcp, mblk_t *bp) 18115 { 18116 if (do_tcpzcopy == 2) 18117 return (bp); 18118 else if (tcp->tcp_snd_zcopy_on) { 18119 tcp->tcp_snd_zcopy_on = B_FALSE; 18120 if (!TCP_IS_DETACHED(tcp)) { 18121 (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE); 18122 TCP_STAT(tcp_zcopy_disable); 18123 } 18124 } 18125 return (tcp_zcopy_backoff(tcp, bp, 0)); 18126 } 18127 18128 /* 18129 * Backoff from a zero-copy mblk by copying data to a new mblk and freeing 18130 * the original desballoca'ed segmapped mblk. 18131 */ 18132 static mblk_t * 18133 tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, int fix_xmitlist) 18134 { 18135 mblk_t *head, *tail, *nbp; 18136 if (IS_VMLOANED_MBLK(bp)) { 18137 TCP_STAT(tcp_zcopy_backoff); 18138 if ((head = copyb(bp)) == NULL) { 18139 /* fail to backoff; leave it for the next backoff */ 18140 tcp->tcp_xmit_zc_clean = B_FALSE; 18141 return (bp); 18142 } 18143 if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { 18144 if (fix_xmitlist) 18145 tcp_zcopy_notify(tcp); 18146 else 18147 head->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 18148 } 18149 nbp = bp->b_cont; 18150 if (fix_xmitlist) { 18151 head->b_prev = bp->b_prev; 18152 head->b_next = bp->b_next; 18153 if (tcp->tcp_xmit_tail == bp) 18154 tcp->tcp_xmit_tail = head; 18155 } 18156 bp->b_next = NULL; 18157 bp->b_prev = NULL; 18158 freeb(bp); 18159 } else { 18160 head = bp; 18161 nbp = bp->b_cont; 18162 } 18163 tail = head; 18164 while (nbp) { 18165 if (IS_VMLOANED_MBLK(nbp)) { 18166 TCP_STAT(tcp_zcopy_backoff); 18167 if ((tail->b_cont = copyb(nbp)) == NULL) { 18168 tcp->tcp_xmit_zc_clean = B_FALSE; 18169 tail->b_cont = nbp; 18170 return (head); 18171 } 18172 tail = tail->b_cont; 18173 if (nbp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { 18174 if (fix_xmitlist) 18175 tcp_zcopy_notify(tcp); 18176 else 18177 tail->b_datap->db_struioflag |= 18178 STRUIO_ZCNOTIFY; 18179 } 18180 bp = nbp; 18181 nbp = nbp->b_cont; 18182 if (fix_xmitlist) { 18183 tail->b_prev = bp->b_prev; 18184 tail->b_next = bp->b_next; 18185 if (tcp->tcp_xmit_tail == bp) 18186 tcp->tcp_xmit_tail = tail; 18187 } 18188 bp->b_next = NULL; 18189 bp->b_prev = NULL; 18190 freeb(bp); 18191 } else { 18192 tail->b_cont = nbp; 18193 tail = nbp; 18194 nbp = nbp->b_cont; 18195 } 18196 } 18197 if (fix_xmitlist) { 18198 tcp->tcp_xmit_last = tail; 18199 tcp->tcp_xmit_zc_clean = B_TRUE; 18200 } 18201 return (head); 18202 } 18203 18204 static void 18205 tcp_zcopy_notify(tcp_t *tcp) 18206 { 18207 struct stdata *stp; 18208 18209 if (tcp->tcp_detached) 18210 return; 18211 stp = STREAM(tcp->tcp_rq); 18212 mutex_enter(&stp->sd_lock); 18213 stp->sd_flag |= STZCNOTIFY; 18214 cv_broadcast(&stp->sd_zcopy_wait); 18215 mutex_exit(&stp->sd_lock); 18216 } 18217 18218 static void 18219 tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) 18220 { 18221 ipha_t *ipha; 18222 ipaddr_t src; 18223 ipaddr_t dst; 18224 uint32_t cksum; 18225 ire_t *ire; 18226 uint16_t *up; 18227 ill_t *ill; 18228 conn_t *connp = tcp->tcp_connp; 18229 uint32_t hcksum_txflags = 0; 18230 mblk_t *ire_fp_mp; 18231 uint_t ire_fp_mp_len; 18232 18233 ASSERT(DB_TYPE(mp) == M_DATA); 18234 18235 if (DB_CRED(mp) == NULL) 18236 mblk_setcred(mp, CONN_CRED(connp)); 18237 18238 ipha = (ipha_t *)mp->b_rptr; 18239 src = ipha->ipha_src; 18240 dst = ipha->ipha_dst; 18241 18242 /* 18243 * Drop off fast path for IPv6 and also if options are present or 18244 * we need to resolve a TS label. 18245 */ 18246 if (tcp->tcp_ipversion != IPV4_VERSION || 18247 !IPCL_IS_CONNECTED(connp) || 18248 (connp->conn_flags & IPCL_CHECK_POLICY) != 0 || 18249 connp->conn_dontroute || 18250 connp->conn_nexthop_set || 18251 connp->conn_xmit_if_ill != NULL || 18252 connp->conn_nofailover_ill != NULL || 18253 !connp->conn_ulp_labeled || 18254 ipha->ipha_ident == IP_HDR_INCLUDED || 18255 ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION || 18256 IPP_ENABLED(IPP_LOCAL_OUT)) { 18257 if (tcp->tcp_snd_zcopy_aware) 18258 mp = tcp_zcopy_disable(tcp, mp); 18259 TCP_STAT(tcp_ip_send); 18260 CALL_IP_WPUT(connp, q, mp); 18261 return; 18262 } 18263 18264 mutex_enter(&connp->conn_lock); 18265 ire = connp->conn_ire_cache; 18266 ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); 18267 if (ire != NULL && ire->ire_addr == dst && 18268 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18269 IRE_REFHOLD(ire); 18270 mutex_exit(&connp->conn_lock); 18271 } else { 18272 boolean_t cached = B_FALSE; 18273 18274 /* force a recheck later on */ 18275 tcp->tcp_ire_ill_check_done = B_FALSE; 18276 18277 TCP_DBGSTAT(tcp_ire_null1); 18278 connp->conn_ire_cache = NULL; 18279 mutex_exit(&connp->conn_lock); 18280 if (ire != NULL) 18281 IRE_REFRELE_NOTR(ire); 18282 ire = ire_cache_lookup(dst, connp->conn_zoneid, 18283 MBLK_GETLABEL(mp)); 18284 if (ire == NULL) { 18285 if (tcp->tcp_snd_zcopy_aware) 18286 mp = tcp_zcopy_backoff(tcp, mp, 0); 18287 TCP_STAT(tcp_ire_null); 18288 CALL_IP_WPUT(connp, q, mp); 18289 return; 18290 } 18291 IRE_REFHOLD_NOTR(ire); 18292 /* 18293 * Since we are inside the squeue, there cannot be another 18294 * thread in TCP trying to set the conn_ire_cache now. The 18295 * check for IRE_MARK_CONDEMNED ensures that an interface 18296 * unplumb thread has not yet started cleaning up the conns. 18297 * Hence we don't need to grab the conn lock. 18298 */ 18299 if (!(connp->conn_state_flags & CONN_CLOSING)) { 18300 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 18301 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18302 connp->conn_ire_cache = ire; 18303 cached = B_TRUE; 18304 } 18305 rw_exit(&ire->ire_bucket->irb_lock); 18306 } 18307 18308 /* 18309 * We can continue to use the ire but since it was 18310 * not cached, we should drop the extra reference. 18311 */ 18312 if (!cached) 18313 IRE_REFRELE_NOTR(ire); 18314 18315 /* 18316 * Rampart note: no need to select a new label here, since 18317 * labels are not allowed to change during the life of a TCP 18318 * connection. 18319 */ 18320 } 18321 18322 /* 18323 * The following if case identifies whether or not 18324 * we are forced to take the slowpath. 18325 */ 18326 if (ire->ire_flags & RTF_MULTIRT || 18327 ire->ire_stq == NULL || 18328 ire->ire_max_frag < ntohs(ipha->ipha_length) || 18329 (ire->ire_nce != NULL && 18330 (ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL) || 18331 (ire_fp_mp_len = MBLKL(ire_fp_mp)) > MBLKHEAD(mp)) { 18332 if (tcp->tcp_snd_zcopy_aware) 18333 mp = tcp_zcopy_disable(tcp, mp); 18334 TCP_STAT(tcp_ip_ire_send); 18335 IRE_REFRELE(ire); 18336 CALL_IP_WPUT(connp, q, mp); 18337 return; 18338 } 18339 18340 ill = ire_to_ill(ire); 18341 if (connp->conn_outgoing_ill != NULL) { 18342 ill_t *conn_outgoing_ill = NULL; 18343 /* 18344 * Choose a good ill in the group to send the packets on. 18345 */ 18346 ire = conn_set_outgoing_ill(connp, ire, &conn_outgoing_ill); 18347 ill = ire_to_ill(ire); 18348 } 18349 ASSERT(ill != NULL); 18350 18351 if (!tcp->tcp_ire_ill_check_done) { 18352 tcp_ire_ill_check(tcp, ire, ill, B_TRUE); 18353 tcp->tcp_ire_ill_check_done = B_TRUE; 18354 } 18355 18356 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); 18357 ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); 18358 #ifndef _BIG_ENDIAN 18359 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); 18360 #endif 18361 18362 /* 18363 * Check to see if we need to re-enable MDT for this connection 18364 * because it was previously disabled due to changes in the ill; 18365 * note that by doing it here, this re-enabling only applies when 18366 * the packet is not dispatched through CALL_IP_WPUT(). 18367 * 18368 * That means for IPv4, it is worth re-enabling MDT for the fastpath 18369 * case, since that's how we ended up here. For IPv6, we do the 18370 * re-enabling work in ip_xmit_v6(), albeit indirectly via squeue. 18371 */ 18372 if (connp->conn_mdt_ok && !tcp->tcp_mdt && ILL_MDT_USABLE(ill)) { 18373 /* 18374 * Restore MDT for this connection, so that next time around 18375 * it is eligible to go through tcp_multisend() path again. 18376 */ 18377 TCP_STAT(tcp_mdt_conn_resumed1); 18378 tcp->tcp_mdt = B_TRUE; 18379 ip1dbg(("tcp_send_data: reenabling MDT for connp %p on " 18380 "interface %s\n", (void *)connp, ill->ill_name)); 18381 } 18382 18383 if (tcp->tcp_snd_zcopy_aware) { 18384 if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 || 18385 (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0)) 18386 mp = tcp_zcopy_disable(tcp, mp); 18387 /* 18388 * we shouldn't need to reset ipha as the mp containing 18389 * ipha should never be a zero-copy mp. 18390 */ 18391 } 18392 18393 if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { 18394 ASSERT(ill->ill_hcksum_capab != NULL); 18395 hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; 18396 } 18397 18398 /* pseudo-header checksum (do it in parts for IP header checksum) */ 18399 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 18400 18401 ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION); 18402 up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); 18403 18404 IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up, 18405 IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum); 18406 18407 /* Software checksum? */ 18408 if (DB_CKSUMFLAGS(mp) == 0) { 18409 TCP_STAT(tcp_out_sw_cksum); 18410 TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes, 18411 ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH); 18412 } 18413 18414 ipha->ipha_fragment_offset_and_flags |= 18415 (uint32_t)htons(ire->ire_frag_flag); 18416 18417 /* Calculate IP header checksum if hardware isn't capable */ 18418 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 18419 IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0], 18420 ((uint16_t *)ipha)[4]); 18421 } 18422 18423 ASSERT(DB_TYPE(ire_fp_mp) == M_DATA); 18424 mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len; 18425 bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len); 18426 18427 UPDATE_OB_PKT_COUNT(ire); 18428 ire->ire_last_used_time = lbolt; 18429 BUMP_MIB(&ip_mib, ipOutRequests); 18430 18431 if (ILL_DLS_CAPABLE(ill)) { 18432 /* 18433 * Send the packet directly to DLD, where it may be queued 18434 * depending on the availability of transmit resources at 18435 * the media layer. 18436 */ 18437 IP_DLS_ILL_TX(ill, mp); 18438 } else { 18439 putnext(ire->ire_stq, mp); 18440 } 18441 IRE_REFRELE(ire); 18442 } 18443 18444 /* 18445 * This handles the case when the receiver has shrunk its win. Per RFC 1122 18446 * if the receiver shrinks the window, i.e. moves the right window to the 18447 * left, the we should not send new data, but should retransmit normally the 18448 * old unacked data between suna and suna + swnd. We might has sent data 18449 * that is now outside the new window, pretend that we didn't send it. 18450 */ 18451 static void 18452 tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count) 18453 { 18454 uint32_t snxt = tcp->tcp_snxt; 18455 mblk_t *xmit_tail; 18456 int32_t offset; 18457 18458 ASSERT(shrunk_count > 0); 18459 18460 /* Pretend we didn't send the data outside the window */ 18461 snxt -= shrunk_count; 18462 18463 /* Get the mblk and the offset in it per the shrunk window */ 18464 xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset); 18465 18466 ASSERT(xmit_tail != NULL); 18467 18468 /* Reset all the values per the now shrunk window */ 18469 tcp->tcp_snxt = snxt; 18470 tcp->tcp_xmit_tail = xmit_tail; 18471 tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr - xmit_tail->b_rptr - 18472 offset; 18473 tcp->tcp_unsent += shrunk_count; 18474 18475 if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0) 18476 /* 18477 * Make sure the timer is running so that we will probe a zero 18478 * window. 18479 */ 18480 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 18481 } 18482 18483 18484 /* 18485 * The TCP normal data output path. 18486 * NOTE: the logic of the fast path is duplicated from this function. 18487 */ 18488 static void 18489 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) 18490 { 18491 int len; 18492 mblk_t *local_time; 18493 mblk_t *mp1; 18494 uint32_t snxt; 18495 int tail_unsent; 18496 int tcpstate; 18497 int usable = 0; 18498 mblk_t *xmit_tail; 18499 queue_t *q = tcp->tcp_wq; 18500 int32_t mss; 18501 int32_t num_sack_blk = 0; 18502 int32_t tcp_hdr_len; 18503 int32_t tcp_tcp_hdr_len; 18504 int mdt_thres; 18505 int rc; 18506 18507 tcpstate = tcp->tcp_state; 18508 if (mp == NULL) { 18509 /* 18510 * tcp_wput_data() with NULL mp should only be called when 18511 * there is unsent data. 18512 */ 18513 ASSERT(tcp->tcp_unsent > 0); 18514 /* Really tacky... but we need this for detached closes. */ 18515 len = tcp->tcp_unsent; 18516 goto data_null; 18517 } 18518 18519 #if CCS_STATS 18520 wrw_stats.tot.count++; 18521 wrw_stats.tot.bytes += msgdsize(mp); 18522 #endif 18523 ASSERT(mp->b_datap->db_type == M_DATA); 18524 /* 18525 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 18526 * or before a connection attempt has begun. 18527 */ 18528 if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 18529 (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 18530 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 18531 #ifdef DEBUG 18532 cmn_err(CE_WARN, 18533 "tcp_wput_data: data after ordrel, %s", 18534 tcp_display(tcp, NULL, 18535 DISP_ADDR_AND_PORT)); 18536 #else 18537 if (tcp->tcp_debug) { 18538 (void) strlog(TCP_MOD_ID, 0, 1, 18539 SL_TRACE|SL_ERROR, 18540 "tcp_wput_data: data after ordrel, %s\n", 18541 tcp_display(tcp, NULL, 18542 DISP_ADDR_AND_PORT)); 18543 } 18544 #endif /* DEBUG */ 18545 } 18546 if (tcp->tcp_snd_zcopy_aware && 18547 (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0) 18548 tcp_zcopy_notify(tcp); 18549 freemsg(mp); 18550 if (tcp->tcp_flow_stopped && 18551 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 18552 tcp_clrqfull(tcp); 18553 } 18554 return; 18555 } 18556 18557 /* Strip empties */ 18558 for (;;) { 18559 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 18560 (uintptr_t)INT_MAX); 18561 len = (int)(mp->b_wptr - mp->b_rptr); 18562 if (len > 0) 18563 break; 18564 mp1 = mp; 18565 mp = mp->b_cont; 18566 freeb(mp1); 18567 if (!mp) { 18568 return; 18569 } 18570 } 18571 18572 /* If we are the first on the list ... */ 18573 if (tcp->tcp_xmit_head == NULL) { 18574 tcp->tcp_xmit_head = mp; 18575 tcp->tcp_xmit_tail = mp; 18576 tcp->tcp_xmit_tail_unsent = len; 18577 } else { 18578 /* If tiny tx and room in txq tail, pullup to save mblks. */ 18579 struct datab *dp; 18580 18581 mp1 = tcp->tcp_xmit_last; 18582 if (len < tcp_tx_pull_len && 18583 (dp = mp1->b_datap)->db_ref == 1 && 18584 dp->db_lim - mp1->b_wptr >= len) { 18585 ASSERT(len > 0); 18586 ASSERT(!mp1->b_cont); 18587 if (len == 1) { 18588 *mp1->b_wptr++ = *mp->b_rptr; 18589 } else { 18590 bcopy(mp->b_rptr, mp1->b_wptr, len); 18591 mp1->b_wptr += len; 18592 } 18593 if (mp1 == tcp->tcp_xmit_tail) 18594 tcp->tcp_xmit_tail_unsent += len; 18595 mp1->b_cont = mp->b_cont; 18596 if (tcp->tcp_snd_zcopy_aware && 18597 (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 18598 mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 18599 freeb(mp); 18600 mp = mp1; 18601 } else { 18602 tcp->tcp_xmit_last->b_cont = mp; 18603 } 18604 len += tcp->tcp_unsent; 18605 } 18606 18607 /* Tack on however many more positive length mblks we have */ 18608 if ((mp1 = mp->b_cont) != NULL) { 18609 do { 18610 int tlen; 18611 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 18612 (uintptr_t)INT_MAX); 18613 tlen = (int)(mp1->b_wptr - mp1->b_rptr); 18614 if (tlen <= 0) { 18615 mp->b_cont = mp1->b_cont; 18616 freeb(mp1); 18617 } else { 18618 len += tlen; 18619 mp = mp1; 18620 } 18621 } while ((mp1 = mp->b_cont) != NULL); 18622 } 18623 tcp->tcp_xmit_last = mp; 18624 tcp->tcp_unsent = len; 18625 18626 if (urgent) 18627 usable = 1; 18628 18629 data_null: 18630 snxt = tcp->tcp_snxt; 18631 xmit_tail = tcp->tcp_xmit_tail; 18632 tail_unsent = tcp->tcp_xmit_tail_unsent; 18633 18634 /* 18635 * Note that tcp_mss has been adjusted to take into account the 18636 * timestamp option if applicable. Because SACK options do not 18637 * appear in every TCP segments and they are of variable lengths, 18638 * they cannot be included in tcp_mss. Thus we need to calculate 18639 * the actual segment length when we need to send a segment which 18640 * includes SACK options. 18641 */ 18642 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 18643 int32_t opt_len; 18644 18645 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 18646 tcp->tcp_num_sack_blk); 18647 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 18648 2 + TCPOPT_HEADER_LEN; 18649 mss = tcp->tcp_mss - opt_len; 18650 tcp_hdr_len = tcp->tcp_hdr_len + opt_len; 18651 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + opt_len; 18652 } else { 18653 mss = tcp->tcp_mss; 18654 tcp_hdr_len = tcp->tcp_hdr_len; 18655 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; 18656 } 18657 18658 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 18659 (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 18660 SET_TCP_INIT_CWND(tcp, mss, tcp_slow_start_after_idle); 18661 } 18662 if (tcpstate == TCPS_SYN_RCVD) { 18663 /* 18664 * The three-way connection establishment handshake is not 18665 * complete yet. We want to queue the data for transmission 18666 * after entering ESTABLISHED state (RFC793). A jump to 18667 * "done" label effectively leaves data on the queue. 18668 */ 18669 goto done; 18670 } else { 18671 int usable_r; 18672 18673 /* 18674 * In the special case when cwnd is zero, which can only 18675 * happen if the connection is ECN capable, return now. 18676 * New segments is sent using tcp_timer(). The timer 18677 * is set in tcp_rput_data(). 18678 */ 18679 if (tcp->tcp_cwnd == 0) { 18680 /* 18681 * Note that tcp_cwnd is 0 before 3-way handshake is 18682 * finished. 18683 */ 18684 ASSERT(tcp->tcp_ecn_ok || 18685 tcp->tcp_state < TCPS_ESTABLISHED); 18686 return; 18687 } 18688 18689 /* NOTE: trouble if xmitting while SYN not acked? */ 18690 usable_r = snxt - tcp->tcp_suna; 18691 usable_r = tcp->tcp_swnd - usable_r; 18692 18693 /* 18694 * Check if the receiver has shrunk the window. If 18695 * tcp_wput_data() with NULL mp is called, tcp_fin_sent 18696 * cannot be set as there is unsent data, so FIN cannot 18697 * be sent out. Otherwise, we need to take into account 18698 * of FIN as it consumes an "invisible" sequence number. 18699 */ 18700 ASSERT(tcp->tcp_fin_sent == 0); 18701 if (usable_r < 0) { 18702 /* 18703 * The receiver has shrunk the window and we have sent 18704 * -usable_r date beyond the window, re-adjust. 18705 * 18706 * If TCP window scaling is enabled, there can be 18707 * round down error as the advertised receive window 18708 * is actually right shifted n bits. This means that 18709 * the lower n bits info is wiped out. It will look 18710 * like the window is shrunk. Do a check here to 18711 * see if the shrunk amount is actually within the 18712 * error in window calculation. If it is, just 18713 * return. Note that this check is inside the 18714 * shrunk window check. This makes sure that even 18715 * though tcp_process_shrunk_swnd() is not called, 18716 * we will stop further processing. 18717 */ 18718 if ((-usable_r >> tcp->tcp_snd_ws) > 0) { 18719 tcp_process_shrunk_swnd(tcp, -usable_r); 18720 } 18721 return; 18722 } 18723 18724 /* usable = MIN(swnd, cwnd) - unacked_bytes */ 18725 if (tcp->tcp_swnd > tcp->tcp_cwnd) 18726 usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd; 18727 18728 /* usable = MIN(usable, unsent) */ 18729 if (usable_r > len) 18730 usable_r = len; 18731 18732 /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 18733 if (usable_r > 0) { 18734 usable = usable_r; 18735 } else { 18736 /* Bypass all other unnecessary processing. */ 18737 goto done; 18738 } 18739 } 18740 18741 local_time = (mblk_t *)lbolt; 18742 18743 /* 18744 * "Our" Nagle Algorithm. This is not the same as in the old 18745 * BSD. This is more in line with the true intent of Nagle. 18746 * 18747 * The conditions are: 18748 * 1. The amount of unsent data (or amount of data which can be 18749 * sent, whichever is smaller) is less than Nagle limit. 18750 * 2. The last sent size is also less than Nagle limit. 18751 * 3. There is unack'ed data. 18752 * 4. Urgent pointer is not set. Send urgent data ignoring the 18753 * Nagle algorithm. This reduces the probability that urgent 18754 * bytes get "merged" together. 18755 * 5. The app has not closed the connection. This eliminates the 18756 * wait time of the receiving side waiting for the last piece of 18757 * (small) data. 18758 * 18759 * If all are satisified, exit without sending anything. Note 18760 * that Nagle limit can be smaller than 1 MSS. Nagle limit is 18761 * the smaller of 1 MSS and global tcp_naglim_def (default to be 18762 * 4095). 18763 */ 18764 if (usable < (int)tcp->tcp_naglim && 18765 tcp->tcp_naglim > tcp->tcp_last_sent_len && 18766 snxt != tcp->tcp_suna && 18767 !(tcp->tcp_valid_bits & TCP_URG_VALID) && 18768 !(tcp->tcp_valid_bits & TCP_FSS_VALID)) { 18769 goto done; 18770 } 18771 18772 if (tcp->tcp_cork) { 18773 /* 18774 * if the tcp->tcp_cork option is set, then we have to force 18775 * TCP not to send partial segment (smaller than MSS bytes). 18776 * We are calculating the usable now based on full mss and 18777 * will save the rest of remaining data for later. 18778 */ 18779 if (usable < mss) 18780 goto done; 18781 usable = (usable / mss) * mss; 18782 } 18783 18784 /* Update the latest receive window size in TCP header. */ 18785 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 18786 tcp->tcp_tcph->th_win); 18787 18788 /* 18789 * Determine if it's worthwhile to attempt MDT, based on: 18790 * 18791 * 1. Simple TCP/IP{v4,v6} (no options). 18792 * 2. IPSEC/IPQoS processing is not needed for the TCP connection. 18793 * 3. If the TCP connection is in ESTABLISHED state. 18794 * 4. The TCP is not detached. 18795 * 18796 * If any of the above conditions have changed during the 18797 * connection, stop using MDT and restore the stream head 18798 * parameters accordingly. 18799 */ 18800 if (tcp->tcp_mdt && 18801 ((tcp->tcp_ipversion == IPV4_VERSION && 18802 tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || 18803 (tcp->tcp_ipversion == IPV6_VERSION && 18804 tcp->tcp_ip_hdr_len != IPV6_HDR_LEN) || 18805 tcp->tcp_state != TCPS_ESTABLISHED || 18806 TCP_IS_DETACHED(tcp) || !CONN_IS_MD_FASTPATH(tcp->tcp_connp) || 18807 CONN_IPSEC_OUT_ENCAPSULATED(tcp->tcp_connp) || 18808 IPP_ENABLED(IPP_LOCAL_OUT))) { 18809 tcp->tcp_connp->conn_mdt_ok = B_FALSE; 18810 tcp->tcp_mdt = B_FALSE; 18811 18812 /* Anything other than detached is considered pathological */ 18813 if (!TCP_IS_DETACHED(tcp)) { 18814 TCP_STAT(tcp_mdt_conn_halted1); 18815 (void) tcp_maxpsz_set(tcp, B_TRUE); 18816 } 18817 } 18818 18819 /* Use MDT if sendable amount is greater than the threshold */ 18820 if (tcp->tcp_mdt && 18821 (mdt_thres = mss << tcp_mdt_smss_threshold, usable > mdt_thres) && 18822 (tail_unsent > mdt_thres || (xmit_tail->b_cont != NULL && 18823 MBLKL(xmit_tail->b_cont) > mdt_thres)) && 18824 (tcp->tcp_valid_bits == 0 || 18825 tcp->tcp_valid_bits == TCP_FSS_VALID)) { 18826 ASSERT(tcp->tcp_connp->conn_mdt_ok); 18827 rc = tcp_multisend(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, 18828 num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, 18829 local_time, mdt_thres); 18830 } else { 18831 rc = tcp_send(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, 18832 num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, 18833 local_time, INT_MAX); 18834 } 18835 18836 /* Pretend that all we were trying to send really got sent */ 18837 if (rc < 0 && tail_unsent < 0) { 18838 do { 18839 xmit_tail = xmit_tail->b_cont; 18840 xmit_tail->b_prev = local_time; 18841 ASSERT((uintptr_t)(xmit_tail->b_wptr - 18842 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 18843 tail_unsent += (int)(xmit_tail->b_wptr - 18844 xmit_tail->b_rptr); 18845 } while (tail_unsent < 0); 18846 } 18847 done:; 18848 tcp->tcp_xmit_tail = xmit_tail; 18849 tcp->tcp_xmit_tail_unsent = tail_unsent; 18850 len = tcp->tcp_snxt - snxt; 18851 if (len) { 18852 /* 18853 * If new data was sent, need to update the notsack 18854 * list, which is, afterall, data blocks that have 18855 * not been sack'ed by the receiver. New data is 18856 * not sack'ed. 18857 */ 18858 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 18859 /* len is a negative value. */ 18860 tcp->tcp_pipe -= len; 18861 tcp_notsack_update(&(tcp->tcp_notsack_list), 18862 tcp->tcp_snxt, snxt, 18863 &(tcp->tcp_num_notsack_blk), 18864 &(tcp->tcp_cnt_notsack_list)); 18865 } 18866 tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 18867 tcp->tcp_rack = tcp->tcp_rnxt; 18868 tcp->tcp_rack_cnt = 0; 18869 if ((snxt + len) == tcp->tcp_suna) { 18870 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 18871 } 18872 } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 18873 /* 18874 * Didn't send anything. Make sure the timer is running 18875 * so that we will probe a zero window. 18876 */ 18877 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 18878 } 18879 /* Note that len is the amount we just sent but with a negative sign */ 18880 tcp->tcp_unsent += len; 18881 if (tcp->tcp_flow_stopped) { 18882 if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 18883 tcp_clrqfull(tcp); 18884 } 18885 } else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) { 18886 tcp_setqfull(tcp); 18887 } 18888 } 18889 18890 /* 18891 * tcp_fill_header is called by tcp_send() and tcp_multisend() to fill the 18892 * outgoing TCP header with the template header, as well as other 18893 * options such as time-stamp, ECN and/or SACK. 18894 */ 18895 static void 18896 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) 18897 { 18898 tcph_t *tcp_tmpl, *tcp_h; 18899 uint32_t *dst, *src; 18900 int hdrlen; 18901 18902 ASSERT(OK_32PTR(rptr)); 18903 18904 /* Template header */ 18905 tcp_tmpl = tcp->tcp_tcph; 18906 18907 /* Header of outgoing packet */ 18908 tcp_h = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 18909 18910 /* dst and src are opaque 32-bit fields, used for copying */ 18911 dst = (uint32_t *)rptr; 18912 src = (uint32_t *)tcp->tcp_iphc; 18913 hdrlen = tcp->tcp_hdr_len; 18914 18915 /* Fill time-stamp option if needed */ 18916 if (tcp->tcp_snd_ts_ok) { 18917 U32_TO_BE32((uint32_t)now, 18918 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); 18919 U32_TO_BE32(tcp->tcp_ts_recent, 18920 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); 18921 } else { 18922 ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 18923 } 18924 18925 /* 18926 * Copy the template header; is this really more efficient than 18927 * calling bcopy()? For simple IPv4/TCP, it may be the case, 18928 * but perhaps not for other scenarios. 18929 */ 18930 dst[0] = src[0]; 18931 dst[1] = src[1]; 18932 dst[2] = src[2]; 18933 dst[3] = src[3]; 18934 dst[4] = src[4]; 18935 dst[5] = src[5]; 18936 dst[6] = src[6]; 18937 dst[7] = src[7]; 18938 dst[8] = src[8]; 18939 dst[9] = src[9]; 18940 if (hdrlen -= 40) { 18941 hdrlen >>= 2; 18942 dst += 10; 18943 src += 10; 18944 do { 18945 *dst++ = *src++; 18946 } while (--hdrlen); 18947 } 18948 18949 /* 18950 * Set the ECN info in the TCP header if it is not a zero 18951 * window probe. Zero window probe is only sent in 18952 * tcp_wput_data() and tcp_timer(). 18953 */ 18954 if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 18955 SET_ECT(tcp, rptr); 18956 18957 if (tcp->tcp_ecn_echo_on) 18958 tcp_h->th_flags[0] |= TH_ECE; 18959 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 18960 tcp_h->th_flags[0] |= TH_CWR; 18961 tcp->tcp_ecn_cwr_sent = B_TRUE; 18962 } 18963 } 18964 18965 /* Fill in SACK options */ 18966 if (num_sack_blk > 0) { 18967 uchar_t *wptr = rptr + tcp->tcp_hdr_len; 18968 sack_blk_t *tmp; 18969 int32_t i; 18970 18971 wptr[0] = TCPOPT_NOP; 18972 wptr[1] = TCPOPT_NOP; 18973 wptr[2] = TCPOPT_SACK; 18974 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 18975 sizeof (sack_blk_t); 18976 wptr += TCPOPT_REAL_SACK_LEN; 18977 18978 tmp = tcp->tcp_sack_list; 18979 for (i = 0; i < num_sack_blk; i++) { 18980 U32_TO_BE32(tmp[i].begin, wptr); 18981 wptr += sizeof (tcp_seq); 18982 U32_TO_BE32(tmp[i].end, wptr); 18983 wptr += sizeof (tcp_seq); 18984 } 18985 tcp_h->th_offset_and_rsrvd[0] += 18986 ((num_sack_blk * 2 + 1) << 4); 18987 } 18988 } 18989 18990 /* 18991 * tcp_mdt_add_attrs() is called by tcp_multisend() in order to attach 18992 * the destination address and SAP attribute, and if necessary, the 18993 * hardware checksum offload attribute to a Multidata message. 18994 */ 18995 static int 18996 tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum, 18997 const uint32_t start, const uint32_t stuff, const uint32_t end, 18998 const uint32_t flags) 18999 { 19000 /* Add global destination address & SAP attribute */ 19001 if (dlmp == NULL || !ip_md_addr_attr(mmd, NULL, dlmp)) { 19002 ip1dbg(("tcp_mdt_add_attrs: can't add global physical " 19003 "destination address+SAP\n")); 19004 19005 if (dlmp != NULL) 19006 TCP_STAT(tcp_mdt_allocfail); 19007 return (-1); 19008 } 19009 19010 /* Add global hwcksum attribute */ 19011 if (hwcksum && 19012 !ip_md_hcksum_attr(mmd, NULL, start, stuff, end, flags)) { 19013 ip1dbg(("tcp_mdt_add_attrs: can't add global hardware " 19014 "checksum attribute\n")); 19015 19016 TCP_STAT(tcp_mdt_allocfail); 19017 return (-1); 19018 } 19019 19020 return (0); 19021 } 19022 19023 /* 19024 * Smaller and private version of pdescinfo_t used specifically for TCP, 19025 * which allows for only two payload spans per packet. 19026 */ 19027 typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t; 19028 19029 /* 19030 * tcp_multisend() is called by tcp_wput_data() for Multidata Transmit 19031 * scheme, and returns one the following: 19032 * 19033 * -1 = failed allocation. 19034 * 0 = success; burst count reached, or usable send window is too small, 19035 * and that we'd rather wait until later before sending again. 19036 */ 19037 static int 19038 tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, 19039 const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable, 19040 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 19041 const int mdt_thres) 19042 { 19043 mblk_t *md_mp_head, *md_mp, *md_pbuf, *md_pbuf_nxt, *md_hbuf; 19044 multidata_t *mmd; 19045 uint_t obsegs, obbytes, hdr_frag_sz; 19046 uint_t cur_hdr_off, cur_pld_off, base_pld_off, first_snxt; 19047 int num_burst_seg, max_pld; 19048 pdesc_t *pkt; 19049 tcp_pdescinfo_t tcp_pkt_info; 19050 pdescinfo_t *pkt_info; 19051 int pbuf_idx, pbuf_idx_nxt; 19052 int seg_len, len, spill, af; 19053 boolean_t add_buffer, zcopy, clusterwide; 19054 boolean_t rconfirm = B_FALSE; 19055 boolean_t done = B_FALSE; 19056 uint32_t cksum; 19057 uint32_t hwcksum_flags; 19058 ire_t *ire; 19059 ill_t *ill; 19060 ipha_t *ipha; 19061 ip6_t *ip6h; 19062 ipaddr_t src, dst; 19063 ill_zerocopy_capab_t *zc_cap = NULL; 19064 uint16_t *up; 19065 int err; 19066 conn_t *connp; 19067 19068 #ifdef _BIG_ENDIAN 19069 #define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 28) & 0x7) 19070 #else 19071 #define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 4) & 0x7) 19072 #endif 19073 19074 #define PREP_NEW_MULTIDATA() { \ 19075 mmd = NULL; \ 19076 md_mp = md_hbuf = NULL; \ 19077 cur_hdr_off = 0; \ 19078 max_pld = tcp->tcp_mdt_max_pld; \ 19079 pbuf_idx = pbuf_idx_nxt = -1; \ 19080 add_buffer = B_TRUE; \ 19081 zcopy = B_FALSE; \ 19082 } 19083 19084 #define PREP_NEW_PBUF() { \ 19085 md_pbuf = md_pbuf_nxt = NULL; \ 19086 pbuf_idx = pbuf_idx_nxt = -1; \ 19087 cur_pld_off = 0; \ 19088 first_snxt = *snxt; \ 19089 ASSERT(*tail_unsent > 0); \ 19090 base_pld_off = MBLKL(*xmit_tail) - *tail_unsent; \ 19091 } 19092 19093 ASSERT(mdt_thres >= mss); 19094 ASSERT(*usable > 0 && *usable > mdt_thres); 19095 ASSERT(tcp->tcp_state == TCPS_ESTABLISHED); 19096 ASSERT(!TCP_IS_DETACHED(tcp)); 19097 ASSERT(tcp->tcp_valid_bits == 0 || 19098 tcp->tcp_valid_bits == TCP_FSS_VALID); 19099 ASSERT((tcp->tcp_ipversion == IPV4_VERSION && 19100 tcp->tcp_ip_hdr_len == IP_SIMPLE_HDR_LENGTH) || 19101 (tcp->tcp_ipversion == IPV6_VERSION && 19102 tcp->tcp_ip_hdr_len == IPV6_HDR_LEN)); 19103 19104 connp = tcp->tcp_connp; 19105 ASSERT(connp != NULL); 19106 ASSERT(CONN_IS_MD_FASTPATH(connp)); 19107 ASSERT(!CONN_IPSEC_OUT_ENCAPSULATED(connp)); 19108 19109 /* 19110 * Note that tcp will only declare at most 2 payload spans per 19111 * packet, which is much lower than the maximum allowable number 19112 * of packet spans per Multidata. For this reason, we use the 19113 * privately declared and smaller descriptor info structure, in 19114 * order to save some stack space. 19115 */ 19116 pkt_info = (pdescinfo_t *)&tcp_pkt_info; 19117 19118 af = (tcp->tcp_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6; 19119 if (af == AF_INET) { 19120 dst = tcp->tcp_ipha->ipha_dst; 19121 src = tcp->tcp_ipha->ipha_src; 19122 ASSERT(!CLASSD(dst)); 19123 } 19124 ASSERT(af == AF_INET || 19125 !IN6_IS_ADDR_MULTICAST(&tcp->tcp_ip6h->ip6_dst)); 19126 19127 obsegs = obbytes = 0; 19128 num_burst_seg = tcp->tcp_snd_burst; 19129 md_mp_head = NULL; 19130 PREP_NEW_MULTIDATA(); 19131 19132 /* 19133 * Before we go on further, make sure there is an IRE that we can 19134 * use, and that the ILL supports MDT. Otherwise, there's no point 19135 * in proceeding any further, and we should just hand everything 19136 * off to the legacy path. 19137 */ 19138 mutex_enter(&connp->conn_lock); 19139 ire = connp->conn_ire_cache; 19140 ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); 19141 if (ire != NULL && ((af == AF_INET && ire->ire_addr == dst) || 19142 (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, 19143 &tcp->tcp_ip6h->ip6_dst))) && 19144 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 19145 IRE_REFHOLD(ire); 19146 mutex_exit(&connp->conn_lock); 19147 } else { 19148 boolean_t cached = B_FALSE; 19149 ts_label_t *tsl; 19150 19151 /* force a recheck later on */ 19152 tcp->tcp_ire_ill_check_done = B_FALSE; 19153 19154 TCP_DBGSTAT(tcp_ire_null1); 19155 connp->conn_ire_cache = NULL; 19156 mutex_exit(&connp->conn_lock); 19157 19158 /* Release the old ire */ 19159 if (ire != NULL) 19160 IRE_REFRELE_NOTR(ire); 19161 19162 tsl = crgetlabel(CONN_CRED(connp)); 19163 ire = (af == AF_INET) ? 19164 ire_cache_lookup(dst, connp->conn_zoneid, tsl) : 19165 ire_cache_lookup_v6(&tcp->tcp_ip6h->ip6_dst, 19166 connp->conn_zoneid, tsl); 19167 19168 if (ire == NULL) { 19169 TCP_STAT(tcp_ire_null); 19170 goto legacy_send_no_md; 19171 } 19172 19173 IRE_REFHOLD_NOTR(ire); 19174 /* 19175 * Since we are inside the squeue, there cannot be another 19176 * thread in TCP trying to set the conn_ire_cache now. The 19177 * check for IRE_MARK_CONDEMNED ensures that an interface 19178 * unplumb thread has not yet started cleaning up the conns. 19179 * Hence we don't need to grab the conn lock. 19180 */ 19181 if (!(connp->conn_state_flags & CONN_CLOSING)) { 19182 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 19183 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 19184 connp->conn_ire_cache = ire; 19185 cached = B_TRUE; 19186 } 19187 rw_exit(&ire->ire_bucket->irb_lock); 19188 } 19189 19190 /* 19191 * We can continue to use the ire but since it was not 19192 * cached, we should drop the extra reference. 19193 */ 19194 if (!cached) 19195 IRE_REFRELE_NOTR(ire); 19196 } 19197 19198 ASSERT(ire != NULL); 19199 ASSERT(af != AF_INET || ire->ire_ipversion == IPV4_VERSION); 19200 ASSERT(af == AF_INET || !IN6_IS_ADDR_V4MAPPED(&(ire->ire_addr_v6))); 19201 ASSERT(af == AF_INET || ire->ire_nce != NULL); 19202 ASSERT(!(ire->ire_type & IRE_BROADCAST)); 19203 /* 19204 * If we do support loopback for MDT (which requires modifications 19205 * to the receiving paths), the following assertions should go away, 19206 * and we would be sending the Multidata to loopback conn later on. 19207 */ 19208 ASSERT(!IRE_IS_LOCAL(ire)); 19209 ASSERT(ire->ire_stq != NULL); 19210 19211 ill = ire_to_ill(ire); 19212 ASSERT(ill != NULL); 19213 ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL); 19214 19215 if (!tcp->tcp_ire_ill_check_done) { 19216 tcp_ire_ill_check(tcp, ire, ill, B_TRUE); 19217 tcp->tcp_ire_ill_check_done = B_TRUE; 19218 } 19219 19220 /* 19221 * If the underlying interface conditions have changed, or if the 19222 * new interface does not support MDT, go back to legacy path. 19223 */ 19224 if (!ILL_MDT_USABLE(ill) || (ire->ire_flags & RTF_MULTIRT) != 0) { 19225 /* don't go through this path anymore for this connection */ 19226 TCP_STAT(tcp_mdt_conn_halted2); 19227 tcp->tcp_mdt = B_FALSE; 19228 ip1dbg(("tcp_multisend: disabling MDT for connp %p on " 19229 "interface %s\n", (void *)connp, ill->ill_name)); 19230 /* IRE will be released prior to returning */ 19231 goto legacy_send_no_md; 19232 } 19233 19234 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) 19235 zc_cap = ill->ill_zerocopy_capab; 19236 19237 /* 19238 * Check if we can take tcp fast-path. Note that "incomplete" 19239 * ire's (where the link-layer for next hop is not resolved 19240 * or where the fast-path header in nce_fp_mp is not available 19241 * yet) are sent down the legacy (slow) path. 19242 * NOTE: We should fix ip_xmit_v4 to handle M_MULTIDATA 19243 */ 19244 if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) { 19245 /* IRE will be released prior to returning */ 19246 goto legacy_send_no_md; 19247 } 19248 19249 /* go to legacy path if interface doesn't support zerocopy */ 19250 if (tcp->tcp_snd_zcopy_aware && do_tcpzcopy != 2 && 19251 (zc_cap == NULL || zc_cap->ill_zerocopy_flags == 0)) { 19252 /* IRE will be released prior to returning */ 19253 goto legacy_send_no_md; 19254 } 19255 19256 /* does the interface support hardware checksum offload? */ 19257 hwcksum_flags = 0; 19258 if (ILL_HCKSUM_CAPABLE(ill) && 19259 (ill->ill_hcksum_capab->ill_hcksum_txflags & 19260 (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL | 19261 HCKSUM_IPHDRCKSUM)) && dohwcksum) { 19262 if (ill->ill_hcksum_capab->ill_hcksum_txflags & 19263 HCKSUM_IPHDRCKSUM) 19264 hwcksum_flags = HCK_IPV4_HDRCKSUM; 19265 19266 if (ill->ill_hcksum_capab->ill_hcksum_txflags & 19267 (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6)) 19268 hwcksum_flags |= HCK_FULLCKSUM; 19269 else if (ill->ill_hcksum_capab->ill_hcksum_txflags & 19270 HCKSUM_INET_PARTIAL) 19271 hwcksum_flags |= HCK_PARTIALCKSUM; 19272 } 19273 19274 /* 19275 * Each header fragment consists of the leading extra space, 19276 * followed by the TCP/IP header, and the trailing extra space. 19277 * We make sure that each header fragment begins on a 32-bit 19278 * aligned memory address (tcp_mdt_hdr_head is already 32-bit 19279 * aligned in tcp_mdt_update). 19280 */ 19281 hdr_frag_sz = roundup((tcp->tcp_mdt_hdr_head + tcp_hdr_len + 19282 tcp->tcp_mdt_hdr_tail), 4); 19283 19284 /* are we starting from the beginning of data block? */ 19285 if (*tail_unsent == 0) { 19286 *xmit_tail = (*xmit_tail)->b_cont; 19287 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= (uintptr_t)INT_MAX); 19288 *tail_unsent = (int)MBLKL(*xmit_tail); 19289 } 19290 19291 /* 19292 * Here we create one or more Multidata messages, each made up of 19293 * one header buffer and up to N payload buffers. This entire 19294 * operation is done within two loops: 19295 * 19296 * The outer loop mostly deals with creating the Multidata message, 19297 * as well as the header buffer that gets added to it. It also 19298 * links the Multidata messages together such that all of them can 19299 * be sent down to the lower layer in a single putnext call; this 19300 * linking behavior depends on the tcp_mdt_chain tunable. 19301 * 19302 * The inner loop takes an existing Multidata message, and adds 19303 * one or more (up to tcp_mdt_max_pld) payload buffers to it. It 19304 * packetizes those buffers by filling up the corresponding header 19305 * buffer fragments with the proper IP and TCP headers, and by 19306 * describing the layout of each packet in the packet descriptors 19307 * that get added to the Multidata. 19308 */ 19309 do { 19310 /* 19311 * If usable send window is too small, or data blocks in 19312 * transmit list are smaller than our threshold (i.e. app 19313 * performs large writes followed by small ones), we hand 19314 * off the control over to the legacy path. Note that we'll 19315 * get back the control once it encounters a large block. 19316 */ 19317 if (*usable < mss || (*tail_unsent <= mdt_thres && 19318 (*xmit_tail)->b_cont != NULL && 19319 MBLKL((*xmit_tail)->b_cont) <= mdt_thres)) { 19320 /* send down what we've got so far */ 19321 if (md_mp_head != NULL) { 19322 tcp_multisend_data(tcp, ire, ill, md_mp_head, 19323 obsegs, obbytes, &rconfirm); 19324 } 19325 /* 19326 * Pass control over to tcp_send(), but tell it to 19327 * return to us once a large-size transmission is 19328 * possible. 19329 */ 19330 TCP_STAT(tcp_mdt_legacy_small); 19331 if ((err = tcp_send(q, tcp, mss, tcp_hdr_len, 19332 tcp_tcp_hdr_len, num_sack_blk, usable, snxt, 19333 tail_unsent, xmit_tail, local_time, 19334 mdt_thres)) <= 0) { 19335 /* burst count reached, or alloc failed */ 19336 IRE_REFRELE(ire); 19337 return (err); 19338 } 19339 19340 /* tcp_send() may have sent everything, so check */ 19341 if (*usable <= 0) { 19342 IRE_REFRELE(ire); 19343 return (0); 19344 } 19345 19346 TCP_STAT(tcp_mdt_legacy_ret); 19347 /* 19348 * We may have delivered the Multidata, so make sure 19349 * to re-initialize before the next round. 19350 */ 19351 md_mp_head = NULL; 19352 obsegs = obbytes = 0; 19353 num_burst_seg = tcp->tcp_snd_burst; 19354 PREP_NEW_MULTIDATA(); 19355 19356 /* are we starting from the beginning of data block? */ 19357 if (*tail_unsent == 0) { 19358 *xmit_tail = (*xmit_tail)->b_cont; 19359 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= 19360 (uintptr_t)INT_MAX); 19361 *tail_unsent = (int)MBLKL(*xmit_tail); 19362 } 19363 } 19364 19365 /* 19366 * max_pld limits the number of mblks in tcp's transmit 19367 * queue that can be added to a Multidata message. Once 19368 * this counter reaches zero, no more additional mblks 19369 * can be added to it. What happens afterwards depends 19370 * on whether or not we are set to chain the Multidata 19371 * messages. If we are to link them together, reset 19372 * max_pld to its original value (tcp_mdt_max_pld) and 19373 * prepare to create a new Multidata message which will 19374 * get linked to md_mp_head. Else, leave it alone and 19375 * let the inner loop break on its own. 19376 */ 19377 if (tcp_mdt_chain && max_pld == 0) 19378 PREP_NEW_MULTIDATA(); 19379 19380 /* adding a payload buffer; re-initialize values */ 19381 if (add_buffer) 19382 PREP_NEW_PBUF(); 19383 19384 /* 19385 * If we don't have a Multidata, either because we just 19386 * (re)entered this outer loop, or after we branched off 19387 * to tcp_send above, setup the Multidata and header 19388 * buffer to be used. 19389 */ 19390 if (md_mp == NULL) { 19391 int md_hbuflen; 19392 uint32_t start, stuff; 19393 19394 /* 19395 * Calculate Multidata header buffer size large enough 19396 * to hold all of the headers that can possibly be 19397 * sent at this moment. We'd rather over-estimate 19398 * the size than running out of space; this is okay 19399 * since this buffer is small anyway. 19400 */ 19401 md_hbuflen = (howmany(*usable, mss) + 1) * hdr_frag_sz; 19402 19403 /* 19404 * Start and stuff offset for partial hardware 19405 * checksum offload; these are currently for IPv4. 19406 * For full checksum offload, they are set to zero. 19407 */ 19408 if ((hwcksum_flags & HCK_PARTIALCKSUM)) { 19409 if (af == AF_INET) { 19410 start = IP_SIMPLE_HDR_LENGTH; 19411 stuff = IP_SIMPLE_HDR_LENGTH + 19412 TCP_CHECKSUM_OFFSET; 19413 } else { 19414 start = IPV6_HDR_LEN; 19415 stuff = IPV6_HDR_LEN + 19416 TCP_CHECKSUM_OFFSET; 19417 } 19418 } else { 19419 start = stuff = 0; 19420 } 19421 19422 /* 19423 * Create the header buffer, Multidata, as well as 19424 * any necessary attributes (destination address, 19425 * SAP and hardware checksum offload) that should 19426 * be associated with the Multidata message. 19427 */ 19428 ASSERT(cur_hdr_off == 0); 19429 if ((md_hbuf = allocb(md_hbuflen, BPRI_HI)) == NULL || 19430 ((md_hbuf->b_wptr += md_hbuflen), 19431 (mmd = mmd_alloc(md_hbuf, &md_mp, 19432 KM_NOSLEEP)) == NULL) || (tcp_mdt_add_attrs(mmd, 19433 /* fastpath mblk */ 19434 ire->ire_nce->nce_res_mp, 19435 /* hardware checksum enabled */ 19436 (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)), 19437 /* hardware checksum offsets */ 19438 start, stuff, 0, 19439 /* hardware checksum flag */ 19440 hwcksum_flags) != 0)) { 19441 legacy_send: 19442 if (md_mp != NULL) { 19443 /* Unlink message from the chain */ 19444 if (md_mp_head != NULL) { 19445 err = (intptr_t)rmvb(md_mp_head, 19446 md_mp); 19447 /* 19448 * We can't assert that rmvb 19449 * did not return -1, since we 19450 * may get here before linkb 19451 * happens. We do, however, 19452 * check if we just removed the 19453 * only element in the list. 19454 */ 19455 if (err == 0) 19456 md_mp_head = NULL; 19457 } 19458 /* md_hbuf gets freed automatically */ 19459 TCP_STAT(tcp_mdt_discarded); 19460 freeb(md_mp); 19461 } else { 19462 /* Either allocb or mmd_alloc failed */ 19463 TCP_STAT(tcp_mdt_allocfail); 19464 if (md_hbuf != NULL) 19465 freeb(md_hbuf); 19466 } 19467 19468 /* send down what we've got so far */ 19469 if (md_mp_head != NULL) { 19470 tcp_multisend_data(tcp, ire, ill, 19471 md_mp_head, obsegs, obbytes, 19472 &rconfirm); 19473 } 19474 legacy_send_no_md: 19475 if (ire != NULL) 19476 IRE_REFRELE(ire); 19477 /* 19478 * Too bad; let the legacy path handle this. 19479 * We specify INT_MAX for the threshold, since 19480 * we gave up with the Multidata processings 19481 * and let the old path have it all. 19482 */ 19483 TCP_STAT(tcp_mdt_legacy_all); 19484 return (tcp_send(q, tcp, mss, tcp_hdr_len, 19485 tcp_tcp_hdr_len, num_sack_blk, usable, 19486 snxt, tail_unsent, xmit_tail, local_time, 19487 INT_MAX)); 19488 } 19489 19490 /* link to any existing ones, if applicable */ 19491 TCP_STAT(tcp_mdt_allocd); 19492 if (md_mp_head == NULL) { 19493 md_mp_head = md_mp; 19494 } else if (tcp_mdt_chain) { 19495 TCP_STAT(tcp_mdt_linked); 19496 linkb(md_mp_head, md_mp); 19497 } 19498 } 19499 19500 ASSERT(md_mp_head != NULL); 19501 ASSERT(tcp_mdt_chain || md_mp_head->b_cont == NULL); 19502 ASSERT(md_mp != NULL && mmd != NULL); 19503 ASSERT(md_hbuf != NULL); 19504 19505 /* 19506 * Packetize the transmittable portion of the data block; 19507 * each data block is essentially added to the Multidata 19508 * as a payload buffer. We also deal with adding more 19509 * than one payload buffers, which happens when the remaining 19510 * packetized portion of the current payload buffer is less 19511 * than MSS, while the next data block in transmit queue 19512 * has enough data to make up for one. This "spillover" 19513 * case essentially creates a split-packet, where portions 19514 * of the packet's payload fragments may span across two 19515 * virtually discontiguous address blocks. 19516 */ 19517 seg_len = mss; 19518 do { 19519 len = seg_len; 19520 19521 ASSERT(len > 0); 19522 ASSERT(max_pld >= 0); 19523 ASSERT(!add_buffer || cur_pld_off == 0); 19524 19525 /* 19526 * First time around for this payload buffer; note 19527 * in the case of a spillover, the following has 19528 * been done prior to adding the split-packet 19529 * descriptor to Multidata, and we don't want to 19530 * repeat the process. 19531 */ 19532 if (add_buffer) { 19533 ASSERT(mmd != NULL); 19534 ASSERT(md_pbuf == NULL); 19535 ASSERT(md_pbuf_nxt == NULL); 19536 ASSERT(pbuf_idx == -1 && pbuf_idx_nxt == -1); 19537 19538 /* 19539 * Have we reached the limit? We'd get to 19540 * this case when we're not chaining the 19541 * Multidata messages together, and since 19542 * we're done, terminate this loop. 19543 */ 19544 if (max_pld == 0) 19545 break; /* done */ 19546 19547 if ((md_pbuf = dupb(*xmit_tail)) == NULL) { 19548 TCP_STAT(tcp_mdt_allocfail); 19549 goto legacy_send; /* out_of_mem */ 19550 } 19551 19552 if (IS_VMLOANED_MBLK(md_pbuf) && !zcopy && 19553 zc_cap != NULL) { 19554 if (!ip_md_zcopy_attr(mmd, NULL, 19555 zc_cap->ill_zerocopy_flags)) { 19556 freeb(md_pbuf); 19557 TCP_STAT(tcp_mdt_allocfail); 19558 /* out_of_mem */ 19559 goto legacy_send; 19560 } 19561 zcopy = B_TRUE; 19562 } 19563 19564 md_pbuf->b_rptr += base_pld_off; 19565 19566 /* 19567 * Add a payload buffer to the Multidata; this 19568 * operation must not fail, or otherwise our 19569 * logic in this routine is broken. There 19570 * is no memory allocation done by the 19571 * routine, so any returned failure simply 19572 * tells us that we've done something wrong. 19573 * 19574 * A failure tells us that either we're adding 19575 * the same payload buffer more than once, or 19576 * we're trying to add more buffers than 19577 * allowed (max_pld calculation is wrong). 19578 * None of the above cases should happen, and 19579 * we panic because either there's horrible 19580 * heap corruption, and/or programming mistake. 19581 */ 19582 pbuf_idx = mmd_addpldbuf(mmd, md_pbuf); 19583 if (pbuf_idx < 0) { 19584 cmn_err(CE_PANIC, "tcp_multisend: " 19585 "payload buffer logic error " 19586 "detected for tcp %p mmd %p " 19587 "pbuf %p (%d)\n", 19588 (void *)tcp, (void *)mmd, 19589 (void *)md_pbuf, pbuf_idx); 19590 } 19591 19592 ASSERT(max_pld > 0); 19593 --max_pld; 19594 add_buffer = B_FALSE; 19595 } 19596 19597 ASSERT(md_mp_head != NULL); 19598 ASSERT(md_pbuf != NULL); 19599 ASSERT(md_pbuf_nxt == NULL); 19600 ASSERT(pbuf_idx != -1); 19601 ASSERT(pbuf_idx_nxt == -1); 19602 ASSERT(*usable > 0); 19603 19604 /* 19605 * We spillover to the next payload buffer only 19606 * if all of the following is true: 19607 * 19608 * 1. There is not enough data on the current 19609 * payload buffer to make up `len', 19610 * 2. We are allowed to send `len', 19611 * 3. The next payload buffer length is large 19612 * enough to accomodate `spill'. 19613 */ 19614 if ((spill = len - *tail_unsent) > 0 && 19615 *usable >= len && 19616 MBLKL((*xmit_tail)->b_cont) >= spill && 19617 max_pld > 0) { 19618 md_pbuf_nxt = dupb((*xmit_tail)->b_cont); 19619 if (md_pbuf_nxt == NULL) { 19620 TCP_STAT(tcp_mdt_allocfail); 19621 goto legacy_send; /* out_of_mem */ 19622 } 19623 19624 if (IS_VMLOANED_MBLK(md_pbuf_nxt) && !zcopy && 19625 zc_cap != NULL) { 19626 if (!ip_md_zcopy_attr(mmd, NULL, 19627 zc_cap->ill_zerocopy_flags)) { 19628 freeb(md_pbuf_nxt); 19629 TCP_STAT(tcp_mdt_allocfail); 19630 /* out_of_mem */ 19631 goto legacy_send; 19632 } 19633 zcopy = B_TRUE; 19634 } 19635 19636 /* 19637 * See comments above on the first call to 19638 * mmd_addpldbuf for explanation on the panic. 19639 */ 19640 pbuf_idx_nxt = mmd_addpldbuf(mmd, md_pbuf_nxt); 19641 if (pbuf_idx_nxt < 0) { 19642 panic("tcp_multisend: " 19643 "next payload buffer logic error " 19644 "detected for tcp %p mmd %p " 19645 "pbuf %p (%d)\n", 19646 (void *)tcp, (void *)mmd, 19647 (void *)md_pbuf_nxt, pbuf_idx_nxt); 19648 } 19649 19650 ASSERT(max_pld > 0); 19651 --max_pld; 19652 } else if (spill > 0) { 19653 /* 19654 * If there's a spillover, but the following 19655 * xmit_tail couldn't give us enough octets 19656 * to reach "len", then stop the current 19657 * Multidata creation and let the legacy 19658 * tcp_send() path take over. We don't want 19659 * to send the tiny segment as part of this 19660 * Multidata for performance reasons; instead, 19661 * we let the legacy path deal with grouping 19662 * it with the subsequent small mblks. 19663 */ 19664 if (*usable >= len && 19665 MBLKL((*xmit_tail)->b_cont) < spill) { 19666 max_pld = 0; 19667 break; /* done */ 19668 } 19669 19670 /* 19671 * We can't spillover, and we are near 19672 * the end of the current payload buffer, 19673 * so send what's left. 19674 */ 19675 ASSERT(*tail_unsent > 0); 19676 len = *tail_unsent; 19677 } 19678 19679 /* tail_unsent is negated if there is a spillover */ 19680 *tail_unsent -= len; 19681 *usable -= len; 19682 ASSERT(*usable >= 0); 19683 19684 if (*usable < mss) 19685 seg_len = *usable; 19686 /* 19687 * Sender SWS avoidance; see comments in tcp_send(); 19688 * everything else is the same, except that we only 19689 * do this here if there is no more data to be sent 19690 * following the current xmit_tail. We don't check 19691 * for 1-byte urgent data because we shouldn't get 19692 * here if TCP_URG_VALID is set. 19693 */ 19694 if (*usable > 0 && *usable < mss && 19695 ((md_pbuf_nxt == NULL && 19696 (*xmit_tail)->b_cont == NULL) || 19697 (md_pbuf_nxt != NULL && 19698 (*xmit_tail)->b_cont->b_cont == NULL)) && 19699 seg_len < (tcp->tcp_max_swnd >> 1) && 19700 (tcp->tcp_unsent - 19701 ((*snxt + len) - tcp->tcp_snxt)) > seg_len && 19702 !tcp->tcp_zero_win_probe) { 19703 if ((*snxt + len) == tcp->tcp_snxt && 19704 (*snxt + len) == tcp->tcp_suna) { 19705 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 19706 } 19707 done = B_TRUE; 19708 } 19709 19710 /* 19711 * Prime pump for IP's checksumming on our behalf; 19712 * include the adjustment for a source route if any. 19713 * Do this only for software/partial hardware checksum 19714 * offload, as this field gets zeroed out later for 19715 * the full hardware checksum offload case. 19716 */ 19717 if (!(hwcksum_flags & HCK_FULLCKSUM)) { 19718 cksum = len + tcp_tcp_hdr_len + tcp->tcp_sum; 19719 cksum = (cksum >> 16) + (cksum & 0xFFFF); 19720 U16_TO_ABE16(cksum, tcp->tcp_tcph->th_sum); 19721 } 19722 19723 U32_TO_ABE32(*snxt, tcp->tcp_tcph->th_seq); 19724 *snxt += len; 19725 19726 tcp->tcp_tcph->th_flags[0] = TH_ACK; 19727 /* 19728 * We set the PUSH bit only if TCP has no more buffered 19729 * data to be transmitted (or if sender SWS avoidance 19730 * takes place), as opposed to setting it for every 19731 * last packet in the burst. 19732 */ 19733 if (done || 19734 (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) == 0) 19735 tcp->tcp_tcph->th_flags[0] |= TH_PUSH; 19736 19737 /* 19738 * Set FIN bit if this is our last segment; snxt 19739 * already includes its length, and it will not 19740 * be adjusted after this point. 19741 */ 19742 if (tcp->tcp_valid_bits == TCP_FSS_VALID && 19743 *snxt == tcp->tcp_fss) { 19744 if (!tcp->tcp_fin_acked) { 19745 tcp->tcp_tcph->th_flags[0] |= TH_FIN; 19746 BUMP_MIB(&tcp_mib, tcpOutControl); 19747 } 19748 if (!tcp->tcp_fin_sent) { 19749 tcp->tcp_fin_sent = B_TRUE; 19750 /* 19751 * tcp state must be ESTABLISHED 19752 * in order for us to get here in 19753 * the first place. 19754 */ 19755 tcp->tcp_state = TCPS_FIN_WAIT_1; 19756 19757 /* 19758 * Upon returning from this routine, 19759 * tcp_wput_data() will set tcp_snxt 19760 * to be equal to snxt + tcp_fin_sent. 19761 * This is essentially the same as 19762 * setting it to tcp_fss + 1. 19763 */ 19764 } 19765 } 19766 19767 tcp->tcp_last_sent_len = (ushort_t)len; 19768 19769 len += tcp_hdr_len; 19770 if (tcp->tcp_ipversion == IPV4_VERSION) 19771 tcp->tcp_ipha->ipha_length = htons(len); 19772 else 19773 tcp->tcp_ip6h->ip6_plen = htons(len - 19774 ((char *)&tcp->tcp_ip6h[1] - 19775 tcp->tcp_iphc)); 19776 19777 pkt_info->flags = (PDESC_HBUF_REF | PDESC_PBUF_REF); 19778 19779 /* setup header fragment */ 19780 PDESC_HDR_ADD(pkt_info, 19781 md_hbuf->b_rptr + cur_hdr_off, /* base */ 19782 tcp->tcp_mdt_hdr_head, /* head room */ 19783 tcp_hdr_len, /* len */ 19784 tcp->tcp_mdt_hdr_tail); /* tail room */ 19785 19786 ASSERT(pkt_info->hdr_lim - pkt_info->hdr_base == 19787 hdr_frag_sz); 19788 ASSERT(MBLKIN(md_hbuf, 19789 (pkt_info->hdr_base - md_hbuf->b_rptr), 19790 PDESC_HDRSIZE(pkt_info))); 19791 19792 /* setup first payload fragment */ 19793 PDESC_PLD_INIT(pkt_info); 19794 PDESC_PLD_SPAN_ADD(pkt_info, 19795 pbuf_idx, /* index */ 19796 md_pbuf->b_rptr + cur_pld_off, /* start */ 19797 tcp->tcp_last_sent_len); /* len */ 19798 19799 /* create a split-packet in case of a spillover */ 19800 if (md_pbuf_nxt != NULL) { 19801 ASSERT(spill > 0); 19802 ASSERT(pbuf_idx_nxt > pbuf_idx); 19803 ASSERT(!add_buffer); 19804 19805 md_pbuf = md_pbuf_nxt; 19806 md_pbuf_nxt = NULL; 19807 pbuf_idx = pbuf_idx_nxt; 19808 pbuf_idx_nxt = -1; 19809 cur_pld_off = spill; 19810 19811 /* trim out first payload fragment */ 19812 PDESC_PLD_SPAN_TRIM(pkt_info, 0, spill); 19813 19814 /* setup second payload fragment */ 19815 PDESC_PLD_SPAN_ADD(pkt_info, 19816 pbuf_idx, /* index */ 19817 md_pbuf->b_rptr, /* start */ 19818 spill); /* len */ 19819 19820 if ((*xmit_tail)->b_next == NULL) { 19821 /* 19822 * Store the lbolt used for RTT 19823 * estimation. We can only record one 19824 * timestamp per mblk so we do it when 19825 * we reach the end of the payload 19826 * buffer. Also we only take a new 19827 * timestamp sample when the previous 19828 * timed data from the same mblk has 19829 * been ack'ed. 19830 */ 19831 (*xmit_tail)->b_prev = local_time; 19832 (*xmit_tail)->b_next = 19833 (mblk_t *)(uintptr_t)first_snxt; 19834 } 19835 19836 first_snxt = *snxt - spill; 19837 19838 /* 19839 * Advance xmit_tail; usable could be 0 by 19840 * the time we got here, but we made sure 19841 * above that we would only spillover to 19842 * the next data block if usable includes 19843 * the spilled-over amount prior to the 19844 * subtraction. Therefore, we are sure 19845 * that xmit_tail->b_cont can't be NULL. 19846 */ 19847 ASSERT((*xmit_tail)->b_cont != NULL); 19848 *xmit_tail = (*xmit_tail)->b_cont; 19849 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= 19850 (uintptr_t)INT_MAX); 19851 *tail_unsent = (int)MBLKL(*xmit_tail) - spill; 19852 } else { 19853 cur_pld_off += tcp->tcp_last_sent_len; 19854 } 19855 19856 /* 19857 * Fill in the header using the template header, and 19858 * add options such as time-stamp, ECN and/or SACK, 19859 * as needed. 19860 */ 19861 tcp_fill_header(tcp, pkt_info->hdr_rptr, 19862 (clock_t)local_time, num_sack_blk); 19863 19864 /* take care of some IP header businesses */ 19865 if (af == AF_INET) { 19866 ipha = (ipha_t *)pkt_info->hdr_rptr; 19867 19868 ASSERT(OK_32PTR((uchar_t *)ipha)); 19869 ASSERT(PDESC_HDRL(pkt_info) >= 19870 IP_SIMPLE_HDR_LENGTH); 19871 ASSERT(ipha->ipha_version_and_hdr_length == 19872 IP_SIMPLE_HDR_VERSION); 19873 19874 /* 19875 * Assign ident value for current packet; see 19876 * related comments in ip_wput_ire() about the 19877 * contract private interface with clustering 19878 * group. 19879 */ 19880 clusterwide = B_FALSE; 19881 if (cl_inet_ipident != NULL) { 19882 ASSERT(cl_inet_isclusterwide != NULL); 19883 if ((*cl_inet_isclusterwide)(IPPROTO_IP, 19884 AF_INET, 19885 (uint8_t *)(uintptr_t)src)) { 19886 ipha->ipha_ident = 19887 (*cl_inet_ipident) 19888 (IPPROTO_IP, AF_INET, 19889 (uint8_t *)(uintptr_t)src, 19890 (uint8_t *)(uintptr_t)dst); 19891 clusterwide = B_TRUE; 19892 } 19893 } 19894 19895 if (!clusterwide) { 19896 ipha->ipha_ident = (uint16_t) 19897 atomic_add_32_nv( 19898 &ire->ire_ident, 1); 19899 } 19900 #ifndef _BIG_ENDIAN 19901 ipha->ipha_ident = (ipha->ipha_ident << 8) | 19902 (ipha->ipha_ident >> 8); 19903 #endif 19904 } else { 19905 ip6h = (ip6_t *)pkt_info->hdr_rptr; 19906 19907 ASSERT(OK_32PTR((uchar_t *)ip6h)); 19908 ASSERT(IPVER(ip6h) == IPV6_VERSION); 19909 ASSERT(ip6h->ip6_nxt == IPPROTO_TCP); 19910 ASSERT(PDESC_HDRL(pkt_info) >= 19911 (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET + 19912 TCP_CHECKSUM_SIZE)); 19913 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 19914 19915 if (tcp->tcp_ip_forward_progress) { 19916 rconfirm = B_TRUE; 19917 tcp->tcp_ip_forward_progress = B_FALSE; 19918 } 19919 } 19920 19921 /* at least one payload span, and at most two */ 19922 ASSERT(pkt_info->pld_cnt > 0 && pkt_info->pld_cnt < 3); 19923 19924 /* add the packet descriptor to Multidata */ 19925 if ((pkt = mmd_addpdesc(mmd, pkt_info, &err, 19926 KM_NOSLEEP)) == NULL) { 19927 /* 19928 * Any failure other than ENOMEM indicates 19929 * that we have passed in invalid pkt_info 19930 * or parameters to mmd_addpdesc, which must 19931 * not happen. 19932 * 19933 * EINVAL is a result of failure on boundary 19934 * checks against the pkt_info contents. It 19935 * should not happen, and we panic because 19936 * either there's horrible heap corruption, 19937 * and/or programming mistake. 19938 */ 19939 if (err != ENOMEM) { 19940 cmn_err(CE_PANIC, "tcp_multisend: " 19941 "pdesc logic error detected for " 19942 "tcp %p mmd %p pinfo %p (%d)\n", 19943 (void *)tcp, (void *)mmd, 19944 (void *)pkt_info, err); 19945 } 19946 TCP_STAT(tcp_mdt_addpdescfail); 19947 goto legacy_send; /* out_of_mem */ 19948 } 19949 ASSERT(pkt != NULL); 19950 19951 /* calculate IP header and TCP checksums */ 19952 if (af == AF_INET) { 19953 /* calculate pseudo-header checksum */ 19954 cksum = (dst >> 16) + (dst & 0xFFFF) + 19955 (src >> 16) + (src & 0xFFFF); 19956 19957 /* offset for TCP header checksum */ 19958 up = IPH_TCPH_CHECKSUMP(ipha, 19959 IP_SIMPLE_HDR_LENGTH); 19960 } else { 19961 up = (uint16_t *)&ip6h->ip6_src; 19962 19963 /* calculate pseudo-header checksum */ 19964 cksum = up[0] + up[1] + up[2] + up[3] + 19965 up[4] + up[5] + up[6] + up[7] + 19966 up[8] + up[9] + up[10] + up[11] + 19967 up[12] + up[13] + up[14] + up[15]; 19968 19969 /* Fold the initial sum */ 19970 cksum = (cksum & 0xffff) + (cksum >> 16); 19971 19972 up = (uint16_t *)(((uchar_t *)ip6h) + 19973 IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET); 19974 } 19975 19976 if (hwcksum_flags & HCK_FULLCKSUM) { 19977 /* clear checksum field for hardware */ 19978 *up = 0; 19979 } else if (hwcksum_flags & HCK_PARTIALCKSUM) { 19980 uint32_t sum; 19981 19982 /* pseudo-header checksumming */ 19983 sum = *up + cksum + IP_TCP_CSUM_COMP; 19984 sum = (sum & 0xFFFF) + (sum >> 16); 19985 *up = (sum & 0xFFFF) + (sum >> 16); 19986 } else { 19987 /* software checksumming */ 19988 TCP_STAT(tcp_out_sw_cksum); 19989 TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes, 19990 tcp->tcp_hdr_len + tcp->tcp_last_sent_len); 19991 *up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len, 19992 cksum + IP_TCP_CSUM_COMP); 19993 if (*up == 0) 19994 *up = 0xFFFF; 19995 } 19996 19997 /* IPv4 header checksum */ 19998 if (af == AF_INET) { 19999 ipha->ipha_fragment_offset_and_flags |= 20000 (uint32_t)htons(ire->ire_frag_flag); 20001 20002 if (hwcksum_flags & HCK_IPV4_HDRCKSUM) { 20003 ipha->ipha_hdr_checksum = 0; 20004 } else { 20005 IP_HDR_CKSUM(ipha, cksum, 20006 ((uint32_t *)ipha)[0], 20007 ((uint16_t *)ipha)[4]); 20008 } 20009 } 20010 20011 /* advance header offset */ 20012 cur_hdr_off += hdr_frag_sz; 20013 20014 obbytes += tcp->tcp_last_sent_len; 20015 ++obsegs; 20016 } while (!done && *usable > 0 && --num_burst_seg > 0 && 20017 *tail_unsent > 0); 20018 20019 if ((*xmit_tail)->b_next == NULL) { 20020 /* 20021 * Store the lbolt used for RTT estimation. We can only 20022 * record one timestamp per mblk so we do it when we 20023 * reach the end of the payload buffer. Also we only 20024 * take a new timestamp sample when the previous timed 20025 * data from the same mblk has been ack'ed. 20026 */ 20027 (*xmit_tail)->b_prev = local_time; 20028 (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)first_snxt; 20029 } 20030 20031 ASSERT(*tail_unsent >= 0); 20032 if (*tail_unsent > 0) { 20033 /* 20034 * We got here because we broke out of the above 20035 * loop due to of one of the following cases: 20036 * 20037 * 1. len < adjusted MSS (i.e. small), 20038 * 2. Sender SWS avoidance, 20039 * 3. max_pld is zero. 20040 * 20041 * We are done for this Multidata, so trim our 20042 * last payload buffer (if any) accordingly. 20043 */ 20044 if (md_pbuf != NULL) 20045 md_pbuf->b_wptr -= *tail_unsent; 20046 } else if (*usable > 0) { 20047 *xmit_tail = (*xmit_tail)->b_cont; 20048 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= 20049 (uintptr_t)INT_MAX); 20050 *tail_unsent = (int)MBLKL(*xmit_tail); 20051 add_buffer = B_TRUE; 20052 } 20053 } while (!done && *usable > 0 && num_burst_seg > 0 && 20054 (tcp_mdt_chain || max_pld > 0)); 20055 20056 /* send everything down */ 20057 tcp_multisend_data(tcp, ire, ill, md_mp_head, obsegs, obbytes, 20058 &rconfirm); 20059 20060 #undef PREP_NEW_MULTIDATA 20061 #undef PREP_NEW_PBUF 20062 #undef IPVER 20063 20064 IRE_REFRELE(ire); 20065 return (0); 20066 } 20067 20068 /* 20069 * A wrapper function for sending one or more Multidata messages down to 20070 * the module below ip; this routine does not release the reference of the 20071 * IRE (caller does that). This routine is analogous to tcp_send_data(). 20072 */ 20073 static void 20074 tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head, 20075 const uint_t obsegs, const uint_t obbytes, boolean_t *rconfirm) 20076 { 20077 uint64_t delta; 20078 nce_t *nce; 20079 20080 ASSERT(ire != NULL && ill != NULL); 20081 ASSERT(ire->ire_stq != NULL); 20082 ASSERT(md_mp_head != NULL); 20083 ASSERT(rconfirm != NULL); 20084 20085 /* adjust MIBs and IRE timestamp */ 20086 TCP_RECORD_TRACE(tcp, md_mp_head, TCP_TRACE_SEND_PKT); 20087 tcp->tcp_obsegs += obsegs; 20088 UPDATE_MIB(&tcp_mib, tcpOutDataSegs, obsegs); 20089 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, obbytes); 20090 TCP_STAT_UPDATE(tcp_mdt_pkt_out, obsegs); 20091 20092 if (tcp->tcp_ipversion == IPV4_VERSION) { 20093 TCP_STAT_UPDATE(tcp_mdt_pkt_out_v4, obsegs); 20094 UPDATE_MIB(&ip_mib, ipOutRequests, obsegs); 20095 } else { 20096 TCP_STAT_UPDATE(tcp_mdt_pkt_out_v6, obsegs); 20097 UPDATE_MIB(&ip6_mib, ipv6OutRequests, obsegs); 20098 } 20099 20100 ire->ire_ob_pkt_count += obsegs; 20101 if (ire->ire_ipif != NULL) 20102 atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, obsegs); 20103 ire->ire_last_used_time = lbolt; 20104 20105 /* send it down */ 20106 putnext(ire->ire_stq, md_mp_head); 20107 20108 /* we're done for TCP/IPv4 */ 20109 if (tcp->tcp_ipversion == IPV4_VERSION) 20110 return; 20111 20112 nce = ire->ire_nce; 20113 20114 ASSERT(nce != NULL); 20115 ASSERT(!(nce->nce_flags & (NCE_F_NONUD|NCE_F_PERMANENT))); 20116 ASSERT(nce->nce_state != ND_INCOMPLETE); 20117 20118 /* reachability confirmation? */ 20119 if (*rconfirm) { 20120 nce->nce_last = TICK_TO_MSEC(lbolt64); 20121 if (nce->nce_state != ND_REACHABLE) { 20122 mutex_enter(&nce->nce_lock); 20123 nce->nce_state = ND_REACHABLE; 20124 nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; 20125 mutex_exit(&nce->nce_lock); 20126 (void) untimeout(nce->nce_timeout_id); 20127 if (ip_debug > 2) { 20128 /* ip1dbg */ 20129 pr_addr_dbg("tcp_multisend_data: state " 20130 "for %s changed to REACHABLE\n", 20131 AF_INET6, &ire->ire_addr_v6); 20132 } 20133 } 20134 /* reset transport reachability confirmation */ 20135 *rconfirm = B_FALSE; 20136 } 20137 20138 delta = TICK_TO_MSEC(lbolt64) - nce->nce_last; 20139 ip1dbg(("tcp_multisend_data: delta = %" PRId64 20140 " ill_reachable_time = %d \n", delta, ill->ill_reachable_time)); 20141 20142 if (delta > (uint64_t)ill->ill_reachable_time) { 20143 mutex_enter(&nce->nce_lock); 20144 switch (nce->nce_state) { 20145 case ND_REACHABLE: 20146 case ND_STALE: 20147 /* 20148 * ND_REACHABLE is identical to ND_STALE in this 20149 * specific case. If reachable time has expired for 20150 * this neighbor (delta is greater than reachable 20151 * time), conceptually, the neighbor cache is no 20152 * longer in REACHABLE state, but already in STALE 20153 * state. So the correct transition here is to 20154 * ND_DELAY. 20155 */ 20156 nce->nce_state = ND_DELAY; 20157 mutex_exit(&nce->nce_lock); 20158 NDP_RESTART_TIMER(nce, delay_first_probe_time); 20159 if (ip_debug > 3) { 20160 /* ip2dbg */ 20161 pr_addr_dbg("tcp_multisend_data: state " 20162 "for %s changed to DELAY\n", 20163 AF_INET6, &ire->ire_addr_v6); 20164 } 20165 break; 20166 case ND_DELAY: 20167 case ND_PROBE: 20168 mutex_exit(&nce->nce_lock); 20169 /* Timers have already started */ 20170 break; 20171 case ND_UNREACHABLE: 20172 /* 20173 * ndp timer has detected that this nce is 20174 * unreachable and initiated deleting this nce 20175 * and all its associated IREs. This is a race 20176 * where we found the ire before it was deleted 20177 * and have just sent out a packet using this 20178 * unreachable nce. 20179 */ 20180 mutex_exit(&nce->nce_lock); 20181 break; 20182 default: 20183 ASSERT(0); 20184 } 20185 } 20186 } 20187 20188 /* 20189 * tcp_send() is called by tcp_wput_data() for non-Multidata transmission 20190 * scheme, and returns one of the following: 20191 * 20192 * -1 = failed allocation. 20193 * 0 = success; burst count reached, or usable send window is too small, 20194 * and that we'd rather wait until later before sending again. 20195 * 1 = success; we are called from tcp_multisend(), and both usable send 20196 * window and tail_unsent are greater than the MDT threshold, and thus 20197 * Multidata Transmit should be used instead. 20198 */ 20199 static int 20200 tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, 20201 const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable, 20202 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 20203 const int mdt_thres) 20204 { 20205 int num_burst_seg = tcp->tcp_snd_burst; 20206 20207 for (;;) { 20208 struct datab *db; 20209 tcph_t *tcph; 20210 uint32_t sum; 20211 mblk_t *mp, *mp1; 20212 uchar_t *rptr; 20213 int len; 20214 20215 /* 20216 * If we're called by tcp_multisend(), and the amount of 20217 * sendable data as well as the size of current xmit_tail 20218 * is beyond the MDT threshold, return to the caller and 20219 * let the large data transmit be done using MDT. 20220 */ 20221 if (*usable > 0 && *usable > mdt_thres && 20222 (*tail_unsent > mdt_thres || (*tail_unsent == 0 && 20223 MBLKL((*xmit_tail)->b_cont) > mdt_thres))) { 20224 ASSERT(tcp->tcp_mdt); 20225 return (1); /* success; do large send */ 20226 } 20227 20228 if (num_burst_seg-- == 0) 20229 break; /* success; burst count reached */ 20230 20231 len = mss; 20232 if (len > *usable) { 20233 len = *usable; 20234 if (len <= 0) { 20235 /* Terminate the loop */ 20236 break; /* success; too small */ 20237 } 20238 /* 20239 * Sender silly-window avoidance. 20240 * Ignore this if we are going to send a 20241 * zero window probe out. 20242 * 20243 * TODO: force data into microscopic window? 20244 * ==> (!pushed || (unsent > usable)) 20245 */ 20246 if (len < (tcp->tcp_max_swnd >> 1) && 20247 (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len && 20248 !((tcp->tcp_valid_bits & TCP_URG_VALID) && 20249 len == 1) && (! tcp->tcp_zero_win_probe)) { 20250 /* 20251 * If the retransmit timer is not running 20252 * we start it so that we will retransmit 20253 * in the case when the the receiver has 20254 * decremented the window. 20255 */ 20256 if (*snxt == tcp->tcp_snxt && 20257 *snxt == tcp->tcp_suna) { 20258 /* 20259 * We are not supposed to send 20260 * anything. So let's wait a little 20261 * bit longer before breaking SWS 20262 * avoidance. 20263 * 20264 * What should the value be? 20265 * Suggestion: MAX(init rexmit time, 20266 * tcp->tcp_rto) 20267 */ 20268 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 20269 } 20270 break; /* success; too small */ 20271 } 20272 } 20273 20274 tcph = tcp->tcp_tcph; 20275 20276 *usable -= len; /* Approximate - can be adjusted later */ 20277 if (*usable > 0) 20278 tcph->th_flags[0] = TH_ACK; 20279 else 20280 tcph->th_flags[0] = (TH_ACK | TH_PUSH); 20281 20282 /* 20283 * Prime pump for IP's checksumming on our behalf 20284 * Include the adjustment for a source route if any. 20285 */ 20286 sum = len + tcp_tcp_hdr_len + tcp->tcp_sum; 20287 sum = (sum >> 16) + (sum & 0xFFFF); 20288 U16_TO_ABE16(sum, tcph->th_sum); 20289 20290 U32_TO_ABE32(*snxt, tcph->th_seq); 20291 20292 /* 20293 * Branch off to tcp_xmit_mp() if any of the VALID bits is 20294 * set. For the case when TCP_FSS_VALID is the only valid 20295 * bit (normal active close), branch off only when we think 20296 * that the FIN flag needs to be set. Note for this case, 20297 * that (snxt + len) may not reflect the actual seg_len, 20298 * as len may be further reduced in tcp_xmit_mp(). If len 20299 * gets modified, we will end up here again. 20300 */ 20301 if (tcp->tcp_valid_bits != 0 && 20302 (tcp->tcp_valid_bits != TCP_FSS_VALID || 20303 ((*snxt + len) == tcp->tcp_fss))) { 20304 uchar_t *prev_rptr; 20305 uint32_t prev_snxt = tcp->tcp_snxt; 20306 20307 if (*tail_unsent == 0) { 20308 ASSERT((*xmit_tail)->b_cont != NULL); 20309 *xmit_tail = (*xmit_tail)->b_cont; 20310 prev_rptr = (*xmit_tail)->b_rptr; 20311 *tail_unsent = (int)((*xmit_tail)->b_wptr - 20312 (*xmit_tail)->b_rptr); 20313 } else { 20314 prev_rptr = (*xmit_tail)->b_rptr; 20315 (*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr - 20316 *tail_unsent; 20317 } 20318 mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL, 20319 *snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 20320 /* Restore tcp_snxt so we get amount sent right. */ 20321 tcp->tcp_snxt = prev_snxt; 20322 if (prev_rptr == (*xmit_tail)->b_rptr) { 20323 /* 20324 * If the previous timestamp is still in use, 20325 * don't stomp on it. 20326 */ 20327 if ((*xmit_tail)->b_next == NULL) { 20328 (*xmit_tail)->b_prev = local_time; 20329 (*xmit_tail)->b_next = 20330 (mblk_t *)(uintptr_t)(*snxt); 20331 } 20332 } else 20333 (*xmit_tail)->b_rptr = prev_rptr; 20334 20335 if (mp == NULL) 20336 return (-1); 20337 mp1 = mp->b_cont; 20338 20339 tcp->tcp_last_sent_len = (ushort_t)len; 20340 while (mp1->b_cont) { 20341 *xmit_tail = (*xmit_tail)->b_cont; 20342 (*xmit_tail)->b_prev = local_time; 20343 (*xmit_tail)->b_next = 20344 (mblk_t *)(uintptr_t)(*snxt); 20345 mp1 = mp1->b_cont; 20346 } 20347 *snxt += len; 20348 *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; 20349 BUMP_LOCAL(tcp->tcp_obsegs); 20350 BUMP_MIB(&tcp_mib, tcpOutDataSegs); 20351 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len); 20352 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 20353 tcp_send_data(tcp, q, mp); 20354 continue; 20355 } 20356 20357 *snxt += len; /* Adjust later if we don't send all of len */ 20358 BUMP_MIB(&tcp_mib, tcpOutDataSegs); 20359 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len); 20360 20361 if (*tail_unsent) { 20362 /* Are the bytes above us in flight? */ 20363 rptr = (*xmit_tail)->b_wptr - *tail_unsent; 20364 if (rptr != (*xmit_tail)->b_rptr) { 20365 *tail_unsent -= len; 20366 tcp->tcp_last_sent_len = (ushort_t)len; 20367 len += tcp_hdr_len; 20368 if (tcp->tcp_ipversion == IPV4_VERSION) 20369 tcp->tcp_ipha->ipha_length = htons(len); 20370 else 20371 tcp->tcp_ip6h->ip6_plen = 20372 htons(len - 20373 ((char *)&tcp->tcp_ip6h[1] - 20374 tcp->tcp_iphc)); 20375 mp = dupb(*xmit_tail); 20376 if (!mp) 20377 return (-1); /* out_of_mem */ 20378 mp->b_rptr = rptr; 20379 /* 20380 * If the old timestamp is no longer in use, 20381 * sample a new timestamp now. 20382 */ 20383 if ((*xmit_tail)->b_next == NULL) { 20384 (*xmit_tail)->b_prev = local_time; 20385 (*xmit_tail)->b_next = 20386 (mblk_t *)(uintptr_t)(*snxt-len); 20387 } 20388 goto must_alloc; 20389 } 20390 } else { 20391 *xmit_tail = (*xmit_tail)->b_cont; 20392 ASSERT((uintptr_t)((*xmit_tail)->b_wptr - 20393 (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX); 20394 *tail_unsent = (int)((*xmit_tail)->b_wptr - 20395 (*xmit_tail)->b_rptr); 20396 } 20397 20398 (*xmit_tail)->b_prev = local_time; 20399 (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len); 20400 20401 *tail_unsent -= len; 20402 tcp->tcp_last_sent_len = (ushort_t)len; 20403 20404 len += tcp_hdr_len; 20405 if (tcp->tcp_ipversion == IPV4_VERSION) 20406 tcp->tcp_ipha->ipha_length = htons(len); 20407 else 20408 tcp->tcp_ip6h->ip6_plen = htons(len - 20409 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 20410 20411 mp = dupb(*xmit_tail); 20412 if (!mp) 20413 return (-1); /* out_of_mem */ 20414 20415 len = tcp_hdr_len; 20416 /* 20417 * There are four reasons to allocate a new hdr mblk: 20418 * 1) The bytes above us are in use by another packet 20419 * 2) We don't have good alignment 20420 * 3) The mblk is being shared 20421 * 4) We don't have enough room for a header 20422 */ 20423 rptr = mp->b_rptr - len; 20424 if (!OK_32PTR(rptr) || 20425 ((db = mp->b_datap), db->db_ref != 2) || 20426 rptr < db->db_base) { 20427 /* NOTE: we assume allocb returns an OK_32PTR */ 20428 20429 must_alloc:; 20430 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 20431 tcp_wroff_xtra, BPRI_MED); 20432 if (!mp1) { 20433 freemsg(mp); 20434 return (-1); /* out_of_mem */ 20435 } 20436 mp1->b_cont = mp; 20437 mp = mp1; 20438 /* Leave room for Link Level header */ 20439 len = tcp_hdr_len; 20440 rptr = &mp->b_rptr[tcp_wroff_xtra]; 20441 mp->b_wptr = &rptr[len]; 20442 } 20443 20444 /* 20445 * Fill in the header using the template header, and add 20446 * options such as time-stamp, ECN and/or SACK, as needed. 20447 */ 20448 tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk); 20449 20450 mp->b_rptr = rptr; 20451 20452 if (*tail_unsent) { 20453 int spill = *tail_unsent; 20454 20455 mp1 = mp->b_cont; 20456 if (!mp1) 20457 mp1 = mp; 20458 20459 /* 20460 * If we're a little short, tack on more mblks until 20461 * there is no more spillover. 20462 */ 20463 while (spill < 0) { 20464 mblk_t *nmp; 20465 int nmpsz; 20466 20467 nmp = (*xmit_tail)->b_cont; 20468 nmpsz = MBLKL(nmp); 20469 20470 /* 20471 * Excess data in mblk; can we split it? 20472 * If MDT is enabled for the connection, 20473 * keep on splitting as this is a transient 20474 * send path. 20475 */ 20476 if (!tcp->tcp_mdt && (spill + nmpsz > 0)) { 20477 /* 20478 * Don't split if stream head was 20479 * told to break up larger writes 20480 * into smaller ones. 20481 */ 20482 if (tcp->tcp_maxpsz > 0) 20483 break; 20484 20485 /* 20486 * Next mblk is less than SMSS/2 20487 * rounded up to nearest 64-byte; 20488 * let it get sent as part of the 20489 * next segment. 20490 */ 20491 if (tcp->tcp_localnet && 20492 !tcp->tcp_cork && 20493 (nmpsz < roundup((mss >> 1), 64))) 20494 break; 20495 } 20496 20497 *xmit_tail = nmp; 20498 ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX); 20499 /* Stash for rtt use later */ 20500 (*xmit_tail)->b_prev = local_time; 20501 (*xmit_tail)->b_next = 20502 (mblk_t *)(uintptr_t)(*snxt - len); 20503 mp1->b_cont = dupb(*xmit_tail); 20504 mp1 = mp1->b_cont; 20505 20506 spill += nmpsz; 20507 if (mp1 == NULL) { 20508 *tail_unsent = spill; 20509 freemsg(mp); 20510 return (-1); /* out_of_mem */ 20511 } 20512 } 20513 20514 /* Trim back any surplus on the last mblk */ 20515 if (spill >= 0) { 20516 mp1->b_wptr -= spill; 20517 *tail_unsent = spill; 20518 } else { 20519 /* 20520 * We did not send everything we could in 20521 * order to remain within the b_cont limit. 20522 */ 20523 *usable -= spill; 20524 *snxt += spill; 20525 tcp->tcp_last_sent_len += spill; 20526 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, spill); 20527 /* 20528 * Adjust the checksum 20529 */ 20530 tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 20531 sum += spill; 20532 sum = (sum >> 16) + (sum & 0xFFFF); 20533 U16_TO_ABE16(sum, tcph->th_sum); 20534 if (tcp->tcp_ipversion == IPV4_VERSION) { 20535 sum = ntohs( 20536 ((ipha_t *)rptr)->ipha_length) + 20537 spill; 20538 ((ipha_t *)rptr)->ipha_length = 20539 htons(sum); 20540 } else { 20541 sum = ntohs( 20542 ((ip6_t *)rptr)->ip6_plen) + 20543 spill; 20544 ((ip6_t *)rptr)->ip6_plen = 20545 htons(sum); 20546 } 20547 *tail_unsent = 0; 20548 } 20549 } 20550 if (tcp->tcp_ip_forward_progress) { 20551 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 20552 *(uint32_t *)mp->b_rptr |= IP_FORWARD_PROG; 20553 tcp->tcp_ip_forward_progress = B_FALSE; 20554 } 20555 20556 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 20557 tcp_send_data(tcp, q, mp); 20558 BUMP_LOCAL(tcp->tcp_obsegs); 20559 } 20560 20561 return (0); 20562 } 20563 20564 /* Unlink and return any mblk that looks like it contains a MDT info */ 20565 static mblk_t * 20566 tcp_mdt_info_mp(mblk_t *mp) 20567 { 20568 mblk_t *prev_mp; 20569 20570 for (;;) { 20571 prev_mp = mp; 20572 /* no more to process? */ 20573 if ((mp = mp->b_cont) == NULL) 20574 break; 20575 20576 switch (DB_TYPE(mp)) { 20577 case M_CTL: 20578 if (*(uint32_t *)mp->b_rptr != MDT_IOC_INFO_UPDATE) 20579 continue; 20580 ASSERT(prev_mp != NULL); 20581 prev_mp->b_cont = mp->b_cont; 20582 mp->b_cont = NULL; 20583 return (mp); 20584 default: 20585 break; 20586 } 20587 } 20588 return (mp); 20589 } 20590 20591 /* MDT info update routine, called when IP notifies us about MDT */ 20592 static void 20593 tcp_mdt_update(tcp_t *tcp, ill_mdt_capab_t *mdt_capab, boolean_t first) 20594 { 20595 boolean_t prev_state; 20596 20597 /* 20598 * IP is telling us to abort MDT on this connection? We know 20599 * this because the capability is only turned off when IP 20600 * encounters some pathological cases, e.g. link-layer change 20601 * where the new driver doesn't support MDT, or in situation 20602 * where MDT usage on the link-layer has been switched off. 20603 * IP would not have sent us the initial MDT_IOC_INFO_UPDATE 20604 * if the link-layer doesn't support MDT, and if it does, it 20605 * will indicate that the feature is to be turned on. 20606 */ 20607 prev_state = tcp->tcp_mdt; 20608 tcp->tcp_mdt = (mdt_capab->ill_mdt_on != 0); 20609 if (!tcp->tcp_mdt && !first) { 20610 TCP_STAT(tcp_mdt_conn_halted3); 20611 ip1dbg(("tcp_mdt_update: disabling MDT for connp %p\n", 20612 (void *)tcp->tcp_connp)); 20613 } 20614 20615 /* 20616 * We currently only support MDT on simple TCP/{IPv4,IPv6}, 20617 * so disable MDT otherwise. The checks are done here 20618 * and in tcp_wput_data(). 20619 */ 20620 if (tcp->tcp_mdt && 20621 (tcp->tcp_ipversion == IPV4_VERSION && 20622 tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || 20623 (tcp->tcp_ipversion == IPV6_VERSION && 20624 tcp->tcp_ip_hdr_len != IPV6_HDR_LEN)) 20625 tcp->tcp_mdt = B_FALSE; 20626 20627 if (tcp->tcp_mdt) { 20628 if (mdt_capab->ill_mdt_version != MDT_VERSION_2) { 20629 cmn_err(CE_NOTE, "tcp_mdt_update: unknown MDT " 20630 "version (%d), expected version is %d", 20631 mdt_capab->ill_mdt_version, MDT_VERSION_2); 20632 tcp->tcp_mdt = B_FALSE; 20633 return; 20634 } 20635 20636 /* 20637 * We need the driver to be able to handle at least three 20638 * spans per packet in order for tcp MDT to be utilized. 20639 * The first is for the header portion, while the rest are 20640 * needed to handle a packet that straddles across two 20641 * virtually non-contiguous buffers; a typical tcp packet 20642 * therefore consists of only two spans. Note that we take 20643 * a zero as "don't care". 20644 */ 20645 if (mdt_capab->ill_mdt_span_limit > 0 && 20646 mdt_capab->ill_mdt_span_limit < 3) { 20647 tcp->tcp_mdt = B_FALSE; 20648 return; 20649 } 20650 20651 /* a zero means driver wants default value */ 20652 tcp->tcp_mdt_max_pld = MIN(mdt_capab->ill_mdt_max_pld, 20653 tcp_mdt_max_pbufs); 20654 if (tcp->tcp_mdt_max_pld == 0) 20655 tcp->tcp_mdt_max_pld = tcp_mdt_max_pbufs; 20656 20657 /* ensure 32-bit alignment */ 20658 tcp->tcp_mdt_hdr_head = roundup(MAX(tcp_mdt_hdr_head_min, 20659 mdt_capab->ill_mdt_hdr_head), 4); 20660 tcp->tcp_mdt_hdr_tail = roundup(MAX(tcp_mdt_hdr_tail_min, 20661 mdt_capab->ill_mdt_hdr_tail), 4); 20662 20663 if (!first && !prev_state) { 20664 TCP_STAT(tcp_mdt_conn_resumed2); 20665 ip1dbg(("tcp_mdt_update: reenabling MDT for connp %p\n", 20666 (void *)tcp->tcp_connp)); 20667 } 20668 } 20669 } 20670 20671 static void 20672 tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_mdt) 20673 { 20674 conn_t *connp = tcp->tcp_connp; 20675 20676 ASSERT(ire != NULL); 20677 20678 /* 20679 * We may be in the fastpath here, and although we essentially do 20680 * similar checks as in ip_bind_connected{_v6}/ip_mdinfo_return, 20681 * we try to keep things as brief as possible. After all, these 20682 * are only best-effort checks, and we do more thorough ones prior 20683 * to calling tcp_multisend(). 20684 */ 20685 if (ip_multidata_outbound && check_mdt && 20686 !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 20687 ill != NULL && ILL_MDT_CAPABLE(ill) && 20688 !CONN_IPSEC_OUT_ENCAPSULATED(connp) && 20689 !(ire->ire_flags & RTF_MULTIRT) && 20690 !IPP_ENABLED(IPP_LOCAL_OUT) && 20691 CONN_IS_MD_FASTPATH(connp)) { 20692 /* Remember the result */ 20693 connp->conn_mdt_ok = B_TRUE; 20694 20695 ASSERT(ill->ill_mdt_capab != NULL); 20696 if (!ill->ill_mdt_capab->ill_mdt_on) { 20697 /* 20698 * If MDT has been previously turned off in the past, 20699 * and we currently can do MDT (due to IPQoS policy 20700 * removal, etc.) then enable it for this interface. 20701 */ 20702 ill->ill_mdt_capab->ill_mdt_on = 1; 20703 ip1dbg(("tcp_ire_ill_check: connp %p enables MDT for " 20704 "interface %s\n", (void *)connp, ill->ill_name)); 20705 } 20706 tcp_mdt_update(tcp, ill->ill_mdt_capab, B_TRUE); 20707 } 20708 20709 /* 20710 * The goal is to reduce the number of generated tcp segments by 20711 * setting the maxpsz multiplier to 0; this will have an affect on 20712 * tcp_maxpsz_set(). With this behavior, tcp will pack more data 20713 * into each packet, up to SMSS bytes. Doing this reduces the number 20714 * of outbound segments and incoming ACKs, thus allowing for better 20715 * network and system performance. In contrast the legacy behavior 20716 * may result in sending less than SMSS size, because the last mblk 20717 * for some packets may have more data than needed to make up SMSS, 20718 * and the legacy code refused to "split" it. 20719 * 20720 * We apply the new behavior on following situations: 20721 * 20722 * 1) Loopback connections, 20723 * 2) Connections in which the remote peer is not on local subnet, 20724 * 3) Local subnet connections over the bge interface (see below). 20725 * 20726 * Ideally, we would like this behavior to apply for interfaces other 20727 * than bge. However, doing so would negatively impact drivers which 20728 * perform dynamic mapping and unmapping of DMA resources, which are 20729 * increased by setting the maxpsz multiplier to 0 (more mblks per 20730 * packet will be generated by tcp). The bge driver does not suffer 20731 * from this, as it copies the mblks into pre-mapped buffers, and 20732 * therefore does not require more I/O resources than before. 20733 * 20734 * Otherwise, this behavior is present on all network interfaces when 20735 * the destination endpoint is non-local, since reducing the number 20736 * of packets in general is good for the network. 20737 * 20738 * TODO We need to remove this hard-coded conditional for bge once 20739 * a better "self-tuning" mechanism, or a way to comprehend 20740 * the driver transmit strategy is devised. Until the solution 20741 * is found and well understood, we live with this hack. 20742 */ 20743 if (!tcp_static_maxpsz && 20744 (tcp->tcp_loopback || !tcp->tcp_localnet || 20745 (ill->ill_name_length > 3 && bcmp(ill->ill_name, "bge", 3) == 0))) { 20746 /* override the default value */ 20747 tcp->tcp_maxpsz = 0; 20748 20749 ip3dbg(("tcp_ire_ill_check: connp %p tcp_maxpsz %d on " 20750 "interface %s\n", (void *)connp, tcp->tcp_maxpsz, 20751 ill != NULL ? ill->ill_name : ipif_loopback_name)); 20752 } 20753 20754 /* set the stream head parameters accordingly */ 20755 (void) tcp_maxpsz_set(tcp, B_TRUE); 20756 } 20757 20758 /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */ 20759 static void 20760 tcp_wput_flush(tcp_t *tcp, mblk_t *mp) 20761 { 20762 uchar_t fval = *mp->b_rptr; 20763 mblk_t *tail; 20764 queue_t *q = tcp->tcp_wq; 20765 20766 /* TODO: How should flush interact with urgent data? */ 20767 if ((fval & FLUSHW) && tcp->tcp_xmit_head && 20768 !(tcp->tcp_valid_bits & TCP_URG_VALID)) { 20769 /* 20770 * Flush only data that has not yet been put on the wire. If 20771 * we flush data that we have already transmitted, life, as we 20772 * know it, may come to an end. 20773 */ 20774 tail = tcp->tcp_xmit_tail; 20775 tail->b_wptr -= tcp->tcp_xmit_tail_unsent; 20776 tcp->tcp_xmit_tail_unsent = 0; 20777 tcp->tcp_unsent = 0; 20778 if (tail->b_wptr != tail->b_rptr) 20779 tail = tail->b_cont; 20780 if (tail) { 20781 mblk_t **excess = &tcp->tcp_xmit_head; 20782 for (;;) { 20783 mblk_t *mp1 = *excess; 20784 if (mp1 == tail) 20785 break; 20786 tcp->tcp_xmit_tail = mp1; 20787 tcp->tcp_xmit_last = mp1; 20788 excess = &mp1->b_cont; 20789 } 20790 *excess = NULL; 20791 tcp_close_mpp(&tail); 20792 if (tcp->tcp_snd_zcopy_aware) 20793 tcp_zcopy_notify(tcp); 20794 } 20795 /* 20796 * We have no unsent data, so unsent must be less than 20797 * tcp_xmit_lowater, so re-enable flow. 20798 */ 20799 if (tcp->tcp_flow_stopped) { 20800 tcp_clrqfull(tcp); 20801 } 20802 } 20803 /* 20804 * TODO: you can't just flush these, you have to increase rwnd for one 20805 * thing. For another, how should urgent data interact? 20806 */ 20807 if (fval & FLUSHR) { 20808 *mp->b_rptr = fval & ~FLUSHW; 20809 /* XXX */ 20810 qreply(q, mp); 20811 return; 20812 } 20813 freemsg(mp); 20814 } 20815 20816 /* 20817 * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA 20818 * messages. 20819 */ 20820 static void 20821 tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) 20822 { 20823 mblk_t *mp1; 20824 STRUCT_HANDLE(strbuf, sb); 20825 uint16_t port; 20826 queue_t *q = tcp->tcp_wq; 20827 in6_addr_t v6addr; 20828 ipaddr_t v4addr; 20829 uint32_t flowinfo = 0; 20830 int addrlen; 20831 20832 /* Make sure it is one of ours. */ 20833 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 20834 case TI_GETMYNAME: 20835 case TI_GETPEERNAME: 20836 break; 20837 default: 20838 CALL_IP_WPUT(tcp->tcp_connp, q, mp); 20839 return; 20840 } 20841 switch (mi_copy_state(q, mp, &mp1)) { 20842 case -1: 20843 return; 20844 case MI_COPY_CASE(MI_COPY_IN, 1): 20845 break; 20846 case MI_COPY_CASE(MI_COPY_OUT, 1): 20847 /* Copy out the strbuf. */ 20848 mi_copyout(q, mp); 20849 return; 20850 case MI_COPY_CASE(MI_COPY_OUT, 2): 20851 /* All done. */ 20852 mi_copy_done(q, mp, 0); 20853 return; 20854 default: 20855 mi_copy_done(q, mp, EPROTO); 20856 return; 20857 } 20858 /* Check alignment of the strbuf */ 20859 if (!OK_32PTR(mp1->b_rptr)) { 20860 mi_copy_done(q, mp, EINVAL); 20861 return; 20862 } 20863 20864 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 20865 (void *)mp1->b_rptr); 20866 addrlen = tcp->tcp_family == AF_INET ? sizeof (sin_t) : sizeof (sin6_t); 20867 20868 if (STRUCT_FGET(sb, maxlen) < addrlen) { 20869 mi_copy_done(q, mp, EINVAL); 20870 return; 20871 } 20872 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 20873 case TI_GETMYNAME: 20874 if (tcp->tcp_family == AF_INET) { 20875 if (tcp->tcp_ipversion == IPV4_VERSION) { 20876 v4addr = tcp->tcp_ipha->ipha_src; 20877 } else { 20878 /* can't return an address in this case */ 20879 v4addr = 0; 20880 } 20881 } else { 20882 /* tcp->tcp_family == AF_INET6 */ 20883 if (tcp->tcp_ipversion == IPV4_VERSION) { 20884 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 20885 &v6addr); 20886 } else { 20887 v6addr = tcp->tcp_ip6h->ip6_src; 20888 } 20889 } 20890 port = tcp->tcp_lport; 20891 break; 20892 case TI_GETPEERNAME: 20893 if (tcp->tcp_family == AF_INET) { 20894 if (tcp->tcp_ipversion == IPV4_VERSION) { 20895 IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6, 20896 v4addr); 20897 } else { 20898 /* can't return an address in this case */ 20899 v4addr = 0; 20900 } 20901 } else { 20902 /* tcp->tcp_family == AF_INET6) */ 20903 v6addr = tcp->tcp_remote_v6; 20904 if (tcp->tcp_ipversion == IPV6_VERSION) { 20905 /* 20906 * No flowinfo if tcp->tcp_ipversion is v4. 20907 * 20908 * flowinfo was already initialized to zero 20909 * where it was declared above, so only 20910 * set it if ipversion is v6. 20911 */ 20912 flowinfo = tcp->tcp_ip6h->ip6_vcf & 20913 ~IPV6_VERS_AND_FLOW_MASK; 20914 } 20915 } 20916 port = tcp->tcp_fport; 20917 break; 20918 default: 20919 mi_copy_done(q, mp, EPROTO); 20920 return; 20921 } 20922 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 20923 if (!mp1) 20924 return; 20925 20926 if (tcp->tcp_family == AF_INET) { 20927 sin_t *sin; 20928 20929 STRUCT_FSET(sb, len, (int)sizeof (sin_t)); 20930 sin = (sin_t *)mp1->b_rptr; 20931 mp1->b_wptr = (uchar_t *)&sin[1]; 20932 *sin = sin_null; 20933 sin->sin_family = AF_INET; 20934 sin->sin_addr.s_addr = v4addr; 20935 sin->sin_port = port; 20936 } else { 20937 /* tcp->tcp_family == AF_INET6 */ 20938 sin6_t *sin6; 20939 20940 STRUCT_FSET(sb, len, (int)sizeof (sin6_t)); 20941 sin6 = (sin6_t *)mp1->b_rptr; 20942 mp1->b_wptr = (uchar_t *)&sin6[1]; 20943 *sin6 = sin6_null; 20944 sin6->sin6_family = AF_INET6; 20945 sin6->sin6_flowinfo = flowinfo; 20946 sin6->sin6_addr = v6addr; 20947 sin6->sin6_port = port; 20948 } 20949 /* Copy out the address */ 20950 mi_copyout(q, mp); 20951 } 20952 20953 /* 20954 * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL 20955 * messages. 20956 */ 20957 /* ARGSUSED */ 20958 static void 20959 tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) 20960 { 20961 conn_t *connp = (conn_t *)arg; 20962 tcp_t *tcp = connp->conn_tcp; 20963 queue_t *q = tcp->tcp_wq; 20964 struct iocblk *iocp; 20965 20966 ASSERT(DB_TYPE(mp) == M_IOCTL); 20967 /* 20968 * Try and ASSERT the minimum possible references on the 20969 * conn early enough. Since we are executing on write side, 20970 * the connection is obviously not detached and that means 20971 * there is a ref each for TCP and IP. Since we are behind 20972 * the squeue, the minimum references needed are 3. If the 20973 * conn is in classifier hash list, there should be an 20974 * extra ref for that (we check both the possibilities). 20975 */ 20976 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 20977 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 20978 20979 iocp = (struct iocblk *)mp->b_rptr; 20980 switch (iocp->ioc_cmd) { 20981 case TCP_IOC_DEFAULT_Q: 20982 /* Wants to be the default wq. */ 20983 if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) { 20984 iocp->ioc_error = EPERM; 20985 iocp->ioc_count = 0; 20986 mp->b_datap->db_type = M_IOCACK; 20987 qreply(q, mp); 20988 return; 20989 } 20990 tcp_def_q_set(tcp, mp); 20991 return; 20992 case _SIOCSOCKFALLBACK: 20993 /* 20994 * Either sockmod is about to be popped and the socket 20995 * would now be treated as a plain stream, or a module 20996 * is about to be pushed so we could no longer use read- 20997 * side synchronous streams for fused loopback tcp. 20998 * Drain any queued data and disable direct sockfs 20999 * interface from now on. 21000 */ 21001 if (!tcp->tcp_issocket) { 21002 DB_TYPE(mp) = M_IOCNAK; 21003 iocp->ioc_error = EINVAL; 21004 } else { 21005 #ifdef _ILP32 21006 tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); 21007 #else 21008 tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev; 21009 #endif 21010 /* 21011 * Insert this socket into the acceptor hash. 21012 * We might need it for T_CONN_RES message 21013 */ 21014 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 21015 21016 if (tcp->tcp_fused) { 21017 /* 21018 * This is a fused loopback tcp; disable 21019 * read-side synchronous streams interface 21020 * and drain any queued data. It is okay 21021 * to do this for non-synchronous streams 21022 * fused tcp as well. 21023 */ 21024 tcp_fuse_disable_pair(tcp, B_FALSE); 21025 } 21026 tcp->tcp_issocket = B_FALSE; 21027 TCP_STAT(tcp_sock_fallback); 21028 21029 DB_TYPE(mp) = M_IOCACK; 21030 iocp->ioc_error = 0; 21031 } 21032 iocp->ioc_count = 0; 21033 iocp->ioc_rval = 0; 21034 qreply(q, mp); 21035 return; 21036 } 21037 CALL_IP_WPUT(connp, q, mp); 21038 } 21039 21040 /* 21041 * This routine is called by tcp_wput() to handle all TPI requests. 21042 */ 21043 /* ARGSUSED */ 21044 static void 21045 tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) 21046 { 21047 conn_t *connp = (conn_t *)arg; 21048 tcp_t *tcp = connp->conn_tcp; 21049 union T_primitives *tprim = (union T_primitives *)mp->b_rptr; 21050 uchar_t *rptr; 21051 t_scalar_t type; 21052 int len; 21053 cred_t *cr = DB_CREDDEF(mp, tcp->tcp_cred); 21054 21055 /* 21056 * Try and ASSERT the minimum possible references on the 21057 * conn early enough. Since we are executing on write side, 21058 * the connection is obviously not detached and that means 21059 * there is a ref each for TCP and IP. Since we are behind 21060 * the squeue, the minimum references needed are 3. If the 21061 * conn is in classifier hash list, there should be an 21062 * extra ref for that (we check both the possibilities). 21063 */ 21064 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 21065 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 21066 21067 rptr = mp->b_rptr; 21068 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 21069 if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 21070 type = ((union T_primitives *)rptr)->type; 21071 if (type == T_EXDATA_REQ) { 21072 uint32_t msize = msgdsize(mp->b_cont); 21073 21074 len = msize - 1; 21075 if (len < 0) { 21076 freemsg(mp); 21077 return; 21078 } 21079 /* 21080 * Try to force urgent data out on the wire. 21081 * Even if we have unsent data this will 21082 * at least send the urgent flag. 21083 * XXX does not handle more flag correctly. 21084 */ 21085 len += tcp->tcp_unsent; 21086 len += tcp->tcp_snxt; 21087 tcp->tcp_urg = len; 21088 tcp->tcp_valid_bits |= TCP_URG_VALID; 21089 21090 /* Bypass tcp protocol for fused tcp loopback */ 21091 if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 21092 return; 21093 } else if (type != T_DATA_REQ) { 21094 goto non_urgent_data; 21095 } 21096 /* TODO: options, flags, ... from user */ 21097 /* Set length to zero for reclamation below */ 21098 tcp_wput_data(tcp, mp->b_cont, B_TRUE); 21099 freeb(mp); 21100 return; 21101 } else { 21102 if (tcp->tcp_debug) { 21103 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 21104 "tcp_wput_proto, dropping one..."); 21105 } 21106 freemsg(mp); 21107 return; 21108 } 21109 21110 non_urgent_data: 21111 21112 switch ((int)tprim->type) { 21113 case T_SSL_PROXY_BIND_REQ: /* an SSL proxy endpoint bind request */ 21114 /* 21115 * save the kssl_ent_t from the next block, and convert this 21116 * back to a normal bind_req. 21117 */ 21118 if (mp->b_cont != NULL) { 21119 ASSERT(MBLKL(mp->b_cont) >= sizeof (kssl_ent_t)); 21120 21121 if (tcp->tcp_kssl_ent != NULL) { 21122 kssl_release_ent(tcp->tcp_kssl_ent, NULL, 21123 KSSL_NO_PROXY); 21124 tcp->tcp_kssl_ent = NULL; 21125 } 21126 bcopy(mp->b_cont->b_rptr, &tcp->tcp_kssl_ent, 21127 sizeof (kssl_ent_t)); 21128 kssl_hold_ent(tcp->tcp_kssl_ent); 21129 freemsg(mp->b_cont); 21130 mp->b_cont = NULL; 21131 } 21132 tprim->type = T_BIND_REQ; 21133 21134 /* FALLTHROUGH */ 21135 case O_T_BIND_REQ: /* bind request */ 21136 case T_BIND_REQ: /* new semantics bind request */ 21137 tcp_bind(tcp, mp); 21138 break; 21139 case T_UNBIND_REQ: /* unbind request */ 21140 tcp_unbind(tcp, mp); 21141 break; 21142 case O_T_CONN_RES: /* old connection response XXX */ 21143 case T_CONN_RES: /* connection response */ 21144 tcp_accept(tcp, mp); 21145 break; 21146 case T_CONN_REQ: /* connection request */ 21147 tcp_connect(tcp, mp); 21148 break; 21149 case T_DISCON_REQ: /* disconnect request */ 21150 tcp_disconnect(tcp, mp); 21151 break; 21152 case T_CAPABILITY_REQ: 21153 tcp_capability_req(tcp, mp); /* capability request */ 21154 break; 21155 case T_INFO_REQ: /* information request */ 21156 tcp_info_req(tcp, mp); 21157 break; 21158 case T_SVR4_OPTMGMT_REQ: /* manage options req */ 21159 /* Only IP is allowed to return meaningful value */ 21160 (void) svr4_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj); 21161 break; 21162 case T_OPTMGMT_REQ: 21163 /* 21164 * Note: no support for snmpcom_req() through new 21165 * T_OPTMGMT_REQ. See comments in ip.c 21166 */ 21167 /* Only IP is allowed to return meaningful value */ 21168 (void) tpi_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj); 21169 break; 21170 21171 case T_UNITDATA_REQ: /* unitdata request */ 21172 tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 21173 break; 21174 case T_ORDREL_REQ: /* orderly release req */ 21175 freemsg(mp); 21176 21177 if (tcp->tcp_fused) 21178 tcp_unfuse(tcp); 21179 21180 if (tcp_xmit_end(tcp) != 0) { 21181 /* 21182 * We were crossing FINs and got a reset from 21183 * the other side. Just ignore it. 21184 */ 21185 if (tcp->tcp_debug) { 21186 (void) strlog(TCP_MOD_ID, 0, 1, 21187 SL_ERROR|SL_TRACE, 21188 "tcp_wput_proto, T_ORDREL_REQ out of " 21189 "state %s", 21190 tcp_display(tcp, NULL, 21191 DISP_ADDR_AND_PORT)); 21192 } 21193 } 21194 break; 21195 case T_ADDR_REQ: 21196 tcp_addr_req(tcp, mp); 21197 break; 21198 default: 21199 if (tcp->tcp_debug) { 21200 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 21201 "tcp_wput_proto, bogus TPI msg, type %d", 21202 tprim->type); 21203 } 21204 /* 21205 * We used to M_ERROR. Sending TNOTSUPPORT gives the user 21206 * to recover. 21207 */ 21208 tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 21209 break; 21210 } 21211 } 21212 21213 /* 21214 * The TCP write service routine should never be called... 21215 */ 21216 /* ARGSUSED */ 21217 static void 21218 tcp_wsrv(queue_t *q) 21219 { 21220 TCP_STAT(tcp_wsrv_called); 21221 } 21222 21223 /* Non overlapping byte exchanger */ 21224 static void 21225 tcp_xchg(uchar_t *a, uchar_t *b, int len) 21226 { 21227 uchar_t uch; 21228 21229 while (len-- > 0) { 21230 uch = a[len]; 21231 a[len] = b[len]; 21232 b[len] = uch; 21233 } 21234 } 21235 21236 /* 21237 * Send out a control packet on the tcp connection specified. This routine 21238 * is typically called where we need a simple ACK or RST generated. 21239 */ 21240 static void 21241 tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) 21242 { 21243 uchar_t *rptr; 21244 tcph_t *tcph; 21245 ipha_t *ipha = NULL; 21246 ip6_t *ip6h = NULL; 21247 uint32_t sum; 21248 int tcp_hdr_len; 21249 int tcp_ip_hdr_len; 21250 mblk_t *mp; 21251 21252 /* 21253 * Save sum for use in source route later. 21254 */ 21255 ASSERT(tcp != NULL); 21256 sum = tcp->tcp_tcp_hdr_len + tcp->tcp_sum; 21257 tcp_hdr_len = tcp->tcp_hdr_len; 21258 tcp_ip_hdr_len = tcp->tcp_ip_hdr_len; 21259 21260 /* If a text string is passed in with the request, pass it to strlog. */ 21261 if (str != NULL && tcp->tcp_debug) { 21262 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 21263 "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", 21264 str, seq, ack, ctl); 21265 } 21266 mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 21267 BPRI_MED); 21268 if (mp == NULL) { 21269 return; 21270 } 21271 rptr = &mp->b_rptr[tcp_wroff_xtra]; 21272 mp->b_rptr = rptr; 21273 mp->b_wptr = &rptr[tcp_hdr_len]; 21274 bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); 21275 21276 if (tcp->tcp_ipversion == IPV4_VERSION) { 21277 ipha = (ipha_t *)rptr; 21278 ipha->ipha_length = htons(tcp_hdr_len); 21279 } else { 21280 ip6h = (ip6_t *)rptr; 21281 ASSERT(tcp != NULL); 21282 ip6h->ip6_plen = htons(tcp->tcp_hdr_len - 21283 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 21284 } 21285 tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; 21286 tcph->th_flags[0] = (uint8_t)ctl; 21287 if (ctl & TH_RST) { 21288 BUMP_MIB(&tcp_mib, tcpOutRsts); 21289 BUMP_MIB(&tcp_mib, tcpOutControl); 21290 /* 21291 * Don't send TSopt w/ TH_RST packets per RFC 1323. 21292 */ 21293 if (tcp->tcp_snd_ts_ok && 21294 tcp->tcp_state > TCPS_SYN_SENT) { 21295 mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN]; 21296 *(mp->b_wptr) = TCPOPT_EOL; 21297 if (tcp->tcp_ipversion == IPV4_VERSION) { 21298 ipha->ipha_length = htons(tcp_hdr_len - 21299 TCPOPT_REAL_TS_LEN); 21300 } else { 21301 ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - 21302 TCPOPT_REAL_TS_LEN); 21303 } 21304 tcph->th_offset_and_rsrvd[0] -= (3 << 4); 21305 sum -= TCPOPT_REAL_TS_LEN; 21306 } 21307 } 21308 if (ctl & TH_ACK) { 21309 if (tcp->tcp_snd_ts_ok) { 21310 U32_TO_BE32(lbolt, 21311 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 21312 U32_TO_BE32(tcp->tcp_ts_recent, 21313 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 21314 } 21315 21316 /* Update the latest receive window size in TCP header. */ 21317 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 21318 tcph->th_win); 21319 tcp->tcp_rack = ack; 21320 tcp->tcp_rack_cnt = 0; 21321 BUMP_MIB(&tcp_mib, tcpOutAck); 21322 } 21323 BUMP_LOCAL(tcp->tcp_obsegs); 21324 U32_TO_BE32(seq, tcph->th_seq); 21325 U32_TO_BE32(ack, tcph->th_ack); 21326 /* 21327 * Include the adjustment for a source route if any. 21328 */ 21329 sum = (sum >> 16) + (sum & 0xFFFF); 21330 U16_TO_BE16(sum, tcph->th_sum); 21331 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 21332 tcp_send_data(tcp, tcp->tcp_wq, mp); 21333 } 21334 21335 /* 21336 * If this routine returns B_TRUE, TCP can generate a RST in response 21337 * to a segment. If it returns B_FALSE, TCP should not respond. 21338 */ 21339 static boolean_t 21340 tcp_send_rst_chk(void) 21341 { 21342 clock_t now; 21343 21344 /* 21345 * TCP needs to protect itself from generating too many RSTs. 21346 * This can be a DoS attack by sending us random segments 21347 * soliciting RSTs. 21348 * 21349 * What we do here is to have a limit of tcp_rst_sent_rate RSTs 21350 * in each 1 second interval. In this way, TCP still generate 21351 * RSTs in normal cases but when under attack, the impact is 21352 * limited. 21353 */ 21354 if (tcp_rst_sent_rate_enabled != 0) { 21355 now = lbolt; 21356 /* lbolt can wrap around. */ 21357 if ((tcp_last_rst_intrvl > now) || 21358 (TICK_TO_MSEC(now - tcp_last_rst_intrvl) > 1*SECONDS)) { 21359 tcp_last_rst_intrvl = now; 21360 tcp_rst_cnt = 1; 21361 } else if (++tcp_rst_cnt > tcp_rst_sent_rate) { 21362 return (B_FALSE); 21363 } 21364 } 21365 return (B_TRUE); 21366 } 21367 21368 /* 21369 * Send down the advice IP ioctl to tell IP to mark an IRE temporary. 21370 */ 21371 static void 21372 tcp_ip_ire_mark_advice(tcp_t *tcp) 21373 { 21374 mblk_t *mp; 21375 ipic_t *ipic; 21376 21377 if (tcp->tcp_ipversion == IPV4_VERSION) { 21378 mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN, 21379 &ipic); 21380 } else { 21381 mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN, 21382 &ipic); 21383 } 21384 if (mp == NULL) 21385 return; 21386 ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY; 21387 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 21388 } 21389 21390 /* 21391 * Return an IP advice ioctl mblk and set ipic to be the pointer 21392 * to the advice structure. 21393 */ 21394 static mblk_t * 21395 tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic) 21396 { 21397 struct iocblk *ioc; 21398 mblk_t *mp, *mp1; 21399 21400 mp = allocb(sizeof (ipic_t) + addr_len, BPRI_HI); 21401 if (mp == NULL) 21402 return (NULL); 21403 bzero(mp->b_rptr, sizeof (ipic_t) + addr_len); 21404 *ipic = (ipic_t *)mp->b_rptr; 21405 (*ipic)->ipic_cmd = IP_IOC_IRE_ADVISE_NO_REPLY; 21406 (*ipic)->ipic_addr_offset = sizeof (ipic_t); 21407 21408 bcopy(addr, *ipic + 1, addr_len); 21409 21410 (*ipic)->ipic_addr_length = addr_len; 21411 mp->b_wptr = &mp->b_rptr[sizeof (ipic_t) + addr_len]; 21412 21413 mp1 = mkiocb(IP_IOCTL); 21414 if (mp1 == NULL) { 21415 freemsg(mp); 21416 return (NULL); 21417 } 21418 mp1->b_cont = mp; 21419 ioc = (struct iocblk *)mp1->b_rptr; 21420 ioc->ioc_count = sizeof (ipic_t) + addr_len; 21421 21422 return (mp1); 21423 } 21424 21425 /* 21426 * Generate a reset based on an inbound packet for which there is no active 21427 * tcp state that we can find. 21428 * 21429 * IPSEC NOTE : Try to send the reply with the same protection as it came 21430 * in. We still have the ipsec_mp that the packet was attached to. Thus 21431 * the packet will go out at the same level of protection as it came in by 21432 * converting the IPSEC_IN to IPSEC_OUT. 21433 */ 21434 static void 21435 tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, 21436 uint32_t ack, int ctl, uint_t ip_hdr_len, zoneid_t zoneid) 21437 { 21438 ipha_t *ipha = NULL; 21439 ip6_t *ip6h = NULL; 21440 ushort_t len; 21441 tcph_t *tcph; 21442 int i; 21443 mblk_t *ipsec_mp; 21444 boolean_t mctl_present; 21445 ipic_t *ipic; 21446 ipaddr_t v4addr; 21447 in6_addr_t v6addr; 21448 int addr_len; 21449 void *addr; 21450 queue_t *q = tcp_g_q; 21451 tcp_t *tcp = Q_TO_TCP(q); 21452 cred_t *cr; 21453 mblk_t *nmp; 21454 21455 if (!tcp_send_rst_chk()) { 21456 tcp_rst_unsent++; 21457 freemsg(mp); 21458 return; 21459 } 21460 21461 if (mp->b_datap->db_type == M_CTL) { 21462 ipsec_mp = mp; 21463 mp = mp->b_cont; 21464 mctl_present = B_TRUE; 21465 } else { 21466 ipsec_mp = mp; 21467 mctl_present = B_FALSE; 21468 } 21469 21470 if (str && q && tcp_dbg) { 21471 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 21472 "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 21473 "flags 0x%x", 21474 str, seq, ack, ctl); 21475 } 21476 if (mp->b_datap->db_ref != 1) { 21477 mblk_t *mp1 = copyb(mp); 21478 freemsg(mp); 21479 mp = mp1; 21480 if (!mp) { 21481 if (mctl_present) 21482 freeb(ipsec_mp); 21483 return; 21484 } else { 21485 if (mctl_present) { 21486 ipsec_mp->b_cont = mp; 21487 } else { 21488 ipsec_mp = mp; 21489 } 21490 } 21491 } else if (mp->b_cont) { 21492 freemsg(mp->b_cont); 21493 mp->b_cont = NULL; 21494 } 21495 /* 21496 * We skip reversing source route here. 21497 * (for now we replace all IP options with EOL) 21498 */ 21499 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 21500 ipha = (ipha_t *)mp->b_rptr; 21501 for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 21502 mp->b_rptr[i] = IPOPT_EOL; 21503 /* 21504 * Make sure that src address isn't flagrantly invalid. 21505 * Not all broadcast address checking for the src address 21506 * is possible, since we don't know the netmask of the src 21507 * addr. No check for destination address is done, since 21508 * IP will not pass up a packet with a broadcast dest 21509 * address to TCP. Similar checks are done below for IPv6. 21510 */ 21511 if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST || 21512 CLASSD(ipha->ipha_src)) { 21513 freemsg(ipsec_mp); 21514 BUMP_MIB(&ip_mib, ipInDiscards); 21515 return; 21516 } 21517 } else { 21518 ip6h = (ip6_t *)mp->b_rptr; 21519 21520 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || 21521 IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { 21522 freemsg(ipsec_mp); 21523 BUMP_MIB(&ip6_mib, ipv6InDiscards); 21524 return; 21525 } 21526 21527 /* Remove any extension headers assuming partial overlay */ 21528 if (ip_hdr_len > IPV6_HDR_LEN) { 21529 uint8_t *to; 21530 21531 to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN; 21532 ovbcopy(ip6h, to, IPV6_HDR_LEN); 21533 mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN; 21534 ip_hdr_len = IPV6_HDR_LEN; 21535 ip6h = (ip6_t *)mp->b_rptr; 21536 ip6h->ip6_nxt = IPPROTO_TCP; 21537 } 21538 } 21539 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 21540 if (tcph->th_flags[0] & TH_RST) { 21541 freemsg(ipsec_mp); 21542 return; 21543 } 21544 tcph->th_offset_and_rsrvd[0] = (5 << 4); 21545 len = ip_hdr_len + sizeof (tcph_t); 21546 mp->b_wptr = &mp->b_rptr[len]; 21547 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 21548 ipha->ipha_length = htons(len); 21549 /* Swap addresses */ 21550 v4addr = ipha->ipha_src; 21551 ipha->ipha_src = ipha->ipha_dst; 21552 ipha->ipha_dst = v4addr; 21553 ipha->ipha_ident = 0; 21554 ipha->ipha_ttl = (uchar_t)tcp_ipv4_ttl; 21555 addr_len = IP_ADDR_LEN; 21556 addr = &v4addr; 21557 } else { 21558 /* No ip6i_t in this case */ 21559 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 21560 /* Swap addresses */ 21561 v6addr = ip6h->ip6_src; 21562 ip6h->ip6_src = ip6h->ip6_dst; 21563 ip6h->ip6_dst = v6addr; 21564 ip6h->ip6_hops = (uchar_t)tcp_ipv6_hoplimit; 21565 addr_len = IPV6_ADDR_LEN; 21566 addr = &v6addr; 21567 } 21568 tcp_xchg(tcph->th_fport, tcph->th_lport, 2); 21569 U32_TO_BE32(ack, tcph->th_ack); 21570 U32_TO_BE32(seq, tcph->th_seq); 21571 U16_TO_BE16(0, tcph->th_win); 21572 U16_TO_BE16(sizeof (tcph_t), tcph->th_sum); 21573 tcph->th_flags[0] = (uint8_t)ctl; 21574 if (ctl & TH_RST) { 21575 BUMP_MIB(&tcp_mib, tcpOutRsts); 21576 BUMP_MIB(&tcp_mib, tcpOutControl); 21577 } 21578 21579 /* IP trusts us to set up labels when required. */ 21580 if (is_system_labeled() && (cr = DB_CRED(mp)) != NULL && 21581 crgetlabel(cr) != NULL) { 21582 int err, adjust; 21583 21584 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) 21585 err = tsol_check_label(cr, &mp, &adjust, 21586 tcp->tcp_connp->conn_mac_exempt); 21587 else 21588 err = tsol_check_label_v6(cr, &mp, &adjust, 21589 tcp->tcp_connp->conn_mac_exempt); 21590 if (mctl_present) 21591 ipsec_mp->b_cont = mp; 21592 else 21593 ipsec_mp = mp; 21594 if (err != 0) { 21595 freemsg(ipsec_mp); 21596 return; 21597 } 21598 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 21599 ipha = (ipha_t *)mp->b_rptr; 21600 adjust += ntohs(ipha->ipha_length); 21601 ipha->ipha_length = htons(adjust); 21602 } else { 21603 ip6h = (ip6_t *)mp->b_rptr; 21604 } 21605 } 21606 21607 if (mctl_present) { 21608 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; 21609 21610 ASSERT(ii->ipsec_in_type == IPSEC_IN); 21611 if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h)) { 21612 return; 21613 } 21614 } 21615 if (zoneid == ALL_ZONES) 21616 zoneid = GLOBAL_ZONEID; 21617 21618 /* Add the zoneid so ip_output routes it properly */ 21619 if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid)) == NULL) { 21620 freemsg(ipsec_mp); 21621 return; 21622 } 21623 ipsec_mp = nmp; 21624 21625 /* 21626 * NOTE: one might consider tracing a TCP packet here, but 21627 * this function has no active TCP state and no tcp structure 21628 * that has a trace buffer. If we traced here, we would have 21629 * to keep a local trace buffer in tcp_record_trace(). 21630 * 21631 * TSol note: The mblk that contains the incoming packet was 21632 * reused by tcp_xmit_listener_reset, so it already contains 21633 * the right credentials and we don't need to call mblk_setcred. 21634 * Also the conn's cred is not right since it is associated 21635 * with tcp_g_q. 21636 */ 21637 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp); 21638 21639 /* 21640 * Tell IP to mark the IRE used for this destination temporary. 21641 * This way, we can limit our exposure to DoS attack because IP 21642 * creates an IRE for each destination. If there are too many, 21643 * the time to do any routing lookup will be extremely long. And 21644 * the lookup can be in interrupt context. 21645 * 21646 * Note that in normal circumstances, this marking should not 21647 * affect anything. It would be nice if only 1 message is 21648 * needed to inform IP that the IRE created for this RST should 21649 * not be added to the cache table. But there is currently 21650 * not such communication mechanism between TCP and IP. So 21651 * the best we can do now is to send the advice ioctl to IP 21652 * to mark the IRE temporary. 21653 */ 21654 if ((mp = tcp_ip_advise_mblk(addr, addr_len, &ipic)) != NULL) { 21655 ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY; 21656 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 21657 } 21658 } 21659 21660 /* 21661 * Initiate closedown sequence on an active connection. (May be called as 21662 * writer.) Return value zero for OK return, non-zero for error return. 21663 */ 21664 static int 21665 tcp_xmit_end(tcp_t *tcp) 21666 { 21667 ipic_t *ipic; 21668 mblk_t *mp; 21669 21670 if (tcp->tcp_state < TCPS_SYN_RCVD || 21671 tcp->tcp_state > TCPS_CLOSE_WAIT) { 21672 /* 21673 * Invalid state, only states TCPS_SYN_RCVD, 21674 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 21675 */ 21676 return (-1); 21677 } 21678 21679 tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 21680 tcp->tcp_valid_bits |= TCP_FSS_VALID; 21681 /* 21682 * If there is nothing more unsent, send the FIN now. 21683 * Otherwise, it will go out with the last segment. 21684 */ 21685 if (tcp->tcp_unsent == 0) { 21686 mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 21687 tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 21688 21689 if (mp) { 21690 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 21691 tcp_send_data(tcp, tcp->tcp_wq, mp); 21692 } else { 21693 /* 21694 * Couldn't allocate msg. Pretend we got it out. 21695 * Wait for rexmit timeout. 21696 */ 21697 tcp->tcp_snxt = tcp->tcp_fss + 1; 21698 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 21699 } 21700 21701 /* 21702 * If needed, update tcp_rexmit_snxt as tcp_snxt is 21703 * changed. 21704 */ 21705 if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 21706 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 21707 } 21708 } else { 21709 /* 21710 * If tcp->tcp_cork is set, then the data will not get sent, 21711 * so we have to check that and unset it first. 21712 */ 21713 if (tcp->tcp_cork) 21714 tcp->tcp_cork = B_FALSE; 21715 tcp_wput_data(tcp, NULL, B_FALSE); 21716 } 21717 21718 /* 21719 * If TCP does not get enough samples of RTT or tcp_rtt_updates 21720 * is 0, don't update the cache. 21721 */ 21722 if (tcp_rtt_updates == 0 || tcp->tcp_rtt_update < tcp_rtt_updates) 21723 return (0); 21724 21725 /* 21726 * NOTE: should not update if source routes i.e. if tcp_remote if 21727 * different from the destination. 21728 */ 21729 if (tcp->tcp_ipversion == IPV4_VERSION) { 21730 if (tcp->tcp_remote != tcp->tcp_ipha->ipha_dst) { 21731 return (0); 21732 } 21733 mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN, 21734 &ipic); 21735 } else { 21736 if (!(IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, 21737 &tcp->tcp_ip6h->ip6_dst))) { 21738 return (0); 21739 } 21740 mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN, 21741 &ipic); 21742 } 21743 21744 /* Record route attributes in the IRE for use by future connections. */ 21745 if (mp == NULL) 21746 return (0); 21747 21748 /* 21749 * We do not have a good algorithm to update ssthresh at this time. 21750 * So don't do any update. 21751 */ 21752 ipic->ipic_rtt = tcp->tcp_rtt_sa; 21753 ipic->ipic_rtt_sd = tcp->tcp_rtt_sd; 21754 21755 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 21756 return (0); 21757 } 21758 21759 /* 21760 * Generate a "no listener here" RST in response to an "unknown" segment. 21761 * Note that we are reusing the incoming mp to construct the outgoing 21762 * RST. 21763 */ 21764 void 21765 tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid) 21766 { 21767 uchar_t *rptr; 21768 uint32_t seg_len; 21769 tcph_t *tcph; 21770 uint32_t seg_seq; 21771 uint32_t seg_ack; 21772 uint_t flags; 21773 mblk_t *ipsec_mp; 21774 ipha_t *ipha; 21775 ip6_t *ip6h; 21776 boolean_t mctl_present = B_FALSE; 21777 boolean_t check = B_TRUE; 21778 boolean_t policy_present; 21779 21780 TCP_STAT(tcp_no_listener); 21781 21782 ipsec_mp = mp; 21783 21784 if (mp->b_datap->db_type == M_CTL) { 21785 ipsec_in_t *ii; 21786 21787 mctl_present = B_TRUE; 21788 mp = mp->b_cont; 21789 21790 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 21791 ASSERT(ii->ipsec_in_type == IPSEC_IN); 21792 if (ii->ipsec_in_dont_check) { 21793 check = B_FALSE; 21794 if (!ii->ipsec_in_secure) { 21795 freeb(ipsec_mp); 21796 mctl_present = B_FALSE; 21797 ipsec_mp = mp; 21798 } 21799 } 21800 } 21801 21802 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 21803 policy_present = ipsec_inbound_v4_policy_present; 21804 ipha = (ipha_t *)mp->b_rptr; 21805 ip6h = NULL; 21806 } else { 21807 policy_present = ipsec_inbound_v6_policy_present; 21808 ipha = NULL; 21809 ip6h = (ip6_t *)mp->b_rptr; 21810 } 21811 21812 if (check && policy_present) { 21813 /* 21814 * The conn_t parameter is NULL because we already know 21815 * nobody's home. 21816 */ 21817 ipsec_mp = ipsec_check_global_policy( 21818 ipsec_mp, (conn_t *)NULL, ipha, ip6h, mctl_present); 21819 if (ipsec_mp == NULL) 21820 return; 21821 } 21822 if (is_system_labeled() && !tsol_can_reply_error(mp)) { 21823 DTRACE_PROBE2( 21824 tx__ip__log__error__nolistener__tcp, 21825 char *, "Could not reply with RST to mp(1)", 21826 mblk_t *, mp); 21827 ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n")); 21828 freemsg(ipsec_mp); 21829 return; 21830 } 21831 21832 rptr = mp->b_rptr; 21833 21834 tcph = (tcph_t *)&rptr[ip_hdr_len]; 21835 seg_seq = BE32_TO_U32(tcph->th_seq); 21836 seg_ack = BE32_TO_U32(tcph->th_ack); 21837 flags = tcph->th_flags[0]; 21838 21839 seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len); 21840 if (flags & TH_RST) { 21841 freemsg(ipsec_mp); 21842 } else if (flags & TH_ACK) { 21843 tcp_xmit_early_reset("no tcp, reset", 21844 ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid); 21845 } else { 21846 if (flags & TH_SYN) { 21847 seg_len++; 21848 } else { 21849 /* 21850 * Here we violate the RFC. Note that a normal 21851 * TCP will never send a segment without the ACK 21852 * flag, except for RST or SYN segment. This 21853 * segment is neither. Just drop it on the 21854 * floor. 21855 */ 21856 freemsg(ipsec_mp); 21857 tcp_rst_unsent++; 21858 return; 21859 } 21860 21861 tcp_xmit_early_reset("no tcp, reset/ack", 21862 ipsec_mp, 0, seg_seq + seg_len, 21863 TH_RST | TH_ACK, ip_hdr_len, zoneid); 21864 } 21865 } 21866 21867 /* 21868 * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 21869 * ip and tcp header ready to pass down to IP. If the mp passed in is 21870 * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 21871 * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 21872 * otherwise it will dup partial mblks.) 21873 * Otherwise, an appropriate ACK packet will be generated. This 21874 * routine is not usually called to send new data for the first time. It 21875 * is mostly called out of the timer for retransmits, and to generate ACKs. 21876 * 21877 * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 21878 * be adjusted by *offset. And after dupb(), the offset and the ending mblk 21879 * of the original mblk chain will be returned in *offset and *end_mp. 21880 */ 21881 static mblk_t * 21882 tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 21883 mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 21884 boolean_t rexmit) 21885 { 21886 int data_length; 21887 int32_t off = 0; 21888 uint_t flags; 21889 mblk_t *mp1; 21890 mblk_t *mp2; 21891 uchar_t *rptr; 21892 tcph_t *tcph; 21893 int32_t num_sack_blk = 0; 21894 int32_t sack_opt_len = 0; 21895 21896 /* Allocate for our maximum TCP header + link-level */ 21897 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 21898 BPRI_MED); 21899 if (!mp1) 21900 return (NULL); 21901 data_length = 0; 21902 21903 /* 21904 * Note that tcp_mss has been adjusted to take into account the 21905 * timestamp option if applicable. Because SACK options do not 21906 * appear in every TCP segments and they are of variable lengths, 21907 * they cannot be included in tcp_mss. Thus we need to calculate 21908 * the actual segment length when we need to send a segment which 21909 * includes SACK options. 21910 */ 21911 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 21912 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 21913 tcp->tcp_num_sack_blk); 21914 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 21915 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 21916 if (max_to_send + sack_opt_len > tcp->tcp_mss) 21917 max_to_send -= sack_opt_len; 21918 } 21919 21920 if (offset != NULL) { 21921 off = *offset; 21922 /* We use offset as an indicator that end_mp is not NULL. */ 21923 *end_mp = NULL; 21924 } 21925 for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 21926 /* This could be faster with cooperation from downstream */ 21927 if (mp2 != mp1 && !sendall && 21928 data_length + (int)(mp->b_wptr - mp->b_rptr) > 21929 max_to_send) 21930 /* 21931 * Don't send the next mblk since the whole mblk 21932 * does not fit. 21933 */ 21934 break; 21935 mp2->b_cont = dupb(mp); 21936 mp2 = mp2->b_cont; 21937 if (!mp2) { 21938 freemsg(mp1); 21939 return (NULL); 21940 } 21941 mp2->b_rptr += off; 21942 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 21943 (uintptr_t)INT_MAX); 21944 21945 data_length += (int)(mp2->b_wptr - mp2->b_rptr); 21946 if (data_length > max_to_send) { 21947 mp2->b_wptr -= data_length - max_to_send; 21948 data_length = max_to_send; 21949 off = mp2->b_wptr - mp->b_rptr; 21950 break; 21951 } else { 21952 off = 0; 21953 } 21954 } 21955 if (offset != NULL) { 21956 *offset = off; 21957 *end_mp = mp; 21958 } 21959 if (seg_len != NULL) { 21960 *seg_len = data_length; 21961 } 21962 21963 /* Update the latest receive window size in TCP header. */ 21964 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 21965 tcp->tcp_tcph->th_win); 21966 21967 rptr = mp1->b_rptr + tcp_wroff_xtra; 21968 mp1->b_rptr = rptr; 21969 mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; 21970 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 21971 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 21972 U32_TO_ABE32(seq, tcph->th_seq); 21973 21974 /* 21975 * Use tcp_unsent to determine if the PUSH bit should be used assumes 21976 * that this function was called from tcp_wput_data. Thus, when called 21977 * to retransmit data the setting of the PUSH bit may appear some 21978 * what random in that it might get set when it should not. This 21979 * should not pose any performance issues. 21980 */ 21981 if (data_length != 0 && (tcp->tcp_unsent == 0 || 21982 tcp->tcp_unsent == data_length)) { 21983 flags = TH_ACK | TH_PUSH; 21984 } else { 21985 flags = TH_ACK; 21986 } 21987 21988 if (tcp->tcp_ecn_ok) { 21989 if (tcp->tcp_ecn_echo_on) 21990 flags |= TH_ECE; 21991 21992 /* 21993 * Only set ECT bit and ECN_CWR if a segment contains new data. 21994 * There is no TCP flow control for non-data segments, and 21995 * only data segment is transmitted reliably. 21996 */ 21997 if (data_length > 0 && !rexmit) { 21998 SET_ECT(tcp, rptr); 21999 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 22000 flags |= TH_CWR; 22001 tcp->tcp_ecn_cwr_sent = B_TRUE; 22002 } 22003 } 22004 } 22005 22006 if (tcp->tcp_valid_bits) { 22007 uint32_t u1; 22008 22009 if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 22010 seq == tcp->tcp_iss) { 22011 uchar_t *wptr; 22012 22013 /* 22014 * If TCP_ISS_VALID and the seq number is tcp_iss, 22015 * TCP can only be in SYN-SENT, SYN-RCVD or 22016 * FIN-WAIT-1 state. It can be FIN-WAIT-1 if 22017 * our SYN is not ack'ed but the app closes this 22018 * TCP connection. 22019 */ 22020 ASSERT(tcp->tcp_state == TCPS_SYN_SENT || 22021 tcp->tcp_state == TCPS_SYN_RCVD || 22022 tcp->tcp_state == TCPS_FIN_WAIT_1); 22023 22024 /* 22025 * Tack on the MSS option. It is always needed 22026 * for both active and passive open. 22027 * 22028 * MSS option value should be interface MTU - MIN 22029 * TCP/IP header according to RFC 793 as it means 22030 * the maximum segment size TCP can receive. But 22031 * to get around some broken middle boxes/end hosts 22032 * out there, we allow the option value to be the 22033 * same as the MSS option size on the peer side. 22034 * In this way, the other side will not send 22035 * anything larger than they can receive. 22036 * 22037 * Note that for SYN_SENT state, the ndd param 22038 * tcp_use_smss_as_mss_opt has no effect as we 22039 * don't know the peer's MSS option value. So 22040 * the only case we need to take care of is in 22041 * SYN_RCVD state, which is done later. 22042 */ 22043 wptr = mp1->b_wptr; 22044 wptr[0] = TCPOPT_MAXSEG; 22045 wptr[1] = TCPOPT_MAXSEG_LEN; 22046 wptr += 2; 22047 u1 = tcp->tcp_if_mtu - 22048 (tcp->tcp_ipversion == IPV4_VERSION ? 22049 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - 22050 TCP_MIN_HEADER_LENGTH; 22051 U16_TO_BE16(u1, wptr); 22052 mp1->b_wptr = wptr + 2; 22053 /* Update the offset to cover the additional word */ 22054 tcph->th_offset_and_rsrvd[0] += (1 << 4); 22055 22056 /* 22057 * Note that the following way of filling in 22058 * TCP options are not optimal. Some NOPs can 22059 * be saved. But there is no need at this time 22060 * to optimize it. When it is needed, we will 22061 * do it. 22062 */ 22063 switch (tcp->tcp_state) { 22064 case TCPS_SYN_SENT: 22065 flags = TH_SYN; 22066 22067 if (tcp->tcp_snd_ts_ok) { 22068 uint32_t llbolt = (uint32_t)lbolt; 22069 22070 wptr = mp1->b_wptr; 22071 wptr[0] = TCPOPT_NOP; 22072 wptr[1] = TCPOPT_NOP; 22073 wptr[2] = TCPOPT_TSTAMP; 22074 wptr[3] = TCPOPT_TSTAMP_LEN; 22075 wptr += 4; 22076 U32_TO_BE32(llbolt, wptr); 22077 wptr += 4; 22078 ASSERT(tcp->tcp_ts_recent == 0); 22079 U32_TO_BE32(0L, wptr); 22080 mp1->b_wptr += TCPOPT_REAL_TS_LEN; 22081 tcph->th_offset_and_rsrvd[0] += 22082 (3 << 4); 22083 } 22084 22085 /* 22086 * Set up all the bits to tell other side 22087 * we are ECN capable. 22088 */ 22089 if (tcp->tcp_ecn_ok) { 22090 flags |= (TH_ECE | TH_CWR); 22091 } 22092 break; 22093 case TCPS_SYN_RCVD: 22094 flags |= TH_SYN; 22095 22096 /* 22097 * Reset the MSS option value to be SMSS 22098 * We should probably add back the bytes 22099 * for timestamp option and IPsec. We 22100 * don't do that as this is a workaround 22101 * for broken middle boxes/end hosts, it 22102 * is better for us to be more cautious. 22103 * They may not take these things into 22104 * account in their SMSS calculation. Thus 22105 * the peer's calculated SMSS may be smaller 22106 * than what it can be. This should be OK. 22107 */ 22108 if (tcp_use_smss_as_mss_opt) { 22109 u1 = tcp->tcp_mss; 22110 U16_TO_BE16(u1, wptr); 22111 } 22112 22113 /* 22114 * If the other side is ECN capable, reply 22115 * that we are also ECN capable. 22116 */ 22117 if (tcp->tcp_ecn_ok) 22118 flags |= TH_ECE; 22119 break; 22120 default: 22121 /* 22122 * The above ASSERT() makes sure that this 22123 * must be FIN-WAIT-1 state. Our SYN has 22124 * not been ack'ed so retransmit it. 22125 */ 22126 flags |= TH_SYN; 22127 break; 22128 } 22129 22130 if (tcp->tcp_snd_ws_ok) { 22131 wptr = mp1->b_wptr; 22132 wptr[0] = TCPOPT_NOP; 22133 wptr[1] = TCPOPT_WSCALE; 22134 wptr[2] = TCPOPT_WS_LEN; 22135 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 22136 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 22137 tcph->th_offset_and_rsrvd[0] += (1 << 4); 22138 } 22139 22140 if (tcp->tcp_snd_sack_ok) { 22141 wptr = mp1->b_wptr; 22142 wptr[0] = TCPOPT_NOP; 22143 wptr[1] = TCPOPT_NOP; 22144 wptr[2] = TCPOPT_SACK_PERMITTED; 22145 wptr[3] = TCPOPT_SACK_OK_LEN; 22146 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 22147 tcph->th_offset_and_rsrvd[0] += (1 << 4); 22148 } 22149 22150 /* allocb() of adequate mblk assures space */ 22151 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 22152 (uintptr_t)INT_MAX); 22153 u1 = (int)(mp1->b_wptr - mp1->b_rptr); 22154 /* 22155 * Get IP set to checksum on our behalf 22156 * Include the adjustment for a source route if any. 22157 */ 22158 u1 += tcp->tcp_sum; 22159 u1 = (u1 >> 16) + (u1 & 0xFFFF); 22160 U16_TO_BE16(u1, tcph->th_sum); 22161 BUMP_MIB(&tcp_mib, tcpOutControl); 22162 } 22163 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 22164 (seq + data_length) == tcp->tcp_fss) { 22165 if (!tcp->tcp_fin_acked) { 22166 flags |= TH_FIN; 22167 BUMP_MIB(&tcp_mib, tcpOutControl); 22168 } 22169 if (!tcp->tcp_fin_sent) { 22170 tcp->tcp_fin_sent = B_TRUE; 22171 switch (tcp->tcp_state) { 22172 case TCPS_SYN_RCVD: 22173 case TCPS_ESTABLISHED: 22174 tcp->tcp_state = TCPS_FIN_WAIT_1; 22175 break; 22176 case TCPS_CLOSE_WAIT: 22177 tcp->tcp_state = TCPS_LAST_ACK; 22178 break; 22179 } 22180 if (tcp->tcp_suna == tcp->tcp_snxt) 22181 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 22182 tcp->tcp_snxt = tcp->tcp_fss + 1; 22183 } 22184 } 22185 /* 22186 * Note the trick here. u1 is unsigned. When tcp_urg 22187 * is smaller than seq, u1 will become a very huge value. 22188 * So the comparison will fail. Also note that tcp_urp 22189 * should be positive, see RFC 793 page 17. 22190 */ 22191 u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION; 22192 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 && 22193 u1 < (uint32_t)(64 * 1024)) { 22194 flags |= TH_URG; 22195 BUMP_MIB(&tcp_mib, tcpOutUrg); 22196 U32_TO_ABE16(u1, tcph->th_urp); 22197 } 22198 } 22199 tcph->th_flags[0] = (uchar_t)flags; 22200 tcp->tcp_rack = tcp->tcp_rnxt; 22201 tcp->tcp_rack_cnt = 0; 22202 22203 if (tcp->tcp_snd_ts_ok) { 22204 if (tcp->tcp_state != TCPS_SYN_SENT) { 22205 uint32_t llbolt = (uint32_t)lbolt; 22206 22207 U32_TO_BE32(llbolt, 22208 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 22209 U32_TO_BE32(tcp->tcp_ts_recent, 22210 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 22211 } 22212 } 22213 22214 if (num_sack_blk > 0) { 22215 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 22216 sack_blk_t *tmp; 22217 int32_t i; 22218 22219 wptr[0] = TCPOPT_NOP; 22220 wptr[1] = TCPOPT_NOP; 22221 wptr[2] = TCPOPT_SACK; 22222 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 22223 sizeof (sack_blk_t); 22224 wptr += TCPOPT_REAL_SACK_LEN; 22225 22226 tmp = tcp->tcp_sack_list; 22227 for (i = 0; i < num_sack_blk; i++) { 22228 U32_TO_BE32(tmp[i].begin, wptr); 22229 wptr += sizeof (tcp_seq); 22230 U32_TO_BE32(tmp[i].end, wptr); 22231 wptr += sizeof (tcp_seq); 22232 } 22233 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4); 22234 } 22235 ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 22236 data_length += (int)(mp1->b_wptr - rptr); 22237 if (tcp->tcp_ipversion == IPV4_VERSION) { 22238 ((ipha_t *)rptr)->ipha_length = htons(data_length); 22239 } else { 22240 ip6_t *ip6 = (ip6_t *)(rptr + 22241 (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ? 22242 sizeof (ip6i_t) : 0)); 22243 22244 ip6->ip6_plen = htons(data_length - 22245 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 22246 } 22247 22248 /* 22249 * Prime pump for IP 22250 * Include the adjustment for a source route if any. 22251 */ 22252 data_length -= tcp->tcp_ip_hdr_len; 22253 data_length += tcp->tcp_sum; 22254 data_length = (data_length >> 16) + (data_length & 0xFFFF); 22255 U16_TO_ABE16(data_length, tcph->th_sum); 22256 if (tcp->tcp_ip_forward_progress) { 22257 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 22258 *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; 22259 tcp->tcp_ip_forward_progress = B_FALSE; 22260 } 22261 return (mp1); 22262 } 22263 22264 /* This function handles the push timeout. */ 22265 void 22266 tcp_push_timer(void *arg) 22267 { 22268 conn_t *connp = (conn_t *)arg; 22269 tcp_t *tcp = connp->conn_tcp; 22270 22271 TCP_DBGSTAT(tcp_push_timer_cnt); 22272 22273 ASSERT(tcp->tcp_listener == NULL); 22274 22275 /* 22276 * We need to plug synchronous streams during our drain to prevent 22277 * a race with tcp_fuse_rrw() or tcp_fusion_rinfop(). 22278 */ 22279 TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp); 22280 tcp->tcp_push_tid = 0; 22281 if ((tcp->tcp_rcv_list != NULL) && 22282 (tcp_rcv_drain(tcp->tcp_rq, tcp) == TH_ACK_NEEDED)) 22283 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 22284 TCP_FUSE_SYNCSTR_UNPLUG_DRAIN(tcp); 22285 } 22286 22287 /* 22288 * This function handles delayed ACK timeout. 22289 */ 22290 static void 22291 tcp_ack_timer(void *arg) 22292 { 22293 conn_t *connp = (conn_t *)arg; 22294 tcp_t *tcp = connp->conn_tcp; 22295 mblk_t *mp; 22296 22297 TCP_DBGSTAT(tcp_ack_timer_cnt); 22298 22299 tcp->tcp_ack_tid = 0; 22300 22301 if (tcp->tcp_fused) 22302 return; 22303 22304 /* 22305 * Do not send ACK if there is no outstanding unack'ed data. 22306 */ 22307 if (tcp->tcp_rnxt == tcp->tcp_rack) { 22308 return; 22309 } 22310 22311 if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) { 22312 /* 22313 * Make sure we don't allow deferred ACKs to result in 22314 * timer-based ACKing. If we have held off an ACK 22315 * when there was more than an mss here, and the timer 22316 * goes off, we have to worry about the possibility 22317 * that the sender isn't doing slow-start, or is out 22318 * of step with us for some other reason. We fall 22319 * permanently back in the direction of 22320 * ACK-every-other-packet as suggested in RFC 1122. 22321 */ 22322 if (tcp->tcp_rack_abs_max > 2) 22323 tcp->tcp_rack_abs_max--; 22324 tcp->tcp_rack_cur_max = 2; 22325 } 22326 mp = tcp_ack_mp(tcp); 22327 22328 if (mp != NULL) { 22329 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 22330 BUMP_LOCAL(tcp->tcp_obsegs); 22331 BUMP_MIB(&tcp_mib, tcpOutAck); 22332 BUMP_MIB(&tcp_mib, tcpOutAckDelayed); 22333 tcp_send_data(tcp, tcp->tcp_wq, mp); 22334 } 22335 } 22336 22337 22338 /* Generate an ACK-only (no data) segment for a TCP endpoint */ 22339 static mblk_t * 22340 tcp_ack_mp(tcp_t *tcp) 22341 { 22342 uint32_t seq_no; 22343 22344 /* 22345 * There are a few cases to be considered while setting the sequence no. 22346 * Essentially, we can come here while processing an unacceptable pkt 22347 * in the TCPS_SYN_RCVD state, in which case we set the sequence number 22348 * to snxt (per RFC 793), note the swnd wouldn't have been set yet. 22349 * If we are here for a zero window probe, stick with suna. In all 22350 * other cases, we check if suna + swnd encompasses snxt and set 22351 * the sequence number to snxt, if so. If snxt falls outside the 22352 * window (the receiver probably shrunk its window), we will go with 22353 * suna + swnd, otherwise the sequence no will be unacceptable to the 22354 * receiver. 22355 */ 22356 if (tcp->tcp_zero_win_probe) { 22357 seq_no = tcp->tcp_suna; 22358 } else if (tcp->tcp_state == TCPS_SYN_RCVD) { 22359 ASSERT(tcp->tcp_swnd == 0); 22360 seq_no = tcp->tcp_snxt; 22361 } else { 22362 seq_no = SEQ_GT(tcp->tcp_snxt, 22363 (tcp->tcp_suna + tcp->tcp_swnd)) ? 22364 (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt; 22365 } 22366 22367 if (tcp->tcp_valid_bits) { 22368 /* 22369 * For the complex case where we have to send some 22370 * controls (FIN or SYN), let tcp_xmit_mp do it. 22371 */ 22372 return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE, 22373 NULL, B_FALSE)); 22374 } else { 22375 /* Generate a simple ACK */ 22376 int data_length; 22377 uchar_t *rptr; 22378 tcph_t *tcph; 22379 mblk_t *mp1; 22380 int32_t tcp_hdr_len; 22381 int32_t tcp_tcp_hdr_len; 22382 int32_t num_sack_blk = 0; 22383 int32_t sack_opt_len; 22384 22385 /* 22386 * Allocate space for TCP + IP headers 22387 * and link-level header 22388 */ 22389 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 22390 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 22391 tcp->tcp_num_sack_blk); 22392 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 22393 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 22394 tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len; 22395 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + sack_opt_len; 22396 } else { 22397 tcp_hdr_len = tcp->tcp_hdr_len; 22398 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; 22399 } 22400 mp1 = allocb(tcp_hdr_len + tcp_wroff_xtra, BPRI_MED); 22401 if (!mp1) 22402 return (NULL); 22403 22404 /* Update the latest receive window size in TCP header. */ 22405 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 22406 tcp->tcp_tcph->th_win); 22407 /* copy in prototype TCP + IP header */ 22408 rptr = mp1->b_rptr + tcp_wroff_xtra; 22409 mp1->b_rptr = rptr; 22410 mp1->b_wptr = rptr + tcp_hdr_len; 22411 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 22412 22413 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 22414 22415 /* Set the TCP sequence number. */ 22416 U32_TO_ABE32(seq_no, tcph->th_seq); 22417 22418 /* Set up the TCP flag field. */ 22419 tcph->th_flags[0] = (uchar_t)TH_ACK; 22420 if (tcp->tcp_ecn_echo_on) 22421 tcph->th_flags[0] |= TH_ECE; 22422 22423 tcp->tcp_rack = tcp->tcp_rnxt; 22424 tcp->tcp_rack_cnt = 0; 22425 22426 /* fill in timestamp option if in use */ 22427 if (tcp->tcp_snd_ts_ok) { 22428 uint32_t llbolt = (uint32_t)lbolt; 22429 22430 U32_TO_BE32(llbolt, 22431 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 22432 U32_TO_BE32(tcp->tcp_ts_recent, 22433 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 22434 } 22435 22436 /* Fill in SACK options */ 22437 if (num_sack_blk > 0) { 22438 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 22439 sack_blk_t *tmp; 22440 int32_t i; 22441 22442 wptr[0] = TCPOPT_NOP; 22443 wptr[1] = TCPOPT_NOP; 22444 wptr[2] = TCPOPT_SACK; 22445 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 22446 sizeof (sack_blk_t); 22447 wptr += TCPOPT_REAL_SACK_LEN; 22448 22449 tmp = tcp->tcp_sack_list; 22450 for (i = 0; i < num_sack_blk; i++) { 22451 U32_TO_BE32(tmp[i].begin, wptr); 22452 wptr += sizeof (tcp_seq); 22453 U32_TO_BE32(tmp[i].end, wptr); 22454 wptr += sizeof (tcp_seq); 22455 } 22456 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 22457 << 4); 22458 } 22459 22460 if (tcp->tcp_ipversion == IPV4_VERSION) { 22461 ((ipha_t *)rptr)->ipha_length = htons(tcp_hdr_len); 22462 } else { 22463 /* Check for ip6i_t header in sticky hdrs */ 22464 ip6_t *ip6 = (ip6_t *)(rptr + 22465 (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ? 22466 sizeof (ip6i_t) : 0)); 22467 22468 ip6->ip6_plen = htons(tcp_hdr_len - 22469 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 22470 } 22471 22472 /* 22473 * Prime pump for checksum calculation in IP. Include the 22474 * adjustment for a source route if any. 22475 */ 22476 data_length = tcp_tcp_hdr_len + tcp->tcp_sum; 22477 data_length = (data_length >> 16) + (data_length & 0xFFFF); 22478 U16_TO_ABE16(data_length, tcph->th_sum); 22479 22480 if (tcp->tcp_ip_forward_progress) { 22481 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 22482 *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; 22483 tcp->tcp_ip_forward_progress = B_FALSE; 22484 } 22485 return (mp1); 22486 } 22487 } 22488 22489 /* 22490 * To create a temporary tcp structure for inserting into bind hash list. 22491 * The parameter is assumed to be in network byte order, ready for use. 22492 */ 22493 /* ARGSUSED */ 22494 static tcp_t * 22495 tcp_alloc_temp_tcp(in_port_t port) 22496 { 22497 conn_t *connp; 22498 tcp_t *tcp; 22499 22500 connp = ipcl_conn_create(IPCL_TCPCONN, KM_SLEEP); 22501 if (connp == NULL) 22502 return (NULL); 22503 22504 tcp = connp->conn_tcp; 22505 22506 /* 22507 * Only initialize the necessary info in those structures. Note 22508 * that since INADDR_ANY is all 0, we do not need to set 22509 * tcp_bound_source to INADDR_ANY here. 22510 */ 22511 tcp->tcp_state = TCPS_BOUND; 22512 tcp->tcp_lport = port; 22513 tcp->tcp_exclbind = 1; 22514 tcp->tcp_reserved_port = 1; 22515 22516 /* Just for place holding... */ 22517 tcp->tcp_ipversion = IPV4_VERSION; 22518 22519 return (tcp); 22520 } 22521 22522 /* 22523 * To remove a port range specified by lo_port and hi_port from the 22524 * reserved port ranges. This is one of the three public functions of 22525 * the reserved port interface. Note that a port range has to be removed 22526 * as a whole. Ports in a range cannot be removed individually. 22527 * 22528 * Params: 22529 * in_port_t lo_port: the beginning port of the reserved port range to 22530 * be deleted. 22531 * in_port_t hi_port: the ending port of the reserved port range to 22532 * be deleted. 22533 * 22534 * Return: 22535 * B_TRUE if the deletion is successful, B_FALSE otherwise. 22536 */ 22537 boolean_t 22538 tcp_reserved_port_del(in_port_t lo_port, in_port_t hi_port) 22539 { 22540 int i, j; 22541 int size; 22542 tcp_t **temp_tcp_array; 22543 tcp_t *tcp; 22544 22545 rw_enter(&tcp_reserved_port_lock, RW_WRITER); 22546 22547 /* First make sure that the port ranage is indeed reserved. */ 22548 for (i = 0; i < tcp_reserved_port_array_size; i++) { 22549 if (tcp_reserved_port[i].lo_port == lo_port) { 22550 hi_port = tcp_reserved_port[i].hi_port; 22551 temp_tcp_array = tcp_reserved_port[i].temp_tcp_array; 22552 break; 22553 } 22554 } 22555 if (i == tcp_reserved_port_array_size) { 22556 rw_exit(&tcp_reserved_port_lock); 22557 return (B_FALSE); 22558 } 22559 22560 /* 22561 * Remove the range from the array. This simple loop is possible 22562 * because port ranges are inserted in ascending order. 22563 */ 22564 for (j = i; j < tcp_reserved_port_array_size - 1; j++) { 22565 tcp_reserved_port[j].lo_port = tcp_reserved_port[j+1].lo_port; 22566 tcp_reserved_port[j].hi_port = tcp_reserved_port[j+1].hi_port; 22567 tcp_reserved_port[j].temp_tcp_array = 22568 tcp_reserved_port[j+1].temp_tcp_array; 22569 } 22570 22571 /* Remove all the temporary tcp structures. */ 22572 size = hi_port - lo_port + 1; 22573 while (size > 0) { 22574 tcp = temp_tcp_array[size - 1]; 22575 ASSERT(tcp != NULL); 22576 tcp_bind_hash_remove(tcp); 22577 CONN_DEC_REF(tcp->tcp_connp); 22578 size--; 22579 } 22580 kmem_free(temp_tcp_array, (hi_port - lo_port + 1) * sizeof (tcp_t *)); 22581 tcp_reserved_port_array_size--; 22582 rw_exit(&tcp_reserved_port_lock); 22583 return (B_TRUE); 22584 } 22585 22586 /* 22587 * Macro to remove temporary tcp structure from the bind hash list. The 22588 * first parameter is the list of tcp to be removed. The second parameter 22589 * is the number of tcps in the array. 22590 */ 22591 #define TCP_TMP_TCP_REMOVE(tcp_array, num) \ 22592 { \ 22593 while ((num) > 0) { \ 22594 tcp_t *tcp = (tcp_array)[(num) - 1]; \ 22595 tf_t *tbf; \ 22596 tcp_t *tcpnext; \ 22597 tbf = &tcp_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)]; \ 22598 mutex_enter(&tbf->tf_lock); \ 22599 tcpnext = tcp->tcp_bind_hash; \ 22600 if (tcpnext) { \ 22601 tcpnext->tcp_ptpbhn = \ 22602 tcp->tcp_ptpbhn; \ 22603 } \ 22604 *tcp->tcp_ptpbhn = tcpnext; \ 22605 mutex_exit(&tbf->tf_lock); \ 22606 kmem_free(tcp, sizeof (tcp_t)); \ 22607 (tcp_array)[(num) - 1] = NULL; \ 22608 (num)--; \ 22609 } \ 22610 } 22611 22612 /* 22613 * The public interface for other modules to call to reserve a port range 22614 * in TCP. The caller passes in how large a port range it wants. TCP 22615 * will try to find a range and return it via lo_port and hi_port. This is 22616 * used by NCA's nca_conn_init. 22617 * NCA can only be used in the global zone so this only affects the global 22618 * zone's ports. 22619 * 22620 * Params: 22621 * int size: the size of the port range to be reserved. 22622 * in_port_t *lo_port (referenced): returns the beginning port of the 22623 * reserved port range added. 22624 * in_port_t *hi_port (referenced): returns the ending port of the 22625 * reserved port range added. 22626 * 22627 * Return: 22628 * B_TRUE if the port reservation is successful, B_FALSE otherwise. 22629 */ 22630 boolean_t 22631 tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) 22632 { 22633 tcp_t *tcp; 22634 tcp_t *tmp_tcp; 22635 tcp_t **temp_tcp_array; 22636 tf_t *tbf; 22637 in_port_t net_port; 22638 in_port_t port; 22639 int32_t cur_size; 22640 int i, j; 22641 boolean_t used; 22642 tcp_rport_t tmp_ports[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE]; 22643 zoneid_t zoneid = GLOBAL_ZONEID; 22644 22645 /* Sanity check. */ 22646 if (size <= 0 || size > TCP_RESERVED_PORTS_RANGE_MAX) { 22647 return (B_FALSE); 22648 } 22649 22650 rw_enter(&tcp_reserved_port_lock, RW_WRITER); 22651 if (tcp_reserved_port_array_size == TCP_RESERVED_PORTS_ARRAY_MAX_SIZE) { 22652 rw_exit(&tcp_reserved_port_lock); 22653 return (B_FALSE); 22654 } 22655 22656 /* 22657 * Find the starting port to try. Since the port ranges are ordered 22658 * in the reserved port array, we can do a simple search here. 22659 */ 22660 *lo_port = TCP_SMALLEST_RESERVED_PORT; 22661 *hi_port = TCP_LARGEST_RESERVED_PORT; 22662 for (i = 0; i < tcp_reserved_port_array_size; 22663 *lo_port = tcp_reserved_port[i].hi_port + 1, i++) { 22664 if (tcp_reserved_port[i].lo_port - *lo_port >= size) { 22665 *hi_port = tcp_reserved_port[i].lo_port - 1; 22666 break; 22667 } 22668 } 22669 /* No available port range. */ 22670 if (i == tcp_reserved_port_array_size && *hi_port - *lo_port < size) { 22671 rw_exit(&tcp_reserved_port_lock); 22672 return (B_FALSE); 22673 } 22674 22675 temp_tcp_array = kmem_zalloc(size * sizeof (tcp_t *), KM_NOSLEEP); 22676 if (temp_tcp_array == NULL) { 22677 rw_exit(&tcp_reserved_port_lock); 22678 return (B_FALSE); 22679 } 22680 22681 /* Go thru the port range to see if some ports are already bound. */ 22682 for (port = *lo_port, cur_size = 0; 22683 cur_size < size && port <= *hi_port; 22684 cur_size++, port++) { 22685 used = B_FALSE; 22686 net_port = htons(port); 22687 tbf = &tcp_bind_fanout[TCP_BIND_HASH(net_port)]; 22688 mutex_enter(&tbf->tf_lock); 22689 for (tcp = tbf->tf_tcp; tcp != NULL; 22690 tcp = tcp->tcp_bind_hash) { 22691 if (IPCL_ZONE_MATCH(tcp->tcp_connp, zoneid) && 22692 net_port == tcp->tcp_lport) { 22693 /* 22694 * A port is already bound. Search again 22695 * starting from port + 1. Release all 22696 * temporary tcps. 22697 */ 22698 mutex_exit(&tbf->tf_lock); 22699 TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size); 22700 *lo_port = port + 1; 22701 cur_size = -1; 22702 used = B_TRUE; 22703 break; 22704 } 22705 } 22706 if (!used) { 22707 if ((tmp_tcp = tcp_alloc_temp_tcp(net_port)) == NULL) { 22708 /* 22709 * Allocation failure. Just fail the request. 22710 * Need to remove all those temporary tcp 22711 * structures. 22712 */ 22713 mutex_exit(&tbf->tf_lock); 22714 TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size); 22715 rw_exit(&tcp_reserved_port_lock); 22716 kmem_free(temp_tcp_array, 22717 (hi_port - lo_port + 1) * 22718 sizeof (tcp_t *)); 22719 return (B_FALSE); 22720 } 22721 temp_tcp_array[cur_size] = tmp_tcp; 22722 tcp_bind_hash_insert(tbf, tmp_tcp, B_TRUE); 22723 mutex_exit(&tbf->tf_lock); 22724 } 22725 } 22726 22727 /* 22728 * The current range is not large enough. We can actually do another 22729 * search if this search is done between 2 reserved port ranges. But 22730 * for first release, we just stop here and return saying that no port 22731 * range is available. 22732 */ 22733 if (cur_size < size) { 22734 TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size); 22735 rw_exit(&tcp_reserved_port_lock); 22736 kmem_free(temp_tcp_array, size * sizeof (tcp_t *)); 22737 return (B_FALSE); 22738 } 22739 *hi_port = port - 1; 22740 22741 /* 22742 * Insert range into array in ascending order. Since this function 22743 * must not be called often, we choose to use the simplest method. 22744 * The above array should not consume excessive stack space as 22745 * the size must be very small. If in future releases, we find 22746 * that we should provide more reserved port ranges, this function 22747 * has to be modified to be more efficient. 22748 */ 22749 if (tcp_reserved_port_array_size == 0) { 22750 tcp_reserved_port[0].lo_port = *lo_port; 22751 tcp_reserved_port[0].hi_port = *hi_port; 22752 tcp_reserved_port[0].temp_tcp_array = temp_tcp_array; 22753 } else { 22754 for (i = 0, j = 0; i < tcp_reserved_port_array_size; i++, j++) { 22755 if (*lo_port < tcp_reserved_port[i].lo_port && i == j) { 22756 tmp_ports[j].lo_port = *lo_port; 22757 tmp_ports[j].hi_port = *hi_port; 22758 tmp_ports[j].temp_tcp_array = temp_tcp_array; 22759 j++; 22760 } 22761 tmp_ports[j].lo_port = tcp_reserved_port[i].lo_port; 22762 tmp_ports[j].hi_port = tcp_reserved_port[i].hi_port; 22763 tmp_ports[j].temp_tcp_array = 22764 tcp_reserved_port[i].temp_tcp_array; 22765 } 22766 if (j == i) { 22767 tmp_ports[j].lo_port = *lo_port; 22768 tmp_ports[j].hi_port = *hi_port; 22769 tmp_ports[j].temp_tcp_array = temp_tcp_array; 22770 } 22771 bcopy(tmp_ports, tcp_reserved_port, sizeof (tmp_ports)); 22772 } 22773 tcp_reserved_port_array_size++; 22774 rw_exit(&tcp_reserved_port_lock); 22775 return (B_TRUE); 22776 } 22777 22778 /* 22779 * Check to see if a port is in any reserved port range. 22780 * 22781 * Params: 22782 * in_port_t port: the port to be verified. 22783 * 22784 * Return: 22785 * B_TRUE is the port is inside a reserved port range, B_FALSE otherwise. 22786 */ 22787 boolean_t 22788 tcp_reserved_port_check(in_port_t port) 22789 { 22790 int i; 22791 22792 rw_enter(&tcp_reserved_port_lock, RW_READER); 22793 for (i = 0; i < tcp_reserved_port_array_size; i++) { 22794 if (port >= tcp_reserved_port[i].lo_port || 22795 port <= tcp_reserved_port[i].hi_port) { 22796 rw_exit(&tcp_reserved_port_lock); 22797 return (B_TRUE); 22798 } 22799 } 22800 rw_exit(&tcp_reserved_port_lock); 22801 return (B_FALSE); 22802 } 22803 22804 /* 22805 * To list all reserved port ranges. This is the function to handle 22806 * ndd tcp_reserved_port_list. 22807 */ 22808 /* ARGSUSED */ 22809 static int 22810 tcp_reserved_port_list(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 22811 { 22812 int i; 22813 22814 rw_enter(&tcp_reserved_port_lock, RW_READER); 22815 if (tcp_reserved_port_array_size > 0) 22816 (void) mi_mpprintf(mp, "The following ports are reserved:"); 22817 else 22818 (void) mi_mpprintf(mp, "No port is reserved."); 22819 for (i = 0; i < tcp_reserved_port_array_size; i++) { 22820 (void) mi_mpprintf(mp, "%d-%d", 22821 tcp_reserved_port[i].lo_port, tcp_reserved_port[i].hi_port); 22822 } 22823 rw_exit(&tcp_reserved_port_lock); 22824 return (0); 22825 } 22826 22827 /* 22828 * Hash list insertion routine for tcp_t structures. 22829 * Inserts entries with the ones bound to a specific IP address first 22830 * followed by those bound to INADDR_ANY. 22831 */ 22832 static void 22833 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) 22834 { 22835 tcp_t **tcpp; 22836 tcp_t *tcpnext; 22837 22838 if (tcp->tcp_ptpbhn != NULL) { 22839 ASSERT(!caller_holds_lock); 22840 tcp_bind_hash_remove(tcp); 22841 } 22842 tcpp = &tbf->tf_tcp; 22843 if (!caller_holds_lock) { 22844 mutex_enter(&tbf->tf_lock); 22845 } else { 22846 ASSERT(MUTEX_HELD(&tbf->tf_lock)); 22847 } 22848 tcpnext = tcpp[0]; 22849 if (tcpnext) { 22850 /* 22851 * If the new tcp bound to the INADDR_ANY address 22852 * and the first one in the list is not bound to 22853 * INADDR_ANY we skip all entries until we find the 22854 * first one bound to INADDR_ANY. 22855 * This makes sure that applications binding to a 22856 * specific address get preference over those binding to 22857 * INADDR_ANY. 22858 */ 22859 if (V6_OR_V4_INADDR_ANY(tcp->tcp_bound_source_v6) && 22860 !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) { 22861 while ((tcpnext = tcpp[0]) != NULL && 22862 !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) 22863 tcpp = &(tcpnext->tcp_bind_hash); 22864 if (tcpnext) 22865 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash; 22866 } else 22867 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash; 22868 } 22869 tcp->tcp_bind_hash = tcpnext; 22870 tcp->tcp_ptpbhn = tcpp; 22871 tcpp[0] = tcp; 22872 if (!caller_holds_lock) 22873 mutex_exit(&tbf->tf_lock); 22874 } 22875 22876 /* 22877 * Hash list removal routine for tcp_t structures. 22878 */ 22879 static void 22880 tcp_bind_hash_remove(tcp_t *tcp) 22881 { 22882 tcp_t *tcpnext; 22883 kmutex_t *lockp; 22884 22885 if (tcp->tcp_ptpbhn == NULL) 22886 return; 22887 22888 /* 22889 * Extract the lock pointer in case there are concurrent 22890 * hash_remove's for this instance. 22891 */ 22892 ASSERT(tcp->tcp_lport != 0); 22893 lockp = &tcp_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)].tf_lock; 22894 22895 ASSERT(lockp != NULL); 22896 mutex_enter(lockp); 22897 if (tcp->tcp_ptpbhn) { 22898 tcpnext = tcp->tcp_bind_hash; 22899 if (tcpnext) { 22900 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 22901 tcp->tcp_bind_hash = NULL; 22902 } 22903 *tcp->tcp_ptpbhn = tcpnext; 22904 tcp->tcp_ptpbhn = NULL; 22905 } 22906 mutex_exit(lockp); 22907 } 22908 22909 22910 /* 22911 * Hash list lookup routine for tcp_t structures. 22912 * Returns with a CONN_INC_REF tcp structure. Caller must do a CONN_DEC_REF. 22913 */ 22914 static tcp_t * 22915 tcp_acceptor_hash_lookup(t_uscalar_t id) 22916 { 22917 tf_t *tf; 22918 tcp_t *tcp; 22919 22920 tf = &tcp_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 22921 mutex_enter(&tf->tf_lock); 22922 for (tcp = tf->tf_tcp; tcp != NULL; 22923 tcp = tcp->tcp_acceptor_hash) { 22924 if (tcp->tcp_acceptor_id == id) { 22925 CONN_INC_REF(tcp->tcp_connp); 22926 mutex_exit(&tf->tf_lock); 22927 return (tcp); 22928 } 22929 } 22930 mutex_exit(&tf->tf_lock); 22931 return (NULL); 22932 } 22933 22934 22935 /* 22936 * Hash list insertion routine for tcp_t structures. 22937 */ 22938 void 22939 tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp) 22940 { 22941 tf_t *tf; 22942 tcp_t **tcpp; 22943 tcp_t *tcpnext; 22944 22945 tf = &tcp_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 22946 22947 if (tcp->tcp_ptpahn != NULL) 22948 tcp_acceptor_hash_remove(tcp); 22949 tcpp = &tf->tf_tcp; 22950 mutex_enter(&tf->tf_lock); 22951 tcpnext = tcpp[0]; 22952 if (tcpnext) 22953 tcpnext->tcp_ptpahn = &tcp->tcp_acceptor_hash; 22954 tcp->tcp_acceptor_hash = tcpnext; 22955 tcp->tcp_ptpahn = tcpp; 22956 tcpp[0] = tcp; 22957 tcp->tcp_acceptor_lockp = &tf->tf_lock; /* For tcp_*_hash_remove */ 22958 mutex_exit(&tf->tf_lock); 22959 } 22960 22961 /* 22962 * Hash list removal routine for tcp_t structures. 22963 */ 22964 static void 22965 tcp_acceptor_hash_remove(tcp_t *tcp) 22966 { 22967 tcp_t *tcpnext; 22968 kmutex_t *lockp; 22969 22970 /* 22971 * Extract the lock pointer in case there are concurrent 22972 * hash_remove's for this instance. 22973 */ 22974 lockp = tcp->tcp_acceptor_lockp; 22975 22976 if (tcp->tcp_ptpahn == NULL) 22977 return; 22978 22979 ASSERT(lockp != NULL); 22980 mutex_enter(lockp); 22981 if (tcp->tcp_ptpahn) { 22982 tcpnext = tcp->tcp_acceptor_hash; 22983 if (tcpnext) { 22984 tcpnext->tcp_ptpahn = tcp->tcp_ptpahn; 22985 tcp->tcp_acceptor_hash = NULL; 22986 } 22987 *tcp->tcp_ptpahn = tcpnext; 22988 tcp->tcp_ptpahn = NULL; 22989 } 22990 mutex_exit(lockp); 22991 tcp->tcp_acceptor_lockp = NULL; 22992 } 22993 22994 /* ARGSUSED */ 22995 static int 22996 tcp_host_param_setvalue(queue_t *q, mblk_t *mp, char *value, caddr_t cp, int af) 22997 { 22998 int error = 0; 22999 int retval; 23000 char *end; 23001 23002 tcp_hsp_t *hsp; 23003 tcp_hsp_t *hspprev; 23004 23005 ipaddr_t addr = 0; /* Address we're looking for */ 23006 in6_addr_t v6addr; /* Address we're looking for */ 23007 uint32_t hash; /* Hash of that address */ 23008 23009 /* 23010 * If the following variables are still zero after parsing the input 23011 * string, the user didn't specify them and we don't change them in 23012 * the HSP. 23013 */ 23014 23015 ipaddr_t mask = 0; /* Subnet mask */ 23016 in6_addr_t v6mask; 23017 long sendspace = 0; /* Send buffer size */ 23018 long recvspace = 0; /* Receive buffer size */ 23019 long timestamp = 0; /* Originate TCP TSTAMP option, 1 = yes */ 23020 boolean_t delete = B_FALSE; /* User asked to delete this HSP */ 23021 23022 rw_enter(&tcp_hsp_lock, RW_WRITER); 23023 23024 /* Parse and validate address */ 23025 if (af == AF_INET) { 23026 retval = inet_pton(af, value, &addr); 23027 if (retval == 1) 23028 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 23029 } else if (af == AF_INET6) { 23030 retval = inet_pton(af, value, &v6addr); 23031 } else { 23032 error = EINVAL; 23033 goto done; 23034 } 23035 if (retval == 0) { 23036 error = EINVAL; 23037 goto done; 23038 } 23039 23040 while ((*value) && *value != ' ') 23041 value++; 23042 23043 /* Parse individual keywords, set variables if found */ 23044 while (*value) { 23045 /* Skip leading blanks */ 23046 23047 while (*value == ' ' || *value == '\t') 23048 value++; 23049 23050 /* If at end of string, we're done */ 23051 23052 if (!*value) 23053 break; 23054 23055 /* We have a word, figure out what it is */ 23056 23057 if (strncmp("mask", value, 4) == 0) { 23058 value += 4; 23059 while (*value == ' ' || *value == '\t') 23060 value++; 23061 /* Parse subnet mask */ 23062 if (af == AF_INET) { 23063 retval = inet_pton(af, value, &mask); 23064 if (retval == 1) { 23065 V4MASK_TO_V6(mask, v6mask); 23066 } 23067 } else if (af == AF_INET6) { 23068 retval = inet_pton(af, value, &v6mask); 23069 } 23070 if (retval != 1) { 23071 error = EINVAL; 23072 goto done; 23073 } 23074 while ((*value) && *value != ' ') 23075 value++; 23076 } else if (strncmp("sendspace", value, 9) == 0) { 23077 value += 9; 23078 23079 if (ddi_strtol(value, &end, 0, &sendspace) != 0 || 23080 sendspace < TCP_XMIT_HIWATER || 23081 sendspace >= (1L<<30)) { 23082 error = EINVAL; 23083 goto done; 23084 } 23085 value = end; 23086 } else if (strncmp("recvspace", value, 9) == 0) { 23087 value += 9; 23088 23089 if (ddi_strtol(value, &end, 0, &recvspace) != 0 || 23090 recvspace < TCP_RECV_HIWATER || 23091 recvspace >= (1L<<30)) { 23092 error = EINVAL; 23093 goto done; 23094 } 23095 value = end; 23096 } else if (strncmp("timestamp", value, 9) == 0) { 23097 value += 9; 23098 23099 if (ddi_strtol(value, &end, 0, ×tamp) != 0 || 23100 timestamp < 0 || timestamp > 1) { 23101 error = EINVAL; 23102 goto done; 23103 } 23104 23105 /* 23106 * We increment timestamp so we know it's been set; 23107 * this is undone when we put it in the HSP 23108 */ 23109 timestamp++; 23110 value = end; 23111 } else if (strncmp("delete", value, 6) == 0) { 23112 value += 6; 23113 delete = B_TRUE; 23114 } else { 23115 error = EINVAL; 23116 goto done; 23117 } 23118 } 23119 23120 /* Hash address for lookup */ 23121 23122 hash = TCP_HSP_HASH(addr); 23123 23124 if (delete) { 23125 /* 23126 * Note that deletes don't return an error if the thing 23127 * we're trying to delete isn't there. 23128 */ 23129 if (tcp_hsp_hash == NULL) 23130 goto done; 23131 hsp = tcp_hsp_hash[hash]; 23132 23133 if (hsp) { 23134 if (IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, 23135 &v6addr)) { 23136 tcp_hsp_hash[hash] = hsp->tcp_hsp_next; 23137 mi_free((char *)hsp); 23138 } else { 23139 hspprev = hsp; 23140 while ((hsp = hsp->tcp_hsp_next) != NULL) { 23141 if (IN6_ARE_ADDR_EQUAL( 23142 &hsp->tcp_hsp_addr_v6, &v6addr)) { 23143 hspprev->tcp_hsp_next = 23144 hsp->tcp_hsp_next; 23145 mi_free((char *)hsp); 23146 break; 23147 } 23148 hspprev = hsp; 23149 } 23150 } 23151 } 23152 } else { 23153 /* 23154 * We're adding/modifying an HSP. If we haven't already done 23155 * so, allocate the hash table. 23156 */ 23157 23158 if (!tcp_hsp_hash) { 23159 tcp_hsp_hash = (tcp_hsp_t **) 23160 mi_zalloc(sizeof (tcp_hsp_t *) * TCP_HSP_HASH_SIZE); 23161 if (!tcp_hsp_hash) { 23162 error = EINVAL; 23163 goto done; 23164 } 23165 } 23166 23167 /* Get head of hash chain */ 23168 23169 hsp = tcp_hsp_hash[hash]; 23170 23171 /* Try to find pre-existing hsp on hash chain */ 23172 /* Doesn't handle CIDR prefixes. */ 23173 while (hsp) { 23174 if (IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, &v6addr)) 23175 break; 23176 hsp = hsp->tcp_hsp_next; 23177 } 23178 23179 /* 23180 * If we didn't, create one with default values and put it 23181 * at head of hash chain 23182 */ 23183 23184 if (!hsp) { 23185 hsp = (tcp_hsp_t *)mi_zalloc(sizeof (tcp_hsp_t)); 23186 if (!hsp) { 23187 error = EINVAL; 23188 goto done; 23189 } 23190 hsp->tcp_hsp_next = tcp_hsp_hash[hash]; 23191 tcp_hsp_hash[hash] = hsp; 23192 } 23193 23194 /* Set values that the user asked us to change */ 23195 23196 hsp->tcp_hsp_addr_v6 = v6addr; 23197 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) 23198 hsp->tcp_hsp_vers = IPV4_VERSION; 23199 else 23200 hsp->tcp_hsp_vers = IPV6_VERSION; 23201 hsp->tcp_hsp_subnet_v6 = v6mask; 23202 if (sendspace > 0) 23203 hsp->tcp_hsp_sendspace = sendspace; 23204 if (recvspace > 0) 23205 hsp->tcp_hsp_recvspace = recvspace; 23206 if (timestamp > 0) 23207 hsp->tcp_hsp_tstamp = timestamp - 1; 23208 } 23209 23210 done: 23211 rw_exit(&tcp_hsp_lock); 23212 return (error); 23213 } 23214 23215 /* Set callback routine passed to nd_load by tcp_param_register. */ 23216 /* ARGSUSED */ 23217 static int 23218 tcp_host_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) 23219 { 23220 return (tcp_host_param_setvalue(q, mp, value, cp, AF_INET)); 23221 } 23222 /* ARGSUSED */ 23223 static int 23224 tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 23225 cred_t *cr) 23226 { 23227 return (tcp_host_param_setvalue(q, mp, value, cp, AF_INET6)); 23228 } 23229 23230 /* TCP host parameters report triggered via the Named Dispatch mechanism. */ 23231 /* ARGSUSED */ 23232 static int 23233 tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 23234 { 23235 tcp_hsp_t *hsp; 23236 int i; 23237 char addrbuf[INET6_ADDRSTRLEN], subnetbuf[INET6_ADDRSTRLEN]; 23238 23239 rw_enter(&tcp_hsp_lock, RW_READER); 23240 (void) mi_mpprintf(mp, 23241 "Hash HSP " MI_COL_HDRPAD_STR 23242 "Address Subnet Mask Send Receive TStamp"); 23243 if (tcp_hsp_hash) { 23244 for (i = 0; i < TCP_HSP_HASH_SIZE; i++) { 23245 hsp = tcp_hsp_hash[i]; 23246 while (hsp) { 23247 if (hsp->tcp_hsp_vers == IPV4_VERSION) { 23248 (void) inet_ntop(AF_INET, 23249 &hsp->tcp_hsp_addr, 23250 addrbuf, sizeof (addrbuf)); 23251 (void) inet_ntop(AF_INET, 23252 &hsp->tcp_hsp_subnet, 23253 subnetbuf, sizeof (subnetbuf)); 23254 } else { 23255 (void) inet_ntop(AF_INET6, 23256 &hsp->tcp_hsp_addr_v6, 23257 addrbuf, sizeof (addrbuf)); 23258 (void) inet_ntop(AF_INET6, 23259 &hsp->tcp_hsp_subnet_v6, 23260 subnetbuf, sizeof (subnetbuf)); 23261 } 23262 (void) mi_mpprintf(mp, 23263 " %03d " MI_COL_PTRFMT_STR 23264 "%s %s %010d %010d %d", 23265 i, 23266 (void *)hsp, 23267 addrbuf, 23268 subnetbuf, 23269 hsp->tcp_hsp_sendspace, 23270 hsp->tcp_hsp_recvspace, 23271 hsp->tcp_hsp_tstamp); 23272 23273 hsp = hsp->tcp_hsp_next; 23274 } 23275 } 23276 } 23277 rw_exit(&tcp_hsp_lock); 23278 return (0); 23279 } 23280 23281 23282 /* Data for fast netmask macro used by tcp_hsp_lookup */ 23283 23284 static ipaddr_t netmasks[] = { 23285 IN_CLASSA_NET, IN_CLASSA_NET, IN_CLASSB_NET, 23286 IN_CLASSC_NET | IN_CLASSD_NET /* Class C,D,E */ 23287 }; 23288 23289 #define netmask(addr) (netmasks[(ipaddr_t)(addr) >> 30]) 23290 23291 /* 23292 * XXX This routine should go away and instead we should use the metrics 23293 * associated with the routes to determine the default sndspace and rcvspace. 23294 */ 23295 static tcp_hsp_t * 23296 tcp_hsp_lookup(ipaddr_t addr) 23297 { 23298 tcp_hsp_t *hsp = NULL; 23299 23300 /* Quick check without acquiring the lock. */ 23301 if (tcp_hsp_hash == NULL) 23302 return (NULL); 23303 23304 rw_enter(&tcp_hsp_lock, RW_READER); 23305 23306 /* This routine finds the best-matching HSP for address addr. */ 23307 23308 if (tcp_hsp_hash) { 23309 int i; 23310 ipaddr_t srchaddr; 23311 tcp_hsp_t *hsp_net; 23312 23313 /* We do three passes: host, network, and subnet. */ 23314 23315 srchaddr = addr; 23316 23317 for (i = 1; i <= 3; i++) { 23318 /* Look for exact match on srchaddr */ 23319 23320 hsp = tcp_hsp_hash[TCP_HSP_HASH(srchaddr)]; 23321 while (hsp) { 23322 if (hsp->tcp_hsp_vers == IPV4_VERSION && 23323 hsp->tcp_hsp_addr == srchaddr) 23324 break; 23325 hsp = hsp->tcp_hsp_next; 23326 } 23327 ASSERT(hsp == NULL || 23328 hsp->tcp_hsp_vers == IPV4_VERSION); 23329 23330 /* 23331 * If this is the first pass: 23332 * If we found a match, great, return it. 23333 * If not, search for the network on the second pass. 23334 */ 23335 23336 if (i == 1) 23337 if (hsp) 23338 break; 23339 else 23340 { 23341 srchaddr = addr & netmask(addr); 23342 continue; 23343 } 23344 23345 /* 23346 * If this is the second pass: 23347 * If we found a match, but there's a subnet mask, 23348 * save the match but try again using the subnet 23349 * mask on the third pass. 23350 * Otherwise, return whatever we found. 23351 */ 23352 23353 if (i == 2) { 23354 if (hsp && hsp->tcp_hsp_subnet) { 23355 hsp_net = hsp; 23356 srchaddr = addr & hsp->tcp_hsp_subnet; 23357 continue; 23358 } else { 23359 break; 23360 } 23361 } 23362 23363 /* 23364 * This must be the third pass. If we didn't find 23365 * anything, return the saved network HSP instead. 23366 */ 23367 23368 if (!hsp) 23369 hsp = hsp_net; 23370 } 23371 } 23372 23373 rw_exit(&tcp_hsp_lock); 23374 return (hsp); 23375 } 23376 23377 /* 23378 * XXX Equally broken as the IPv4 routine. Doesn't handle longest 23379 * match lookup. 23380 */ 23381 static tcp_hsp_t * 23382 tcp_hsp_lookup_ipv6(in6_addr_t *v6addr) 23383 { 23384 tcp_hsp_t *hsp = NULL; 23385 23386 /* Quick check without acquiring the lock. */ 23387 if (tcp_hsp_hash == NULL) 23388 return (NULL); 23389 23390 rw_enter(&tcp_hsp_lock, RW_READER); 23391 23392 /* This routine finds the best-matching HSP for address addr. */ 23393 23394 if (tcp_hsp_hash) { 23395 int i; 23396 in6_addr_t v6srchaddr; 23397 tcp_hsp_t *hsp_net; 23398 23399 /* We do three passes: host, network, and subnet. */ 23400 23401 v6srchaddr = *v6addr; 23402 23403 for (i = 1; i <= 3; i++) { 23404 /* Look for exact match on srchaddr */ 23405 23406 hsp = tcp_hsp_hash[TCP_HSP_HASH( 23407 V4_PART_OF_V6(v6srchaddr))]; 23408 while (hsp) { 23409 if (hsp->tcp_hsp_vers == IPV6_VERSION && 23410 IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, 23411 &v6srchaddr)) 23412 break; 23413 hsp = hsp->tcp_hsp_next; 23414 } 23415 23416 /* 23417 * If this is the first pass: 23418 * If we found a match, great, return it. 23419 * If not, search for the network on the second pass. 23420 */ 23421 23422 if (i == 1) 23423 if (hsp) 23424 break; 23425 else { 23426 /* Assume a 64 bit mask */ 23427 v6srchaddr.s6_addr32[0] = 23428 v6addr->s6_addr32[0]; 23429 v6srchaddr.s6_addr32[1] = 23430 v6addr->s6_addr32[1]; 23431 v6srchaddr.s6_addr32[2] = 0; 23432 v6srchaddr.s6_addr32[3] = 0; 23433 continue; 23434 } 23435 23436 /* 23437 * If this is the second pass: 23438 * If we found a match, but there's a subnet mask, 23439 * save the match but try again using the subnet 23440 * mask on the third pass. 23441 * Otherwise, return whatever we found. 23442 */ 23443 23444 if (i == 2) { 23445 ASSERT(hsp == NULL || 23446 hsp->tcp_hsp_vers == IPV6_VERSION); 23447 if (hsp && 23448 !IN6_IS_ADDR_UNSPECIFIED( 23449 &hsp->tcp_hsp_subnet_v6)) { 23450 hsp_net = hsp; 23451 V6_MASK_COPY(*v6addr, 23452 hsp->tcp_hsp_subnet_v6, v6srchaddr); 23453 continue; 23454 } else { 23455 break; 23456 } 23457 } 23458 23459 /* 23460 * This must be the third pass. If we didn't find 23461 * anything, return the saved network HSP instead. 23462 */ 23463 23464 if (!hsp) 23465 hsp = hsp_net; 23466 } 23467 } 23468 23469 rw_exit(&tcp_hsp_lock); 23470 return (hsp); 23471 } 23472 23473 /* 23474 * Type three generator adapted from the random() function in 4.4 BSD: 23475 */ 23476 23477 /* 23478 * Copyright (c) 1983, 1993 23479 * The Regents of the University of California. All rights reserved. 23480 * 23481 * Redistribution and use in source and binary forms, with or without 23482 * modification, are permitted provided that the following conditions 23483 * are met: 23484 * 1. Redistributions of source code must retain the above copyright 23485 * notice, this list of conditions and the following disclaimer. 23486 * 2. Redistributions in binary form must reproduce the above copyright 23487 * notice, this list of conditions and the following disclaimer in the 23488 * documentation and/or other materials provided with the distribution. 23489 * 3. All advertising materials mentioning features or use of this software 23490 * must display the following acknowledgement: 23491 * This product includes software developed by the University of 23492 * California, Berkeley and its contributors. 23493 * 4. Neither the name of the University nor the names of its contributors 23494 * may be used to endorse or promote products derived from this software 23495 * without specific prior written permission. 23496 * 23497 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23498 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23499 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23500 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23501 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23502 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23503 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23504 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23505 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23506 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23507 * SUCH DAMAGE. 23508 */ 23509 23510 /* Type 3 -- x**31 + x**3 + 1 */ 23511 #define DEG_3 31 23512 #define SEP_3 3 23513 23514 23515 /* Protected by tcp_random_lock */ 23516 static int tcp_randtbl[DEG_3 + 1]; 23517 23518 static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 23519 static int *tcp_random_rptr = &tcp_randtbl[1]; 23520 23521 static int *tcp_random_state = &tcp_randtbl[1]; 23522 static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 23523 23524 kmutex_t tcp_random_lock; 23525 23526 void 23527 tcp_random_init(void) 23528 { 23529 int i; 23530 hrtime_t hrt; 23531 time_t wallclock; 23532 uint64_t result; 23533 23534 /* 23535 * Use high-res timer and current time for seed. Gethrtime() returns 23536 * a longlong, which may contain resolution down to nanoseconds. 23537 * The current time will either be a 32-bit or a 64-bit quantity. 23538 * XOR the two together in a 64-bit result variable. 23539 * Convert the result to a 32-bit value by multiplying the high-order 23540 * 32-bits by the low-order 32-bits. 23541 */ 23542 23543 hrt = gethrtime(); 23544 (void) drv_getparm(TIME, &wallclock); 23545 result = (uint64_t)wallclock ^ (uint64_t)hrt; 23546 mutex_enter(&tcp_random_lock); 23547 tcp_random_state[0] = ((result >> 32) & 0xffffffff) * 23548 (result & 0xffffffff); 23549 23550 for (i = 1; i < DEG_3; i++) 23551 tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 23552 + 12345; 23553 tcp_random_fptr = &tcp_random_state[SEP_3]; 23554 tcp_random_rptr = &tcp_random_state[0]; 23555 mutex_exit(&tcp_random_lock); 23556 for (i = 0; i < 10 * DEG_3; i++) 23557 (void) tcp_random(); 23558 } 23559 23560 /* 23561 * tcp_random: Return a random number in the range [1 - (128K + 1)]. 23562 * This range is selected to be approximately centered on TCP_ISS / 2, 23563 * and easy to compute. We get this value by generating a 32-bit random 23564 * number, selecting out the high-order 17 bits, and then adding one so 23565 * that we never return zero. 23566 */ 23567 int 23568 tcp_random(void) 23569 { 23570 int i; 23571 23572 mutex_enter(&tcp_random_lock); 23573 *tcp_random_fptr += *tcp_random_rptr; 23574 23575 /* 23576 * The high-order bits are more random than the low-order bits, 23577 * so we select out the high-order 17 bits and add one so that 23578 * we never return zero. 23579 */ 23580 i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 23581 if (++tcp_random_fptr >= tcp_random_end_ptr) { 23582 tcp_random_fptr = tcp_random_state; 23583 ++tcp_random_rptr; 23584 } else if (++tcp_random_rptr >= tcp_random_end_ptr) 23585 tcp_random_rptr = tcp_random_state; 23586 23587 mutex_exit(&tcp_random_lock); 23588 return (i); 23589 } 23590 23591 /* 23592 * XXX This will go away when TPI is extended to send 23593 * info reqs to sockfs/timod ..... 23594 * Given a queue, set the max packet size for the write 23595 * side of the queue below stream head. This value is 23596 * cached on the stream head. 23597 * Returns 1 on success, 0 otherwise. 23598 */ 23599 static int 23600 setmaxps(queue_t *q, int maxpsz) 23601 { 23602 struct stdata *stp; 23603 queue_t *wq; 23604 stp = STREAM(q); 23605 23606 /* 23607 * At this point change of a queue parameter is not allowed 23608 * when a multiplexor is sitting on top. 23609 */ 23610 if (stp->sd_flag & STPLEX) 23611 return (0); 23612 23613 claimstr(stp->sd_wrq); 23614 wq = stp->sd_wrq->q_next; 23615 ASSERT(wq != NULL); 23616 (void) strqset(wq, QMAXPSZ, 0, maxpsz); 23617 releasestr(stp->sd_wrq); 23618 return (1); 23619 } 23620 23621 static int 23622 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, 23623 int *t_errorp, int *sys_errorp) 23624 { 23625 int error; 23626 int is_absreq_failure; 23627 t_scalar_t *opt_lenp; 23628 t_scalar_t opt_offset; 23629 int prim_type; 23630 struct T_conn_req *tcreqp; 23631 struct T_conn_res *tcresp; 23632 cred_t *cr; 23633 23634 cr = DB_CREDDEF(mp, tcp->tcp_cred); 23635 23636 prim_type = ((union T_primitives *)mp->b_rptr)->type; 23637 ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || 23638 prim_type == T_CONN_RES); 23639 23640 switch (prim_type) { 23641 case T_CONN_REQ: 23642 tcreqp = (struct T_conn_req *)mp->b_rptr; 23643 opt_offset = tcreqp->OPT_offset; 23644 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; 23645 break; 23646 case O_T_CONN_RES: 23647 case T_CONN_RES: 23648 tcresp = (struct T_conn_res *)mp->b_rptr; 23649 opt_offset = tcresp->OPT_offset; 23650 opt_lenp = (t_scalar_t *)&tcresp->OPT_length; 23651 break; 23652 } 23653 23654 *t_errorp = 0; 23655 *sys_errorp = 0; 23656 *do_disconnectp = 0; 23657 23658 error = tpi_optcom_buf(tcp->tcp_wq, mp, opt_lenp, 23659 opt_offset, cr, &tcp_opt_obj, 23660 NULL, &is_absreq_failure); 23661 23662 switch (error) { 23663 case 0: /* no error */ 23664 ASSERT(is_absreq_failure == 0); 23665 return (0); 23666 case ENOPROTOOPT: 23667 *t_errorp = TBADOPT; 23668 break; 23669 case EACCES: 23670 *t_errorp = TACCES; 23671 break; 23672 default: 23673 *t_errorp = TSYSERR; *sys_errorp = error; 23674 break; 23675 } 23676 if (is_absreq_failure != 0) { 23677 /* 23678 * The connection request should get the local ack 23679 * T_OK_ACK and then a T_DISCON_IND. 23680 */ 23681 *do_disconnectp = 1; 23682 } 23683 return (-1); 23684 } 23685 23686 /* 23687 * Split this function out so that if the secret changes, I'm okay. 23688 * 23689 * Initialize the tcp_iss_cookie and tcp_iss_key. 23690 */ 23691 23692 #define PASSWD_SIZE 16 /* MUST be multiple of 4 */ 23693 23694 static void 23695 tcp_iss_key_init(uint8_t *phrase, int len) 23696 { 23697 struct { 23698 int32_t current_time; 23699 uint32_t randnum; 23700 uint16_t pad; 23701 uint8_t ether[6]; 23702 uint8_t passwd[PASSWD_SIZE]; 23703 } tcp_iss_cookie; 23704 time_t t; 23705 23706 /* 23707 * Start with the current absolute time. 23708 */ 23709 (void) drv_getparm(TIME, &t); 23710 tcp_iss_cookie.current_time = t; 23711 23712 /* 23713 * XXX - Need a more random number per RFC 1750, not this crap. 23714 * OTOH, if what follows is pretty random, then I'm in better shape. 23715 */ 23716 tcp_iss_cookie.randnum = (uint32_t)(gethrtime() + tcp_random()); 23717 tcp_iss_cookie.pad = 0x365c; /* Picked from HMAC pad values. */ 23718 23719 /* 23720 * The cpu_type_info is pretty non-random. Ugggh. It does serve 23721 * as a good template. 23722 */ 23723 bcopy(&cpu_list->cpu_type_info, &tcp_iss_cookie.passwd, 23724 min(PASSWD_SIZE, sizeof (cpu_list->cpu_type_info))); 23725 23726 /* 23727 * The pass-phrase. Normally this is supplied by user-called NDD. 23728 */ 23729 bcopy(phrase, &tcp_iss_cookie.passwd, min(PASSWD_SIZE, len)); 23730 23731 /* 23732 * See 4010593 if this section becomes a problem again, 23733 * but the local ethernet address is useful here. 23734 */ 23735 (void) localetheraddr(NULL, 23736 (struct ether_addr *)&tcp_iss_cookie.ether); 23737 23738 /* 23739 * Hash 'em all together. The MD5Final is called per-connection. 23740 */ 23741 mutex_enter(&tcp_iss_key_lock); 23742 MD5Init(&tcp_iss_key); 23743 MD5Update(&tcp_iss_key, (uchar_t *)&tcp_iss_cookie, 23744 sizeof (tcp_iss_cookie)); 23745 mutex_exit(&tcp_iss_key_lock); 23746 } 23747 23748 /* 23749 * Set the RFC 1948 pass phrase 23750 */ 23751 /* ARGSUSED */ 23752 static int 23753 tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 23754 cred_t *cr) 23755 { 23756 /* 23757 * Basically, value contains a new pass phrase. Pass it along! 23758 */ 23759 tcp_iss_key_init((uint8_t *)value, strlen(value)); 23760 return (0); 23761 } 23762 23763 /* ARGSUSED */ 23764 static int 23765 tcp_sack_info_constructor(void *buf, void *cdrarg, int kmflags) 23766 { 23767 bzero(buf, sizeof (tcp_sack_info_t)); 23768 return (0); 23769 } 23770 23771 /* ARGSUSED */ 23772 static int 23773 tcp_iphc_constructor(void *buf, void *cdrarg, int kmflags) 23774 { 23775 bzero(buf, TCP_MAX_COMBINED_HEADER_LENGTH); 23776 return (0); 23777 } 23778 23779 void 23780 tcp_ddi_init(void) 23781 { 23782 int i; 23783 23784 /* Initialize locks */ 23785 rw_init(&tcp_hsp_lock, NULL, RW_DEFAULT, NULL); 23786 mutex_init(&tcp_g_q_lock, NULL, MUTEX_DEFAULT, NULL); 23787 mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL); 23788 mutex_init(&tcp_iss_key_lock, NULL, MUTEX_DEFAULT, NULL); 23789 mutex_init(&tcp_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL); 23790 rw_init(&tcp_reserved_port_lock, NULL, RW_DEFAULT, NULL); 23791 23792 for (i = 0; i < A_CNT(tcp_bind_fanout); i++) { 23793 mutex_init(&tcp_bind_fanout[i].tf_lock, NULL, 23794 MUTEX_DEFAULT, NULL); 23795 } 23796 23797 for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) { 23798 mutex_init(&tcp_acceptor_fanout[i].tf_lock, NULL, 23799 MUTEX_DEFAULT, NULL); 23800 } 23801 23802 /* TCP's IPsec code calls the packet dropper. */ 23803 ip_drop_register(&tcp_dropper, "TCP IPsec policy enforcement"); 23804 23805 if (!tcp_g_nd) { 23806 if (!tcp_param_register(tcp_param_arr, A_CNT(tcp_param_arr))) { 23807 nd_free(&tcp_g_nd); 23808 } 23809 } 23810 23811 /* 23812 * Note: To really walk the device tree you need the devinfo 23813 * pointer to your device which is only available after probe/attach. 23814 * The following is safe only because it uses ddi_root_node() 23815 */ 23816 tcp_max_optsize = optcom_max_optsize(tcp_opt_obj.odb_opt_des_arr, 23817 tcp_opt_obj.odb_opt_arr_cnt); 23818 23819 tcp_timercache = kmem_cache_create("tcp_timercache", 23820 sizeof (tcp_timer_t) + sizeof (mblk_t), 0, 23821 NULL, NULL, NULL, NULL, NULL, 0); 23822 23823 tcp_sack_info_cache = kmem_cache_create("tcp_sack_info_cache", 23824 sizeof (tcp_sack_info_t), 0, 23825 tcp_sack_info_constructor, NULL, NULL, NULL, NULL, 0); 23826 23827 tcp_iphc_cache = kmem_cache_create("tcp_iphc_cache", 23828 TCP_MAX_COMBINED_HEADER_LENGTH, 0, 23829 tcp_iphc_constructor, NULL, NULL, NULL, NULL, 0); 23830 23831 tcp_squeue_wput_proc = tcp_squeue_switch(tcp_squeue_wput); 23832 tcp_squeue_close_proc = tcp_squeue_switch(tcp_squeue_close); 23833 23834 ip_squeue_init(tcp_squeue_add); 23835 23836 /* Initialize the random number generator */ 23837 tcp_random_init(); 23838 23839 /* 23840 * Initialize RFC 1948 secret values. This will probably be reset once 23841 * by the boot scripts. 23842 * 23843 * Use NULL name, as the name is caught by the new lockstats. 23844 * 23845 * Initialize with some random, non-guessable string, like the global 23846 * T_INFO_ACK. 23847 */ 23848 23849 tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack, 23850 sizeof (tcp_g_t_info_ack)); 23851 23852 if ((tcp_kstat = kstat_create(TCP_MOD_NAME, 0, "tcpstat", 23853 "net", KSTAT_TYPE_NAMED, 23854 sizeof (tcp_statistics) / sizeof (kstat_named_t), 23855 KSTAT_FLAG_VIRTUAL)) != NULL) { 23856 tcp_kstat->ks_data = &tcp_statistics; 23857 kstat_install(tcp_kstat); 23858 } 23859 23860 tcp_kstat_init(); 23861 } 23862 23863 void 23864 tcp_ddi_destroy(void) 23865 { 23866 int i; 23867 23868 nd_free(&tcp_g_nd); 23869 23870 for (i = 0; i < A_CNT(tcp_bind_fanout); i++) { 23871 mutex_destroy(&tcp_bind_fanout[i].tf_lock); 23872 } 23873 23874 for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) { 23875 mutex_destroy(&tcp_acceptor_fanout[i].tf_lock); 23876 } 23877 23878 mutex_destroy(&tcp_iss_key_lock); 23879 rw_destroy(&tcp_hsp_lock); 23880 mutex_destroy(&tcp_g_q_lock); 23881 mutex_destroy(&tcp_random_lock); 23882 mutex_destroy(&tcp_epriv_port_lock); 23883 rw_destroy(&tcp_reserved_port_lock); 23884 23885 ip_drop_unregister(&tcp_dropper); 23886 23887 kmem_cache_destroy(tcp_timercache); 23888 kmem_cache_destroy(tcp_sack_info_cache); 23889 kmem_cache_destroy(tcp_iphc_cache); 23890 23891 tcp_kstat_fini(); 23892 } 23893 23894 /* 23895 * Generate ISS, taking into account NDD changes may happen halfway through. 23896 * (If the iss is not zero, set it.) 23897 */ 23898 23899 static void 23900 tcp_iss_init(tcp_t *tcp) 23901 { 23902 MD5_CTX context; 23903 struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg; 23904 uint32_t answer[4]; 23905 23906 tcp_iss_incr_extra += (ISS_INCR >> 1); 23907 tcp->tcp_iss = tcp_iss_incr_extra; 23908 switch (tcp_strong_iss) { 23909 case 2: 23910 mutex_enter(&tcp_iss_key_lock); 23911 context = tcp_iss_key; 23912 mutex_exit(&tcp_iss_key_lock); 23913 arg.ports = tcp->tcp_ports; 23914 if (tcp->tcp_ipversion == IPV4_VERSION) { 23915 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 23916 &arg.src); 23917 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_dst, 23918 &arg.dst); 23919 } else { 23920 arg.src = tcp->tcp_ip6h->ip6_src; 23921 arg.dst = tcp->tcp_ip6h->ip6_dst; 23922 } 23923 MD5Update(&context, (uchar_t *)&arg, sizeof (arg)); 23924 MD5Final((uchar_t *)answer, &context); 23925 tcp->tcp_iss += answer[0] ^ answer[1] ^ answer[2] ^ answer[3]; 23926 /* 23927 * Now that we've hashed into a unique per-connection sequence 23928 * space, add a random increment per strong_iss == 1. So I 23929 * guess we'll have to... 23930 */ 23931 /* FALLTHRU */ 23932 case 1: 23933 tcp->tcp_iss += (gethrtime() >> ISS_NSEC_SHT) + tcp_random(); 23934 break; 23935 default: 23936 tcp->tcp_iss += (uint32_t)gethrestime_sec() * ISS_INCR; 23937 break; 23938 } 23939 tcp->tcp_valid_bits = TCP_ISS_VALID; 23940 tcp->tcp_fss = tcp->tcp_iss - 1; 23941 tcp->tcp_suna = tcp->tcp_iss; 23942 tcp->tcp_snxt = tcp->tcp_iss + 1; 23943 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 23944 tcp->tcp_csuna = tcp->tcp_snxt; 23945 } 23946 23947 /* 23948 * Exported routine for extracting active tcp connection status. 23949 * 23950 * This is used by the Solaris Cluster Networking software to 23951 * gather a list of connections that need to be forwarded to 23952 * specific nodes in the cluster when configuration changes occur. 23953 * 23954 * The callback is invoked for each tcp_t structure. Returning 23955 * non-zero from the callback routine terminates the search. 23956 */ 23957 int 23958 cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg) 23959 { 23960 tcp_t *tcp; 23961 cl_tcp_info_t cl_tcpi; 23962 connf_t *connfp; 23963 conn_t *connp; 23964 int i; 23965 23966 ASSERT(callback != NULL); 23967 23968 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 23969 23970 connfp = &ipcl_globalhash_fanout[i]; 23971 connp = NULL; 23972 23973 while ((connp = 23974 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 23975 23976 tcp = connp->conn_tcp; 23977 cl_tcpi.cl_tcpi_version = CL_TCPI_V1; 23978 cl_tcpi.cl_tcpi_ipversion = tcp->tcp_ipversion; 23979 cl_tcpi.cl_tcpi_state = tcp->tcp_state; 23980 cl_tcpi.cl_tcpi_lport = tcp->tcp_lport; 23981 cl_tcpi.cl_tcpi_fport = tcp->tcp_fport; 23982 /* 23983 * The macros tcp_laddr and tcp_faddr give the IPv4 23984 * addresses. They are copied implicitly below as 23985 * mapped addresses. 23986 */ 23987 cl_tcpi.cl_tcpi_laddr_v6 = tcp->tcp_ip_src_v6; 23988 if (tcp->tcp_ipversion == IPV4_VERSION) { 23989 cl_tcpi.cl_tcpi_faddr = 23990 tcp->tcp_ipha->ipha_dst; 23991 } else { 23992 cl_tcpi.cl_tcpi_faddr_v6 = 23993 tcp->tcp_ip6h->ip6_dst; 23994 } 23995 23996 /* 23997 * If the callback returns non-zero 23998 * we terminate the traversal. 23999 */ 24000 if ((*callback)(&cl_tcpi, arg) != 0) { 24001 CONN_DEC_REF(tcp->tcp_connp); 24002 return (1); 24003 } 24004 } 24005 } 24006 24007 return (0); 24008 } 24009 24010 /* 24011 * Macros used for accessing the different types of sockaddr 24012 * structures inside a tcp_ioc_abort_conn_t. 24013 */ 24014 #define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local) 24015 #define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote) 24016 #define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr) 24017 #define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr) 24018 #define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port) 24019 #define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port) 24020 #define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local) 24021 #define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote) 24022 #define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr) 24023 #define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr) 24024 #define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port) 24025 #define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port) 24026 24027 /* 24028 * Return the correct error code to mimic the behavior 24029 * of a connection reset. 24030 */ 24031 #define TCP_AC_GET_ERRCODE(state, err) { \ 24032 switch ((state)) { \ 24033 case TCPS_SYN_SENT: \ 24034 case TCPS_SYN_RCVD: \ 24035 (err) = ECONNREFUSED; \ 24036 break; \ 24037 case TCPS_ESTABLISHED: \ 24038 case TCPS_FIN_WAIT_1: \ 24039 case TCPS_FIN_WAIT_2: \ 24040 case TCPS_CLOSE_WAIT: \ 24041 (err) = ECONNRESET; \ 24042 break; \ 24043 case TCPS_CLOSING: \ 24044 case TCPS_LAST_ACK: \ 24045 case TCPS_TIME_WAIT: \ 24046 (err) = 0; \ 24047 break; \ 24048 default: \ 24049 (err) = ENXIO; \ 24050 } \ 24051 } 24052 24053 /* 24054 * Check if a tcp structure matches the info in acp. 24055 */ 24056 #define TCP_AC_ADDR_MATCH(acp, tcp) \ 24057 (((acp)->ac_local.ss_family == AF_INET) ? \ 24058 ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \ 24059 TCP_AC_V4LOCAL((acp)) == (tcp)->tcp_ip_src) && \ 24060 (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \ 24061 TCP_AC_V4REMOTE((acp)) == (tcp)->tcp_remote) && \ 24062 (TCP_AC_V4LPORT((acp)) == 0 || \ 24063 TCP_AC_V4LPORT((acp)) == (tcp)->tcp_lport) && \ 24064 (TCP_AC_V4RPORT((acp)) == 0 || \ 24065 TCP_AC_V4RPORT((acp)) == (tcp)->tcp_fport) && \ 24066 (acp)->ac_start <= (tcp)->tcp_state && \ 24067 (acp)->ac_end >= (tcp)->tcp_state) : \ 24068 ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \ 24069 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \ 24070 &(tcp)->tcp_ip_src_v6)) && \ 24071 (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \ 24072 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \ 24073 &(tcp)->tcp_remote_v6)) && \ 24074 (TCP_AC_V6LPORT((acp)) == 0 || \ 24075 TCP_AC_V6LPORT((acp)) == (tcp)->tcp_lport) && \ 24076 (TCP_AC_V6RPORT((acp)) == 0 || \ 24077 TCP_AC_V6RPORT((acp)) == (tcp)->tcp_fport) && \ 24078 (acp)->ac_start <= (tcp)->tcp_state && \ 24079 (acp)->ac_end >= (tcp)->tcp_state)) 24080 24081 #define TCP_AC_MATCH(acp, tcp) \ 24082 (((acp)->ac_zoneid == ALL_ZONES || \ 24083 (acp)->ac_zoneid == tcp->tcp_connp->conn_zoneid) ? \ 24084 TCP_AC_ADDR_MATCH(acp, tcp) : 0) 24085 24086 /* 24087 * Build a message containing a tcp_ioc_abort_conn_t structure 24088 * which is filled in with information from acp and tp. 24089 */ 24090 static mblk_t * 24091 tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) 24092 { 24093 mblk_t *mp; 24094 tcp_ioc_abort_conn_t *tacp; 24095 24096 mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO); 24097 if (mp == NULL) 24098 return (NULL); 24099 24100 mp->b_datap->db_type = M_CTL; 24101 24102 *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN; 24103 tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr + 24104 sizeof (uint32_t)); 24105 24106 tacp->ac_start = acp->ac_start; 24107 tacp->ac_end = acp->ac_end; 24108 tacp->ac_zoneid = acp->ac_zoneid; 24109 24110 if (acp->ac_local.ss_family == AF_INET) { 24111 tacp->ac_local.ss_family = AF_INET; 24112 tacp->ac_remote.ss_family = AF_INET; 24113 TCP_AC_V4LOCAL(tacp) = tp->tcp_ip_src; 24114 TCP_AC_V4REMOTE(tacp) = tp->tcp_remote; 24115 TCP_AC_V4LPORT(tacp) = tp->tcp_lport; 24116 TCP_AC_V4RPORT(tacp) = tp->tcp_fport; 24117 } else { 24118 tacp->ac_local.ss_family = AF_INET6; 24119 tacp->ac_remote.ss_family = AF_INET6; 24120 TCP_AC_V6LOCAL(tacp) = tp->tcp_ip_src_v6; 24121 TCP_AC_V6REMOTE(tacp) = tp->tcp_remote_v6; 24122 TCP_AC_V6LPORT(tacp) = tp->tcp_lport; 24123 TCP_AC_V6RPORT(tacp) = tp->tcp_fport; 24124 } 24125 mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp); 24126 return (mp); 24127 } 24128 24129 /* 24130 * Print a tcp_ioc_abort_conn_t structure. 24131 */ 24132 static void 24133 tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) 24134 { 24135 char lbuf[128]; 24136 char rbuf[128]; 24137 sa_family_t af; 24138 in_port_t lport, rport; 24139 ushort_t logflags; 24140 24141 af = acp->ac_local.ss_family; 24142 24143 if (af == AF_INET) { 24144 (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp), 24145 lbuf, 128); 24146 (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp), 24147 rbuf, 128); 24148 lport = ntohs(TCP_AC_V4LPORT(acp)); 24149 rport = ntohs(TCP_AC_V4RPORT(acp)); 24150 } else { 24151 (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp), 24152 lbuf, 128); 24153 (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp), 24154 rbuf, 128); 24155 lport = ntohs(TCP_AC_V6LPORT(acp)); 24156 rport = ntohs(TCP_AC_V6RPORT(acp)); 24157 } 24158 24159 logflags = SL_TRACE | SL_NOTE; 24160 /* 24161 * Don't print this message to the console if the operation was done 24162 * to a non-global zone. 24163 */ 24164 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 24165 logflags |= SL_CONSOLE; 24166 (void) strlog(TCP_MOD_ID, 0, 1, logflags, 24167 "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, " 24168 "start = %d, end = %d\n", lbuf, lport, rbuf, rport, 24169 acp->ac_start, acp->ac_end); 24170 } 24171 24172 /* 24173 * Called inside tcp_rput when a message built using 24174 * tcp_ioctl_abort_build_msg is put into a queue. 24175 * Note that when we get here there is no wildcard in acp any more. 24176 */ 24177 static void 24178 tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp) 24179 { 24180 tcp_ioc_abort_conn_t *acp; 24181 24182 acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t)); 24183 if (tcp->tcp_state <= acp->ac_end) { 24184 /* 24185 * If we get here, we are already on the correct 24186 * squeue. This ioctl follows the following path 24187 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn 24188 * ->tcp_ioctl_abort->squeue_fill (if on a 24189 * different squeue) 24190 */ 24191 int errcode; 24192 24193 TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode); 24194 (void) tcp_clean_death(tcp, errcode, 26); 24195 } 24196 freemsg(mp); 24197 } 24198 24199 /* 24200 * Abort all matching connections on a hash chain. 24201 */ 24202 static int 24203 tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count, 24204 boolean_t exact) 24205 { 24206 int nmatch, err = 0; 24207 tcp_t *tcp; 24208 MBLKP mp, last, listhead = NULL; 24209 conn_t *tconnp; 24210 connf_t *connfp = &ipcl_conn_fanout[index]; 24211 24212 startover: 24213 nmatch = 0; 24214 24215 mutex_enter(&connfp->connf_lock); 24216 for (tconnp = connfp->connf_head; tconnp != NULL; 24217 tconnp = tconnp->conn_next) { 24218 tcp = tconnp->conn_tcp; 24219 if (TCP_AC_MATCH(acp, tcp)) { 24220 CONN_INC_REF(tcp->tcp_connp); 24221 mp = tcp_ioctl_abort_build_msg(acp, tcp); 24222 if (mp == NULL) { 24223 err = ENOMEM; 24224 CONN_DEC_REF(tcp->tcp_connp); 24225 break; 24226 } 24227 mp->b_prev = (mblk_t *)tcp; 24228 24229 if (listhead == NULL) { 24230 listhead = mp; 24231 last = mp; 24232 } else { 24233 last->b_next = mp; 24234 last = mp; 24235 } 24236 nmatch++; 24237 if (exact) 24238 break; 24239 } 24240 24241 /* Avoid holding lock for too long. */ 24242 if (nmatch >= 500) 24243 break; 24244 } 24245 mutex_exit(&connfp->connf_lock); 24246 24247 /* Pass mp into the correct tcp */ 24248 while ((mp = listhead) != NULL) { 24249 listhead = listhead->b_next; 24250 tcp = (tcp_t *)mp->b_prev; 24251 mp->b_next = mp->b_prev = NULL; 24252 squeue_fill(tcp->tcp_connp->conn_sqp, mp, 24253 tcp_input, tcp->tcp_connp, SQTAG_TCP_ABORT_BUCKET); 24254 } 24255 24256 *count += nmatch; 24257 if (nmatch >= 500 && err == 0) 24258 goto startover; 24259 return (err); 24260 } 24261 24262 /* 24263 * Abort all connections that matches the attributes specified in acp. 24264 */ 24265 static int 24266 tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp) 24267 { 24268 sa_family_t af; 24269 uint32_t ports; 24270 uint16_t *pports; 24271 int err = 0, count = 0; 24272 boolean_t exact = B_FALSE; /* set when there is no wildcard */ 24273 int index = -1; 24274 ushort_t logflags; 24275 24276 af = acp->ac_local.ss_family; 24277 24278 if (af == AF_INET) { 24279 if (TCP_AC_V4REMOTE(acp) != INADDR_ANY && 24280 TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) { 24281 pports = (uint16_t *)&ports; 24282 pports[1] = TCP_AC_V4LPORT(acp); 24283 pports[0] = TCP_AC_V4RPORT(acp); 24284 exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY); 24285 } 24286 } else { 24287 if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) && 24288 TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) { 24289 pports = (uint16_t *)&ports; 24290 pports[1] = TCP_AC_V6LPORT(acp); 24291 pports[0] = TCP_AC_V6RPORT(acp); 24292 exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp)); 24293 } 24294 } 24295 24296 /* 24297 * For cases where remote addr, local port, and remote port are non- 24298 * wildcards, tcp_ioctl_abort_bucket will only be called once. 24299 */ 24300 if (index != -1) { 24301 err = tcp_ioctl_abort_bucket(acp, index, 24302 &count, exact); 24303 } else { 24304 /* 24305 * loop through all entries for wildcard case 24306 */ 24307 for (index = 0; index < ipcl_conn_fanout_size; index++) { 24308 err = tcp_ioctl_abort_bucket(acp, index, 24309 &count, exact); 24310 if (err != 0) 24311 break; 24312 } 24313 } 24314 24315 logflags = SL_TRACE | SL_NOTE; 24316 /* 24317 * Don't print this message to the console if the operation was done 24318 * to a non-global zone. 24319 */ 24320 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 24321 logflags |= SL_CONSOLE; 24322 (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: " 24323 "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' ')); 24324 if (err == 0 && count == 0) 24325 err = ENOENT; 24326 return (err); 24327 } 24328 24329 /* 24330 * Process the TCP_IOC_ABORT_CONN ioctl request. 24331 */ 24332 static void 24333 tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) 24334 { 24335 int err; 24336 IOCP iocp; 24337 MBLKP mp1; 24338 sa_family_t laf, raf; 24339 tcp_ioc_abort_conn_t *acp; 24340 zone_t *zptr; 24341 zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid; 24342 24343 iocp = (IOCP)mp->b_rptr; 24344 24345 if ((mp1 = mp->b_cont) == NULL || 24346 iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) { 24347 err = EINVAL; 24348 goto out; 24349 } 24350 24351 /* check permissions */ 24352 if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) { 24353 err = EPERM; 24354 goto out; 24355 } 24356 24357 if (mp1->b_cont != NULL) { 24358 freemsg(mp1->b_cont); 24359 mp1->b_cont = NULL; 24360 } 24361 24362 acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr; 24363 laf = acp->ac_local.ss_family; 24364 raf = acp->ac_remote.ss_family; 24365 24366 /* check that a zone with the supplied zoneid exists */ 24367 if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) { 24368 zptr = zone_find_by_id(zoneid); 24369 if (zptr != NULL) { 24370 zone_rele(zptr); 24371 } else { 24372 err = EINVAL; 24373 goto out; 24374 } 24375 } 24376 24377 if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT || 24378 acp->ac_start > acp->ac_end || laf != raf || 24379 (laf != AF_INET && laf != AF_INET6)) { 24380 err = EINVAL; 24381 goto out; 24382 } 24383 24384 tcp_ioctl_abort_dump(acp); 24385 err = tcp_ioctl_abort(acp); 24386 24387 out: 24388 if (mp1 != NULL) { 24389 freemsg(mp1); 24390 mp->b_cont = NULL; 24391 } 24392 24393 if (err != 0) 24394 miocnak(q, mp, 0, err); 24395 else 24396 miocack(q, mp, 0, 0); 24397 } 24398 24399 /* 24400 * tcp_time_wait_processing() handles processing of incoming packets when 24401 * the tcp is in the TIME_WAIT state. 24402 * A TIME_WAIT tcp that has an associated open TCP stream is never put 24403 * on the time wait list. 24404 */ 24405 void 24406 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, 24407 uint32_t seg_ack, int seg_len, tcph_t *tcph) 24408 { 24409 int32_t bytes_acked; 24410 int32_t gap; 24411 int32_t rgap; 24412 tcp_opt_t tcpopt; 24413 uint_t flags; 24414 uint32_t new_swnd = 0; 24415 conn_t *connp; 24416 24417 BUMP_LOCAL(tcp->tcp_ibsegs); 24418 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_RECV_PKT); 24419 24420 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 24421 new_swnd = BE16_TO_U16(tcph->th_win) << 24422 ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 24423 if (tcp->tcp_snd_ts_ok) { 24424 if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 24425 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 24426 tcp->tcp_rnxt, TH_ACK); 24427 goto done; 24428 } 24429 } 24430 gap = seg_seq - tcp->tcp_rnxt; 24431 rgap = tcp->tcp_rwnd - (gap + seg_len); 24432 if (gap < 0) { 24433 BUMP_MIB(&tcp_mib, tcpInDataDupSegs); 24434 UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, 24435 (seg_len > -gap ? -gap : seg_len)); 24436 seg_len += gap; 24437 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 24438 if (flags & TH_RST) { 24439 goto done; 24440 } 24441 if ((flags & TH_FIN) && seg_len == -1) { 24442 /* 24443 * When TCP receives a duplicate FIN in 24444 * TIME_WAIT state, restart the 2 MSL timer. 24445 * See page 73 in RFC 793. Make sure this TCP 24446 * is already on the TIME_WAIT list. If not, 24447 * just restart the timer. 24448 */ 24449 if (TCP_IS_DETACHED(tcp)) { 24450 tcp_time_wait_remove(tcp, NULL); 24451 tcp_time_wait_append(tcp); 24452 TCP_DBGSTAT(tcp_rput_time_wait); 24453 } else { 24454 ASSERT(tcp != NULL); 24455 TCP_TIMER_RESTART(tcp, 24456 tcp_time_wait_interval); 24457 } 24458 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 24459 tcp->tcp_rnxt, TH_ACK); 24460 goto done; 24461 } 24462 flags |= TH_ACK_NEEDED; 24463 seg_len = 0; 24464 goto process_ack; 24465 } 24466 24467 /* Fix seg_seq, and chew the gap off the front. */ 24468 seg_seq = tcp->tcp_rnxt; 24469 } 24470 24471 if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 24472 /* 24473 * Make sure that when we accept the connection, pick 24474 * an ISS greater than (tcp_snxt + ISS_INCR/2) for the 24475 * old connection. 24476 * 24477 * The next ISS generated is equal to tcp_iss_incr_extra 24478 * + ISS_INCR/2 + other components depending on the 24479 * value of tcp_strong_iss. We pre-calculate the new 24480 * ISS here and compare with tcp_snxt to determine if 24481 * we need to make adjustment to tcp_iss_incr_extra. 24482 * 24483 * The above calculation is ugly and is a 24484 * waste of CPU cycles... 24485 */ 24486 uint32_t new_iss = tcp_iss_incr_extra; 24487 int32_t adj; 24488 24489 switch (tcp_strong_iss) { 24490 case 2: { 24491 /* Add time and MD5 components. */ 24492 uint32_t answer[4]; 24493 struct { 24494 uint32_t ports; 24495 in6_addr_t src; 24496 in6_addr_t dst; 24497 } arg; 24498 MD5_CTX context; 24499 24500 mutex_enter(&tcp_iss_key_lock); 24501 context = tcp_iss_key; 24502 mutex_exit(&tcp_iss_key_lock); 24503 arg.ports = tcp->tcp_ports; 24504 /* We use MAPPED addresses in tcp_iss_init */ 24505 arg.src = tcp->tcp_ip_src_v6; 24506 if (tcp->tcp_ipversion == IPV4_VERSION) { 24507 IN6_IPADDR_TO_V4MAPPED( 24508 tcp->tcp_ipha->ipha_dst, 24509 &arg.dst); 24510 } else { 24511 arg.dst = 24512 tcp->tcp_ip6h->ip6_dst; 24513 } 24514 MD5Update(&context, (uchar_t *)&arg, 24515 sizeof (arg)); 24516 MD5Final((uchar_t *)answer, &context); 24517 answer[0] ^= answer[1] ^ answer[2] ^ answer[3]; 24518 new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0]; 24519 break; 24520 } 24521 case 1: 24522 /* Add time component and min random (i.e. 1). */ 24523 new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1; 24524 break; 24525 default: 24526 /* Add only time component. */ 24527 new_iss += (uint32_t)gethrestime_sec() * ISS_INCR; 24528 break; 24529 } 24530 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 24531 /* 24532 * New ISS not guaranteed to be ISS_INCR/2 24533 * ahead of the current tcp_snxt, so add the 24534 * difference to tcp_iss_incr_extra. 24535 */ 24536 tcp_iss_incr_extra += adj; 24537 } 24538 /* 24539 * If tcp_clean_death() can not perform the task now, 24540 * drop the SYN packet and let the other side re-xmit. 24541 * Otherwise pass the SYN packet back in, since the 24542 * old tcp state has been cleaned up or freed. 24543 */ 24544 if (tcp_clean_death(tcp, 0, 27) == -1) 24545 goto done; 24546 /* 24547 * We will come back to tcp_rput_data 24548 * on the global queue. Packets destined 24549 * for the global queue will be checked 24550 * with global policy. But the policy for 24551 * this packet has already been checked as 24552 * this was destined for the detached 24553 * connection. We need to bypass policy 24554 * check this time by attaching a dummy 24555 * ipsec_in with ipsec_in_dont_check set. 24556 */ 24557 if ((connp = ipcl_classify(mp, tcp->tcp_connp->conn_zoneid)) != 24558 NULL) { 24559 TCP_STAT(tcp_time_wait_syn_success); 24560 tcp_reinput(connp, mp, tcp->tcp_connp->conn_sqp); 24561 return; 24562 } 24563 goto done; 24564 } 24565 24566 /* 24567 * rgap is the amount of stuff received out of window. A negative 24568 * value is the amount out of window. 24569 */ 24570 if (rgap < 0) { 24571 BUMP_MIB(&tcp_mib, tcpInDataPastWinSegs); 24572 UPDATE_MIB(&tcp_mib, tcpInDataPastWinBytes, -rgap); 24573 /* Fix seg_len and make sure there is something left. */ 24574 seg_len += rgap; 24575 if (seg_len <= 0) { 24576 if (flags & TH_RST) { 24577 goto done; 24578 } 24579 flags |= TH_ACK_NEEDED; 24580 seg_len = 0; 24581 goto process_ack; 24582 } 24583 } 24584 /* 24585 * Check whether we can update tcp_ts_recent. This test is 24586 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 24587 * Extensions for High Performance: An Update", Internet Draft. 24588 */ 24589 if (tcp->tcp_snd_ts_ok && 24590 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 24591 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 24592 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 24593 tcp->tcp_last_rcv_lbolt = lbolt64; 24594 } 24595 24596 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 24597 /* Always ack out of order packets */ 24598 flags |= TH_ACK_NEEDED; 24599 seg_len = 0; 24600 } else if (seg_len > 0) { 24601 BUMP_MIB(&tcp_mib, tcpInClosed); 24602 BUMP_MIB(&tcp_mib, tcpInDataInorderSegs); 24603 UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, seg_len); 24604 } 24605 if (flags & TH_RST) { 24606 (void) tcp_clean_death(tcp, 0, 28); 24607 goto done; 24608 } 24609 if (flags & TH_SYN) { 24610 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 24611 TH_RST|TH_ACK); 24612 /* 24613 * Do not delete the TCP structure if it is in 24614 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 24615 */ 24616 goto done; 24617 } 24618 process_ack: 24619 if (flags & TH_ACK) { 24620 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 24621 if (bytes_acked <= 0) { 24622 if (bytes_acked == 0 && seg_len == 0 && 24623 new_swnd == tcp->tcp_swnd) 24624 BUMP_MIB(&tcp_mib, tcpInDupAck); 24625 } else { 24626 /* Acks something not sent */ 24627 flags |= TH_ACK_NEEDED; 24628 } 24629 } 24630 if (flags & TH_ACK_NEEDED) { 24631 /* 24632 * Time to send an ack for some reason. 24633 */ 24634 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 24635 tcp->tcp_rnxt, TH_ACK); 24636 } 24637 done: 24638 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 24639 DB_CKSUMSTART(mp) = 0; 24640 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 24641 TCP_STAT(tcp_time_wait_syn_fail); 24642 } 24643 freemsg(mp); 24644 } 24645 24646 /* 24647 * Allocate a T_SVR4_OPTMGMT_REQ. 24648 * The caller needs to increment tcp_drop_opt_ack_cnt when sending these so 24649 * that tcp_rput_other can drop the acks. 24650 */ 24651 static mblk_t * 24652 tcp_setsockopt_mp(int level, int cmd, char *opt, int optlen) 24653 { 24654 mblk_t *mp; 24655 struct T_optmgmt_req *tor; 24656 struct opthdr *oh; 24657 uint_t size; 24658 char *optptr; 24659 24660 size = sizeof (*tor) + sizeof (*oh) + optlen; 24661 mp = allocb(size, BPRI_MED); 24662 if (mp == NULL) 24663 return (NULL); 24664 24665 mp->b_wptr += size; 24666 mp->b_datap->db_type = M_PROTO; 24667 tor = (struct T_optmgmt_req *)mp->b_rptr; 24668 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 24669 tor->MGMT_flags = T_NEGOTIATE; 24670 tor->OPT_length = sizeof (*oh) + optlen; 24671 tor->OPT_offset = (t_scalar_t)sizeof (*tor); 24672 24673 oh = (struct opthdr *)&tor[1]; 24674 oh->level = level; 24675 oh->name = cmd; 24676 oh->len = optlen; 24677 if (optlen != 0) { 24678 optptr = (char *)&oh[1]; 24679 bcopy(opt, optptr, optlen); 24680 } 24681 return (mp); 24682 } 24683 24684 /* 24685 * TCP Timers Implementation. 24686 */ 24687 timeout_id_t 24688 tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim) 24689 { 24690 mblk_t *mp; 24691 tcp_timer_t *tcpt; 24692 tcp_t *tcp = connp->conn_tcp; 24693 24694 ASSERT(connp->conn_sqp != NULL); 24695 24696 TCP_DBGSTAT(tcp_timeout_calls); 24697 24698 if (tcp->tcp_timercache == NULL) { 24699 mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC); 24700 } else { 24701 TCP_DBGSTAT(tcp_timeout_cached_alloc); 24702 mp = tcp->tcp_timercache; 24703 tcp->tcp_timercache = mp->b_next; 24704 mp->b_next = NULL; 24705 ASSERT(mp->b_wptr == NULL); 24706 } 24707 24708 CONN_INC_REF(connp); 24709 tcpt = (tcp_timer_t *)mp->b_rptr; 24710 tcpt->connp = connp; 24711 tcpt->tcpt_proc = f; 24712 tcpt->tcpt_tid = timeout(tcp_timer_callback, mp, tim); 24713 return ((timeout_id_t)mp); 24714 } 24715 24716 static void 24717 tcp_timer_callback(void *arg) 24718 { 24719 mblk_t *mp = (mblk_t *)arg; 24720 tcp_timer_t *tcpt; 24721 conn_t *connp; 24722 24723 tcpt = (tcp_timer_t *)mp->b_rptr; 24724 connp = tcpt->connp; 24725 squeue_fill(connp->conn_sqp, mp, 24726 tcp_timer_handler, connp, SQTAG_TCP_TIMER); 24727 } 24728 24729 static void 24730 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2) 24731 { 24732 tcp_timer_t *tcpt; 24733 conn_t *connp = (conn_t *)arg; 24734 tcp_t *tcp = connp->conn_tcp; 24735 24736 tcpt = (tcp_timer_t *)mp->b_rptr; 24737 ASSERT(connp == tcpt->connp); 24738 ASSERT((squeue_t *)arg2 == connp->conn_sqp); 24739 24740 /* 24741 * If the TCP has reached the closed state, don't proceed any 24742 * further. This TCP logically does not exist on the system. 24743 * tcpt_proc could for example access queues, that have already 24744 * been qprocoff'ed off. Also see comments at the start of tcp_input 24745 */ 24746 if (tcp->tcp_state != TCPS_CLOSED) { 24747 (*tcpt->tcpt_proc)(connp); 24748 } else { 24749 tcp->tcp_timer_tid = 0; 24750 } 24751 tcp_timer_free(connp->conn_tcp, mp); 24752 } 24753 24754 /* 24755 * There is potential race with untimeout and the handler firing at the same 24756 * time. The mblock may be freed by the handler while we are trying to use 24757 * it. But since both should execute on the same squeue, this race should not 24758 * occur. 24759 */ 24760 clock_t 24761 tcp_timeout_cancel(conn_t *connp, timeout_id_t id) 24762 { 24763 mblk_t *mp = (mblk_t *)id; 24764 tcp_timer_t *tcpt; 24765 clock_t delta; 24766 24767 TCP_DBGSTAT(tcp_timeout_cancel_reqs); 24768 24769 if (mp == NULL) 24770 return (-1); 24771 24772 tcpt = (tcp_timer_t *)mp->b_rptr; 24773 ASSERT(tcpt->connp == connp); 24774 24775 delta = untimeout(tcpt->tcpt_tid); 24776 24777 if (delta >= 0) { 24778 TCP_DBGSTAT(tcp_timeout_canceled); 24779 tcp_timer_free(connp->conn_tcp, mp); 24780 CONN_DEC_REF(connp); 24781 } 24782 24783 return (delta); 24784 } 24785 24786 /* 24787 * Allocate space for the timer event. The allocation looks like mblk, but it is 24788 * not a proper mblk. To avoid confusion we set b_wptr to NULL. 24789 * 24790 * Dealing with failures: If we can't allocate from the timer cache we try 24791 * allocating from dblock caches using allocb_tryhard(). In this case b_wptr 24792 * points to b_rptr. 24793 * If we can't allocate anything using allocb_tryhard(), we perform a last 24794 * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and 24795 * save the actual allocation size in b_datap. 24796 */ 24797 mblk_t * 24798 tcp_timermp_alloc(int kmflags) 24799 { 24800 mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache, 24801 kmflags & ~KM_PANIC); 24802 24803 if (mp != NULL) { 24804 mp->b_next = mp->b_prev = NULL; 24805 mp->b_rptr = (uchar_t *)(&mp[1]); 24806 mp->b_wptr = NULL; 24807 mp->b_datap = NULL; 24808 mp->b_queue = NULL; 24809 } else if (kmflags & KM_PANIC) { 24810 /* 24811 * Failed to allocate memory for the timer. Try allocating from 24812 * dblock caches. 24813 */ 24814 TCP_STAT(tcp_timermp_allocfail); 24815 mp = allocb_tryhard(sizeof (tcp_timer_t)); 24816 if (mp == NULL) { 24817 size_t size = 0; 24818 /* 24819 * Memory is really low. Try tryhard allocation. 24820 */ 24821 TCP_STAT(tcp_timermp_allocdblfail); 24822 mp = kmem_alloc_tryhard(sizeof (mblk_t) + 24823 sizeof (tcp_timer_t), &size, kmflags); 24824 mp->b_rptr = (uchar_t *)(&mp[1]); 24825 mp->b_next = mp->b_prev = NULL; 24826 mp->b_wptr = (uchar_t *)-1; 24827 mp->b_datap = (dblk_t *)size; 24828 mp->b_queue = NULL; 24829 } 24830 ASSERT(mp->b_wptr != NULL); 24831 } 24832 TCP_DBGSTAT(tcp_timermp_alloced); 24833 24834 return (mp); 24835 } 24836 24837 /* 24838 * Free per-tcp timer cache. 24839 * It can only contain entries from tcp_timercache. 24840 */ 24841 void 24842 tcp_timermp_free(tcp_t *tcp) 24843 { 24844 mblk_t *mp; 24845 24846 while ((mp = tcp->tcp_timercache) != NULL) { 24847 ASSERT(mp->b_wptr == NULL); 24848 tcp->tcp_timercache = tcp->tcp_timercache->b_next; 24849 kmem_cache_free(tcp_timercache, mp); 24850 } 24851 } 24852 24853 /* 24854 * Free timer event. Put it on the per-tcp timer cache if there is not too many 24855 * events there already (currently at most two events are cached). 24856 * If the event is not allocated from the timer cache, free it right away. 24857 */ 24858 static void 24859 tcp_timer_free(tcp_t *tcp, mblk_t *mp) 24860 { 24861 mblk_t *mp1 = tcp->tcp_timercache; 24862 24863 if (mp->b_wptr != NULL) { 24864 /* 24865 * This allocation is not from a timer cache, free it right 24866 * away. 24867 */ 24868 if (mp->b_wptr != (uchar_t *)-1) 24869 freeb(mp); 24870 else 24871 kmem_free(mp, (size_t)mp->b_datap); 24872 } else if (mp1 == NULL || mp1->b_next == NULL) { 24873 /* Cache this timer block for future allocations */ 24874 mp->b_rptr = (uchar_t *)(&mp[1]); 24875 mp->b_next = mp1; 24876 tcp->tcp_timercache = mp; 24877 } else { 24878 kmem_cache_free(tcp_timercache, mp); 24879 TCP_DBGSTAT(tcp_timermp_freed); 24880 } 24881 } 24882 24883 /* 24884 * End of TCP Timers implementation. 24885 */ 24886 24887 /* 24888 * tcp_{set,clr}qfull() functions are used to either set or clear QFULL 24889 * on the specified backing STREAMS q. Note, the caller may make the 24890 * decision to call based on the tcp_t.tcp_flow_stopped value which 24891 * when check outside the q's lock is only an advisory check ... 24892 */ 24893 24894 void 24895 tcp_setqfull(tcp_t *tcp) 24896 { 24897 queue_t *q = tcp->tcp_wq; 24898 24899 if (!(q->q_flag & QFULL)) { 24900 mutex_enter(QLOCK(q)); 24901 if (!(q->q_flag & QFULL)) { 24902 /* still need to set QFULL */ 24903 q->q_flag |= QFULL; 24904 tcp->tcp_flow_stopped = B_TRUE; 24905 mutex_exit(QLOCK(q)); 24906 TCP_STAT(tcp_flwctl_on); 24907 } else { 24908 mutex_exit(QLOCK(q)); 24909 } 24910 } 24911 } 24912 24913 void 24914 tcp_clrqfull(tcp_t *tcp) 24915 { 24916 queue_t *q = tcp->tcp_wq; 24917 24918 if (q->q_flag & QFULL) { 24919 mutex_enter(QLOCK(q)); 24920 if (q->q_flag & QFULL) { 24921 q->q_flag &= ~QFULL; 24922 tcp->tcp_flow_stopped = B_FALSE; 24923 mutex_exit(QLOCK(q)); 24924 if (q->q_flag & QWANTW) 24925 qbackenable(q, 0); 24926 } else { 24927 mutex_exit(QLOCK(q)); 24928 } 24929 } 24930 } 24931 24932 /* 24933 * TCP Kstats implementation 24934 */ 24935 static void 24936 tcp_kstat_init(void) 24937 { 24938 tcp_named_kstat_t template = { 24939 { "rtoAlgorithm", KSTAT_DATA_INT32, 0 }, 24940 { "rtoMin", KSTAT_DATA_INT32, 0 }, 24941 { "rtoMax", KSTAT_DATA_INT32, 0 }, 24942 { "maxConn", KSTAT_DATA_INT32, 0 }, 24943 { "activeOpens", KSTAT_DATA_UINT32, 0 }, 24944 { "passiveOpens", KSTAT_DATA_UINT32, 0 }, 24945 { "attemptFails", KSTAT_DATA_UINT32, 0 }, 24946 { "estabResets", KSTAT_DATA_UINT32, 0 }, 24947 { "currEstab", KSTAT_DATA_UINT32, 0 }, 24948 { "inSegs", KSTAT_DATA_UINT32, 0 }, 24949 { "outSegs", KSTAT_DATA_UINT32, 0 }, 24950 { "retransSegs", KSTAT_DATA_UINT32, 0 }, 24951 { "connTableSize", KSTAT_DATA_INT32, 0 }, 24952 { "outRsts", KSTAT_DATA_UINT32, 0 }, 24953 { "outDataSegs", KSTAT_DATA_UINT32, 0 }, 24954 { "outDataBytes", KSTAT_DATA_UINT32, 0 }, 24955 { "retransBytes", KSTAT_DATA_UINT32, 0 }, 24956 { "outAck", KSTAT_DATA_UINT32, 0 }, 24957 { "outAckDelayed", KSTAT_DATA_UINT32, 0 }, 24958 { "outUrg", KSTAT_DATA_UINT32, 0 }, 24959 { "outWinUpdate", KSTAT_DATA_UINT32, 0 }, 24960 { "outWinProbe", KSTAT_DATA_UINT32, 0 }, 24961 { "outControl", KSTAT_DATA_UINT32, 0 }, 24962 { "outFastRetrans", KSTAT_DATA_UINT32, 0 }, 24963 { "inAckSegs", KSTAT_DATA_UINT32, 0 }, 24964 { "inAckBytes", KSTAT_DATA_UINT32, 0 }, 24965 { "inDupAck", KSTAT_DATA_UINT32, 0 }, 24966 { "inAckUnsent", KSTAT_DATA_UINT32, 0 }, 24967 { "inDataInorderSegs", KSTAT_DATA_UINT32, 0 }, 24968 { "inDataInorderBytes", KSTAT_DATA_UINT32, 0 }, 24969 { "inDataUnorderSegs", KSTAT_DATA_UINT32, 0 }, 24970 { "inDataUnorderBytes", KSTAT_DATA_UINT32, 0 }, 24971 { "inDataDupSegs", KSTAT_DATA_UINT32, 0 }, 24972 { "inDataDupBytes", KSTAT_DATA_UINT32, 0 }, 24973 { "inDataPartDupSegs", KSTAT_DATA_UINT32, 0 }, 24974 { "inDataPartDupBytes", KSTAT_DATA_UINT32, 0 }, 24975 { "inDataPastWinSegs", KSTAT_DATA_UINT32, 0 }, 24976 { "inDataPastWinBytes", KSTAT_DATA_UINT32, 0 }, 24977 { "inWinProbe", KSTAT_DATA_UINT32, 0 }, 24978 { "inWinUpdate", KSTAT_DATA_UINT32, 0 }, 24979 { "inClosed", KSTAT_DATA_UINT32, 0 }, 24980 { "rttUpdate", KSTAT_DATA_UINT32, 0 }, 24981 { "rttNoUpdate", KSTAT_DATA_UINT32, 0 }, 24982 { "timRetrans", KSTAT_DATA_UINT32, 0 }, 24983 { "timRetransDrop", KSTAT_DATA_UINT32, 0 }, 24984 { "timKeepalive", KSTAT_DATA_UINT32, 0 }, 24985 { "timKeepaliveProbe", KSTAT_DATA_UINT32, 0 }, 24986 { "timKeepaliveDrop", KSTAT_DATA_UINT32, 0 }, 24987 { "listenDrop", KSTAT_DATA_UINT32, 0 }, 24988 { "listenDropQ0", KSTAT_DATA_UINT32, 0 }, 24989 { "halfOpenDrop", KSTAT_DATA_UINT32, 0 }, 24990 { "outSackRetransSegs", KSTAT_DATA_UINT32, 0 }, 24991 { "connTableSize6", KSTAT_DATA_INT32, 0 } 24992 }; 24993 24994 tcp_mibkp = kstat_create(TCP_MOD_NAME, 0, TCP_MOD_NAME, 24995 "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0); 24996 24997 if (tcp_mibkp == NULL) 24998 return; 24999 25000 template.rtoAlgorithm.value.ui32 = 4; 25001 template.rtoMin.value.ui32 = tcp_rexmit_interval_min; 25002 template.rtoMax.value.ui32 = tcp_rexmit_interval_max; 25003 template.maxConn.value.i32 = -1; 25004 25005 bcopy(&template, tcp_mibkp->ks_data, sizeof (template)); 25006 25007 tcp_mibkp->ks_update = tcp_kstat_update; 25008 25009 kstat_install(tcp_mibkp); 25010 } 25011 25012 static void 25013 tcp_kstat_fini(void) 25014 { 25015 25016 if (tcp_mibkp != NULL) { 25017 kstat_delete(tcp_mibkp); 25018 tcp_mibkp = NULL; 25019 } 25020 } 25021 25022 static int 25023 tcp_kstat_update(kstat_t *kp, int rw) 25024 { 25025 tcp_named_kstat_t *tcpkp; 25026 tcp_t *tcp; 25027 connf_t *connfp; 25028 conn_t *connp; 25029 int i; 25030 25031 if (!kp || !kp->ks_data) 25032 return (EIO); 25033 25034 if (rw == KSTAT_WRITE) 25035 return (EACCES); 25036 25037 tcpkp = (tcp_named_kstat_t *)kp->ks_data; 25038 25039 tcpkp->currEstab.value.ui32 = 0; 25040 25041 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 25042 connfp = &ipcl_globalhash_fanout[i]; 25043 connp = NULL; 25044 while ((connp = 25045 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 25046 tcp = connp->conn_tcp; 25047 switch (tcp_snmp_state(tcp)) { 25048 case MIB2_TCP_established: 25049 case MIB2_TCP_closeWait: 25050 tcpkp->currEstab.value.ui32++; 25051 break; 25052 } 25053 } 25054 } 25055 25056 tcpkp->activeOpens.value.ui32 = tcp_mib.tcpActiveOpens; 25057 tcpkp->passiveOpens.value.ui32 = tcp_mib.tcpPassiveOpens; 25058 tcpkp->attemptFails.value.ui32 = tcp_mib.tcpAttemptFails; 25059 tcpkp->estabResets.value.ui32 = tcp_mib.tcpEstabResets; 25060 tcpkp->inSegs.value.ui32 = tcp_mib.tcpInSegs; 25061 tcpkp->outSegs.value.ui32 = tcp_mib.tcpOutSegs; 25062 tcpkp->retransSegs.value.ui32 = tcp_mib.tcpRetransSegs; 25063 tcpkp->connTableSize.value.i32 = tcp_mib.tcpConnTableSize; 25064 tcpkp->outRsts.value.ui32 = tcp_mib.tcpOutRsts; 25065 tcpkp->outDataSegs.value.ui32 = tcp_mib.tcpOutDataSegs; 25066 tcpkp->outDataBytes.value.ui32 = tcp_mib.tcpOutDataBytes; 25067 tcpkp->retransBytes.value.ui32 = tcp_mib.tcpRetransBytes; 25068 tcpkp->outAck.value.ui32 = tcp_mib.tcpOutAck; 25069 tcpkp->outAckDelayed.value.ui32 = tcp_mib.tcpOutAckDelayed; 25070 tcpkp->outUrg.value.ui32 = tcp_mib.tcpOutUrg; 25071 tcpkp->outWinUpdate.value.ui32 = tcp_mib.tcpOutWinUpdate; 25072 tcpkp->outWinProbe.value.ui32 = tcp_mib.tcpOutWinProbe; 25073 tcpkp->outControl.value.ui32 = tcp_mib.tcpOutControl; 25074 tcpkp->outFastRetrans.value.ui32 = tcp_mib.tcpOutFastRetrans; 25075 tcpkp->inAckSegs.value.ui32 = tcp_mib.tcpInAckSegs; 25076 tcpkp->inAckBytes.value.ui32 = tcp_mib.tcpInAckBytes; 25077 tcpkp->inDupAck.value.ui32 = tcp_mib.tcpInDupAck; 25078 tcpkp->inAckUnsent.value.ui32 = tcp_mib.tcpInAckUnsent; 25079 tcpkp->inDataInorderSegs.value.ui32 = tcp_mib.tcpInDataInorderSegs; 25080 tcpkp->inDataInorderBytes.value.ui32 = tcp_mib.tcpInDataInorderBytes; 25081 tcpkp->inDataUnorderSegs.value.ui32 = tcp_mib.tcpInDataUnorderSegs; 25082 tcpkp->inDataUnorderBytes.value.ui32 = tcp_mib.tcpInDataUnorderBytes; 25083 tcpkp->inDataDupSegs.value.ui32 = tcp_mib.tcpInDataDupSegs; 25084 tcpkp->inDataDupBytes.value.ui32 = tcp_mib.tcpInDataDupBytes; 25085 tcpkp->inDataPartDupSegs.value.ui32 = tcp_mib.tcpInDataPartDupSegs; 25086 tcpkp->inDataPartDupBytes.value.ui32 = tcp_mib.tcpInDataPartDupBytes; 25087 tcpkp->inDataPastWinSegs.value.ui32 = tcp_mib.tcpInDataPastWinSegs; 25088 tcpkp->inDataPastWinBytes.value.ui32 = tcp_mib.tcpInDataPastWinBytes; 25089 tcpkp->inWinProbe.value.ui32 = tcp_mib.tcpInWinProbe; 25090 tcpkp->inWinUpdate.value.ui32 = tcp_mib.tcpInWinUpdate; 25091 tcpkp->inClosed.value.ui32 = tcp_mib.tcpInClosed; 25092 tcpkp->rttNoUpdate.value.ui32 = tcp_mib.tcpRttNoUpdate; 25093 tcpkp->rttUpdate.value.ui32 = tcp_mib.tcpRttUpdate; 25094 tcpkp->timRetrans.value.ui32 = tcp_mib.tcpTimRetrans; 25095 tcpkp->timRetransDrop.value.ui32 = tcp_mib.tcpTimRetransDrop; 25096 tcpkp->timKeepalive.value.ui32 = tcp_mib.tcpTimKeepalive; 25097 tcpkp->timKeepaliveProbe.value.ui32 = tcp_mib.tcpTimKeepaliveProbe; 25098 tcpkp->timKeepaliveDrop.value.ui32 = tcp_mib.tcpTimKeepaliveDrop; 25099 tcpkp->listenDrop.value.ui32 = tcp_mib.tcpListenDrop; 25100 tcpkp->listenDropQ0.value.ui32 = tcp_mib.tcpListenDropQ0; 25101 tcpkp->halfOpenDrop.value.ui32 = tcp_mib.tcpHalfOpenDrop; 25102 tcpkp->outSackRetransSegs.value.ui32 = tcp_mib.tcpOutSackRetransSegs; 25103 tcpkp->connTableSize6.value.i32 = tcp_mib.tcp6ConnTableSize; 25104 25105 return (0); 25106 } 25107 25108 void 25109 tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp) 25110 { 25111 uint16_t hdr_len; 25112 ipha_t *ipha; 25113 uint8_t *nexthdrp; 25114 tcph_t *tcph; 25115 25116 /* Already has an eager */ 25117 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 25118 TCP_STAT(tcp_reinput_syn); 25119 squeue_enter(connp->conn_sqp, mp, connp->conn_recv, 25120 connp, SQTAG_TCP_REINPUT_EAGER); 25121 return; 25122 } 25123 25124 switch (IPH_HDR_VERSION(mp->b_rptr)) { 25125 case IPV4_VERSION: 25126 ipha = (ipha_t *)mp->b_rptr; 25127 hdr_len = IPH_HDR_LENGTH(ipha); 25128 break; 25129 case IPV6_VERSION: 25130 if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, 25131 &hdr_len, &nexthdrp)) { 25132 CONN_DEC_REF(connp); 25133 freemsg(mp); 25134 return; 25135 } 25136 break; 25137 } 25138 25139 tcph = (tcph_t *)&mp->b_rptr[hdr_len]; 25140 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 25141 mp->b_datap->db_struioflag |= STRUIO_EAGER; 25142 DB_CKSUMSTART(mp) = (intptr_t)sqp; 25143 } 25144 25145 squeue_fill(connp->conn_sqp, mp, connp->conn_recv, connp, 25146 SQTAG_TCP_REINPUT); 25147 } 25148 25149 static squeue_func_t 25150 tcp_squeue_switch(int val) 25151 { 25152 squeue_func_t rval = squeue_fill; 25153 25154 switch (val) { 25155 case 1: 25156 rval = squeue_enter_nodrain; 25157 break; 25158 case 2: 25159 rval = squeue_enter; 25160 break; 25161 default: 25162 break; 25163 } 25164 return (rval); 25165 } 25166 25167 static void 25168 tcp_squeue_add(squeue_t *sqp) 25169 { 25170 tcp_squeue_priv_t *tcp_time_wait = kmem_zalloc( 25171 sizeof (tcp_squeue_priv_t), KM_SLEEP); 25172 25173 *squeue_getprivate(sqp, SQPRIVATE_TCP) = (intptr_t)tcp_time_wait; 25174 tcp_time_wait->tcp_time_wait_tid = timeout(tcp_time_wait_collector, 25175 sqp, TCP_TIME_WAIT_DELAY); 25176 if (tcp_free_list_max_cnt == 0) { 25177 int tcp_ncpus = ((boot_max_ncpus == -1) ? 25178 max_ncpus : boot_max_ncpus); 25179 25180 /* 25181 * Limit number of entries to 1% of availble memory / tcp_ncpus 25182 */ 25183 tcp_free_list_max_cnt = (freemem * PAGESIZE) / 25184 (tcp_ncpus * sizeof (tcp_t) * 100); 25185 } 25186 tcp_time_wait->tcp_free_list_cnt = 0; 25187 } 25188