1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1990 Mentat Inc. */ 27 28 #pragma ident "%Z%%M% %I% %E% SMI" 29 const char tcp_version[] = "%Z%%M% %I% %E% SMI"; 30 31 32 #include <sys/types.h> 33 #include <sys/stream.h> 34 #include <sys/strsun.h> 35 #include <sys/strsubr.h> 36 #include <sys/stropts.h> 37 #include <sys/strlog.h> 38 #include <sys/strsun.h> 39 #define _SUN_TPI_VERSION 2 40 #include <sys/tihdr.h> 41 #include <sys/timod.h> 42 #include <sys/ddi.h> 43 #include <sys/sunddi.h> 44 #include <sys/suntpi.h> 45 #include <sys/xti_inet.h> 46 #include <sys/cmn_err.h> 47 #include <sys/debug.h> 48 #include <sys/vtrace.h> 49 #include <sys/kmem.h> 50 #include <sys/ethernet.h> 51 #include <sys/cpuvar.h> 52 #include <sys/dlpi.h> 53 #include <sys/multidata.h> 54 #include <sys/multidata_impl.h> 55 #include <sys/pattr.h> 56 #include <sys/policy.h> 57 #include <sys/priv.h> 58 #include <sys/zone.h> 59 60 #include <sys/errno.h> 61 #include <sys/signal.h> 62 #include <sys/socket.h> 63 #include <sys/sockio.h> 64 #include <sys/isa_defs.h> 65 #include <sys/md5.h> 66 #include <sys/random.h> 67 #include <netinet/in.h> 68 #include <netinet/tcp.h> 69 #include <netinet/ip6.h> 70 #include <netinet/icmp6.h> 71 #include <net/if.h> 72 #include <net/route.h> 73 #include <inet/ipsec_impl.h> 74 75 #include <inet/common.h> 76 #include <inet/ip.h> 77 #include <inet/ip_impl.h> 78 #include <inet/ip6.h> 79 #include <inet/ip_ndp.h> 80 #include <inet/mi.h> 81 #include <inet/mib2.h> 82 #include <inet/nd.h> 83 #include <inet/optcom.h> 84 #include <inet/snmpcom.h> 85 #include <inet/kstatcom.h> 86 #include <inet/tcp.h> 87 #include <inet/tcp_impl.h> 88 #include <net/pfkeyv2.h> 89 #include <inet/ipsec_info.h> 90 #include <inet/ipdrop.h> 91 #include <inet/tcp_trace.h> 92 93 #include <inet/ipclassifier.h> 94 #include <inet/ip_ire.h> 95 #include <inet/ip_ftable.h> 96 #include <inet/ip_if.h> 97 #include <inet/ipp_common.h> 98 #include <sys/squeue.h> 99 #include <inet/kssl/ksslapi.h> 100 #include <sys/tsol/label.h> 101 #include <sys/tsol/tnet.h> 102 #include <sys/sdt.h> 103 #include <rpc/pmap_prot.h> 104 105 /* 106 * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) 107 * 108 * (Read the detailed design doc in PSARC case directory) 109 * 110 * The entire tcp state is contained in tcp_t and conn_t structure 111 * which are allocated in tandem using ipcl_conn_create() and passing 112 * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect 113 * the references on the tcp_t. The tcp_t structure is never compressed 114 * and packets always land on the correct TCP perimeter from the time 115 * eager is created till the time tcp_t dies (as such the old mentat 116 * TCP global queue is not used for detached state and no IPSEC checking 117 * is required). The global queue is still allocated to send out resets 118 * for connection which have no listeners and IP directly calls 119 * tcp_xmit_listeners_reset() which does any policy check. 120 * 121 * Protection and Synchronisation mechanism: 122 * 123 * The tcp data structure does not use any kind of lock for protecting 124 * its state but instead uses 'squeues' for mutual exclusion from various 125 * read and write side threads. To access a tcp member, the thread should 126 * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or 127 * squeue_fill). Since the squeues allow a direct function call, caller 128 * can pass any tcp function having prototype of edesc_t as argument 129 * (different from traditional STREAMs model where packets come in only 130 * designated entry points). The list of functions that can be directly 131 * called via squeue are listed before the usual function prototype. 132 * 133 * Referencing: 134 * 135 * TCP is MT-Hot and we use a reference based scheme to make sure that the 136 * tcp structure doesn't disappear when its needed. When the application 137 * creates an outgoing connection or accepts an incoming connection, we 138 * start out with 2 references on 'conn_ref'. One for TCP and one for IP. 139 * The IP reference is just a symbolic reference since ip_tcpclose() 140 * looks at tcp structure after tcp_close_output() returns which could 141 * have dropped the last TCP reference. So as long as the connection is 142 * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the 143 * conn_t. The classifier puts its own reference when the connection is 144 * inserted in listen or connected hash. Anytime a thread needs to enter 145 * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr 146 * on write side or by doing a classify on read side and then puts a 147 * reference on the conn before doing squeue_enter/tryenter/fill. For 148 * read side, the classifier itself puts the reference under fanout lock 149 * to make sure that tcp can't disappear before it gets processed. The 150 * squeue will drop this reference automatically so the called function 151 * doesn't have to do a DEC_REF. 152 * 153 * Opening a new connection: 154 * 155 * The outgoing connection open is pretty simple. ip_tcpopen() does the 156 * work in creating the conn/tcp structure and initializing it. The 157 * squeue assignment is done based on the CPU the application 158 * is running on. So for outbound connections, processing is always done 159 * on application CPU which might be different from the incoming CPU 160 * being interrupted by the NIC. An optimal way would be to figure out 161 * the NIC <-> CPU binding at listen time, and assign the outgoing 162 * connection to the squeue attached to the CPU that will be interrupted 163 * for incoming packets (we know the NIC based on the bind IP address). 164 * This might seem like a problem if more data is going out but the 165 * fact is that in most cases the transmit is ACK driven transmit where 166 * the outgoing data normally sits on TCP's xmit queue waiting to be 167 * transmitted. 168 * 169 * Accepting a connection: 170 * 171 * This is a more interesting case because of various races involved in 172 * establishing a eager in its own perimeter. Read the meta comment on 173 * top of tcp_conn_request(). But briefly, the squeue is picked by 174 * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU. 175 * 176 * Closing a connection: 177 * 178 * The close is fairly straight forward. tcp_close() calls tcp_close_output() 179 * via squeue to do the close and mark the tcp as detached if the connection 180 * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its 181 * reference but tcp_close() drop IP's reference always. So if tcp was 182 * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP 183 * and 1 because it is in classifier's connected hash. This is the condition 184 * we use to determine that its OK to clean up the tcp outside of squeue 185 * when time wait expires (check the ref under fanout and conn_lock and 186 * if it is 2, remove it from fanout hash and kill it). 187 * 188 * Although close just drops the necessary references and marks the 189 * tcp_detached state, tcp_close needs to know the tcp_detached has been 190 * set (under squeue) before letting the STREAM go away (because a 191 * inbound packet might attempt to go up the STREAM while the close 192 * has happened and tcp_detached is not set). So a special lock and 193 * flag is used along with a condition variable (tcp_closelock, tcp_closed, 194 * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked 195 * tcp_detached. 196 * 197 * Special provisions and fast paths: 198 * 199 * We make special provision for (AF_INET, SOCK_STREAM) sockets which 200 * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP 201 * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles 202 * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY 203 * check to send packets directly to tcp_rput_data via squeue. Everyone 204 * else comes through tcp_input() on the read side. 205 * 206 * We also make special provisions for sockfs by marking tcp_issocket 207 * whenever we have only sockfs on top of TCP. This allows us to skip 208 * putting the tcp in acceptor hash since a sockfs listener can never 209 * become acceptor and also avoid allocating a tcp_t for acceptor STREAM 210 * since eager has already been allocated and the accept now happens 211 * on acceptor STREAM. There is a big blob of comment on top of 212 * tcp_conn_request explaining the new accept. When socket is POP'd, 213 * sockfs sends us an ioctl to mark the fact and we go back to old 214 * behaviour. Once tcp_issocket is unset, its never set for the 215 * life of that connection. 216 * 217 * IPsec notes : 218 * 219 * Since a packet is always executed on the correct TCP perimeter 220 * all IPsec processing is defered to IP including checking new 221 * connections and setting IPSEC policies for new connection. The 222 * only exception is tcp_xmit_listeners_reset() which is called 223 * directly from IP and needs to policy check to see if TH_RST 224 * can be sent out. 225 */ 226 227 extern major_t TCP6_MAJ; 228 229 /* 230 * Values for squeue switch: 231 * 1: squeue_enter_nodrain 232 * 2: squeue_enter 233 * 3: squeue_fill 234 */ 235 int tcp_squeue_close = 2; 236 int tcp_squeue_wput = 2; 237 238 squeue_func_t tcp_squeue_close_proc; 239 squeue_func_t tcp_squeue_wput_proc; 240 241 /* 242 * This controls how tiny a write must be before we try to copy it 243 * into the the mblk on the tail of the transmit queue. Not much 244 * speedup is observed for values larger than sixteen. Zero will 245 * disable the optimisation. 246 */ 247 int tcp_tx_pull_len = 16; 248 249 /* 250 * TCP Statistics. 251 * 252 * How TCP statistics work. 253 * 254 * There are two types of statistics invoked by two macros. 255 * 256 * TCP_STAT(name) does non-atomic increment of a named stat counter. It is 257 * supposed to be used in non MT-hot paths of the code. 258 * 259 * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is 260 * supposed to be used for DEBUG purposes and may be used on a hot path. 261 * 262 * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat 263 * (use "kstat tcp" to get them). 264 * 265 * There is also additional debugging facility that marks tcp_clean_death() 266 * instances and saves them in tcp_t structure. It is triggered by 267 * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for 268 * tcp_clean_death() calls that counts the number of times each tag was hit. It 269 * is triggered by TCP_CLD_COUNTERS define. 270 * 271 * How to add new counters. 272 * 273 * 1) Add a field in the tcp_stat structure describing your counter. 274 * 2) Add a line in tcp_statistics with the name of the counter. 275 * 276 * IMPORTANT!! - make sure that both are in sync !! 277 * 3) Use either TCP_STAT or TCP_DBGSTAT with the name. 278 * 279 * Please avoid using private counters which are not kstat-exported. 280 * 281 * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances 282 * in tcp_t structure. 283 * 284 * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags. 285 */ 286 287 #ifndef TCP_DEBUG_COUNTER 288 #ifdef DEBUG 289 #define TCP_DEBUG_COUNTER 1 290 #else 291 #define TCP_DEBUG_COUNTER 0 292 #endif 293 #endif 294 295 #define TCP_CLD_COUNTERS 0 296 297 #define TCP_TAG_CLEAN_DEATH 1 298 #define TCP_MAX_CLEAN_DEATH_TAG 32 299 300 #ifdef lint 301 static int _lint_dummy_; 302 #endif 303 304 #if TCP_CLD_COUNTERS 305 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; 306 #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++ 307 #elif defined(lint) 308 #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0); 309 #else 310 #define TCP_CLD_STAT(x) 311 #endif 312 313 #if TCP_DEBUG_COUNTER 314 #define TCP_DBGSTAT(x) atomic_add_64(&(tcp_statistics.x.value.ui64), 1) 315 #elif defined(lint) 316 #define TCP_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); 317 #else 318 #define TCP_DBGSTAT(x) 319 #endif 320 321 tcp_stat_t tcp_statistics = { 322 { "tcp_time_wait", KSTAT_DATA_UINT64 }, 323 { "tcp_time_wait_syn", KSTAT_DATA_UINT64 }, 324 { "tcp_time_wait_success", KSTAT_DATA_UINT64 }, 325 { "tcp_time_wait_fail", KSTAT_DATA_UINT64 }, 326 { "tcp_reinput_syn", KSTAT_DATA_UINT64 }, 327 { "tcp_ip_output", KSTAT_DATA_UINT64 }, 328 { "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 }, 329 { "tcp_detach_time_wait", KSTAT_DATA_UINT64 }, 330 { "tcp_time_wait_reap", KSTAT_DATA_UINT64 }, 331 { "tcp_clean_death_nondetached", KSTAT_DATA_UINT64 }, 332 { "tcp_reinit_calls", KSTAT_DATA_UINT64 }, 333 { "tcp_eager_err1", KSTAT_DATA_UINT64 }, 334 { "tcp_eager_err2", KSTAT_DATA_UINT64 }, 335 { "tcp_eager_blowoff_calls", KSTAT_DATA_UINT64 }, 336 { "tcp_eager_blowoff_q", KSTAT_DATA_UINT64 }, 337 { "tcp_eager_blowoff_q0", KSTAT_DATA_UINT64 }, 338 { "tcp_not_hard_bound", KSTAT_DATA_UINT64 }, 339 { "tcp_no_listener", KSTAT_DATA_UINT64 }, 340 { "tcp_found_eager", KSTAT_DATA_UINT64 }, 341 { "tcp_wrong_queue", KSTAT_DATA_UINT64 }, 342 { "tcp_found_eager_binding1", KSTAT_DATA_UINT64 }, 343 { "tcp_found_eager_bound1", KSTAT_DATA_UINT64 }, 344 { "tcp_eager_has_listener1", KSTAT_DATA_UINT64 }, 345 { "tcp_open_alloc", KSTAT_DATA_UINT64 }, 346 { "tcp_open_detached_alloc", KSTAT_DATA_UINT64 }, 347 { "tcp_rput_time_wait", KSTAT_DATA_UINT64 }, 348 { "tcp_listendrop", KSTAT_DATA_UINT64 }, 349 { "tcp_listendropq0", KSTAT_DATA_UINT64 }, 350 { "tcp_wrong_rq", KSTAT_DATA_UINT64 }, 351 { "tcp_rsrv_calls", KSTAT_DATA_UINT64 }, 352 { "tcp_eagerfree2", KSTAT_DATA_UINT64 }, 353 { "tcp_eagerfree3", KSTAT_DATA_UINT64 }, 354 { "tcp_eagerfree4", KSTAT_DATA_UINT64 }, 355 { "tcp_eagerfree5", KSTAT_DATA_UINT64 }, 356 { "tcp_timewait_syn_fail", KSTAT_DATA_UINT64 }, 357 { "tcp_listen_badflags", KSTAT_DATA_UINT64 }, 358 { "tcp_timeout_calls", KSTAT_DATA_UINT64 }, 359 { "tcp_timeout_cached_alloc", KSTAT_DATA_UINT64 }, 360 { "tcp_timeout_cancel_reqs", KSTAT_DATA_UINT64 }, 361 { "tcp_timeout_canceled", KSTAT_DATA_UINT64 }, 362 { "tcp_timermp_alloced", KSTAT_DATA_UINT64 }, 363 { "tcp_timermp_freed", KSTAT_DATA_UINT64 }, 364 { "tcp_timermp_allocfail", KSTAT_DATA_UINT64 }, 365 { "tcp_timermp_allocdblfail", KSTAT_DATA_UINT64 }, 366 { "tcp_push_timer_cnt", KSTAT_DATA_UINT64 }, 367 { "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 }, 368 { "tcp_ire_null1", KSTAT_DATA_UINT64 }, 369 { "tcp_ire_null", KSTAT_DATA_UINT64 }, 370 { "tcp_ip_send", KSTAT_DATA_UINT64 }, 371 { "tcp_ip_ire_send", KSTAT_DATA_UINT64 }, 372 { "tcp_wsrv_called", KSTAT_DATA_UINT64 }, 373 { "tcp_flwctl_on", KSTAT_DATA_UINT64 }, 374 { "tcp_timer_fire_early", KSTAT_DATA_UINT64 }, 375 { "tcp_timer_fire_miss", KSTAT_DATA_UINT64 }, 376 { "tcp_freelist_cleanup", KSTAT_DATA_UINT64 }, 377 { "tcp_rput_v6_error", KSTAT_DATA_UINT64 }, 378 { "tcp_out_sw_cksum", KSTAT_DATA_UINT64 }, 379 { "tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, 380 { "tcp_zcopy_on", KSTAT_DATA_UINT64 }, 381 { "tcp_zcopy_off", KSTAT_DATA_UINT64 }, 382 { "tcp_zcopy_backoff", KSTAT_DATA_UINT64 }, 383 { "tcp_zcopy_disable", KSTAT_DATA_UINT64 }, 384 { "tcp_mdt_pkt_out", KSTAT_DATA_UINT64 }, 385 { "tcp_mdt_pkt_out_v4", KSTAT_DATA_UINT64 }, 386 { "tcp_mdt_pkt_out_v6", KSTAT_DATA_UINT64 }, 387 { "tcp_mdt_discarded", KSTAT_DATA_UINT64 }, 388 { "tcp_mdt_conn_halted1", KSTAT_DATA_UINT64 }, 389 { "tcp_mdt_conn_halted2", KSTAT_DATA_UINT64 }, 390 { "tcp_mdt_conn_halted3", KSTAT_DATA_UINT64 }, 391 { "tcp_mdt_conn_resumed1", KSTAT_DATA_UINT64 }, 392 { "tcp_mdt_conn_resumed2", KSTAT_DATA_UINT64 }, 393 { "tcp_mdt_legacy_small", KSTAT_DATA_UINT64 }, 394 { "tcp_mdt_legacy_all", KSTAT_DATA_UINT64 }, 395 { "tcp_mdt_legacy_ret", KSTAT_DATA_UINT64 }, 396 { "tcp_mdt_allocfail", KSTAT_DATA_UINT64 }, 397 { "tcp_mdt_addpdescfail", KSTAT_DATA_UINT64 }, 398 { "tcp_mdt_allocd", KSTAT_DATA_UINT64 }, 399 { "tcp_mdt_linked", KSTAT_DATA_UINT64 }, 400 { "tcp_fusion_flowctl", KSTAT_DATA_UINT64 }, 401 { "tcp_fusion_backenabled", KSTAT_DATA_UINT64 }, 402 { "tcp_fusion_urg", KSTAT_DATA_UINT64 }, 403 { "tcp_fusion_putnext", KSTAT_DATA_UINT64 }, 404 { "tcp_fusion_unfusable", KSTAT_DATA_UINT64 }, 405 { "tcp_fusion_aborted", KSTAT_DATA_UINT64 }, 406 { "tcp_fusion_unqualified", KSTAT_DATA_UINT64 }, 407 { "tcp_fusion_rrw_busy", KSTAT_DATA_UINT64 }, 408 { "tcp_fusion_rrw_msgcnt", KSTAT_DATA_UINT64 }, 409 { "tcp_fusion_rrw_plugged", KSTAT_DATA_UINT64 }, 410 { "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64 }, 411 { "tcp_sock_fallback", KSTAT_DATA_UINT64 }, 412 }; 413 414 static kstat_t *tcp_kstat; 415 416 /* 417 * Call either ip_output or ip_output_v6. This replaces putnext() calls on the 418 * tcp write side. 419 */ 420 #define CALL_IP_WPUT(connp, q, mp) { \ 421 ASSERT(((q)->q_flag & QREADR) == 0); \ 422 TCP_DBGSTAT(tcp_ip_output); \ 423 connp->conn_send(connp, (mp), (q), IP_WPUT); \ 424 } 425 426 /* Macros for timestamp comparisons */ 427 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) 428 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) 429 430 /* 431 * Parameters for TCP Initial Send Sequence number (ISS) generation. When 432 * tcp_strong_iss is set to 1, which is the default, the ISS is calculated 433 * by adding three components: a time component which grows by 1 every 4096 434 * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); 435 * a per-connection component which grows by 125000 for every new connection; 436 * and an "extra" component that grows by a random amount centered 437 * approximately on 64000. This causes the the ISS generator to cycle every 438 * 4.89 hours if no TCP connections are made, and faster if connections are 439 * made. 440 * 441 * When tcp_strong_iss is set to 0, ISS is calculated by adding two 442 * components: a time component which grows by 250000 every second; and 443 * a per-connection component which grows by 125000 for every new connections. 444 * 445 * A third method, when tcp_strong_iss is set to 2, for generating ISS is 446 * prescribed by Steve Bellovin. This involves adding time, the 125000 per 447 * connection, and a one-way hash (MD5) of the connection ID <sport, dport, 448 * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered 449 * password. 450 */ 451 #define ISS_INCR 250000 452 #define ISS_NSEC_SHT 12 453 454 static uint32_t tcp_iss_incr_extra; /* Incremented for each connection */ 455 static kmutex_t tcp_iss_key_lock; 456 static MD5_CTX tcp_iss_key; 457 static sin_t sin_null; /* Zero address for quick clears */ 458 static sin6_t sin6_null; /* Zero address for quick clears */ 459 460 /* Packet dropper for TCP IPsec policy drops. */ 461 static ipdropper_t tcp_dropper; 462 463 /* 464 * This implementation follows the 4.3BSD interpretation of the urgent 465 * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause 466 * incompatible changes in protocols like telnet and rlogin. 467 */ 468 #define TCP_OLD_URP_INTERPRETATION 1 469 470 #define TCP_IS_DETACHED_NONEAGER(tcp) \ 471 (TCP_IS_DETACHED(tcp) && \ 472 (!(tcp)->tcp_hard_binding)) 473 474 /* 475 * TCP reassembly macros. We hide starting and ending sequence numbers in 476 * b_next and b_prev of messages on the reassembly queue. The messages are 477 * chained using b_cont. These macros are used in tcp_reass() so we don't 478 * have to see the ugly casts and assignments. 479 */ 480 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) 481 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ 482 (mblk_t *)(uintptr_t)(u)) 483 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) 484 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ 485 (mblk_t *)(uintptr_t)(u)) 486 487 /* 488 * Implementation of TCP Timers. 489 * ============================= 490 * 491 * INTERFACE: 492 * 493 * There are two basic functions dealing with tcp timers: 494 * 495 * timeout_id_t tcp_timeout(connp, func, time) 496 * clock_t tcp_timeout_cancel(connp, timeout_id) 497 * TCP_TIMER_RESTART(tcp, intvl) 498 * 499 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' 500 * after 'time' ticks passed. The function called by timeout() must adhere to 501 * the same restrictions as a driver soft interrupt handler - it must not sleep 502 * or call other functions that might sleep. The value returned is the opaque 503 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to 504 * cancel the request. The call to tcp_timeout() may fail in which case it 505 * returns zero. This is different from the timeout(9F) function which never 506 * fails. 507 * 508 * The call-back function 'func' always receives 'connp' as its single 509 * argument. It is always executed in the squeue corresponding to the tcp 510 * structure. The tcp structure is guaranteed to be present at the time the 511 * call-back is called. 512 * 513 * NOTE: The call-back function 'func' is never called if tcp is in 514 * the TCPS_CLOSED state. 515 * 516 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() 517 * request. locks acquired by the call-back routine should not be held across 518 * the call to tcp_timeout_cancel() or a deadlock may result. 519 * 520 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. 521 * Otherwise, it returns an integer value greater than or equal to 0. In 522 * particular, if the call-back function is already placed on the squeue, it can 523 * not be canceled. 524 * 525 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called 526 * within squeue context corresponding to the tcp instance. Since the 527 * call-back is also called via the same squeue, there are no race 528 * conditions described in untimeout(9F) manual page since all calls are 529 * strictly serialized. 530 * 531 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout 532 * stored in tcp_timer_tid and starts a new one using 533 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back 534 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid 535 * field. 536 * 537 * NOTE: since the timeout cancellation is not guaranteed, the cancelled 538 * call-back may still be called, so it is possible tcp_timer() will be 539 * called several times. This should not be a problem since tcp_timer() 540 * should always check the tcp instance state. 541 * 542 * 543 * IMPLEMENTATION: 544 * 545 * TCP timers are implemented using three-stage process. The call to 546 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function 547 * when the timer expires. The tcp_timer_callback() arranges the call of the 548 * tcp_timer_handler() function via squeue corresponding to the tcp 549 * instance. The tcp_timer_handler() calls actual requested timeout call-back 550 * and passes tcp instance as an argument to it. Information is passed between 551 * stages using the tcp_timer_t structure which contains the connp pointer, the 552 * tcp call-back to call and the timeout id returned by the timeout(9F). 553 * 554 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - 555 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo 556 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() 557 * returns the pointer to this mblk. 558 * 559 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It 560 * looks like a normal mblk without actual dblk attached to it. 561 * 562 * To optimize performance each tcp instance holds a small cache of timer 563 * mblocks. In the current implementation it caches up to two timer mblocks per 564 * tcp instance. The cache is preserved over tcp frees and is only freed when 565 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp 566 * timer processing happens on a corresponding squeue, the cache manipulation 567 * does not require any locks. Experiments show that majority of timer mblocks 568 * allocations are satisfied from the tcp cache and do not involve kmem calls. 569 * 570 * The tcp_timeout() places a refhold on the connp instance which guarantees 571 * that it will be present at the time the call-back function fires. The 572 * tcp_timer_handler() drops the reference after calling the call-back, so the 573 * call-back function does not need to manipulate the references explicitly. 574 */ 575 576 typedef struct tcp_timer_s { 577 conn_t *connp; 578 void (*tcpt_proc)(void *); 579 timeout_id_t tcpt_tid; 580 } tcp_timer_t; 581 582 static kmem_cache_t *tcp_timercache; 583 kmem_cache_t *tcp_sack_info_cache; 584 kmem_cache_t *tcp_iphc_cache; 585 586 /* 587 * For scalability, we must not run a timer for every TCP connection 588 * in TIME_WAIT state. To see why, consider (for time wait interval of 589 * 4 minutes): 590 * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's 591 * 592 * This list is ordered by time, so you need only delete from the head 593 * until you get to entries which aren't old enough to delete yet. 594 * The list consists of only the detached TIME_WAIT connections. 595 * 596 * Note that the timer (tcp_time_wait_expire) is started when the tcp_t 597 * becomes detached TIME_WAIT (either by changing the state and already 598 * being detached or the other way around). This means that the TIME_WAIT 599 * state can be extended (up to doubled) if the connection doesn't become 600 * detached for a long time. 601 * 602 * The list manipulations (including tcp_time_wait_next/prev) 603 * are protected by the tcp_time_wait_lock. The content of the 604 * detached TIME_WAIT connections is protected by the normal perimeters. 605 */ 606 607 typedef struct tcp_squeue_priv_s { 608 kmutex_t tcp_time_wait_lock; 609 /* Protects the next 3 globals */ 610 timeout_id_t tcp_time_wait_tid; 611 tcp_t *tcp_time_wait_head; 612 tcp_t *tcp_time_wait_tail; 613 tcp_t *tcp_free_list; 614 uint_t tcp_free_list_cnt; 615 } tcp_squeue_priv_t; 616 617 /* 618 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. 619 * Running it every 5 seconds seems to give the best results. 620 */ 621 #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000) 622 623 /* 624 * To prevent memory hog, limit the number of entries in tcp_free_list 625 * to 1% of available memory / number of cpus 626 */ 627 uint_t tcp_free_list_max_cnt = 0; 628 629 #define TCP_XMIT_LOWATER 4096 630 #define TCP_XMIT_HIWATER 49152 631 #define TCP_RECV_LOWATER 2048 632 #define TCP_RECV_HIWATER 49152 633 634 /* 635 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days 636 */ 637 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) 638 639 #define TIDUSZ 4096 /* transport interface data unit size */ 640 641 /* 642 * Bind hash list size and has function. It has to be a power of 2 for 643 * hashing. 644 */ 645 #define TCP_BIND_FANOUT_SIZE 512 646 #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1)) 647 /* 648 * Size of listen and acceptor hash list. It has to be a power of 2 for 649 * hashing. 650 */ 651 #define TCP_FANOUT_SIZE 256 652 653 #ifdef _ILP32 654 #define TCP_ACCEPTOR_HASH(accid) \ 655 (((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1)) 656 #else 657 #define TCP_ACCEPTOR_HASH(accid) \ 658 ((uint_t)(accid) & (TCP_FANOUT_SIZE - 1)) 659 #endif /* _ILP32 */ 660 661 #define IP_ADDR_CACHE_SIZE 2048 662 #define IP_ADDR_CACHE_HASH(faddr) \ 663 (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1)) 664 665 /* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */ 666 #define TCP_HSP_HASH_SIZE 256 667 668 #define TCP_HSP_HASH(addr) \ 669 (((addr>>24) ^ (addr >>16) ^ \ 670 (addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE) 671 672 /* 673 * TCP options struct returned from tcp_parse_options. 674 */ 675 typedef struct tcp_opt_s { 676 uint32_t tcp_opt_mss; 677 uint32_t tcp_opt_wscale; 678 uint32_t tcp_opt_ts_val; 679 uint32_t tcp_opt_ts_ecr; 680 tcp_t *tcp; 681 } tcp_opt_t; 682 683 /* 684 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing 685 */ 686 687 #ifdef _BIG_ENDIAN 688 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ 689 (TCPOPT_TSTAMP << 8) | 10) 690 #else 691 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ 692 (TCPOPT_NOP << 8) | TCPOPT_NOP) 693 #endif 694 695 /* 696 * Flags returned from tcp_parse_options. 697 */ 698 #define TCP_OPT_MSS_PRESENT 1 699 #define TCP_OPT_WSCALE_PRESENT 2 700 #define TCP_OPT_TSTAMP_PRESENT 4 701 #define TCP_OPT_SACK_OK_PRESENT 8 702 #define TCP_OPT_SACK_PRESENT 16 703 704 /* TCP option length */ 705 #define TCPOPT_NOP_LEN 1 706 #define TCPOPT_MAXSEG_LEN 4 707 #define TCPOPT_WS_LEN 3 708 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) 709 #define TCPOPT_TSTAMP_LEN 10 710 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) 711 #define TCPOPT_SACK_OK_LEN 2 712 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) 713 #define TCPOPT_REAL_SACK_LEN 4 714 #define TCPOPT_MAX_SACK_LEN 36 715 #define TCPOPT_HEADER_LEN 2 716 717 /* TCP cwnd burst factor. */ 718 #define TCP_CWND_INFINITE 65535 719 #define TCP_CWND_SS 3 720 #define TCP_CWND_NORMAL 5 721 722 /* Maximum TCP initial cwin (start/restart). */ 723 #define TCP_MAX_INIT_CWND 8 724 725 /* 726 * Initialize cwnd according to RFC 3390. def_max_init_cwnd is 727 * either tcp_slow_start_initial or tcp_slow_start_after idle 728 * depending on the caller. If the upper layer has not used the 729 * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd 730 * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd. 731 * If the upper layer has changed set the tcp_init_cwnd, just use 732 * it to calculate the tcp_cwnd. 733 */ 734 #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \ 735 { \ 736 if ((tcp)->tcp_init_cwnd == 0) { \ 737 (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \ 738 MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \ 739 } else { \ 740 (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \ 741 } \ 742 tcp->tcp_cwnd_cnt = 0; \ 743 } 744 745 /* TCP Timer control structure */ 746 typedef struct tcpt_s { 747 pfv_t tcpt_pfv; /* The routine we are to call */ 748 tcp_t *tcpt_tcp; /* The parameter we are to pass in */ 749 } tcpt_t; 750 751 /* Host Specific Parameter structure */ 752 typedef struct tcp_hsp { 753 struct tcp_hsp *tcp_hsp_next; 754 in6_addr_t tcp_hsp_addr_v6; 755 in6_addr_t tcp_hsp_subnet_v6; 756 uint_t tcp_hsp_vers; /* IPV4_VERSION | IPV6_VERSION */ 757 int32_t tcp_hsp_sendspace; 758 int32_t tcp_hsp_recvspace; 759 int32_t tcp_hsp_tstamp; 760 } tcp_hsp_t; 761 #define tcp_hsp_addr V4_PART_OF_V6(tcp_hsp_addr_v6) 762 #define tcp_hsp_subnet V4_PART_OF_V6(tcp_hsp_subnet_v6) 763 764 /* 765 * Functions called directly via squeue having a prototype of edesc_t. 766 */ 767 void tcp_conn_request(void *arg, mblk_t *mp, void *arg2); 768 static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2); 769 void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2); 770 static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2); 771 static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2); 772 void tcp_input(void *arg, mblk_t *mp, void *arg2); 773 void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); 774 static void tcp_close_output(void *arg, mblk_t *mp, void *arg2); 775 void tcp_output(void *arg, mblk_t *mp, void *arg2); 776 static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2); 777 static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2); 778 779 780 /* Prototype for TCP functions */ 781 static void tcp_random_init(void); 782 int tcp_random(void); 783 static void tcp_accept(tcp_t *tcp, mblk_t *mp); 784 static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, 785 tcp_t *eager); 786 static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp); 787 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 788 int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, 789 boolean_t user_specified); 790 static void tcp_closei_local(tcp_t *tcp); 791 static void tcp_close_detached(tcp_t *tcp); 792 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, 793 mblk_t *idmp, mblk_t **defermp); 794 static void tcp_connect(tcp_t *tcp, mblk_t *mp); 795 static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, 796 in_port_t dstport, uint_t srcid); 797 static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, 798 in_port_t dstport, uint32_t flowinfo, uint_t srcid, 799 uint32_t scope_id); 800 static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); 801 static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp); 802 static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); 803 static char *tcp_display(tcp_t *tcp, char *, char); 804 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); 805 static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only); 806 static void tcp_eager_unlink(tcp_t *tcp); 807 static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr, 808 int unixerr); 809 static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 810 int tlierr, int unixerr); 811 static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, 812 cred_t *cr); 813 static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, 814 char *value, caddr_t cp, cred_t *cr); 815 static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, 816 char *value, caddr_t cp, cred_t *cr); 817 static int tcp_tpistate(tcp_t *tcp); 818 static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp, 819 int caller_holds_lock); 820 static void tcp_bind_hash_remove(tcp_t *tcp); 821 static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id); 822 void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp); 823 static void tcp_acceptor_hash_remove(tcp_t *tcp); 824 static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); 825 static void tcp_info_req(tcp_t *tcp, mblk_t *mp); 826 static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); 827 static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp); 828 static int tcp_header_init_ipv4(tcp_t *tcp); 829 static int tcp_header_init_ipv6(tcp_t *tcp); 830 int tcp_init(tcp_t *tcp, queue_t *q); 831 static int tcp_init_values(tcp_t *tcp); 832 static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic); 833 static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, 834 t_scalar_t addr_length); 835 static void tcp_ip_ire_mark_advice(tcp_t *tcp); 836 static void tcp_ip_notify(tcp_t *tcp); 837 static mblk_t *tcp_ire_mp(mblk_t *mp); 838 static void tcp_iss_init(tcp_t *tcp); 839 static void tcp_keepalive_killer(void *arg); 840 static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt); 841 static void tcp_mss_set(tcp_t *tcp, uint32_t size); 842 static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, 843 int *do_disconnectp, int *t_errorp, int *sys_errorp); 844 static boolean_t tcp_allow_connopt_set(int level, int name); 845 int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); 846 int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr); 847 int tcp_opt_set(queue_t *q, uint_t optset_context, int level, 848 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, 849 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, 850 mblk_t *mblk); 851 static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha); 852 static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, 853 uchar_t *ptr, uint_t len); 854 static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); 855 static boolean_t tcp_param_register(tcpparam_t *tcppa, int cnt); 856 static int tcp_param_set(queue_t *q, mblk_t *mp, char *value, 857 caddr_t cp, cred_t *cr); 858 static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, 859 caddr_t cp, cred_t *cr); 860 static void tcp_iss_key_init(uint8_t *phrase, int len); 861 static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, 862 caddr_t cp, cred_t *cr); 863 static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt); 864 static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start); 865 static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp); 866 static void tcp_reinit(tcp_t *tcp); 867 static void tcp_reinit_values(tcp_t *tcp); 868 static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, 869 tcp_t *thisstream, cred_t *cr); 870 871 static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp); 872 static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); 873 static boolean_t tcp_send_rst_chk(void); 874 static void tcp_ss_rexmit(tcp_t *tcp); 875 static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp); 876 static void tcp_process_options(tcp_t *, tcph_t *); 877 static void tcp_rput_common(tcp_t *tcp, mblk_t *mp); 878 static void tcp_rsrv(queue_t *q); 879 static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd); 880 static int tcp_snmp_state(tcp_t *tcp); 881 static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, 882 cred_t *cr); 883 static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 884 cred_t *cr); 885 static int tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 886 cred_t *cr); 887 static int tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 888 cred_t *cr); 889 static int tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, 890 cred_t *cr); 891 static int tcp_host_param_set(queue_t *q, mblk_t *mp, char *value, 892 caddr_t cp, cred_t *cr); 893 static int tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value, 894 caddr_t cp, cred_t *cr); 895 static int tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, 896 cred_t *cr); 897 static void tcp_timer(void *arg); 898 static void tcp_timer_callback(void *); 899 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp, 900 boolean_t random); 901 static in_port_t tcp_get_next_priv_port(const tcp_t *); 902 static void tcp_wput_sock(queue_t *q, mblk_t *mp); 903 void tcp_wput_accept(queue_t *q, mblk_t *mp); 904 static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); 905 static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); 906 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); 907 static int tcp_send(queue_t *q, tcp_t *tcp, const int mss, 908 const int tcp_hdr_len, const int tcp_tcp_hdr_len, 909 const int num_sack_blk, int *usable, uint_t *snxt, 910 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 911 const int mdt_thres); 912 static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, 913 const int tcp_hdr_len, const int tcp_tcp_hdr_len, 914 const int num_sack_blk, int *usable, uint_t *snxt, 915 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 916 const int mdt_thres); 917 static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, 918 int num_sack_blk); 919 static void tcp_wsrv(queue_t *q); 920 static int tcp_xmit_end(tcp_t *tcp); 921 static mblk_t *tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, 922 int32_t *offset, mblk_t **end_mp, uint32_t seq, 923 boolean_t sendall, uint32_t *seg_len, boolean_t rexmit); 924 static void tcp_ack_timer(void *arg); 925 static mblk_t *tcp_ack_mp(tcp_t *tcp); 926 static void tcp_xmit_early_reset(char *str, mblk_t *mp, 927 uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len); 928 static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, 929 uint32_t ack, int ctl); 930 static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr); 931 static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr); 932 static int setmaxps(queue_t *q, int maxpsz); 933 static void tcp_set_rto(tcp_t *, time_t); 934 static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *, 935 boolean_t, boolean_t); 936 static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, 937 boolean_t ipsec_mctl); 938 static mblk_t *tcp_setsockopt_mp(int level, int cmd, 939 char *opt, int optlen); 940 static int tcp_build_hdrs(queue_t *, tcp_t *); 941 static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, 942 uint32_t seg_seq, uint32_t seg_ack, int seg_len, 943 tcph_t *tcph); 944 boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp); 945 boolean_t tcp_reserved_port_add(int, in_port_t *, in_port_t *); 946 boolean_t tcp_reserved_port_del(in_port_t, in_port_t); 947 boolean_t tcp_reserved_port_check(in_port_t); 948 static tcp_t *tcp_alloc_temp_tcp(in_port_t); 949 static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *); 950 static mblk_t *tcp_mdt_info_mp(mblk_t *); 951 static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t); 952 static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *, 953 const boolean_t, const uint32_t, const uint32_t, 954 const uint32_t, const uint32_t); 955 static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *, 956 const uint_t, const uint_t, boolean_t *); 957 static void tcp_send_data(tcp_t *, queue_t *, mblk_t *); 958 extern mblk_t *tcp_timermp_alloc(int); 959 extern void tcp_timermp_free(tcp_t *); 960 static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); 961 static void tcp_stop_lingering(tcp_t *tcp); 962 static void tcp_close_linger_timeout(void *arg); 963 void tcp_ddi_init(void); 964 void tcp_ddi_destroy(void); 965 static void tcp_kstat_init(void); 966 static void tcp_kstat_fini(void); 967 static int tcp_kstat_update(kstat_t *kp, int rw); 968 void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp); 969 static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 970 tcph_t *tcph, uint_t ipvers, mblk_t *idmp); 971 static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, 972 tcph_t *tcph, mblk_t *idmp); 973 static squeue_func_t tcp_squeue_switch(int); 974 975 static int tcp_open(queue_t *, dev_t *, int, int, cred_t *); 976 static int tcp_close(queue_t *, int); 977 static int tcpclose_accept(queue_t *); 978 static int tcp_modclose(queue_t *); 979 static void tcp_wput_mod(queue_t *, mblk_t *); 980 981 static void tcp_squeue_add(squeue_t *); 982 static boolean_t tcp_zcopy_check(tcp_t *); 983 static void tcp_zcopy_notify(tcp_t *); 984 static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *); 985 static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int); 986 static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t); 987 988 extern void tcp_kssl_input(tcp_t *, mblk_t *); 989 990 /* 991 * Routines related to the TCP_IOC_ABORT_CONN ioctl command. 992 * 993 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting 994 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure 995 * (defined in tcp.h) needs to be filled in and passed into the kernel 996 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t 997 * structure contains the four-tuple of a TCP connection and a range of TCP 998 * states (specified by ac_start and ac_end). The use of wildcard addresses 999 * and ports is allowed. Connections with a matching four tuple and a state 1000 * within the specified range will be aborted. The valid states for the 1001 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, 1002 * inclusive. 1003 * 1004 * An application which has its connection aborted by this ioctl will receive 1005 * an error that is dependent on the connection state at the time of the abort. 1006 * If the connection state is < TCPS_TIME_WAIT, an application should behave as 1007 * though a RST packet has been received. If the connection state is equal to 1008 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel 1009 * and all resources associated with the connection will be freed. 1010 */ 1011 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); 1012 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); 1013 static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *); 1014 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *); 1015 static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); 1016 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, 1017 boolean_t); 1018 1019 static struct module_info tcp_rinfo = { 1020 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER 1021 }; 1022 1023 static struct module_info tcp_winfo = { 1024 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16 1025 }; 1026 1027 /* 1028 * Entry points for TCP as a module. It only allows SNMP requests 1029 * to pass through. 1030 */ 1031 struct qinit tcp_mod_rinit = { 1032 (pfi_t)putnext, NULL, tcp_open, ip_snmpmod_close, NULL, &tcp_rinfo, 1033 }; 1034 1035 struct qinit tcp_mod_winit = { 1036 (pfi_t)ip_snmpmod_wput, NULL, tcp_open, ip_snmpmod_close, NULL, 1037 &tcp_rinfo 1038 }; 1039 1040 /* 1041 * Entry points for TCP as a device. The normal case which supports 1042 * the TCP functionality. 1043 */ 1044 struct qinit tcp_rinit = { 1045 NULL, (pfi_t)tcp_rsrv, tcp_open, tcp_close, NULL, &tcp_rinfo 1046 }; 1047 1048 struct qinit tcp_winit = { 1049 (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 1050 }; 1051 1052 /* Initial entry point for TCP in socket mode. */ 1053 struct qinit tcp_sock_winit = { 1054 (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo 1055 }; 1056 1057 /* 1058 * Entry points for TCP as a acceptor STREAM opened by sockfs when doing 1059 * an accept. Avoid allocating data structures since eager has already 1060 * been created. 1061 */ 1062 struct qinit tcp_acceptor_rinit = { 1063 NULL, (pfi_t)tcp_rsrv, NULL, tcpclose_accept, NULL, &tcp_winfo 1064 }; 1065 1066 struct qinit tcp_acceptor_winit = { 1067 (pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo 1068 }; 1069 1070 /* 1071 * Entry points for TCP loopback (read side only) 1072 */ 1073 struct qinit tcp_loopback_rinit = { 1074 (pfi_t)0, (pfi_t)tcp_rsrv, tcp_open, tcp_close, (pfi_t)0, 1075 &tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD 1076 }; 1077 1078 struct streamtab tcpinfo = { 1079 &tcp_rinit, &tcp_winit 1080 }; 1081 1082 extern squeue_func_t tcp_squeue_wput_proc; 1083 extern squeue_func_t tcp_squeue_timer_proc; 1084 1085 /* Protected by tcp_g_q_lock */ 1086 static queue_t *tcp_g_q; /* Default queue used during detached closes */ 1087 kmutex_t tcp_g_q_lock; 1088 1089 /* Protected by tcp_hsp_lock */ 1090 /* 1091 * XXX The host param mechanism should go away and instead we should use 1092 * the metrics associated with the routes to determine the default sndspace 1093 * and rcvspace. 1094 */ 1095 static tcp_hsp_t **tcp_hsp_hash; /* Hash table for HSPs */ 1096 krwlock_t tcp_hsp_lock; 1097 1098 /* 1099 * Extra privileged ports. In host byte order. 1100 * Protected by tcp_epriv_port_lock. 1101 */ 1102 #define TCP_NUM_EPRIV_PORTS 64 1103 static int tcp_g_num_epriv_ports = TCP_NUM_EPRIV_PORTS; 1104 static uint16_t tcp_g_epriv_ports[TCP_NUM_EPRIV_PORTS] = { 2049, 4045 }; 1105 kmutex_t tcp_epriv_port_lock; 1106 1107 /* 1108 * The smallest anonymous port in the privileged port range which TCP 1109 * looks for free port. Use in the option TCP_ANONPRIVBIND. 1110 */ 1111 static in_port_t tcp_min_anonpriv_port = 512; 1112 1113 /* Only modified during _init and _fini thus no locking is needed. */ 1114 static caddr_t tcp_g_nd; /* Head of 'named dispatch' variable list */ 1115 1116 /* Hint not protected by any lock */ 1117 static uint_t tcp_next_port_to_try; 1118 1119 1120 /* TCP bind hash list - all tcp_t with state >= BOUND. */ 1121 tf_t tcp_bind_fanout[TCP_BIND_FANOUT_SIZE]; 1122 1123 /* TCP queue hash list - all tcp_t in case they will be an acceptor. */ 1124 static tf_t tcp_acceptor_fanout[TCP_FANOUT_SIZE]; 1125 1126 /* 1127 * TCP has a private interface for other kernel modules to reserve a 1128 * port range for them to use. Once reserved, TCP will not use any ports 1129 * in the range. This interface relies on the TCP_EXCLBIND feature. If 1130 * the semantics of TCP_EXCLBIND is changed, implementation of this interface 1131 * has to be verified. 1132 * 1133 * There can be TCP_RESERVED_PORTS_ARRAY_MAX_SIZE port ranges. Each port 1134 * range can cover at most TCP_RESERVED_PORTS_RANGE_MAX ports. A port 1135 * range is [port a, port b] inclusive. And each port range is between 1136 * TCP_LOWESET_RESERVED_PORT and TCP_LARGEST_RESERVED_PORT inclusive. 1137 * 1138 * Note that the default anonymous port range starts from 32768. There is 1139 * no port "collision" between that and the reserved port range. If there 1140 * is port collision (because the default smallest anonymous port is lowered 1141 * or some apps specifically bind to ports in the reserved port range), the 1142 * system may not be able to reserve a port range even there are enough 1143 * unbound ports as a reserved port range contains consecutive ports . 1144 */ 1145 #define TCP_RESERVED_PORTS_ARRAY_MAX_SIZE 5 1146 #define TCP_RESERVED_PORTS_RANGE_MAX 1000 1147 #define TCP_SMALLEST_RESERVED_PORT 10240 1148 #define TCP_LARGEST_RESERVED_PORT 20480 1149 1150 /* Structure to represent those reserved port ranges. */ 1151 typedef struct tcp_rport_s { 1152 in_port_t lo_port; 1153 in_port_t hi_port; 1154 tcp_t **temp_tcp_array; 1155 } tcp_rport_t; 1156 1157 /* The reserved port array. */ 1158 static tcp_rport_t tcp_reserved_port[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE]; 1159 1160 /* Locks to protect the tcp_reserved_ports array. */ 1161 static krwlock_t tcp_reserved_port_lock; 1162 1163 /* The number of ranges in the array. */ 1164 uint32_t tcp_reserved_port_array_size = 0; 1165 1166 /* 1167 * MIB-2 stuff for SNMP 1168 * Note: tcpInErrs {tcp 15} is accumulated in ip.c 1169 */ 1170 mib2_tcp_t tcp_mib; /* SNMP fixed size info */ 1171 kstat_t *tcp_mibkp; /* kstat exporting tcp_mib data */ 1172 1173 boolean_t tcp_icmp_source_quench = B_FALSE; 1174 /* 1175 * Following assumes TPI alignment requirements stay along 32 bit 1176 * boundaries 1177 */ 1178 #define ROUNDUP32(x) \ 1179 (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1)) 1180 1181 /* Template for response to info request. */ 1182 static struct T_info_ack tcp_g_t_info_ack = { 1183 T_INFO_ACK, /* PRIM_type */ 1184 0, /* TSDU_size */ 1185 T_INFINITE, /* ETSDU_size */ 1186 T_INVALID, /* CDATA_size */ 1187 T_INVALID, /* DDATA_size */ 1188 sizeof (sin_t), /* ADDR_size */ 1189 0, /* OPT_size - not initialized here */ 1190 TIDUSZ, /* TIDU_size */ 1191 T_COTS_ORD, /* SERV_type */ 1192 TCPS_IDLE, /* CURRENT_state */ 1193 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 1194 }; 1195 1196 static struct T_info_ack tcp_g_t_info_ack_v6 = { 1197 T_INFO_ACK, /* PRIM_type */ 1198 0, /* TSDU_size */ 1199 T_INFINITE, /* ETSDU_size */ 1200 T_INVALID, /* CDATA_size */ 1201 T_INVALID, /* DDATA_size */ 1202 sizeof (sin6_t), /* ADDR_size */ 1203 0, /* OPT_size - not initialized here */ 1204 TIDUSZ, /* TIDU_size */ 1205 T_COTS_ORD, /* SERV_type */ 1206 TCPS_IDLE, /* CURRENT_state */ 1207 (XPG4_1|EXPINLINE) /* PROVIDER_flag */ 1208 }; 1209 1210 #define MS 1L 1211 #define SECONDS (1000 * MS) 1212 #define MINUTES (60 * SECONDS) 1213 #define HOURS (60 * MINUTES) 1214 #define DAYS (24 * HOURS) 1215 1216 #define PARAM_MAX (~(uint32_t)0) 1217 1218 /* Max size IP datagram is 64k - 1 */ 1219 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t))) 1220 #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t))) 1221 /* Max of the above */ 1222 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 1223 1224 /* Largest TCP port number */ 1225 #define TCP_MAX_PORT (64 * 1024 - 1) 1226 1227 /* 1228 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link 1229 * layer header. It has to be a multiple of 4. 1230 */ 1231 static tcpparam_t tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; 1232 #define tcp_wroff_xtra tcp_wroff_xtra_param.tcp_param_val 1233 1234 /* 1235 * All of these are alterable, within the min/max values given, at run time. 1236 * Note that the default value of "tcp_time_wait_interval" is four minutes, 1237 * per the TCP spec. 1238 */ 1239 /* BEGIN CSTYLED */ 1240 tcpparam_t tcp_param_arr[] = { 1241 /*min max value name */ 1242 { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"}, 1243 { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" }, 1244 { 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" }, 1245 { 1, 1024, 1, "tcp_conn_req_min" }, 1246 { 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" }, 1247 { 128, (1<<30), 1024*1024, "tcp_cwnd_max" }, 1248 { 0, 10, 0, "tcp_debug" }, 1249 { 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"}, 1250 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"}, 1251 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"}, 1252 { 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"}, 1253 { 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"}, 1254 { 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"}, 1255 { 1, 255, 64, "tcp_ipv4_ttl"}, 1256 { 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"}, 1257 { 0, 100, 10, "tcp_maxpsz_multiplier" }, 1258 { 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"}, 1259 { 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"}, 1260 { 1, TCP_MSS_MAX, 108, "tcp_mss_min"}, 1261 { 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"}, 1262 { 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"}, 1263 { 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"}, 1264 { 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"}, 1265 { 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" }, 1266 { 0, 16, 0, "tcp_snd_lowat_fraction" }, 1267 { 0, 128000, 0, "tcp_sth_rcv_hiwat" }, 1268 { 0, 128000, 0, "tcp_sth_rcv_lowat" }, 1269 { 1, 10000, 3, "tcp_dupack_fast_retransmit" }, 1270 { 0, 1, 0, "tcp_ignore_path_mtu" }, 1271 { 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"}, 1272 { 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"}, 1273 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"}, 1274 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"}, 1275 { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"}, 1276 { 1, 65536, 4, "tcp_recv_hiwat_minmss"}, 1277 { 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"}, 1278 { 0, TCP_MSS_MAX, 64, "tcp_co_min"}, 1279 { 8192, (1<<30), 1024*1024, "tcp_max_buf"}, 1280 /* 1281 * Question: What default value should I set for tcp_strong_iss? 1282 */ 1283 { 0, 2, 1, "tcp_strong_iss"}, 1284 { 0, 65536, 20, "tcp_rtt_updates"}, 1285 { 0, 1, 1, "tcp_wscale_always"}, 1286 { 0, 1, 0, "tcp_tstamp_always"}, 1287 { 0, 1, 1, "tcp_tstamp_if_wscale"}, 1288 { 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"}, 1289 { 0, 16, 2, "tcp_deferred_acks_max"}, 1290 { 1, 16384, 4, "tcp_slow_start_after_idle"}, 1291 { 1, 4, 4, "tcp_slow_start_initial"}, 1292 { 10*MS, 50*MS, 20*MS, "tcp_co_timer_interval"}, 1293 { 0, 2, 2, "tcp_sack_permitted"}, 1294 { 0, 1, 0, "tcp_trace"}, 1295 { 0, 1, 1, "tcp_compression_enabled"}, 1296 { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"}, 1297 { 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"}, 1298 { 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"}, 1299 { 0, 1, 0, "tcp_rev_src_routes"}, 1300 { 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"}, 1301 { 100*MS, 60*SECONDS, 1*SECONDS, "tcp_ndd_get_info_interval"}, 1302 { 0, 16, 8, "tcp_local_dacks_max"}, 1303 { 0, 2, 1, "tcp_ecn_permitted"}, 1304 { 0, 1, 1, "tcp_rst_sent_rate_enabled"}, 1305 { 0, PARAM_MAX, 40, "tcp_rst_sent_rate"}, 1306 { 0, 100*MS, 50*MS, "tcp_push_timer_interval"}, 1307 { 0, 1, 0, "tcp_use_smss_as_mss_opt"}, 1308 { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"}, 1309 }; 1310 /* END CSTYLED */ 1311 1312 /* 1313 * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of 1314 * each header fragment in the header buffer. Each parameter value has 1315 * to be a multiple of 4 (32-bit aligned). 1316 */ 1317 static tcpparam_t tcp_mdt_head_param = { 32, 256, 32, "tcp_mdt_hdr_head_min" }; 1318 static tcpparam_t tcp_mdt_tail_param = { 0, 256, 32, "tcp_mdt_hdr_tail_min" }; 1319 #define tcp_mdt_hdr_head_min tcp_mdt_head_param.tcp_param_val 1320 #define tcp_mdt_hdr_tail_min tcp_mdt_tail_param.tcp_param_val 1321 1322 /* 1323 * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out 1324 * the maximum number of payload buffers associated per Multidata. 1325 */ 1326 static tcpparam_t tcp_mdt_max_pbufs_param = 1327 { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" }; 1328 #define tcp_mdt_max_pbufs tcp_mdt_max_pbufs_param.tcp_param_val 1329 1330 /* Round up the value to the nearest mss. */ 1331 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) 1332 1333 /* 1334 * Set ECN capable transport (ECT) code point in IP header. 1335 * 1336 * Note that there are 2 ECT code points '01' and '10', which are called 1337 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code 1338 * point ECT(0) for TCP as described in RFC 2481. 1339 */ 1340 #define SET_ECT(tcp, iph) \ 1341 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1342 /* We need to clear the code point first. */ \ 1343 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ 1344 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ 1345 } else { \ 1346 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \ 1347 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ 1348 } 1349 1350 /* 1351 * The format argument to pass to tcp_display(). 1352 * DISP_PORT_ONLY means that the returned string has only port info. 1353 * DISP_ADDR_AND_PORT means that the returned string also contains the 1354 * remote and local IP address. 1355 */ 1356 #define DISP_PORT_ONLY 1 1357 #define DISP_ADDR_AND_PORT 2 1358 1359 /* 1360 * This controls the rate some ndd info report functions can be used 1361 * by non-privileged users. It stores the last time such info is 1362 * requested. When those report functions are called again, this 1363 * is checked with the current time and compare with the ndd param 1364 * tcp_ndd_get_info_interval. 1365 */ 1366 static clock_t tcp_last_ndd_get_info_time = 0; 1367 #define NDD_TOO_QUICK_MSG \ 1368 "ndd get info rate too high for non-privileged users, try again " \ 1369 "later.\n" 1370 #define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n" 1371 1372 #define IS_VMLOANED_MBLK(mp) \ 1373 (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) 1374 1375 /* 1376 * These two variables control the rate for TCP to generate RSTs in 1377 * response to segments not belonging to any connections. We limit 1378 * TCP to sent out tcp_rst_sent_rate (ndd param) number of RSTs in 1379 * each 1 second interval. This is to protect TCP against DoS attack. 1380 */ 1381 static clock_t tcp_last_rst_intrvl; 1382 static uint32_t tcp_rst_cnt; 1383 1384 /* The number of RST not sent because of the rate limit. */ 1385 static uint32_t tcp_rst_unsent; 1386 1387 /* Enable or disable b_cont M_MULTIDATA chaining for MDT. */ 1388 boolean_t tcp_mdt_chain = B_TRUE; 1389 1390 /* 1391 * MDT threshold in the form of effective send MSS multiplier; we take 1392 * the MDT path if the amount of unsent data exceeds the threshold value 1393 * (default threshold is 1*SMSS). 1394 */ 1395 uint_t tcp_mdt_smss_threshold = 1; 1396 1397 uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ 1398 1399 /* 1400 * Forces all connections to obey the value of the tcp_maxpsz_multiplier 1401 * tunable settable via NDD. Otherwise, the per-connection behavior is 1402 * determined dynamically during tcp_adapt_ire(), which is the default. 1403 */ 1404 boolean_t tcp_static_maxpsz = B_FALSE; 1405 1406 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ 1407 uint32_t tcp_random_anon_port = 1; 1408 1409 /* 1410 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more 1411 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent 1412 * data, TCP will not respond with an ACK. RFC 793 requires that 1413 * TCP responds with an ACK for such a bogus ACK. By not following 1414 * the RFC, we prevent TCP from getting into an ACK storm if somehow 1415 * an attacker successfully spoofs an acceptable segment to our 1416 * peer; or when our peer is "confused." 1417 */ 1418 uint32_t tcp_drop_ack_unsent_cnt = 10; 1419 1420 /* 1421 * Hook functions to enable cluster networking 1422 * On non-clustered systems these vectors must always be NULL. 1423 */ 1424 1425 void (*cl_inet_listen)(uint8_t protocol, sa_family_t addr_family, 1426 uint8_t *laddrp, in_port_t lport) = NULL; 1427 void (*cl_inet_unlisten)(uint8_t protocol, sa_family_t addr_family, 1428 uint8_t *laddrp, in_port_t lport) = NULL; 1429 void (*cl_inet_connect)(uint8_t protocol, sa_family_t addr_family, 1430 uint8_t *laddrp, in_port_t lport, 1431 uint8_t *faddrp, in_port_t fport) = NULL; 1432 void (*cl_inet_disconnect)(uint8_t protocol, sa_family_t addr_family, 1433 uint8_t *laddrp, in_port_t lport, 1434 uint8_t *faddrp, in_port_t fport) = NULL; 1435 1436 /* 1437 * The following are defined in ip.c 1438 */ 1439 extern int (*cl_inet_isclusterwide)(uint8_t protocol, sa_family_t addr_family, 1440 uint8_t *laddrp); 1441 extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, 1442 uint8_t *laddrp, uint8_t *faddrp); 1443 1444 #define CL_INET_CONNECT(tcp) { \ 1445 if (cl_inet_connect != NULL) { \ 1446 /* \ 1447 * Running in cluster mode - register active connection \ 1448 * information \ 1449 */ \ 1450 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1451 if ((tcp)->tcp_ipha->ipha_src != 0) { \ 1452 (*cl_inet_connect)(IPPROTO_TCP, AF_INET,\ 1453 (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\ 1454 (in_port_t)(tcp)->tcp_lport, \ 1455 (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ 1456 (in_port_t)(tcp)->tcp_fport); \ 1457 } \ 1458 } else { \ 1459 if (!IN6_IS_ADDR_UNSPECIFIED( \ 1460 &(tcp)->tcp_ip6h->ip6_src)) {\ 1461 (*cl_inet_connect)(IPPROTO_TCP, AF_INET6,\ 1462 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\ 1463 (in_port_t)(tcp)->tcp_lport, \ 1464 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ 1465 (in_port_t)(tcp)->tcp_fport); \ 1466 } \ 1467 } \ 1468 } \ 1469 } 1470 1471 #define CL_INET_DISCONNECT(tcp) { \ 1472 if (cl_inet_disconnect != NULL) { \ 1473 /* \ 1474 * Running in cluster mode - deregister active \ 1475 * connection information \ 1476 */ \ 1477 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ 1478 if ((tcp)->tcp_ip_src != 0) { \ 1479 (*cl_inet_disconnect)(IPPROTO_TCP, \ 1480 AF_INET, \ 1481 (uint8_t *)(&((tcp)->tcp_ip_src)),\ 1482 (in_port_t)(tcp)->tcp_lport, \ 1483 (uint8_t *) \ 1484 (&((tcp)->tcp_ipha->ipha_dst)),\ 1485 (in_port_t)(tcp)->tcp_fport); \ 1486 } \ 1487 } else { \ 1488 if (!IN6_IS_ADDR_UNSPECIFIED( \ 1489 &(tcp)->tcp_ip_src_v6)) { \ 1490 (*cl_inet_disconnect)(IPPROTO_TCP, AF_INET6,\ 1491 (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\ 1492 (in_port_t)(tcp)->tcp_lport, \ 1493 (uint8_t *) \ 1494 (&((tcp)->tcp_ip6h->ip6_dst)),\ 1495 (in_port_t)(tcp)->tcp_fport); \ 1496 } \ 1497 } \ 1498 } \ 1499 } 1500 1501 /* 1502 * Cluster networking hook for traversing current connection list. 1503 * This routine is used to extract the current list of live connections 1504 * which must continue to to be dispatched to this node. 1505 */ 1506 int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg); 1507 1508 /* 1509 * Figure out the value of window scale opton. Note that the rwnd is 1510 * ASSUMED to be rounded up to the nearest MSS before the calculation. 1511 * We cannot find the scale value and then do a round up of tcp_rwnd 1512 * because the scale value may not be correct after that. 1513 * 1514 * Set the compiler flag to make this function inline. 1515 */ 1516 static void 1517 tcp_set_ws_value(tcp_t *tcp) 1518 { 1519 int i; 1520 uint32_t rwnd = tcp->tcp_rwnd; 1521 1522 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT; 1523 i++, rwnd >>= 1) 1524 ; 1525 tcp->tcp_rcv_ws = i; 1526 } 1527 1528 /* 1529 * Remove a connection from the list of detached TIME_WAIT connections. 1530 */ 1531 static void 1532 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) 1533 { 1534 boolean_t locked = B_FALSE; 1535 1536 if (tcp_time_wait == NULL) { 1537 tcp_time_wait = *((tcp_squeue_priv_t **) 1538 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); 1539 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1540 locked = B_TRUE; 1541 } 1542 1543 if (tcp->tcp_time_wait_expire == 0) { 1544 ASSERT(tcp->tcp_time_wait_next == NULL); 1545 ASSERT(tcp->tcp_time_wait_prev == NULL); 1546 if (locked) 1547 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1548 return; 1549 } 1550 ASSERT(TCP_IS_DETACHED(tcp)); 1551 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1552 1553 if (tcp == tcp_time_wait->tcp_time_wait_head) { 1554 ASSERT(tcp->tcp_time_wait_prev == NULL); 1555 tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; 1556 if (tcp_time_wait->tcp_time_wait_head != NULL) { 1557 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = 1558 NULL; 1559 } else { 1560 tcp_time_wait->tcp_time_wait_tail = NULL; 1561 } 1562 } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { 1563 ASSERT(tcp != tcp_time_wait->tcp_time_wait_head); 1564 ASSERT(tcp->tcp_time_wait_next == NULL); 1565 tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; 1566 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1567 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; 1568 } else { 1569 ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); 1570 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); 1571 tcp->tcp_time_wait_prev->tcp_time_wait_next = 1572 tcp->tcp_time_wait_next; 1573 tcp->tcp_time_wait_next->tcp_time_wait_prev = 1574 tcp->tcp_time_wait_prev; 1575 } 1576 tcp->tcp_time_wait_next = NULL; 1577 tcp->tcp_time_wait_prev = NULL; 1578 tcp->tcp_time_wait_expire = 0; 1579 1580 if (locked) 1581 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1582 } 1583 1584 /* 1585 * Add a connection to the list of detached TIME_WAIT connections 1586 * and set its time to expire. 1587 */ 1588 static void 1589 tcp_time_wait_append(tcp_t *tcp) 1590 { 1591 tcp_squeue_priv_t *tcp_time_wait = 1592 *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp, 1593 SQPRIVATE_TCP)); 1594 1595 tcp_timers_stop(tcp); 1596 1597 /* Freed above */ 1598 ASSERT(tcp->tcp_timer_tid == 0); 1599 ASSERT(tcp->tcp_ack_tid == 0); 1600 1601 /* must have happened at the time of detaching the tcp */ 1602 ASSERT(tcp->tcp_ptpahn == NULL); 1603 ASSERT(tcp->tcp_flow_stopped == 0); 1604 ASSERT(tcp->tcp_time_wait_next == NULL); 1605 ASSERT(tcp->tcp_time_wait_prev == NULL); 1606 ASSERT(tcp->tcp_time_wait_expire == NULL); 1607 ASSERT(tcp->tcp_listener == NULL); 1608 1609 tcp->tcp_time_wait_expire = ddi_get_lbolt(); 1610 /* 1611 * The value computed below in tcp->tcp_time_wait_expire may 1612 * appear negative or wrap around. That is ok since our 1613 * interest is only in the difference between the current lbolt 1614 * value and tcp->tcp_time_wait_expire. But the value should not 1615 * be zero, since it means the tcp is not in the TIME_WAIT list. 1616 * The corresponding comparison in tcp_time_wait_collector() uses 1617 * modular arithmetic. 1618 */ 1619 tcp->tcp_time_wait_expire += 1620 drv_usectohz(tcp_time_wait_interval * 1000); 1621 if (tcp->tcp_time_wait_expire == 0) 1622 tcp->tcp_time_wait_expire = 1; 1623 1624 ASSERT(TCP_IS_DETACHED(tcp)); 1625 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); 1626 ASSERT(tcp->tcp_time_wait_next == NULL); 1627 ASSERT(tcp->tcp_time_wait_prev == NULL); 1628 TCP_DBGSTAT(tcp_time_wait); 1629 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1630 if (tcp_time_wait->tcp_time_wait_head == NULL) { 1631 ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); 1632 tcp_time_wait->tcp_time_wait_head = tcp; 1633 } else { 1634 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); 1635 ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == 1636 TCPS_TIME_WAIT); 1637 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; 1638 tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; 1639 } 1640 tcp_time_wait->tcp_time_wait_tail = tcp; 1641 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1642 } 1643 1644 /* ARGSUSED */ 1645 void 1646 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) 1647 { 1648 conn_t *connp = (conn_t *)arg; 1649 tcp_t *tcp = connp->conn_tcp; 1650 1651 ASSERT(tcp != NULL); 1652 if (tcp->tcp_state == TCPS_CLOSED) { 1653 return; 1654 } 1655 1656 ASSERT((tcp->tcp_family == AF_INET && 1657 tcp->tcp_ipversion == IPV4_VERSION) || 1658 (tcp->tcp_family == AF_INET6 && 1659 (tcp->tcp_ipversion == IPV4_VERSION || 1660 tcp->tcp_ipversion == IPV6_VERSION))); 1661 ASSERT(!tcp->tcp_listener); 1662 1663 TCP_STAT(tcp_time_wait_reap); 1664 ASSERT(TCP_IS_DETACHED(tcp)); 1665 1666 /* 1667 * Because they have no upstream client to rebind or tcp_close() 1668 * them later, we axe the connection here and now. 1669 */ 1670 tcp_close_detached(tcp); 1671 } 1672 1673 void 1674 tcp_cleanup(tcp_t *tcp) 1675 { 1676 mblk_t *mp; 1677 char *tcp_iphc; 1678 int tcp_iphc_len; 1679 int tcp_hdr_grown; 1680 tcp_sack_info_t *tcp_sack_info; 1681 conn_t *connp = tcp->tcp_connp; 1682 1683 tcp_bind_hash_remove(tcp); 1684 tcp_free(tcp); 1685 1686 /* Release any SSL context */ 1687 if (tcp->tcp_kssl_ent != NULL) { 1688 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); 1689 tcp->tcp_kssl_ent = NULL; 1690 } 1691 1692 if (tcp->tcp_kssl_ctx != NULL) { 1693 kssl_release_ctx(tcp->tcp_kssl_ctx); 1694 tcp->tcp_kssl_ctx = NULL; 1695 } 1696 tcp->tcp_kssl_pending = B_FALSE; 1697 1698 conn_delete_ire(connp, NULL); 1699 if (connp->conn_flags & IPCL_TCPCONN) { 1700 if (connp->conn_latch != NULL) 1701 IPLATCH_REFRELE(connp->conn_latch); 1702 if (connp->conn_policy != NULL) 1703 IPPH_REFRELE(connp->conn_policy); 1704 } 1705 1706 /* 1707 * Since we will bzero the entire structure, we need to 1708 * remove it and reinsert it in global hash list. We 1709 * know the walkers can't get to this conn because we 1710 * had set CONDEMNED flag earlier and checked reference 1711 * under conn_lock so walker won't pick it and when we 1712 * go the ipcl_globalhash_remove() below, no walker 1713 * can get to it. 1714 */ 1715 ipcl_globalhash_remove(connp); 1716 1717 /* Save some state */ 1718 mp = tcp->tcp_timercache; 1719 1720 tcp_sack_info = tcp->tcp_sack_info; 1721 tcp_iphc = tcp->tcp_iphc; 1722 tcp_iphc_len = tcp->tcp_iphc_len; 1723 tcp_hdr_grown = tcp->tcp_hdr_grown; 1724 1725 if (connp->conn_cred != NULL) 1726 crfree(connp->conn_cred); 1727 if (connp->conn_peercred != NULL) 1728 crfree(connp->conn_peercred); 1729 bzero(connp, sizeof (conn_t)); 1730 bzero(tcp, sizeof (tcp_t)); 1731 1732 /* restore the state */ 1733 tcp->tcp_timercache = mp; 1734 1735 tcp->tcp_sack_info = tcp_sack_info; 1736 tcp->tcp_iphc = tcp_iphc; 1737 tcp->tcp_iphc_len = tcp_iphc_len; 1738 tcp->tcp_hdr_grown = tcp_hdr_grown; 1739 1740 1741 tcp->tcp_connp = connp; 1742 1743 connp->conn_tcp = tcp; 1744 connp->conn_flags = IPCL_TCPCONN; 1745 connp->conn_state_flags = CONN_INCIPIENT; 1746 connp->conn_ulp = IPPROTO_TCP; 1747 connp->conn_ref = 1; 1748 1749 ipcl_globalhash_insert(connp); 1750 } 1751 1752 /* 1753 * Blows away all tcps whose TIME_WAIT has expired. List traversal 1754 * is done forwards from the head. 1755 */ 1756 /* ARGSUSED */ 1757 void 1758 tcp_time_wait_collector(void *arg) 1759 { 1760 tcp_t *tcp; 1761 clock_t now; 1762 mblk_t *mp; 1763 conn_t *connp; 1764 kmutex_t *lock; 1765 1766 squeue_t *sqp = (squeue_t *)arg; 1767 tcp_squeue_priv_t *tcp_time_wait = 1768 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 1769 1770 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1771 tcp_time_wait->tcp_time_wait_tid = 0; 1772 1773 if (tcp_time_wait->tcp_free_list != NULL && 1774 tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { 1775 TCP_STAT(tcp_freelist_cleanup); 1776 while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { 1777 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 1778 CONN_DEC_REF(tcp->tcp_connp); 1779 } 1780 tcp_time_wait->tcp_free_list_cnt = 0; 1781 } 1782 1783 /* 1784 * In order to reap time waits reliably, we should use a 1785 * source of time that is not adjustable by the user -- hence 1786 * the call to ddi_get_lbolt(). 1787 */ 1788 now = ddi_get_lbolt(); 1789 while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { 1790 /* 1791 * Compare times using modular arithmetic, since 1792 * lbolt can wrapover. 1793 */ 1794 if ((now - tcp->tcp_time_wait_expire) < 0) { 1795 break; 1796 } 1797 1798 tcp_time_wait_remove(tcp, tcp_time_wait); 1799 1800 connp = tcp->tcp_connp; 1801 ASSERT(connp->conn_fanout != NULL); 1802 lock = &connp->conn_fanout->connf_lock; 1803 /* 1804 * This is essentially a TW reclaim fast path optimization for 1805 * performance where the timewait collector checks under the 1806 * fanout lock (so that no one else can get access to the 1807 * conn_t) that the refcnt is 2 i.e. one for TCP and one for 1808 * the classifier hash list. If ref count is indeed 2, we can 1809 * just remove the conn under the fanout lock and avoid 1810 * cleaning up the conn under the squeue, provided that 1811 * clustering callbacks are not enabled. If clustering is 1812 * enabled, we need to make the clustering callback before 1813 * setting the CONDEMNED flag and after dropping all locks and 1814 * so we forego this optimization and fall back to the slow 1815 * path. Also please see the comments in tcp_closei_local 1816 * regarding the refcnt logic. 1817 * 1818 * Since we are holding the tcp_time_wait_lock, its better 1819 * not to block on the fanout_lock because other connections 1820 * can't add themselves to time_wait list. So we do a 1821 * tryenter instead of mutex_enter. 1822 */ 1823 if (mutex_tryenter(lock)) { 1824 mutex_enter(&connp->conn_lock); 1825 if ((connp->conn_ref == 2) && 1826 (cl_inet_disconnect == NULL)) { 1827 ipcl_hash_remove_locked(connp, 1828 connp->conn_fanout); 1829 /* 1830 * Set the CONDEMNED flag now itself so that 1831 * the refcnt cannot increase due to any 1832 * walker. But we have still not cleaned up 1833 * conn_ire_cache. This is still ok since 1834 * we are going to clean it up in tcp_cleanup 1835 * immediately and any interface unplumb 1836 * thread will wait till the ire is blown away 1837 */ 1838 connp->conn_state_flags |= CONN_CONDEMNED; 1839 mutex_exit(lock); 1840 mutex_exit(&connp->conn_lock); 1841 if (tcp_time_wait->tcp_free_list_cnt < 1842 tcp_free_list_max_cnt) { 1843 /* Add to head of tcp_free_list */ 1844 mutex_exit( 1845 &tcp_time_wait->tcp_time_wait_lock); 1846 tcp_cleanup(tcp); 1847 mutex_enter( 1848 &tcp_time_wait->tcp_time_wait_lock); 1849 tcp->tcp_time_wait_next = 1850 tcp_time_wait->tcp_free_list; 1851 tcp_time_wait->tcp_free_list = tcp; 1852 tcp_time_wait->tcp_free_list_cnt++; 1853 continue; 1854 } else { 1855 /* Do not add to tcp_free_list */ 1856 mutex_exit( 1857 &tcp_time_wait->tcp_time_wait_lock); 1858 tcp_bind_hash_remove(tcp); 1859 conn_delete_ire(tcp->tcp_connp, NULL); 1860 CONN_DEC_REF(tcp->tcp_connp); 1861 } 1862 } else { 1863 CONN_INC_REF_LOCKED(connp); 1864 mutex_exit(lock); 1865 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1866 mutex_exit(&connp->conn_lock); 1867 /* 1868 * We can reuse the closemp here since conn has 1869 * detached (otherwise we wouldn't even be in 1870 * time_wait list). 1871 */ 1872 mp = &tcp->tcp_closemp; 1873 squeue_fill(connp->conn_sqp, mp, 1874 tcp_timewait_output, connp, 1875 SQTAG_TCP_TIMEWAIT); 1876 } 1877 } else { 1878 mutex_enter(&connp->conn_lock); 1879 CONN_INC_REF_LOCKED(connp); 1880 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1881 mutex_exit(&connp->conn_lock); 1882 /* 1883 * We can reuse the closemp here since conn has 1884 * detached (otherwise we wouldn't even be in 1885 * time_wait list). 1886 */ 1887 mp = &tcp->tcp_closemp; 1888 squeue_fill(connp->conn_sqp, mp, 1889 tcp_timewait_output, connp, 0); 1890 } 1891 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 1892 } 1893 1894 if (tcp_time_wait->tcp_free_list != NULL) 1895 tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; 1896 1897 tcp_time_wait->tcp_time_wait_tid = 1898 timeout(tcp_time_wait_collector, sqp, TCP_TIME_WAIT_DELAY); 1899 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 1900 } 1901 1902 /* 1903 * Reply to a clients T_CONN_RES TPI message. This function 1904 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES 1905 * on the acceptor STREAM and processed in tcp_wput_accept(). 1906 * Read the block comment on top of tcp_conn_request(). 1907 */ 1908 static void 1909 tcp_accept(tcp_t *listener, mblk_t *mp) 1910 { 1911 tcp_t *acceptor; 1912 tcp_t *eager; 1913 tcp_t *tcp; 1914 struct T_conn_res *tcr; 1915 t_uscalar_t acceptor_id; 1916 t_scalar_t seqnum; 1917 mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */ 1918 mblk_t *ok_mp; 1919 mblk_t *mp1; 1920 1921 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 1922 tcp_err_ack(listener, mp, TPROTO, 0); 1923 return; 1924 } 1925 tcr = (struct T_conn_res *)mp->b_rptr; 1926 1927 /* 1928 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the 1929 * read side queue of the streams device underneath us i.e. the 1930 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we 1931 * look it up in the queue_hash. Under LP64 it sends down the 1932 * minor_t of the accepting endpoint. 1933 * 1934 * Once the acceptor/eager are modified (in tcp_accept_swap) the 1935 * fanout hash lock is held. 1936 * This prevents any thread from entering the acceptor queue from 1937 * below (since it has not been hard bound yet i.e. any inbound 1938 * packets will arrive on the listener or default tcp queue and 1939 * go through tcp_lookup). 1940 * The CONN_INC_REF will prevent the acceptor from closing. 1941 * 1942 * XXX It is still possible for a tli application to send down data 1943 * on the accepting stream while another thread calls t_accept. 1944 * This should not be a problem for well-behaved applications since 1945 * the T_OK_ACK is sent after the queue swapping is completed. 1946 * 1947 * If the accepting fd is the same as the listening fd, avoid 1948 * queue hash lookup since that will return an eager listener in a 1949 * already established state. 1950 */ 1951 acceptor_id = tcr->ACCEPTOR_id; 1952 mutex_enter(&listener->tcp_eager_lock); 1953 if (listener->tcp_acceptor_id == acceptor_id) { 1954 eager = listener->tcp_eager_next_q; 1955 /* only count how many T_CONN_INDs so don't count q0 */ 1956 if ((listener->tcp_conn_req_cnt_q != 1) || 1957 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { 1958 mutex_exit(&listener->tcp_eager_lock); 1959 tcp_err_ack(listener, mp, TBADF, 0); 1960 return; 1961 } 1962 if (listener->tcp_conn_req_cnt_q0 != 0) { 1963 /* Throw away all the eagers on q0. */ 1964 tcp_eager_cleanup(listener, 1); 1965 } 1966 if (listener->tcp_syn_defense) { 1967 listener->tcp_syn_defense = B_FALSE; 1968 if (listener->tcp_ip_addr_cache != NULL) { 1969 kmem_free(listener->tcp_ip_addr_cache, 1970 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 1971 listener->tcp_ip_addr_cache = NULL; 1972 } 1973 } 1974 /* 1975 * Transfer tcp_conn_req_max to the eager so that when 1976 * a disconnect occurs we can revert the endpoint to the 1977 * listen state. 1978 */ 1979 eager->tcp_conn_req_max = listener->tcp_conn_req_max; 1980 ASSERT(listener->tcp_conn_req_cnt_q0 == 0); 1981 /* 1982 * Get a reference on the acceptor just like the 1983 * tcp_acceptor_hash_lookup below. 1984 */ 1985 acceptor = listener; 1986 CONN_INC_REF(acceptor->tcp_connp); 1987 } else { 1988 acceptor = tcp_acceptor_hash_lookup(acceptor_id); 1989 if (acceptor == NULL) { 1990 if (listener->tcp_debug) { 1991 (void) strlog(TCP_MOD_ID, 0, 1, 1992 SL_ERROR|SL_TRACE, 1993 "tcp_accept: did not find acceptor 0x%x\n", 1994 acceptor_id); 1995 } 1996 mutex_exit(&listener->tcp_eager_lock); 1997 tcp_err_ack(listener, mp, TPROVMISMATCH, 0); 1998 return; 1999 } 2000 /* 2001 * Verify acceptor state. The acceptable states for an acceptor 2002 * include TCPS_IDLE and TCPS_BOUND. 2003 */ 2004 switch (acceptor->tcp_state) { 2005 case TCPS_IDLE: 2006 /* FALLTHRU */ 2007 case TCPS_BOUND: 2008 break; 2009 default: 2010 CONN_DEC_REF(acceptor->tcp_connp); 2011 mutex_exit(&listener->tcp_eager_lock); 2012 tcp_err_ack(listener, mp, TOUTSTATE, 0); 2013 return; 2014 } 2015 } 2016 2017 /* The listener must be in TCPS_LISTEN */ 2018 if (listener->tcp_state != TCPS_LISTEN) { 2019 CONN_DEC_REF(acceptor->tcp_connp); 2020 mutex_exit(&listener->tcp_eager_lock); 2021 tcp_err_ack(listener, mp, TOUTSTATE, 0); 2022 return; 2023 } 2024 2025 /* 2026 * Rendezvous with an eager connection request packet hanging off 2027 * 'tcp' that has the 'seqnum' tag. We tagged the detached open 2028 * tcp structure when the connection packet arrived in 2029 * tcp_conn_request(). 2030 */ 2031 seqnum = tcr->SEQ_number; 2032 eager = listener; 2033 do { 2034 eager = eager->tcp_eager_next_q; 2035 if (eager == NULL) { 2036 CONN_DEC_REF(acceptor->tcp_connp); 2037 mutex_exit(&listener->tcp_eager_lock); 2038 tcp_err_ack(listener, mp, TBADSEQ, 0); 2039 return; 2040 } 2041 } while (eager->tcp_conn_req_seqnum != seqnum); 2042 mutex_exit(&listener->tcp_eager_lock); 2043 2044 /* 2045 * At this point, both acceptor and listener have 2 ref 2046 * that they begin with. Acceptor has one additional ref 2047 * we placed in lookup while listener has 3 additional 2048 * ref for being behind the squeue (tcp_accept() is 2049 * done on listener's squeue); being in classifier hash; 2050 * and eager's ref on listener. 2051 */ 2052 ASSERT(listener->tcp_connp->conn_ref >= 5); 2053 ASSERT(acceptor->tcp_connp->conn_ref >= 3); 2054 2055 /* 2056 * The eager at this point is set in its own squeue and 2057 * could easily have been killed (tcp_accept_finish will 2058 * deal with that) because of a TH_RST so we can only 2059 * ASSERT for a single ref. 2060 */ 2061 ASSERT(eager->tcp_connp->conn_ref >= 1); 2062 2063 /* Pre allocate the stroptions mblk also */ 2064 opt_mp = allocb(sizeof (struct stroptions), BPRI_HI); 2065 if (opt_mp == NULL) { 2066 CONN_DEC_REF(acceptor->tcp_connp); 2067 CONN_DEC_REF(eager->tcp_connp); 2068 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 2069 return; 2070 } 2071 DB_TYPE(opt_mp) = M_SETOPTS; 2072 opt_mp->b_wptr += sizeof (struct stroptions); 2073 2074 /* 2075 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO 2076 * from listener to acceptor. The message is chained on opt_mp 2077 * which will be sent onto eager's squeue. 2078 */ 2079 if (listener->tcp_bound_if != 0) { 2080 /* allocate optmgmt req */ 2081 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6, 2082 IPV6_BOUND_IF, (char *)&listener->tcp_bound_if, 2083 sizeof (int)); 2084 if (mp1 != NULL) 2085 linkb(opt_mp, mp1); 2086 } 2087 if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { 2088 uint_t on = 1; 2089 2090 /* allocate optmgmt req */ 2091 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6, 2092 IPV6_RECVPKTINFO, (char *)&on, sizeof (on)); 2093 if (mp1 != NULL) 2094 linkb(opt_mp, mp1); 2095 } 2096 2097 /* Re-use mp1 to hold a copy of mp, in case reallocb fails */ 2098 if ((mp1 = copymsg(mp)) == NULL) { 2099 CONN_DEC_REF(acceptor->tcp_connp); 2100 CONN_DEC_REF(eager->tcp_connp); 2101 freemsg(opt_mp); 2102 tcp_err_ack(listener, mp, TSYSERR, ENOMEM); 2103 return; 2104 } 2105 2106 tcr = (struct T_conn_res *)mp1->b_rptr; 2107 2108 /* 2109 * This is an expanded version of mi_tpi_ok_ack_alloc() 2110 * which allocates a larger mblk and appends the new 2111 * local address to the ok_ack. The address is copied by 2112 * soaccept() for getsockname(). 2113 */ 2114 { 2115 int extra; 2116 2117 extra = (eager->tcp_family == AF_INET) ? 2118 sizeof (sin_t) : sizeof (sin6_t); 2119 2120 /* 2121 * Try to re-use mp, if possible. Otherwise, allocate 2122 * an mblk and return it as ok_mp. In any case, mp 2123 * is no longer usable upon return. 2124 */ 2125 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { 2126 CONN_DEC_REF(acceptor->tcp_connp); 2127 CONN_DEC_REF(eager->tcp_connp); 2128 freemsg(opt_mp); 2129 /* Original mp has been freed by now, so use mp1 */ 2130 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); 2131 return; 2132 } 2133 2134 mp = NULL; /* We should never use mp after this point */ 2135 2136 switch (extra) { 2137 case sizeof (sin_t): { 2138 sin_t *sin = (sin_t *)ok_mp->b_wptr; 2139 2140 ok_mp->b_wptr += extra; 2141 sin->sin_family = AF_INET; 2142 sin->sin_port = eager->tcp_lport; 2143 sin->sin_addr.s_addr = 2144 eager->tcp_ipha->ipha_src; 2145 break; 2146 } 2147 case sizeof (sin6_t): { 2148 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; 2149 2150 ok_mp->b_wptr += extra; 2151 sin6->sin6_family = AF_INET6; 2152 sin6->sin6_port = eager->tcp_lport; 2153 if (eager->tcp_ipversion == IPV4_VERSION) { 2154 sin6->sin6_flowinfo = 0; 2155 IN6_IPADDR_TO_V4MAPPED( 2156 eager->tcp_ipha->ipha_src, 2157 &sin6->sin6_addr); 2158 } else { 2159 ASSERT(eager->tcp_ip6h != NULL); 2160 sin6->sin6_flowinfo = 2161 eager->tcp_ip6h->ip6_vcf & 2162 ~IPV6_VERS_AND_FLOW_MASK; 2163 sin6->sin6_addr = 2164 eager->tcp_ip6h->ip6_src; 2165 } 2166 break; 2167 } 2168 default: 2169 break; 2170 } 2171 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); 2172 } 2173 2174 /* 2175 * If there are no options we know that the T_CONN_RES will 2176 * succeed. However, we can't send the T_OK_ACK upstream until 2177 * the tcp_accept_swap is done since it would be dangerous to 2178 * let the application start using the new fd prior to the swap. 2179 */ 2180 tcp_accept_swap(listener, acceptor, eager); 2181 2182 /* 2183 * tcp_accept_swap unlinks eager from listener but does not drop 2184 * the eager's reference on the listener. 2185 */ 2186 ASSERT(eager->tcp_listener == NULL); 2187 ASSERT(listener->tcp_connp->conn_ref >= 5); 2188 2189 /* 2190 * The eager is now associated with its own queue. Insert in 2191 * the hash so that the connection can be reused for a future 2192 * T_CONN_RES. 2193 */ 2194 tcp_acceptor_hash_insert(acceptor_id, eager); 2195 2196 /* 2197 * We now do the processing of options with T_CONN_RES. 2198 * We delay till now since we wanted to have queue to pass to 2199 * option processing routines that points back to the right 2200 * instance structure which does not happen until after 2201 * tcp_accept_swap(). 2202 * 2203 * Note: 2204 * The sanity of the logic here assumes that whatever options 2205 * are appropriate to inherit from listner=>eager are done 2206 * before this point, and whatever were to be overridden (or not) 2207 * in transfer logic from eager=>acceptor in tcp_accept_swap(). 2208 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it 2209 * before its ACCEPTOR_id comes down in T_CONN_RES ] 2210 * This may not be true at this point in time but can be fixed 2211 * independently. This option processing code starts with 2212 * the instantiated acceptor instance and the final queue at 2213 * this point. 2214 */ 2215 2216 if (tcr->OPT_length != 0) { 2217 /* Options to process */ 2218 int t_error = 0; 2219 int sys_error = 0; 2220 int do_disconnect = 0; 2221 2222 if (tcp_conprim_opt_process(eager, mp1, 2223 &do_disconnect, &t_error, &sys_error) < 0) { 2224 eager->tcp_accept_error = 1; 2225 if (do_disconnect) { 2226 /* 2227 * An option failed which does not allow 2228 * connection to be accepted. 2229 * 2230 * We allow T_CONN_RES to succeed and 2231 * put a T_DISCON_IND on the eager queue. 2232 */ 2233 ASSERT(t_error == 0 && sys_error == 0); 2234 eager->tcp_send_discon_ind = 1; 2235 } else { 2236 ASSERT(t_error != 0); 2237 freemsg(ok_mp); 2238 /* 2239 * Original mp was either freed or set 2240 * to ok_mp above, so use mp1 instead. 2241 */ 2242 tcp_err_ack(listener, mp1, t_error, sys_error); 2243 goto finish; 2244 } 2245 } 2246 /* 2247 * Most likely success in setting options (except if 2248 * eager->tcp_send_discon_ind set). 2249 * mp1 option buffer represented by OPT_length/offset 2250 * potentially modified and contains results of setting 2251 * options at this point 2252 */ 2253 } 2254 2255 /* We no longer need mp1, since all options processing has passed */ 2256 freemsg(mp1); 2257 2258 putnext(listener->tcp_rq, ok_mp); 2259 2260 mutex_enter(&listener->tcp_eager_lock); 2261 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 2262 tcp_t *tail; 2263 mblk_t *conn_ind; 2264 2265 /* 2266 * This path should not be executed if listener and 2267 * acceptor streams are the same. 2268 */ 2269 ASSERT(listener != acceptor); 2270 2271 tcp = listener->tcp_eager_prev_q0; 2272 /* 2273 * listener->tcp_eager_prev_q0 points to the TAIL of the 2274 * deferred T_conn_ind queue. We need to get to the head of 2275 * the queue in order to send up T_conn_ind the same order as 2276 * how the 3WHS is completed. 2277 */ 2278 while (tcp != listener) { 2279 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) 2280 break; 2281 else 2282 tcp = tcp->tcp_eager_prev_q0; 2283 } 2284 ASSERT(tcp != listener); 2285 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; 2286 ASSERT(conn_ind != NULL); 2287 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 2288 2289 /* Move from q0 to q */ 2290 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 2291 listener->tcp_conn_req_cnt_q0--; 2292 listener->tcp_conn_req_cnt_q++; 2293 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 2294 tcp->tcp_eager_prev_q0; 2295 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 2296 tcp->tcp_eager_next_q0; 2297 tcp->tcp_eager_prev_q0 = NULL; 2298 tcp->tcp_eager_next_q0 = NULL; 2299 tcp->tcp_conn_def_q0 = B_FALSE; 2300 2301 /* 2302 * Insert at end of the queue because sockfs sends 2303 * down T_CONN_RES in chronological order. Leaving 2304 * the older conn indications at front of the queue 2305 * helps reducing search time. 2306 */ 2307 tail = listener->tcp_eager_last_q; 2308 if (tail != NULL) 2309 tail->tcp_eager_next_q = tcp; 2310 else 2311 listener->tcp_eager_next_q = tcp; 2312 listener->tcp_eager_last_q = tcp; 2313 tcp->tcp_eager_next_q = NULL; 2314 mutex_exit(&listener->tcp_eager_lock); 2315 putnext(tcp->tcp_rq, conn_ind); 2316 } else { 2317 mutex_exit(&listener->tcp_eager_lock); 2318 } 2319 2320 /* 2321 * Done with the acceptor - free it 2322 * 2323 * Note: from this point on, no access to listener should be made 2324 * as listener can be equal to acceptor. 2325 */ 2326 finish: 2327 ASSERT(acceptor->tcp_detached); 2328 acceptor->tcp_rq = tcp_g_q; 2329 acceptor->tcp_wq = WR(tcp_g_q); 2330 (void) tcp_clean_death(acceptor, 0, 2); 2331 CONN_DEC_REF(acceptor->tcp_connp); 2332 2333 /* 2334 * In case we already received a FIN we have to make tcp_rput send 2335 * the ordrel_ind. This will also send up a window update if the window 2336 * has opened up. 2337 * 2338 * In the normal case of a successful connection acceptance 2339 * we give the O_T_BIND_REQ to the read side put procedure as an 2340 * indication that this was just accepted. This tells tcp_rput to 2341 * pass up any data queued in tcp_rcv_list. 2342 * 2343 * In the fringe case where options sent with T_CONN_RES failed and 2344 * we required, we would be indicating a T_DISCON_IND to blow 2345 * away this connection. 2346 */ 2347 2348 /* 2349 * XXX: we currently have a problem if XTI application closes the 2350 * acceptor stream in between. This problem exists in on10-gate also 2351 * and is well know but nothing can be done short of major rewrite 2352 * to fix it. Now it is possible to take care of it by assigning TLI/XTI 2353 * eager same squeue as listener (we can distinguish non socket 2354 * listeners at the time of handling a SYN in tcp_conn_request) 2355 * and do most of the work that tcp_accept_finish does here itself 2356 * and then get behind the acceptor squeue to access the acceptor 2357 * queue. 2358 */ 2359 /* 2360 * We already have a ref on tcp so no need to do one before squeue_fill 2361 */ 2362 squeue_fill(eager->tcp_connp->conn_sqp, opt_mp, 2363 tcp_accept_finish, eager->tcp_connp, SQTAG_TCP_ACCEPT_FINISH); 2364 } 2365 2366 /* 2367 * Swap information between the eager and acceptor for a TLI/XTI client. 2368 * The sockfs accept is done on the acceptor stream and control goes 2369 * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not 2370 * called. In either case, both the eager and listener are in their own 2371 * perimeter (squeue) and the code has to deal with potential race. 2372 * 2373 * See the block comment on top of tcp_accept() and tcp_wput_accept(). 2374 */ 2375 static void 2376 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) 2377 { 2378 conn_t *econnp, *aconnp; 2379 2380 ASSERT(eager->tcp_rq == listener->tcp_rq); 2381 ASSERT(eager->tcp_detached && !acceptor->tcp_detached); 2382 ASSERT(!eager->tcp_hard_bound); 2383 ASSERT(!TCP_IS_SOCKET(acceptor)); 2384 ASSERT(!TCP_IS_SOCKET(eager)); 2385 ASSERT(!TCP_IS_SOCKET(listener)); 2386 2387 acceptor->tcp_detached = B_TRUE; 2388 /* 2389 * To permit stream re-use by TLI/XTI, the eager needs a copy of 2390 * the acceptor id. 2391 */ 2392 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; 2393 2394 /* remove eager from listen list... */ 2395 mutex_enter(&listener->tcp_eager_lock); 2396 tcp_eager_unlink(eager); 2397 ASSERT(eager->tcp_eager_next_q == NULL && 2398 eager->tcp_eager_last_q == NULL); 2399 ASSERT(eager->tcp_eager_next_q0 == NULL && 2400 eager->tcp_eager_prev_q0 == NULL); 2401 mutex_exit(&listener->tcp_eager_lock); 2402 eager->tcp_rq = acceptor->tcp_rq; 2403 eager->tcp_wq = acceptor->tcp_wq; 2404 2405 econnp = eager->tcp_connp; 2406 aconnp = acceptor->tcp_connp; 2407 2408 eager->tcp_rq->q_ptr = econnp; 2409 eager->tcp_wq->q_ptr = econnp; 2410 2411 /* 2412 * In the TLI/XTI loopback case, we are inside the listener's squeue, 2413 * which might be a different squeue from our peer TCP instance. 2414 * For TCP Fusion, the peer expects that whenever tcp_detached is 2415 * clear, our TCP queues point to the acceptor's queues. Thus, use 2416 * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq 2417 * above reach global visibility prior to the clearing of tcp_detached. 2418 */ 2419 membar_producer(); 2420 eager->tcp_detached = B_FALSE; 2421 2422 ASSERT(eager->tcp_ack_tid == 0); 2423 2424 econnp->conn_dev = aconnp->conn_dev; 2425 if (eager->tcp_cred != NULL) 2426 crfree(eager->tcp_cred); 2427 eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred; 2428 econnp->conn_zoneid = aconnp->conn_zoneid; 2429 aconnp->conn_cred = NULL; 2430 2431 econnp->conn_mac_exempt = aconnp->conn_mac_exempt; 2432 aconnp->conn_mac_exempt = B_FALSE; 2433 2434 ASSERT(aconnp->conn_peercred == NULL); 2435 2436 /* Do the IPC initialization */ 2437 CONN_INC_REF(econnp); 2438 2439 econnp->conn_multicast_loop = aconnp->conn_multicast_loop; 2440 econnp->conn_af_isv6 = aconnp->conn_af_isv6; 2441 econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6; 2442 econnp->conn_ulp = aconnp->conn_ulp; 2443 2444 /* Done with old IPC. Drop its ref on its connp */ 2445 CONN_DEC_REF(aconnp); 2446 } 2447 2448 2449 /* 2450 * Adapt to the information, such as rtt and rtt_sd, provided from the 2451 * ire cached in conn_cache_ire. If no ire cached, do a ire lookup. 2452 * 2453 * Checks for multicast and broadcast destination address. 2454 * Returns zero on failure; non-zero if ok. 2455 * 2456 * Note that the MSS calculation here is based on the info given in 2457 * the IRE. We do not do any calculation based on TCP options. They 2458 * will be handled in tcp_rput_other() and tcp_rput_data() when TCP 2459 * knows which options to use. 2460 * 2461 * Note on how TCP gets its parameters for a connection. 2462 * 2463 * When a tcp_t structure is allocated, it gets all the default parameters. 2464 * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd, 2465 * spipe, rpipe, ... from the route metrics. Route metric overrides the 2466 * default. But if there is an associated tcp_host_param, it will override 2467 * the metrics. 2468 * 2469 * An incoming SYN with a multicast or broadcast destination address, is dropped 2470 * in 1 of 2 places. 2471 * 2472 * 1. If the packet was received over the wire it is dropped in 2473 * ip_rput_process_broadcast() 2474 * 2475 * 2. If the packet was received through internal IP loopback, i.e. the packet 2476 * was generated and received on the same machine, it is dropped in 2477 * ip_wput_local() 2478 * 2479 * An incoming SYN with a multicast or broadcast source address is always 2480 * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to 2481 * reject an attempt to connect to a broadcast or multicast (destination) 2482 * address. 2483 */ 2484 static int 2485 tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) 2486 { 2487 tcp_hsp_t *hsp; 2488 ire_t *ire; 2489 ire_t *sire = NULL; 2490 iulp_t *ire_uinfo = NULL; 2491 uint32_t mss_max; 2492 uint32_t mss; 2493 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 2494 conn_t *connp = tcp->tcp_connp; 2495 boolean_t ire_cacheable = B_FALSE; 2496 zoneid_t zoneid = connp->conn_zoneid; 2497 int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | 2498 MATCH_IRE_SECATTR; 2499 ts_label_t *tsl = crgetlabel(CONN_CRED(connp)); 2500 ill_t *ill = NULL; 2501 boolean_t incoming = (ire_mp == NULL); 2502 2503 ASSERT(connp->conn_ire_cache == NULL); 2504 2505 if (tcp->tcp_ipversion == IPV4_VERSION) { 2506 2507 if (CLASSD(tcp->tcp_connp->conn_rem)) { 2508 BUMP_MIB(&ip_mib, ipInDiscards); 2509 return (0); 2510 } 2511 /* 2512 * If IP_NEXTHOP is set, then look for an IRE_CACHE 2513 * for the destination with the nexthop as gateway. 2514 * ire_ctable_lookup() is used because this particular 2515 * ire, if it exists, will be marked private. 2516 * If that is not available, use the interface ire 2517 * for the nexthop. 2518 * 2519 * TSol: tcp_update_label will detect label mismatches based 2520 * only on the destination's label, but that would not 2521 * detect label mismatches based on the security attributes 2522 * of routes or next hop gateway. Hence we need to pass the 2523 * label to ire_ftable_lookup below in order to locate the 2524 * right prefix (and/or) ire cache. Similarly we also need 2525 * pass the label to the ire_cache_lookup below to locate 2526 * the right ire that also matches on the label. 2527 */ 2528 if (tcp->tcp_connp->conn_nexthop_set) { 2529 ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem, 2530 tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid, 2531 tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW); 2532 if (ire == NULL) { 2533 ire = ire_ftable_lookup( 2534 tcp->tcp_connp->conn_nexthop_v4, 2535 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0, 2536 tsl, match_flags); 2537 if (ire == NULL) 2538 return (0); 2539 } else { 2540 ire_uinfo = &ire->ire_uinfo; 2541 } 2542 } else { 2543 ire = ire_cache_lookup(tcp->tcp_connp->conn_rem, 2544 zoneid, tsl); 2545 if (ire != NULL) { 2546 ire_cacheable = B_TRUE; 2547 ire_uinfo = (ire_mp != NULL) ? 2548 &((ire_t *)ire_mp->b_rptr)->ire_uinfo: 2549 &ire->ire_uinfo; 2550 2551 } else { 2552 if (ire_mp == NULL) { 2553 ire = ire_ftable_lookup( 2554 tcp->tcp_connp->conn_rem, 2555 0, 0, 0, NULL, &sire, zoneid, 0, 2556 tsl, (MATCH_IRE_RECURSIVE | 2557 MATCH_IRE_DEFAULT)); 2558 if (ire == NULL) 2559 return (0); 2560 ire_uinfo = (sire != NULL) ? 2561 &sire->ire_uinfo : 2562 &ire->ire_uinfo; 2563 } else { 2564 ire = (ire_t *)ire_mp->b_rptr; 2565 ire_uinfo = 2566 &((ire_t *) 2567 ire_mp->b_rptr)->ire_uinfo; 2568 } 2569 } 2570 } 2571 ASSERT(ire != NULL); 2572 2573 if ((ire->ire_src_addr == INADDR_ANY) || 2574 (ire->ire_type & IRE_BROADCAST)) { 2575 /* 2576 * ire->ire_mp is non null when ire_mp passed in is used 2577 * ire->ire_mp is set in ip_bind_insert_ire[_v6](). 2578 */ 2579 if (ire->ire_mp == NULL) 2580 ire_refrele(ire); 2581 if (sire != NULL) 2582 ire_refrele(sire); 2583 return (0); 2584 } 2585 2586 if (tcp->tcp_ipha->ipha_src == INADDR_ANY) { 2587 ipaddr_t src_addr; 2588 2589 /* 2590 * ip_bind_connected() has stored the correct source 2591 * address in conn_src. 2592 */ 2593 src_addr = tcp->tcp_connp->conn_src; 2594 tcp->tcp_ipha->ipha_src = src_addr; 2595 /* 2596 * Copy of the src addr. in tcp_t is needed 2597 * for the lookup funcs. 2598 */ 2599 IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6); 2600 } 2601 /* 2602 * Set the fragment bit so that IP will tell us if the MTU 2603 * should change. IP tells us the latest setting of 2604 * ip_path_mtu_discovery through ire_frag_flag. 2605 */ 2606 if (ip_path_mtu_discovery) { 2607 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 2608 htons(IPH_DF); 2609 } 2610 /* 2611 * If ire_uinfo is NULL, this is the IRE_INTERFACE case 2612 * for IP_NEXTHOP. No cache ire has been found for the 2613 * destination and we are working with the nexthop's 2614 * interface ire. Since we need to forward all packets 2615 * to the nexthop first, we "blindly" set tcp_localnet 2616 * to false, eventhough the destination may also be 2617 * onlink. 2618 */ 2619 if (ire_uinfo == NULL) 2620 tcp->tcp_localnet = 0; 2621 else 2622 tcp->tcp_localnet = (ire->ire_gateway_addr == 0); 2623 } else { 2624 /* 2625 * For incoming connection ire_mp = NULL 2626 * For outgoing connection ire_mp != NULL 2627 * Technically we should check conn_incoming_ill 2628 * when ire_mp is NULL and conn_outgoing_ill when 2629 * ire_mp is non-NULL. But this is performance 2630 * critical path and for IPV*_BOUND_IF, outgoing 2631 * and incoming ill are always set to the same value. 2632 */ 2633 ill_t *dst_ill = NULL; 2634 ipif_t *dst_ipif = NULL; 2635 2636 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); 2637 2638 if (connp->conn_outgoing_ill != NULL) { 2639 /* Outgoing or incoming path */ 2640 int err; 2641 2642 dst_ill = conn_get_held_ill(connp, 2643 &connp->conn_outgoing_ill, &err); 2644 if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) { 2645 ip1dbg(("tcp_adapt_ire: ill_lookup failed\n")); 2646 return (0); 2647 } 2648 match_flags |= MATCH_IRE_ILL; 2649 dst_ipif = dst_ill->ill_ipif; 2650 } 2651 ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6, 2652 0, 0, dst_ipif, zoneid, tsl, match_flags); 2653 2654 if (ire != NULL) { 2655 ire_cacheable = B_TRUE; 2656 ire_uinfo = (ire_mp != NULL) ? 2657 &((ire_t *)ire_mp->b_rptr)->ire_uinfo: 2658 &ire->ire_uinfo; 2659 } else { 2660 if (ire_mp == NULL) { 2661 ire = ire_ftable_lookup_v6( 2662 &tcp->tcp_connp->conn_remv6, 2663 0, 0, 0, dst_ipif, &sire, zoneid, 2664 0, tsl, match_flags); 2665 if (ire == NULL) { 2666 if (dst_ill != NULL) 2667 ill_refrele(dst_ill); 2668 return (0); 2669 } 2670 ire_uinfo = (sire != NULL) ? &sire->ire_uinfo : 2671 &ire->ire_uinfo; 2672 } else { 2673 ire = (ire_t *)ire_mp->b_rptr; 2674 ire_uinfo = 2675 &((ire_t *)ire_mp->b_rptr)->ire_uinfo; 2676 } 2677 } 2678 if (dst_ill != NULL) 2679 ill_refrele(dst_ill); 2680 2681 ASSERT(ire != NULL); 2682 ASSERT(ire_uinfo != NULL); 2683 2684 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) || 2685 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { 2686 /* 2687 * ire->ire_mp is non null when ire_mp passed in is used 2688 * ire->ire_mp is set in ip_bind_insert_ire[_v6](). 2689 */ 2690 if (ire->ire_mp == NULL) 2691 ire_refrele(ire); 2692 if (sire != NULL) 2693 ire_refrele(sire); 2694 return (0); 2695 } 2696 2697 if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { 2698 in6_addr_t src_addr; 2699 2700 /* 2701 * ip_bind_connected_v6() has stored the correct source 2702 * address per IPv6 addr. selection policy in 2703 * conn_src_v6. 2704 */ 2705 src_addr = tcp->tcp_connp->conn_srcv6; 2706 2707 tcp->tcp_ip6h->ip6_src = src_addr; 2708 /* 2709 * Copy of the src addr. in tcp_t is needed 2710 * for the lookup funcs. 2711 */ 2712 tcp->tcp_ip_src_v6 = src_addr; 2713 ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src, 2714 &connp->conn_srcv6)); 2715 } 2716 tcp->tcp_localnet = 2717 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6); 2718 } 2719 2720 /* 2721 * This allows applications to fail quickly when connections are made 2722 * to dead hosts. Hosts can be labeled dead by adding a reject route 2723 * with both the RTF_REJECT and RTF_PRIVATE flags set. 2724 */ 2725 if ((ire->ire_flags & RTF_REJECT) && 2726 (ire->ire_flags & RTF_PRIVATE)) 2727 goto error; 2728 2729 /* 2730 * Make use of the cached rtt and rtt_sd values to calculate the 2731 * initial RTO. Note that they are already initialized in 2732 * tcp_init_values(). 2733 * If ire_uinfo is NULL, i.e., we do not have a cache ire for 2734 * IP_NEXTHOP, but instead are using the interface ire for the 2735 * nexthop, then we do not use the ire_uinfo from that ire to 2736 * do any initializations. 2737 */ 2738 if (ire_uinfo != NULL) { 2739 if (ire_uinfo->iulp_rtt != 0) { 2740 clock_t rto; 2741 2742 tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt; 2743 tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd; 2744 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 2745 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5); 2746 2747 if (rto > tcp_rexmit_interval_max) { 2748 tcp->tcp_rto = tcp_rexmit_interval_max; 2749 } else if (rto < tcp_rexmit_interval_min) { 2750 tcp->tcp_rto = tcp_rexmit_interval_min; 2751 } else { 2752 tcp->tcp_rto = rto; 2753 } 2754 } 2755 if (ire_uinfo->iulp_ssthresh != 0) 2756 tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh; 2757 else 2758 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 2759 if (ire_uinfo->iulp_spipe > 0) { 2760 tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe, 2761 tcp_max_buf); 2762 if (tcp_snd_lowat_fraction != 0) 2763 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / 2764 tcp_snd_lowat_fraction; 2765 (void) tcp_maxpsz_set(tcp, B_TRUE); 2766 } 2767 /* 2768 * Note that up till now, acceptor always inherits receive 2769 * window from the listener. But if there is a metrics 2770 * associated with a host, we should use that instead of 2771 * inheriting it from listener. Thus we need to pass this 2772 * info back to the caller. 2773 */ 2774 if (ire_uinfo->iulp_rpipe > 0) { 2775 tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe, tcp_max_buf); 2776 } 2777 2778 if (ire_uinfo->iulp_rtomax > 0) { 2779 tcp->tcp_second_timer_threshold = 2780 ire_uinfo->iulp_rtomax; 2781 } 2782 2783 /* 2784 * Use the metric option settings, iulp_tstamp_ok and 2785 * iulp_wscale_ok, only for active open. What this means 2786 * is that if the other side uses timestamp or window 2787 * scale option, TCP will also use those options. That 2788 * is for passive open. If the application sets a 2789 * large window, window scale is enabled regardless of 2790 * the value in iulp_wscale_ok. This is the behavior 2791 * since 2.6. So we keep it. 2792 * The only case left in passive open processing is the 2793 * check for SACK. 2794 * For ECN, it should probably be like SACK. But the 2795 * current value is binary, so we treat it like the other 2796 * cases. The metric only controls active open.For passive 2797 * open, the ndd param, tcp_ecn_permitted, controls the 2798 * behavior. 2799 */ 2800 if (!tcp_detached) { 2801 /* 2802 * The if check means that the following can only 2803 * be turned on by the metrics only IRE, but not off. 2804 */ 2805 if (ire_uinfo->iulp_tstamp_ok) 2806 tcp->tcp_snd_ts_ok = B_TRUE; 2807 if (ire_uinfo->iulp_wscale_ok) 2808 tcp->tcp_snd_ws_ok = B_TRUE; 2809 if (ire_uinfo->iulp_sack == 2) 2810 tcp->tcp_snd_sack_ok = B_TRUE; 2811 if (ire_uinfo->iulp_ecn_ok) 2812 tcp->tcp_ecn_ok = B_TRUE; 2813 } else { 2814 /* 2815 * Passive open. 2816 * 2817 * As above, the if check means that SACK can only be 2818 * turned on by the metric only IRE. 2819 */ 2820 if (ire_uinfo->iulp_sack > 0) { 2821 tcp->tcp_snd_sack_ok = B_TRUE; 2822 } 2823 } 2824 } 2825 2826 2827 /* 2828 * XXX: Note that currently, ire_max_frag can be as small as 68 2829 * because of PMTUd. So tcp_mss may go to negative if combined 2830 * length of all those options exceeds 28 bytes. But because 2831 * of the tcp_mss_min check below, we may not have a problem if 2832 * tcp_mss_min is of a reasonable value. The default is 1 so 2833 * the negative problem still exists. And the check defeats PMTUd. 2834 * In fact, if PMTUd finds that the MSS should be smaller than 2835 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min 2836 * value. 2837 * 2838 * We do not deal with that now. All those problems related to 2839 * PMTUd will be fixed later. 2840 */ 2841 ASSERT(ire->ire_max_frag != 0); 2842 mss = tcp->tcp_if_mtu = ire->ire_max_frag; 2843 if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) { 2844 if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) { 2845 mss = MIN(mss, IPV6_MIN_MTU); 2846 } 2847 } 2848 2849 /* Sanity check for MSS value. */ 2850 if (tcp->tcp_ipversion == IPV4_VERSION) 2851 mss_max = tcp_mss_max_ipv4; 2852 else 2853 mss_max = tcp_mss_max_ipv6; 2854 2855 if (tcp->tcp_ipversion == IPV6_VERSION && 2856 (ire->ire_frag_flag & IPH_FRAG_HDR)) { 2857 /* 2858 * After receiving an ICMPv6 "packet too big" message with a 2859 * MTU < 1280, and for multirouted IPv6 packets, the IP layer 2860 * will insert a 8-byte fragment header in every packet; we 2861 * reduce the MSS by that amount here. 2862 */ 2863 mss -= sizeof (ip6_frag_t); 2864 } 2865 2866 if (tcp->tcp_ipsec_overhead == 0) 2867 tcp->tcp_ipsec_overhead = conn_ipsec_length(connp); 2868 2869 mss -= tcp->tcp_ipsec_overhead; 2870 2871 if (mss < tcp_mss_min) 2872 mss = tcp_mss_min; 2873 if (mss > mss_max) 2874 mss = mss_max; 2875 2876 /* Note that this is the maximum MSS, excluding all options. */ 2877 tcp->tcp_mss = mss; 2878 2879 /* 2880 * Initialize the ISS here now that we have the full connection ID. 2881 * The RFC 1948 method of initial sequence number generation requires 2882 * knowledge of the full connection ID before setting the ISS. 2883 */ 2884 2885 tcp_iss_init(tcp); 2886 2887 if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL)) 2888 tcp->tcp_loopback = B_TRUE; 2889 2890 if (tcp->tcp_ipversion == IPV4_VERSION) { 2891 hsp = tcp_hsp_lookup(tcp->tcp_remote); 2892 } else { 2893 hsp = tcp_hsp_lookup_ipv6(&tcp->tcp_remote_v6); 2894 } 2895 2896 if (hsp != NULL) { 2897 /* Only modify if we're going to make them bigger */ 2898 if (hsp->tcp_hsp_sendspace > tcp->tcp_xmit_hiwater) { 2899 tcp->tcp_xmit_hiwater = hsp->tcp_hsp_sendspace; 2900 if (tcp_snd_lowat_fraction != 0) 2901 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / 2902 tcp_snd_lowat_fraction; 2903 } 2904 2905 if (hsp->tcp_hsp_recvspace > tcp->tcp_rwnd) { 2906 tcp->tcp_rwnd = hsp->tcp_hsp_recvspace; 2907 } 2908 2909 /* Copy timestamp flag only for active open */ 2910 if (!tcp_detached) 2911 tcp->tcp_snd_ts_ok = hsp->tcp_hsp_tstamp; 2912 } 2913 2914 if (sire != NULL) 2915 IRE_REFRELE(sire); 2916 2917 /* 2918 * If we got an IRE_CACHE and an ILL, go through their properties; 2919 * otherwise, this is deferred until later when we have an IRE_CACHE. 2920 */ 2921 if (tcp->tcp_loopback || 2922 (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) { 2923 /* 2924 * For incoming, see if this tcp may be MDT-capable. For 2925 * outgoing, this process has been taken care of through 2926 * tcp_rput_other. 2927 */ 2928 tcp_ire_ill_check(tcp, ire, ill, incoming); 2929 tcp->tcp_ire_ill_check_done = B_TRUE; 2930 } 2931 2932 mutex_enter(&connp->conn_lock); 2933 /* 2934 * Make sure that conn is not marked incipient 2935 * for incoming connections. A blind 2936 * removal of incipient flag is cheaper than 2937 * check and removal. 2938 */ 2939 connp->conn_state_flags &= ~CONN_INCIPIENT; 2940 2941 /* Must not cache forwarding table routes. */ 2942 if (ire_cacheable) { 2943 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 2944 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 2945 connp->conn_ire_cache = ire; 2946 IRE_UNTRACE_REF(ire); 2947 rw_exit(&ire->ire_bucket->irb_lock); 2948 mutex_exit(&connp->conn_lock); 2949 return (1); 2950 } 2951 rw_exit(&ire->ire_bucket->irb_lock); 2952 } 2953 mutex_exit(&connp->conn_lock); 2954 2955 if (ire->ire_mp == NULL) 2956 ire_refrele(ire); 2957 return (1); 2958 2959 error: 2960 if (ire->ire_mp == NULL) 2961 ire_refrele(ire); 2962 if (sire != NULL) 2963 ire_refrele(sire); 2964 return (0); 2965 } 2966 2967 /* 2968 * tcp_bind is called (holding the writer lock) by tcp_wput_proto to process a 2969 * O_T_BIND_REQ/T_BIND_REQ message. 2970 */ 2971 static void 2972 tcp_bind(tcp_t *tcp, mblk_t *mp) 2973 { 2974 sin_t *sin; 2975 sin6_t *sin6; 2976 mblk_t *mp1; 2977 in_port_t requested_port; 2978 in_port_t allocated_port; 2979 struct T_bind_req *tbr; 2980 boolean_t bind_to_req_port_only; 2981 boolean_t backlog_update = B_FALSE; 2982 boolean_t user_specified; 2983 in6_addr_t v6addr; 2984 ipaddr_t v4addr; 2985 uint_t origipversion; 2986 int err; 2987 queue_t *q = tcp->tcp_wq; 2988 conn_t *connp; 2989 mlp_type_t addrtype, mlptype; 2990 zone_t *zone; 2991 cred_t *cr; 2992 in_port_t mlp_port; 2993 2994 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 2995 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { 2996 if (tcp->tcp_debug) { 2997 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 2998 "tcp_bind: bad req, len %u", 2999 (uint_t)(mp->b_wptr - mp->b_rptr)); 3000 } 3001 tcp_err_ack(tcp, mp, TPROTO, 0); 3002 return; 3003 } 3004 /* Make sure the largest address fits */ 3005 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1); 3006 if (mp1 == NULL) { 3007 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 3008 return; 3009 } 3010 mp = mp1; 3011 tbr = (struct T_bind_req *)mp->b_rptr; 3012 if (tcp->tcp_state >= TCPS_BOUND) { 3013 if ((tcp->tcp_state == TCPS_BOUND || 3014 tcp->tcp_state == TCPS_LISTEN) && 3015 tcp->tcp_conn_req_max != tbr->CONIND_number && 3016 tbr->CONIND_number > 0) { 3017 /* 3018 * Handle listen() increasing CONIND_number. 3019 * This is more "liberal" then what the TPI spec 3020 * requires but is needed to avoid a t_unbind 3021 * when handling listen() since the port number 3022 * might be "stolen" between the unbind and bind. 3023 */ 3024 backlog_update = B_TRUE; 3025 goto do_bind; 3026 } 3027 if (tcp->tcp_debug) { 3028 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 3029 "tcp_bind: bad state, %d", tcp->tcp_state); 3030 } 3031 tcp_err_ack(tcp, mp, TOUTSTATE, 0); 3032 return; 3033 } 3034 origipversion = tcp->tcp_ipversion; 3035 3036 switch (tbr->ADDR_length) { 3037 case 0: /* request for a generic port */ 3038 tbr->ADDR_offset = sizeof (struct T_bind_req); 3039 if (tcp->tcp_family == AF_INET) { 3040 tbr->ADDR_length = sizeof (sin_t); 3041 sin = (sin_t *)&tbr[1]; 3042 *sin = sin_null; 3043 sin->sin_family = AF_INET; 3044 mp->b_wptr = (uchar_t *)&sin[1]; 3045 tcp->tcp_ipversion = IPV4_VERSION; 3046 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &v6addr); 3047 } else { 3048 ASSERT(tcp->tcp_family == AF_INET6); 3049 tbr->ADDR_length = sizeof (sin6_t); 3050 sin6 = (sin6_t *)&tbr[1]; 3051 *sin6 = sin6_null; 3052 sin6->sin6_family = AF_INET6; 3053 mp->b_wptr = (uchar_t *)&sin6[1]; 3054 tcp->tcp_ipversion = IPV6_VERSION; 3055 V6_SET_ZERO(v6addr); 3056 } 3057 requested_port = 0; 3058 break; 3059 3060 case sizeof (sin_t): /* Complete IPv4 address */ 3061 sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset, 3062 sizeof (sin_t)); 3063 if (sin == NULL || !OK_32PTR((char *)sin)) { 3064 if (tcp->tcp_debug) { 3065 (void) strlog(TCP_MOD_ID, 0, 1, 3066 SL_ERROR|SL_TRACE, 3067 "tcp_bind: bad address parameter, " 3068 "offset %d, len %d", 3069 tbr->ADDR_offset, tbr->ADDR_length); 3070 } 3071 tcp_err_ack(tcp, mp, TPROTO, 0); 3072 return; 3073 } 3074 /* 3075 * With sockets sockfs will accept bogus sin_family in 3076 * bind() and replace it with the family used in the socket 3077 * call. 3078 */ 3079 if (sin->sin_family != AF_INET || 3080 tcp->tcp_family != AF_INET) { 3081 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 3082 return; 3083 } 3084 requested_port = ntohs(sin->sin_port); 3085 tcp->tcp_ipversion = IPV4_VERSION; 3086 v4addr = sin->sin_addr.s_addr; 3087 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); 3088 break; 3089 3090 case sizeof (sin6_t): /* Complete IPv6 address */ 3091 sin6 = (sin6_t *)mi_offset_param(mp, 3092 tbr->ADDR_offset, sizeof (sin6_t)); 3093 if (sin6 == NULL || !OK_32PTR((char *)sin6)) { 3094 if (tcp->tcp_debug) { 3095 (void) strlog(TCP_MOD_ID, 0, 1, 3096 SL_ERROR|SL_TRACE, 3097 "tcp_bind: bad IPv6 address parameter, " 3098 "offset %d, len %d", tbr->ADDR_offset, 3099 tbr->ADDR_length); 3100 } 3101 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 3102 return; 3103 } 3104 if (sin6->sin6_family != AF_INET6 || 3105 tcp->tcp_family != AF_INET6) { 3106 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 3107 return; 3108 } 3109 requested_port = ntohs(sin6->sin6_port); 3110 tcp->tcp_ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? 3111 IPV4_VERSION : IPV6_VERSION; 3112 v6addr = sin6->sin6_addr; 3113 break; 3114 3115 default: 3116 if (tcp->tcp_debug) { 3117 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 3118 "tcp_bind: bad address length, %d", 3119 tbr->ADDR_length); 3120 } 3121 tcp_err_ack(tcp, mp, TBADADDR, 0); 3122 return; 3123 } 3124 tcp->tcp_bound_source_v6 = v6addr; 3125 3126 /* Check for change in ipversion */ 3127 if (origipversion != tcp->tcp_ipversion) { 3128 ASSERT(tcp->tcp_family == AF_INET6); 3129 err = tcp->tcp_ipversion == IPV6_VERSION ? 3130 tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp); 3131 if (err) { 3132 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 3133 return; 3134 } 3135 } 3136 3137 /* 3138 * Initialize family specific fields. Copy of the src addr. 3139 * in tcp_t is needed for the lookup funcs. 3140 */ 3141 if (tcp->tcp_ipversion == IPV6_VERSION) { 3142 tcp->tcp_ip6h->ip6_src = v6addr; 3143 } else { 3144 IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src); 3145 } 3146 tcp->tcp_ip_src_v6 = v6addr; 3147 3148 /* 3149 * For O_T_BIND_REQ: 3150 * Verify that the target port/addr is available, or choose 3151 * another. 3152 * For T_BIND_REQ: 3153 * Verify that the target port/addr is available or fail. 3154 * In both cases when it succeeds the tcp is inserted in the 3155 * bind hash table. This ensures that the operation is atomic 3156 * under the lock on the hash bucket. 3157 */ 3158 bind_to_req_port_only = requested_port != 0 && 3159 tbr->PRIM_type != O_T_BIND_REQ; 3160 /* 3161 * Get a valid port (within the anonymous range and should not 3162 * be a privileged one) to use if the user has not given a port. 3163 * If multiple threads are here, they may all start with 3164 * with the same initial port. But, it should be fine as long as 3165 * tcp_bindi will ensure that no two threads will be assigned 3166 * the same port. 3167 * 3168 * NOTE: XXX If a privileged process asks for an anonymous port, we 3169 * still check for ports only in the range > tcp_smallest_non_priv_port, 3170 * unless TCP_ANONPRIVBIND option is set. 3171 */ 3172 mlptype = mlptSingle; 3173 mlp_port = requested_port; 3174 if (requested_port == 0) { 3175 requested_port = tcp->tcp_anon_priv_bind ? 3176 tcp_get_next_priv_port(tcp) : 3177 tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE); 3178 if (requested_port == 0) { 3179 tcp_err_ack(tcp, mp, TNOADDR, 0); 3180 return; 3181 } 3182 user_specified = B_FALSE; 3183 3184 /* 3185 * If the user went through one of the RPC interfaces to create 3186 * this socket and RPC is MLP in this zone, then give him an 3187 * anonymous MLP. 3188 */ 3189 cr = DB_CREDDEF(mp, tcp->tcp_cred); 3190 connp = tcp->tcp_connp; 3191 if (connp->conn_anon_mlp && is_system_labeled()) { 3192 zone = crgetzone(cr); 3193 addrtype = tsol_mlp_addr_type(zone->zone_id, 3194 IPV6_VERSION, &v6addr); 3195 if (addrtype == mlptSingle) { 3196 tcp_err_ack(tcp, mp, TNOADDR, 0); 3197 return; 3198 } 3199 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 3200 PMAPPORT, addrtype); 3201 mlp_port = PMAPPORT; 3202 } 3203 } else { 3204 int i; 3205 boolean_t priv = B_FALSE; 3206 3207 /* 3208 * If the requested_port is in the well-known privileged range, 3209 * verify that the stream was opened by a privileged user. 3210 * Note: No locks are held when inspecting tcp_g_*epriv_ports 3211 * but instead the code relies on: 3212 * - the fact that the address of the array and its size never 3213 * changes 3214 * - the atomic assignment of the elements of the array 3215 */ 3216 cr = DB_CREDDEF(mp, tcp->tcp_cred); 3217 if (requested_port < tcp_smallest_nonpriv_port) { 3218 priv = B_TRUE; 3219 } else { 3220 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 3221 if (requested_port == 3222 tcp_g_epriv_ports[i]) { 3223 priv = B_TRUE; 3224 break; 3225 } 3226 } 3227 } 3228 if (priv) { 3229 if (secpolicy_net_privaddr(cr, requested_port) != 0) { 3230 if (tcp->tcp_debug) { 3231 (void) strlog(TCP_MOD_ID, 0, 1, 3232 SL_ERROR|SL_TRACE, 3233 "tcp_bind: no priv for port %d", 3234 requested_port); 3235 } 3236 tcp_err_ack(tcp, mp, TACCES, 0); 3237 return; 3238 } 3239 } 3240 user_specified = B_TRUE; 3241 3242 connp = tcp->tcp_connp; 3243 if (is_system_labeled()) { 3244 zone = crgetzone(cr); 3245 addrtype = tsol_mlp_addr_type(zone->zone_id, 3246 IPV6_VERSION, &v6addr); 3247 if (addrtype == mlptSingle) { 3248 tcp_err_ack(tcp, mp, TNOADDR, 0); 3249 return; 3250 } 3251 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 3252 requested_port, addrtype); 3253 } 3254 } 3255 3256 if (mlptype != mlptSingle) { 3257 if (secpolicy_net_bindmlp(cr) != 0) { 3258 if (tcp->tcp_debug) { 3259 (void) strlog(TCP_MOD_ID, 0, 1, 3260 SL_ERROR|SL_TRACE, 3261 "tcp_bind: no priv for multilevel port %d", 3262 requested_port); 3263 } 3264 tcp_err_ack(tcp, mp, TACCES, 0); 3265 return; 3266 } 3267 3268 /* 3269 * If we're specifically binding a shared IP address and the 3270 * port is MLP on shared addresses, then check to see if this 3271 * zone actually owns the MLP. Reject if not. 3272 */ 3273 if (mlptype == mlptShared && addrtype == mlptShared) { 3274 zoneid_t mlpzone; 3275 3276 mlpzone = tsol_mlp_findzone(IPPROTO_TCP, 3277 htons(mlp_port)); 3278 if (connp->conn_zoneid != mlpzone) { 3279 if (tcp->tcp_debug) { 3280 (void) strlog(TCP_MOD_ID, 0, 1, 3281 SL_ERROR|SL_TRACE, 3282 "tcp_bind: attempt to bind port " 3283 "%d on shared addr in zone %d " 3284 "(should be %d)", 3285 mlp_port, connp->conn_zoneid, 3286 mlpzone); 3287 } 3288 tcp_err_ack(tcp, mp, TACCES, 0); 3289 return; 3290 } 3291 } 3292 3293 if (!user_specified) { 3294 err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp, 3295 requested_port, B_TRUE); 3296 if (err != 0) { 3297 if (tcp->tcp_debug) { 3298 (void) strlog(TCP_MOD_ID, 0, 1, 3299 SL_ERROR|SL_TRACE, 3300 "tcp_bind: cannot establish anon " 3301 "MLP for port %d", 3302 requested_port); 3303 } 3304 tcp_err_ack(tcp, mp, TSYSERR, err); 3305 return; 3306 } 3307 connp->conn_anon_port = B_TRUE; 3308 } 3309 connp->conn_mlp_type = mlptype; 3310 } 3311 3312 allocated_port = tcp_bindi(tcp, requested_port, &v6addr, 3313 tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified); 3314 3315 if (allocated_port == 0) { 3316 connp->conn_mlp_type = mlptSingle; 3317 if (connp->conn_anon_port) { 3318 connp->conn_anon_port = B_FALSE; 3319 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, 3320 requested_port, B_FALSE); 3321 } 3322 if (bind_to_req_port_only) { 3323 if (tcp->tcp_debug) { 3324 (void) strlog(TCP_MOD_ID, 0, 1, 3325 SL_ERROR|SL_TRACE, 3326 "tcp_bind: requested addr busy"); 3327 } 3328 tcp_err_ack(tcp, mp, TADDRBUSY, 0); 3329 } else { 3330 /* If we are out of ports, fail the bind. */ 3331 if (tcp->tcp_debug) { 3332 (void) strlog(TCP_MOD_ID, 0, 1, 3333 SL_ERROR|SL_TRACE, 3334 "tcp_bind: out of ports?"); 3335 } 3336 tcp_err_ack(tcp, mp, TNOADDR, 0); 3337 } 3338 return; 3339 } 3340 ASSERT(tcp->tcp_state == TCPS_BOUND); 3341 do_bind: 3342 if (!backlog_update) { 3343 if (tcp->tcp_family == AF_INET) 3344 sin->sin_port = htons(allocated_port); 3345 else 3346 sin6->sin6_port = htons(allocated_port); 3347 } 3348 if (tcp->tcp_family == AF_INET) { 3349 if (tbr->CONIND_number != 0) { 3350 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, 3351 sizeof (sin_t)); 3352 } else { 3353 /* Just verify the local IP address */ 3354 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, IP_ADDR_LEN); 3355 } 3356 } else { 3357 if (tbr->CONIND_number != 0) { 3358 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, 3359 sizeof (sin6_t)); 3360 } else { 3361 /* Just verify the local IP address */ 3362 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, 3363 IPV6_ADDR_LEN); 3364 } 3365 } 3366 if (mp1 == NULL) { 3367 if (connp->conn_anon_port) { 3368 connp->conn_anon_port = B_FALSE; 3369 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, 3370 requested_port, B_FALSE); 3371 } 3372 connp->conn_mlp_type = mlptSingle; 3373 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 3374 return; 3375 } 3376 3377 tbr->PRIM_type = T_BIND_ACK; 3378 mp->b_datap->db_type = M_PCPROTO; 3379 3380 /* Chain in the reply mp for tcp_rput() */ 3381 mp1->b_cont = mp; 3382 mp = mp1; 3383 3384 tcp->tcp_conn_req_max = tbr->CONIND_number; 3385 if (tcp->tcp_conn_req_max) { 3386 if (tcp->tcp_conn_req_max < tcp_conn_req_min) 3387 tcp->tcp_conn_req_max = tcp_conn_req_min; 3388 if (tcp->tcp_conn_req_max > tcp_conn_req_max_q) 3389 tcp->tcp_conn_req_max = tcp_conn_req_max_q; 3390 /* 3391 * If this is a listener, do not reset the eager list 3392 * and other stuffs. Note that we don't check if the 3393 * existing eager list meets the new tcp_conn_req_max 3394 * requirement. 3395 */ 3396 if (tcp->tcp_state != TCPS_LISTEN) { 3397 tcp->tcp_state = TCPS_LISTEN; 3398 /* Initialize the chain. Don't need the eager_lock */ 3399 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 3400 tcp->tcp_second_ctimer_threshold = 3401 tcp_ip_abort_linterval; 3402 } 3403 } 3404 3405 /* 3406 * We can call ip_bind directly which returns a T_BIND_ACK mp. The 3407 * processing continues in tcp_rput_other(). 3408 */ 3409 if (tcp->tcp_family == AF_INET6) { 3410 ASSERT(tcp->tcp_connp->conn_af_isv6); 3411 mp = ip_bind_v6(q, mp, tcp->tcp_connp, &tcp->tcp_sticky_ipp); 3412 } else { 3413 ASSERT(!tcp->tcp_connp->conn_af_isv6); 3414 mp = ip_bind_v4(q, mp, tcp->tcp_connp); 3415 } 3416 /* 3417 * If the bind cannot complete immediately 3418 * IP will arrange to call tcp_rput_other 3419 * when the bind completes. 3420 */ 3421 if (mp != NULL) { 3422 tcp_rput_other(tcp, mp); 3423 } else { 3424 /* 3425 * Bind will be resumed later. Need to ensure 3426 * that conn doesn't disappear when that happens. 3427 * This will be decremented in ip_resume_tcp_bind(). 3428 */ 3429 CONN_INC_REF(tcp->tcp_connp); 3430 } 3431 } 3432 3433 3434 /* 3435 * If the "bind_to_req_port_only" parameter is set, if the requested port 3436 * number is available, return it, If not return 0 3437 * 3438 * If "bind_to_req_port_only" parameter is not set and 3439 * If the requested port number is available, return it. If not, return 3440 * the first anonymous port we happen across. If no anonymous ports are 3441 * available, return 0. addr is the requested local address, if any. 3442 * 3443 * In either case, when succeeding update the tcp_t to record the port number 3444 * and insert it in the bind hash table. 3445 * 3446 * Note that TCP over IPv4 and IPv6 sockets can use the same port number 3447 * without setting SO_REUSEADDR. This is needed so that they 3448 * can be viewed as two independent transport protocols. 3449 */ 3450 static in_port_t 3451 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 3452 int reuseaddr, boolean_t quick_connect, 3453 boolean_t bind_to_req_port_only, boolean_t user_specified) 3454 { 3455 /* number of times we have run around the loop */ 3456 int count = 0; 3457 /* maximum number of times to run around the loop */ 3458 int loopmax; 3459 conn_t *connp = tcp->tcp_connp; 3460 zoneid_t zoneid = connp->conn_zoneid; 3461 3462 /* 3463 * Lookup for free addresses is done in a loop and "loopmax" 3464 * influences how long we spin in the loop 3465 */ 3466 if (bind_to_req_port_only) { 3467 /* 3468 * If the requested port is busy, don't bother to look 3469 * for a new one. Setting loop maximum count to 1 has 3470 * that effect. 3471 */ 3472 loopmax = 1; 3473 } else { 3474 /* 3475 * If the requested port is busy, look for a free one 3476 * in the anonymous port range. 3477 * Set loopmax appropriately so that one does not look 3478 * forever in the case all of the anonymous ports are in use. 3479 */ 3480 if (tcp->tcp_anon_priv_bind) { 3481 /* 3482 * loopmax = 3483 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 3484 */ 3485 loopmax = IPPORT_RESERVED - tcp_min_anonpriv_port; 3486 } else { 3487 loopmax = (tcp_largest_anon_port - 3488 tcp_smallest_anon_port + 1); 3489 } 3490 } 3491 do { 3492 uint16_t lport; 3493 tf_t *tbf; 3494 tcp_t *ltcp; 3495 conn_t *lconnp; 3496 3497 lport = htons(port); 3498 3499 /* 3500 * Ensure that the tcp_t is not currently in the bind hash. 3501 * Hold the lock on the hash bucket to ensure that 3502 * the duplicate check plus the insertion is an atomic 3503 * operation. 3504 * 3505 * This function does an inline lookup on the bind hash list 3506 * Make sure that we access only members of tcp_t 3507 * and that we don't look at tcp_tcp, since we are not 3508 * doing a CONN_INC_REF. 3509 */ 3510 tcp_bind_hash_remove(tcp); 3511 tbf = &tcp_bind_fanout[TCP_BIND_HASH(lport)]; 3512 mutex_enter(&tbf->tf_lock); 3513 for (ltcp = tbf->tf_tcp; ltcp != NULL; 3514 ltcp = ltcp->tcp_bind_hash) { 3515 boolean_t not_socket; 3516 boolean_t exclbind; 3517 3518 if (lport != ltcp->tcp_lport) 3519 continue; 3520 3521 lconnp = ltcp->tcp_connp; 3522 3523 /* 3524 * On a labeled system, we must treat bindings to ports 3525 * on shared IP addresses by sockets with MAC exemption 3526 * privilege as being in all zones, as there's 3527 * otherwise no way to identify the right receiver. 3528 */ 3529 if (!IPCL_ZONE_MATCH(ltcp->tcp_connp, zoneid) && 3530 !lconnp->conn_mac_exempt && 3531 !connp->conn_mac_exempt) 3532 continue; 3533 3534 /* 3535 * If TCP_EXCLBIND is set for either the bound or 3536 * binding endpoint, the semantics of bind 3537 * is changed according to the following. 3538 * 3539 * spec = specified address (v4 or v6) 3540 * unspec = unspecified address (v4 or v6) 3541 * A = specified addresses are different for endpoints 3542 * 3543 * bound bind to allowed 3544 * ------------------------------------- 3545 * unspec unspec no 3546 * unspec spec no 3547 * spec unspec no 3548 * spec spec yes if A 3549 * 3550 * For labeled systems, SO_MAC_EXEMPT behaves the same 3551 * as TCP_EXCLBIND, except that zoneid is ignored. 3552 * 3553 * Note: 3554 * 3555 * 1. Because of TLI semantics, an endpoint can go 3556 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or 3557 * TCPS_BOUND, depending on whether it is originally 3558 * a listener or not. That is why we need to check 3559 * for states greater than or equal to TCPS_BOUND 3560 * here. 3561 * 3562 * 2. Ideally, we should only check for state equals 3563 * to TCPS_LISTEN. And the following check should be 3564 * added. 3565 * 3566 * if (ltcp->tcp_state == TCPS_LISTEN || 3567 * !reuseaddr || !ltcp->tcp_reuseaddr) { 3568 * ... 3569 * } 3570 * 3571 * The semantics will be changed to this. If the 3572 * endpoint on the list is in state not equal to 3573 * TCPS_LISTEN and both endpoints have SO_REUSEADDR 3574 * set, let the bind succeed. 3575 * 3576 * Because of (1), we cannot do that for TLI 3577 * endpoints. But we can do that for socket endpoints. 3578 * If in future, we can change this going back 3579 * semantics, we can use the above check for TLI also. 3580 */ 3581 not_socket = !(TCP_IS_SOCKET(ltcp) && 3582 TCP_IS_SOCKET(tcp)); 3583 exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind; 3584 3585 if (lconnp->conn_mac_exempt || connp->conn_mac_exempt || 3586 (exclbind && (not_socket || 3587 ltcp->tcp_state <= TCPS_ESTABLISHED))) { 3588 if (V6_OR_V4_INADDR_ANY( 3589 ltcp->tcp_bound_source_v6) || 3590 V6_OR_V4_INADDR_ANY(*laddr) || 3591 IN6_ARE_ADDR_EQUAL(laddr, 3592 <cp->tcp_bound_source_v6)) { 3593 break; 3594 } 3595 continue; 3596 } 3597 3598 /* 3599 * Check ipversion to allow IPv4 and IPv6 sockets to 3600 * have disjoint port number spaces, if *_EXCLBIND 3601 * is not set and only if the application binds to a 3602 * specific port. We use the same autoassigned port 3603 * number space for IPv4 and IPv6 sockets. 3604 */ 3605 if (tcp->tcp_ipversion != ltcp->tcp_ipversion && 3606 bind_to_req_port_only) 3607 continue; 3608 3609 /* 3610 * Ideally, we should make sure that the source 3611 * address, remote address, and remote port in the 3612 * four tuple for this tcp-connection is unique. 3613 * However, trying to find out the local source 3614 * address would require too much code duplication 3615 * with IP, since IP needs needs to have that code 3616 * to support userland TCP implementations. 3617 */ 3618 if (quick_connect && 3619 (ltcp->tcp_state > TCPS_LISTEN) && 3620 ((tcp->tcp_fport != ltcp->tcp_fport) || 3621 !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, 3622 <cp->tcp_remote_v6))) 3623 continue; 3624 3625 if (!reuseaddr) { 3626 /* 3627 * No socket option SO_REUSEADDR. 3628 * If existing port is bound to 3629 * a non-wildcard IP address 3630 * and the requesting stream is 3631 * bound to a distinct 3632 * different IP addresses 3633 * (non-wildcard, also), keep 3634 * going. 3635 */ 3636 if (!V6_OR_V4_INADDR_ANY(*laddr) && 3637 !V6_OR_V4_INADDR_ANY( 3638 ltcp->tcp_bound_source_v6) && 3639 !IN6_ARE_ADDR_EQUAL(laddr, 3640 <cp->tcp_bound_source_v6)) 3641 continue; 3642 if (ltcp->tcp_state >= TCPS_BOUND) { 3643 /* 3644 * This port is being used and 3645 * its state is >= TCPS_BOUND, 3646 * so we can't bind to it. 3647 */ 3648 break; 3649 } 3650 } else { 3651 /* 3652 * socket option SO_REUSEADDR is set on the 3653 * binding tcp_t. 3654 * 3655 * If two streams are bound to 3656 * same IP address or both addr 3657 * and bound source are wildcards 3658 * (INADDR_ANY), we want to stop 3659 * searching. 3660 * We have found a match of IP source 3661 * address and source port, which is 3662 * refused regardless of the 3663 * SO_REUSEADDR setting, so we break. 3664 */ 3665 if (IN6_ARE_ADDR_EQUAL(laddr, 3666 <cp->tcp_bound_source_v6) && 3667 (ltcp->tcp_state == TCPS_LISTEN || 3668 ltcp->tcp_state == TCPS_BOUND)) 3669 break; 3670 } 3671 } 3672 if (ltcp != NULL) { 3673 /* The port number is busy */ 3674 mutex_exit(&tbf->tf_lock); 3675 } else { 3676 /* 3677 * This port is ours. Insert in fanout and mark as 3678 * bound to prevent others from getting the port 3679 * number. 3680 */ 3681 tcp->tcp_state = TCPS_BOUND; 3682 tcp->tcp_lport = htons(port); 3683 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 3684 3685 ASSERT(&tcp_bind_fanout[TCP_BIND_HASH( 3686 tcp->tcp_lport)] == tbf); 3687 tcp_bind_hash_insert(tbf, tcp, 1); 3688 3689 mutex_exit(&tbf->tf_lock); 3690 3691 /* 3692 * We don't want tcp_next_port_to_try to "inherit" 3693 * a port number supplied by the user in a bind. 3694 */ 3695 if (user_specified) 3696 return (port); 3697 3698 /* 3699 * This is the only place where tcp_next_port_to_try 3700 * is updated. After the update, it may or may not 3701 * be in the valid range. 3702 */ 3703 if (!tcp->tcp_anon_priv_bind) 3704 tcp_next_port_to_try = port + 1; 3705 return (port); 3706 } 3707 3708 if (tcp->tcp_anon_priv_bind) { 3709 port = tcp_get_next_priv_port(tcp); 3710 } else { 3711 if (count == 0 && user_specified) { 3712 /* 3713 * We may have to return an anonymous port. So 3714 * get one to start with. 3715 */ 3716 port = 3717 tcp_update_next_port(tcp_next_port_to_try, 3718 tcp, B_TRUE); 3719 user_specified = B_FALSE; 3720 } else { 3721 port = tcp_update_next_port(port + 1, tcp, 3722 B_FALSE); 3723 } 3724 } 3725 if (port == 0) 3726 break; 3727 3728 /* 3729 * Don't let this loop run forever in the case where 3730 * all of the anonymous ports are in use. 3731 */ 3732 } while (++count < loopmax); 3733 return (0); 3734 } 3735 3736 /* 3737 * We are dying for some reason. Try to do it gracefully. (May be called 3738 * as writer.) 3739 * 3740 * Return -1 if the structure was not cleaned up (if the cleanup had to be 3741 * done by a service procedure). 3742 * TBD - Should the return value distinguish between the tcp_t being 3743 * freed and it being reinitialized? 3744 */ 3745 static int 3746 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) 3747 { 3748 mblk_t *mp; 3749 queue_t *q; 3750 3751 TCP_CLD_STAT(tag); 3752 3753 #if TCP_TAG_CLEAN_DEATH 3754 tcp->tcp_cleandeathtag = tag; 3755 #endif 3756 3757 if (tcp->tcp_fused) 3758 tcp_unfuse(tcp); 3759 3760 if (tcp->tcp_linger_tid != 0 && 3761 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) { 3762 tcp_stop_lingering(tcp); 3763 } 3764 3765 ASSERT(tcp != NULL); 3766 ASSERT((tcp->tcp_family == AF_INET && 3767 tcp->tcp_ipversion == IPV4_VERSION) || 3768 (tcp->tcp_family == AF_INET6 && 3769 (tcp->tcp_ipversion == IPV4_VERSION || 3770 tcp->tcp_ipversion == IPV6_VERSION))); 3771 3772 if (TCP_IS_DETACHED(tcp)) { 3773 if (tcp->tcp_hard_binding) { 3774 /* 3775 * Its an eager that we are dealing with. We close the 3776 * eager but in case a conn_ind has already gone to the 3777 * listener, let tcp_accept_finish() send a discon_ind 3778 * to the listener and drop the last reference. If the 3779 * listener doesn't even know about the eager i.e. the 3780 * conn_ind hasn't gone up, blow away the eager and drop 3781 * the last reference as well. If the conn_ind has gone 3782 * up, state should be BOUND. tcp_accept_finish 3783 * will figure out that the connection has received a 3784 * RST and will send a DISCON_IND to the application. 3785 */ 3786 tcp_closei_local(tcp); 3787 if (tcp->tcp_conn.tcp_eager_conn_ind != NULL) { 3788 CONN_DEC_REF(tcp->tcp_connp); 3789 } else { 3790 tcp->tcp_state = TCPS_BOUND; 3791 } 3792 } else { 3793 tcp_close_detached(tcp); 3794 } 3795 return (0); 3796 } 3797 3798 TCP_STAT(tcp_clean_death_nondetached); 3799 3800 /* 3801 * If T_ORDREL_IND has not been sent yet (done when service routine 3802 * is run) postpone cleaning up the endpoint until service routine 3803 * has sent up the T_ORDREL_IND. Avoid clearing out an existing 3804 * client_errno since tcp_close uses the client_errno field. 3805 */ 3806 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 3807 if (err != 0) 3808 tcp->tcp_client_errno = err; 3809 3810 tcp->tcp_deferred_clean_death = B_TRUE; 3811 return (-1); 3812 } 3813 3814 q = tcp->tcp_rq; 3815 3816 /* Trash all inbound data */ 3817 flushq(q, FLUSHALL); 3818 3819 /* 3820 * If we are at least part way open and there is error 3821 * (err==0 implies no error) 3822 * notify our client by a T_DISCON_IND. 3823 */ 3824 if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) { 3825 if (tcp->tcp_state >= TCPS_ESTABLISHED && 3826 !TCP_IS_SOCKET(tcp)) { 3827 /* 3828 * Send M_FLUSH according to TPI. Because sockets will 3829 * (and must) ignore FLUSHR we do that only for TPI 3830 * endpoints and sockets in STREAMS mode. 3831 */ 3832 (void) putnextctl1(q, M_FLUSH, FLUSHR); 3833 } 3834 if (tcp->tcp_debug) { 3835 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 3836 "tcp_clean_death: discon err %d", err); 3837 } 3838 mp = mi_tpi_discon_ind(NULL, err, 0); 3839 if (mp != NULL) { 3840 putnext(q, mp); 3841 } else { 3842 if (tcp->tcp_debug) { 3843 (void) strlog(TCP_MOD_ID, 0, 1, 3844 SL_ERROR|SL_TRACE, 3845 "tcp_clean_death, sending M_ERROR"); 3846 } 3847 (void) putnextctl1(q, M_ERROR, EPROTO); 3848 } 3849 if (tcp->tcp_state <= TCPS_SYN_RCVD) { 3850 /* SYN_SENT or SYN_RCVD */ 3851 BUMP_MIB(&tcp_mib, tcpAttemptFails); 3852 } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) { 3853 /* ESTABLISHED or CLOSE_WAIT */ 3854 BUMP_MIB(&tcp_mib, tcpEstabResets); 3855 } 3856 } 3857 3858 tcp_reinit(tcp); 3859 return (-1); 3860 } 3861 3862 /* 3863 * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout 3864 * to expire, stop the wait and finish the close. 3865 */ 3866 static void 3867 tcp_stop_lingering(tcp_t *tcp) 3868 { 3869 clock_t delta = 0; 3870 3871 tcp->tcp_linger_tid = 0; 3872 if (tcp->tcp_state > TCPS_LISTEN) { 3873 tcp_acceptor_hash_remove(tcp); 3874 if (tcp->tcp_flow_stopped) { 3875 tcp_clrqfull(tcp); 3876 } 3877 3878 if (tcp->tcp_timer_tid != 0) { 3879 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 3880 tcp->tcp_timer_tid = 0; 3881 } 3882 /* 3883 * Need to cancel those timers which will not be used when 3884 * TCP is detached. This has to be done before the tcp_wq 3885 * is set to the global queue. 3886 */ 3887 tcp_timers_stop(tcp); 3888 3889 3890 tcp->tcp_detached = B_TRUE; 3891 tcp->tcp_rq = tcp_g_q; 3892 tcp->tcp_wq = WR(tcp_g_q); 3893 3894 if (tcp->tcp_state == TCPS_TIME_WAIT) { 3895 tcp_time_wait_append(tcp); 3896 TCP_DBGSTAT(tcp_detach_time_wait); 3897 goto finish; 3898 } 3899 3900 /* 3901 * If delta is zero the timer event wasn't executed and was 3902 * successfully canceled. In this case we need to restart it 3903 * with the minimal delta possible. 3904 */ 3905 if (delta >= 0) { 3906 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 3907 delta ? delta : 1); 3908 } 3909 } else { 3910 tcp_closei_local(tcp); 3911 CONN_DEC_REF(tcp->tcp_connp); 3912 } 3913 finish: 3914 /* Signal closing thread that it can complete close */ 3915 mutex_enter(&tcp->tcp_closelock); 3916 tcp->tcp_detached = B_TRUE; 3917 tcp->tcp_rq = tcp_g_q; 3918 tcp->tcp_wq = WR(tcp_g_q); 3919 tcp->tcp_closed = 1; 3920 cv_signal(&tcp->tcp_closecv); 3921 mutex_exit(&tcp->tcp_closelock); 3922 } 3923 3924 /* 3925 * Handle lingering timeouts. This function is called when the SO_LINGER timeout 3926 * expires. 3927 */ 3928 static void 3929 tcp_close_linger_timeout(void *arg) 3930 { 3931 conn_t *connp = (conn_t *)arg; 3932 tcp_t *tcp = connp->conn_tcp; 3933 3934 tcp->tcp_client_errno = ETIMEDOUT; 3935 tcp_stop_lingering(tcp); 3936 } 3937 3938 static int 3939 tcp_close(queue_t *q, int flags) 3940 { 3941 conn_t *connp = Q_TO_CONN(q); 3942 tcp_t *tcp = connp->conn_tcp; 3943 mblk_t *mp = &tcp->tcp_closemp; 3944 boolean_t conn_ioctl_cleanup_reqd = B_FALSE; 3945 3946 ASSERT(WR(q)->q_next == NULL); 3947 ASSERT(connp->conn_ref >= 2); 3948 ASSERT((connp->conn_flags & IPCL_TCPMOD) == 0); 3949 3950 /* 3951 * We are being closed as /dev/tcp or /dev/tcp6. 3952 * 3953 * Mark the conn as closing. ill_pending_mp_add will not 3954 * add any mp to the pending mp list, after this conn has 3955 * started closing. Same for sq_pending_mp_add 3956 */ 3957 mutex_enter(&connp->conn_lock); 3958 connp->conn_state_flags |= CONN_CLOSING; 3959 if (connp->conn_oper_pending_ill != NULL) 3960 conn_ioctl_cleanup_reqd = B_TRUE; 3961 CONN_INC_REF_LOCKED(connp); 3962 mutex_exit(&connp->conn_lock); 3963 tcp->tcp_closeflags = (uint8_t)flags; 3964 ASSERT(connp->conn_ref >= 3); 3965 3966 (*tcp_squeue_close_proc)(connp->conn_sqp, mp, 3967 tcp_close_output, connp, SQTAG_IP_TCP_CLOSE); 3968 3969 mutex_enter(&tcp->tcp_closelock); 3970 3971 while (!tcp->tcp_closed) 3972 cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock); 3973 mutex_exit(&tcp->tcp_closelock); 3974 /* 3975 * In the case of listener streams that have eagers in the q or q0 3976 * we wait for the eagers to drop their reference to us. tcp_rq and 3977 * tcp_wq of the eagers point to our queues. By waiting for the 3978 * refcnt to drop to 1, we are sure that the eagers have cleaned 3979 * up their queue pointers and also dropped their references to us. 3980 */ 3981 if (tcp->tcp_wait_for_eagers) { 3982 mutex_enter(&connp->conn_lock); 3983 while (connp->conn_ref != 1) { 3984 cv_wait(&connp->conn_cv, &connp->conn_lock); 3985 } 3986 mutex_exit(&connp->conn_lock); 3987 } 3988 /* 3989 * ioctl cleanup. The mp is queued in the 3990 * ill_pending_mp or in the sq_pending_mp. 3991 */ 3992 if (conn_ioctl_cleanup_reqd) 3993 conn_ioctl_cleanup(connp); 3994 3995 qprocsoff(q); 3996 inet_minor_free(ip_minor_arena, connp->conn_dev); 3997 3998 tcp->tcp_cpid = -1; 3999 4000 /* 4001 * Drop IP's reference on the conn. This is the last reference 4002 * on the connp if the state was less than established. If the 4003 * connection has gone into timewait state, then we will have 4004 * one ref for the TCP and one more ref (total of two) for the 4005 * classifier connected hash list (a timewait connections stays 4006 * in connected hash till closed). 4007 * 4008 * We can't assert the references because there might be other 4009 * transient reference places because of some walkers or queued 4010 * packets in squeue for the timewait state. 4011 */ 4012 CONN_DEC_REF(connp); 4013 q->q_ptr = WR(q)->q_ptr = NULL; 4014 return (0); 4015 } 4016 4017 static int 4018 tcpclose_accept(queue_t *q) 4019 { 4020 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); 4021 4022 /* 4023 * We had opened an acceptor STREAM for sockfs which is 4024 * now being closed due to some error. 4025 */ 4026 qprocsoff(q); 4027 inet_minor_free(ip_minor_arena, (dev_t)q->q_ptr); 4028 q->q_ptr = WR(q)->q_ptr = NULL; 4029 return (0); 4030 } 4031 4032 4033 /* 4034 * Called by streams close routine via squeues when our client blows off her 4035 * descriptor, we take this to mean: "close the stream state NOW, close the tcp 4036 * connection politely" When SO_LINGER is set (with a non-zero linger time and 4037 * it is not a nonblocking socket) then this routine sleeps until the FIN is 4038 * acked. 4039 * 4040 * NOTE: tcp_close potentially returns error when lingering. 4041 * However, the stream head currently does not pass these errors 4042 * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK 4043 * errors to the application (from tsleep()) and not errors 4044 * like ECONNRESET caused by receiving a reset packet. 4045 */ 4046 4047 /* ARGSUSED */ 4048 static void 4049 tcp_close_output(void *arg, mblk_t *mp, void *arg2) 4050 { 4051 char *msg; 4052 conn_t *connp = (conn_t *)arg; 4053 tcp_t *tcp = connp->conn_tcp; 4054 clock_t delta = 0; 4055 4056 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 4057 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 4058 4059 /* Cancel any pending timeout */ 4060 if (tcp->tcp_ordrelid != 0) { 4061 if (tcp->tcp_timeout) { 4062 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ordrelid); 4063 } 4064 tcp->tcp_ordrelid = 0; 4065 tcp->tcp_timeout = B_FALSE; 4066 } 4067 4068 mutex_enter(&tcp->tcp_eager_lock); 4069 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 4070 /* Cleanup for listener */ 4071 tcp_eager_cleanup(tcp, 0); 4072 tcp->tcp_wait_for_eagers = 1; 4073 } 4074 mutex_exit(&tcp->tcp_eager_lock); 4075 4076 connp->conn_mdt_ok = B_FALSE; 4077 tcp->tcp_mdt = B_FALSE; 4078 4079 msg = NULL; 4080 switch (tcp->tcp_state) { 4081 case TCPS_CLOSED: 4082 case TCPS_IDLE: 4083 case TCPS_BOUND: 4084 case TCPS_LISTEN: 4085 break; 4086 case TCPS_SYN_SENT: 4087 msg = "tcp_close, during connect"; 4088 break; 4089 case TCPS_SYN_RCVD: 4090 /* 4091 * Close during the connect 3-way handshake 4092 * but here there may or may not be pending data 4093 * already on queue. Process almost same as in 4094 * the ESTABLISHED state. 4095 */ 4096 /* FALLTHRU */ 4097 default: 4098 if (tcp->tcp_fused) 4099 tcp_unfuse(tcp); 4100 4101 /* 4102 * If SO_LINGER has set a zero linger time, abort the 4103 * connection with a reset. 4104 */ 4105 if (tcp->tcp_linger && tcp->tcp_lingertime == 0) { 4106 msg = "tcp_close, zero lingertime"; 4107 break; 4108 } 4109 4110 ASSERT(tcp->tcp_hard_bound || tcp->tcp_hard_binding); 4111 /* 4112 * Abort connection if there is unread data queued. 4113 */ 4114 if (tcp->tcp_rcv_list || tcp->tcp_reass_head) { 4115 msg = "tcp_close, unread data"; 4116 break; 4117 } 4118 /* 4119 * tcp_hard_bound is now cleared thus all packets go through 4120 * tcp_lookup. This fact is used by tcp_detach below. 4121 * 4122 * We have done a qwait() above which could have possibly 4123 * drained more messages in turn causing transition to a 4124 * different state. Check whether we have to do the rest 4125 * of the processing or not. 4126 */ 4127 if (tcp->tcp_state <= TCPS_LISTEN) 4128 break; 4129 4130 /* 4131 * Transmit the FIN before detaching the tcp_t. 4132 * After tcp_detach returns this queue/perimeter 4133 * no longer owns the tcp_t thus others can modify it. 4134 */ 4135 (void) tcp_xmit_end(tcp); 4136 4137 /* 4138 * If lingering on close then wait until the fin is acked, 4139 * the SO_LINGER time passes, or a reset is sent/received. 4140 */ 4141 if (tcp->tcp_linger && tcp->tcp_lingertime > 0 && 4142 !(tcp->tcp_fin_acked) && 4143 tcp->tcp_state >= TCPS_ESTABLISHED) { 4144 if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) { 4145 tcp->tcp_client_errno = EWOULDBLOCK; 4146 } else if (tcp->tcp_client_errno == 0) { 4147 4148 ASSERT(tcp->tcp_linger_tid == 0); 4149 4150 tcp->tcp_linger_tid = TCP_TIMER(tcp, 4151 tcp_close_linger_timeout, 4152 tcp->tcp_lingertime * hz); 4153 4154 /* tcp_close_linger_timeout will finish close */ 4155 if (tcp->tcp_linger_tid == 0) 4156 tcp->tcp_client_errno = ENOSR; 4157 else 4158 return; 4159 } 4160 4161 /* 4162 * Check if we need to detach or just close 4163 * the instance. 4164 */ 4165 if (tcp->tcp_state <= TCPS_LISTEN) 4166 break; 4167 } 4168 4169 /* 4170 * Make sure that no other thread will access the tcp_rq of 4171 * this instance (through lookups etc.) as tcp_rq will go 4172 * away shortly. 4173 */ 4174 tcp_acceptor_hash_remove(tcp); 4175 4176 if (tcp->tcp_flow_stopped) { 4177 tcp_clrqfull(tcp); 4178 } 4179 4180 if (tcp->tcp_timer_tid != 0) { 4181 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 4182 tcp->tcp_timer_tid = 0; 4183 } 4184 /* 4185 * Need to cancel those timers which will not be used when 4186 * TCP is detached. This has to be done before the tcp_wq 4187 * is set to the global queue. 4188 */ 4189 tcp_timers_stop(tcp); 4190 4191 tcp->tcp_detached = B_TRUE; 4192 if (tcp->tcp_state == TCPS_TIME_WAIT) { 4193 tcp_time_wait_append(tcp); 4194 TCP_DBGSTAT(tcp_detach_time_wait); 4195 ASSERT(connp->conn_ref >= 3); 4196 goto finish; 4197 } 4198 4199 /* 4200 * If delta is zero the timer event wasn't executed and was 4201 * successfully canceled. In this case we need to restart it 4202 * with the minimal delta possible. 4203 */ 4204 if (delta >= 0) 4205 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, 4206 delta ? delta : 1); 4207 4208 ASSERT(connp->conn_ref >= 3); 4209 goto finish; 4210 } 4211 4212 /* Detach did not complete. Still need to remove q from stream. */ 4213 if (msg) { 4214 if (tcp->tcp_state == TCPS_ESTABLISHED || 4215 tcp->tcp_state == TCPS_CLOSE_WAIT) 4216 BUMP_MIB(&tcp_mib, tcpEstabResets); 4217 if (tcp->tcp_state == TCPS_SYN_SENT || 4218 tcp->tcp_state == TCPS_SYN_RCVD) 4219 BUMP_MIB(&tcp_mib, tcpAttemptFails); 4220 tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST); 4221 } 4222 4223 tcp_closei_local(tcp); 4224 CONN_DEC_REF(connp); 4225 ASSERT(connp->conn_ref >= 2); 4226 4227 finish: 4228 /* 4229 * Although packets are always processed on the correct 4230 * tcp's perimeter and access is serialized via squeue's, 4231 * IP still needs a queue when sending packets in time_wait 4232 * state so use WR(tcp_g_q) till ip_output() can be 4233 * changed to deal with just connp. For read side, we 4234 * could have set tcp_rq to NULL but there are some cases 4235 * in tcp_rput_data() from early days of this code which 4236 * do a putnext without checking if tcp is closed. Those 4237 * need to be identified before both tcp_rq and tcp_wq 4238 * can be set to NULL and tcp_q_q can disappear forever. 4239 */ 4240 mutex_enter(&tcp->tcp_closelock); 4241 /* 4242 * Don't change the queues in the case of a listener that has 4243 * eagers in its q or q0. It could surprise the eagers. 4244 * Instead wait for the eagers outside the squeue. 4245 */ 4246 if (!tcp->tcp_wait_for_eagers) { 4247 tcp->tcp_detached = B_TRUE; 4248 tcp->tcp_rq = tcp_g_q; 4249 tcp->tcp_wq = WR(tcp_g_q); 4250 } 4251 4252 /* Signal tcp_close() to finish closing. */ 4253 tcp->tcp_closed = 1; 4254 cv_signal(&tcp->tcp_closecv); 4255 mutex_exit(&tcp->tcp_closelock); 4256 } 4257 4258 4259 /* 4260 * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp. 4261 * Some stream heads get upset if they see these later on as anything but NULL. 4262 */ 4263 static void 4264 tcp_close_mpp(mblk_t **mpp) 4265 { 4266 mblk_t *mp; 4267 4268 if ((mp = *mpp) != NULL) { 4269 do { 4270 mp->b_next = NULL; 4271 mp->b_prev = NULL; 4272 } while ((mp = mp->b_cont) != NULL); 4273 4274 mp = *mpp; 4275 *mpp = NULL; 4276 freemsg(mp); 4277 } 4278 } 4279 4280 /* Do detached close. */ 4281 static void 4282 tcp_close_detached(tcp_t *tcp) 4283 { 4284 if (tcp->tcp_fused) 4285 tcp_unfuse(tcp); 4286 4287 /* 4288 * Clustering code serializes TCP disconnect callbacks and 4289 * cluster tcp list walks by blocking a TCP disconnect callback 4290 * if a cluster tcp list walk is in progress. This ensures 4291 * accurate accounting of TCPs in the cluster code even though 4292 * the TCP list walk itself is not atomic. 4293 */ 4294 tcp_closei_local(tcp); 4295 CONN_DEC_REF(tcp->tcp_connp); 4296 } 4297 4298 /* 4299 * Stop all TCP timers, and free the timer mblks if requested. 4300 */ 4301 void 4302 tcp_timers_stop(tcp_t *tcp) 4303 { 4304 if (tcp->tcp_timer_tid != 0) { 4305 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); 4306 tcp->tcp_timer_tid = 0; 4307 } 4308 if (tcp->tcp_ka_tid != 0) { 4309 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); 4310 tcp->tcp_ka_tid = 0; 4311 } 4312 if (tcp->tcp_ack_tid != 0) { 4313 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 4314 tcp->tcp_ack_tid = 0; 4315 } 4316 if (tcp->tcp_push_tid != 0) { 4317 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 4318 tcp->tcp_push_tid = 0; 4319 } 4320 } 4321 4322 /* 4323 * The tcp_t is going away. Remove it from all lists and set it 4324 * to TCPS_CLOSED. The freeing up of memory is deferred until 4325 * tcp_inactive. This is needed since a thread in tcp_rput might have 4326 * done a CONN_INC_REF on this structure before it was removed from the 4327 * hashes. 4328 */ 4329 static void 4330 tcp_closei_local(tcp_t *tcp) 4331 { 4332 ire_t *ire; 4333 conn_t *connp = tcp->tcp_connp; 4334 4335 if (!TCP_IS_SOCKET(tcp)) 4336 tcp_acceptor_hash_remove(tcp); 4337 4338 UPDATE_MIB(&tcp_mib, tcpInSegs, tcp->tcp_ibsegs); 4339 tcp->tcp_ibsegs = 0; 4340 UPDATE_MIB(&tcp_mib, tcpOutSegs, tcp->tcp_obsegs); 4341 tcp->tcp_obsegs = 0; 4342 4343 /* 4344 * If we are an eager connection hanging off a listener that 4345 * hasn't formally accepted the connection yet, get off his 4346 * list and blow off any data that we have accumulated. 4347 */ 4348 if (tcp->tcp_listener != NULL) { 4349 tcp_t *listener = tcp->tcp_listener; 4350 mutex_enter(&listener->tcp_eager_lock); 4351 /* 4352 * tcp_eager_conn_ind == NULL means that the 4353 * conn_ind has already gone to listener. At 4354 * this point, eager will be closed but we 4355 * leave it in listeners eager list so that 4356 * if listener decides to close without doing 4357 * accept, we can clean this up. In tcp_wput_accept 4358 * we take case of the case of accept on closed 4359 * eager. 4360 */ 4361 if (tcp->tcp_conn.tcp_eager_conn_ind != NULL) { 4362 tcp_eager_unlink(tcp); 4363 mutex_exit(&listener->tcp_eager_lock); 4364 /* 4365 * We don't want to have any pointers to the 4366 * listener queue, after we have released our 4367 * reference on the listener 4368 */ 4369 tcp->tcp_rq = tcp_g_q; 4370 tcp->tcp_wq = WR(tcp_g_q); 4371 CONN_DEC_REF(listener->tcp_connp); 4372 } else { 4373 mutex_exit(&listener->tcp_eager_lock); 4374 } 4375 } 4376 4377 /* Stop all the timers */ 4378 tcp_timers_stop(tcp); 4379 4380 if (tcp->tcp_state == TCPS_LISTEN) { 4381 if (tcp->tcp_ip_addr_cache) { 4382 kmem_free((void *)tcp->tcp_ip_addr_cache, 4383 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 4384 tcp->tcp_ip_addr_cache = NULL; 4385 } 4386 } 4387 if (tcp->tcp_flow_stopped) 4388 tcp_clrqfull(tcp); 4389 4390 tcp_bind_hash_remove(tcp); 4391 /* 4392 * If the tcp_time_wait_collector (which runs outside the squeue) 4393 * is trying to remove this tcp from the time wait list, we will 4394 * block in tcp_time_wait_remove while trying to acquire the 4395 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also 4396 * requires the ipcl_hash_remove to be ordered after the 4397 * tcp_time_wait_remove for the refcnt checks to work correctly. 4398 */ 4399 if (tcp->tcp_state == TCPS_TIME_WAIT) 4400 tcp_time_wait_remove(tcp, NULL); 4401 CL_INET_DISCONNECT(tcp); 4402 ipcl_hash_remove(connp); 4403 4404 /* 4405 * Delete the cached ire in conn_ire_cache and also mark 4406 * the conn as CONDEMNED 4407 */ 4408 mutex_enter(&connp->conn_lock); 4409 connp->conn_state_flags |= CONN_CONDEMNED; 4410 ire = connp->conn_ire_cache; 4411 connp->conn_ire_cache = NULL; 4412 mutex_exit(&connp->conn_lock); 4413 if (ire != NULL) 4414 IRE_REFRELE_NOTR(ire); 4415 4416 /* Need to cleanup any pending ioctls */ 4417 ASSERT(tcp->tcp_time_wait_next == NULL); 4418 ASSERT(tcp->tcp_time_wait_prev == NULL); 4419 ASSERT(tcp->tcp_time_wait_expire == 0); 4420 tcp->tcp_state = TCPS_CLOSED; 4421 4422 /* Release any SSL context */ 4423 if (tcp->tcp_kssl_ent != NULL) { 4424 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY); 4425 tcp->tcp_kssl_ent = NULL; 4426 } 4427 if (tcp->tcp_kssl_ctx != NULL) { 4428 kssl_release_ctx(tcp->tcp_kssl_ctx); 4429 tcp->tcp_kssl_ctx = NULL; 4430 } 4431 tcp->tcp_kssl_pending = B_FALSE; 4432 } 4433 4434 /* 4435 * tcp is dying (called from ipcl_conn_destroy and error cases). 4436 * Free the tcp_t in either case. 4437 */ 4438 void 4439 tcp_free(tcp_t *tcp) 4440 { 4441 mblk_t *mp; 4442 ip6_pkt_t *ipp; 4443 4444 ASSERT(tcp != NULL); 4445 ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL); 4446 4447 tcp->tcp_rq = NULL; 4448 tcp->tcp_wq = NULL; 4449 4450 tcp_close_mpp(&tcp->tcp_xmit_head); 4451 tcp_close_mpp(&tcp->tcp_reass_head); 4452 if (tcp->tcp_rcv_list != NULL) { 4453 /* Free b_next chain */ 4454 tcp_close_mpp(&tcp->tcp_rcv_list); 4455 } 4456 if ((mp = tcp->tcp_urp_mp) != NULL) { 4457 freemsg(mp); 4458 } 4459 if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 4460 freemsg(mp); 4461 } 4462 4463 if (tcp->tcp_fused_sigurg_mp != NULL) { 4464 freeb(tcp->tcp_fused_sigurg_mp); 4465 tcp->tcp_fused_sigurg_mp = NULL; 4466 } 4467 4468 if (tcp->tcp_sack_info != NULL) { 4469 if (tcp->tcp_notsack_list != NULL) { 4470 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 4471 } 4472 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); 4473 } 4474 4475 if (tcp->tcp_hopopts != NULL) { 4476 mi_free(tcp->tcp_hopopts); 4477 tcp->tcp_hopopts = NULL; 4478 tcp->tcp_hopoptslen = 0; 4479 } 4480 ASSERT(tcp->tcp_hopoptslen == 0); 4481 if (tcp->tcp_dstopts != NULL) { 4482 mi_free(tcp->tcp_dstopts); 4483 tcp->tcp_dstopts = NULL; 4484 tcp->tcp_dstoptslen = 0; 4485 } 4486 ASSERT(tcp->tcp_dstoptslen == 0); 4487 if (tcp->tcp_rtdstopts != NULL) { 4488 mi_free(tcp->tcp_rtdstopts); 4489 tcp->tcp_rtdstopts = NULL; 4490 tcp->tcp_rtdstoptslen = 0; 4491 } 4492 ASSERT(tcp->tcp_rtdstoptslen == 0); 4493 if (tcp->tcp_rthdr != NULL) { 4494 mi_free(tcp->tcp_rthdr); 4495 tcp->tcp_rthdr = NULL; 4496 tcp->tcp_rthdrlen = 0; 4497 } 4498 ASSERT(tcp->tcp_rthdrlen == 0); 4499 4500 ipp = &tcp->tcp_sticky_ipp; 4501 if (ipp->ipp_fields & (IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | 4502 IPPF_RTHDR)) 4503 ip6_pkt_free(ipp); 4504 4505 /* 4506 * Free memory associated with the tcp/ip header template. 4507 */ 4508 4509 if (tcp->tcp_iphc != NULL) 4510 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 4511 4512 /* 4513 * Following is really a blowing away a union. 4514 * It happens to have exactly two members of identical size 4515 * the following code is enough. 4516 */ 4517 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 4518 4519 if (tcp->tcp_tracebuf != NULL) { 4520 kmem_free(tcp->tcp_tracebuf, sizeof (tcptrch_t)); 4521 tcp->tcp_tracebuf = NULL; 4522 } 4523 } 4524 4525 4526 /* 4527 * Put a connection confirmation message upstream built from the 4528 * address information within 'iph' and 'tcph'. Report our success or failure. 4529 */ 4530 static boolean_t 4531 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, 4532 mblk_t **defermp) 4533 { 4534 sin_t sin; 4535 sin6_t sin6; 4536 mblk_t *mp; 4537 char *optp = NULL; 4538 int optlen = 0; 4539 cred_t *cr; 4540 4541 if (defermp != NULL) 4542 *defermp = NULL; 4543 4544 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { 4545 /* 4546 * Return in T_CONN_CON results of option negotiation through 4547 * the T_CONN_REQ. Note: If there is an real end-to-end option 4548 * negotiation, then what is received from remote end needs 4549 * to be taken into account but there is no such thing (yet?) 4550 * in our TCP/IP. 4551 * Note: We do not use mi_offset_param() here as 4552 * tcp_opts_conn_req contents do not directly come from 4553 * an application and are either generated in kernel or 4554 * from user input that was already verified. 4555 */ 4556 mp = tcp->tcp_conn.tcp_opts_conn_req; 4557 optp = (char *)(mp->b_rptr + 4558 ((struct T_conn_req *)mp->b_rptr)->OPT_offset); 4559 optlen = (int) 4560 ((struct T_conn_req *)mp->b_rptr)->OPT_length; 4561 } 4562 4563 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { 4564 ipha_t *ipha = (ipha_t *)iphdr; 4565 4566 /* packet is IPv4 */ 4567 if (tcp->tcp_family == AF_INET) { 4568 sin = sin_null; 4569 sin.sin_addr.s_addr = ipha->ipha_src; 4570 sin.sin_port = *(uint16_t *)tcph->th_lport; 4571 sin.sin_family = AF_INET; 4572 mp = mi_tpi_conn_con(NULL, (char *)&sin, 4573 (int)sizeof (sin_t), optp, optlen); 4574 } else { 4575 sin6 = sin6_null; 4576 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr); 4577 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4578 sin6.sin6_family = AF_INET6; 4579 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 4580 (int)sizeof (sin6_t), optp, optlen); 4581 4582 } 4583 } else { 4584 ip6_t *ip6h = (ip6_t *)iphdr; 4585 4586 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); 4587 ASSERT(tcp->tcp_family == AF_INET6); 4588 sin6 = sin6_null; 4589 sin6.sin6_addr = ip6h->ip6_src; 4590 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4591 sin6.sin6_family = AF_INET6; 4592 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 4593 mp = mi_tpi_conn_con(NULL, (char *)&sin6, 4594 (int)sizeof (sin6_t), optp, optlen); 4595 } 4596 4597 if (!mp) 4598 return (B_FALSE); 4599 4600 if ((cr = DB_CRED(idmp)) != NULL) { 4601 mblk_setcred(mp, cr); 4602 DB_CPID(mp) = DB_CPID(idmp); 4603 } 4604 4605 if (defermp == NULL) 4606 putnext(tcp->tcp_rq, mp); 4607 else 4608 *defermp = mp; 4609 4610 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 4611 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 4612 return (B_TRUE); 4613 } 4614 4615 /* 4616 * Defense for the SYN attack - 4617 * 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest 4618 * one that doesn't have the dontdrop bit set. 4619 * 2. Don't drop a SYN request before its first timeout. This gives every 4620 * request at least til the first timeout to complete its 3-way handshake. 4621 * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many 4622 * requests currently on the queue that has timed out. This will be used 4623 * as an indicator of whether an attack is under way, so that appropriate 4624 * actions can be taken. (It's incremented in tcp_timer() and decremented 4625 * either when eager goes into ESTABLISHED, or gets freed up.) 4626 * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on 4627 * # of timeout drops back to <= q0len/32 => SYN alert off 4628 */ 4629 static boolean_t 4630 tcp_drop_q0(tcp_t *tcp) 4631 { 4632 tcp_t *eager; 4633 4634 ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock)); 4635 ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); 4636 /* 4637 * New one is added after next_q0 so prev_q0 points to the oldest 4638 * Also do not drop any established connections that are deferred on 4639 * q0 due to q being full 4640 */ 4641 4642 eager = tcp->tcp_eager_prev_q0; 4643 while (eager->tcp_dontdrop || eager->tcp_conn_def_q0) { 4644 eager = eager->tcp_eager_prev_q0; 4645 if (eager == tcp) { 4646 eager = tcp->tcp_eager_prev_q0; 4647 break; 4648 } 4649 } 4650 if (eager->tcp_syn_rcvd_timeout == 0) 4651 return (B_FALSE); 4652 4653 if (tcp->tcp_debug) { 4654 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 4655 "tcp_drop_q0: listen half-open queue (max=%d) overflow" 4656 " (%d pending) on %s, drop one", tcp_conn_req_max_q0, 4657 tcp->tcp_conn_req_cnt_q0, 4658 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 4659 } 4660 4661 BUMP_MIB(&tcp_mib, tcpHalfOpenDrop); 4662 4663 /* 4664 * need to do refhold here because the selected eager could 4665 * be removed by someone else if we release the eager lock. 4666 */ 4667 CONN_INC_REF(eager->tcp_connp); 4668 mutex_exit(&tcp->tcp_eager_lock); 4669 4670 /* Mark the IRE created for this SYN request temporary */ 4671 tcp_ip_ire_mark_advice(eager); 4672 (void) tcp_clean_death(eager, ETIMEDOUT, 5); 4673 CONN_DEC_REF(eager->tcp_connp); 4674 4675 mutex_enter(&tcp->tcp_eager_lock); 4676 return (B_TRUE); 4677 } 4678 4679 int 4680 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, 4681 tcph_t *tcph, uint_t ipvers, mblk_t *idmp) 4682 { 4683 tcp_t *ltcp = lconnp->conn_tcp; 4684 tcp_t *tcp = connp->conn_tcp; 4685 mblk_t *tpi_mp; 4686 ipha_t *ipha; 4687 ip6_t *ip6h; 4688 sin6_t sin6; 4689 in6_addr_t v6dst; 4690 int err; 4691 int ifindex = 0; 4692 cred_t *cr; 4693 4694 if (ipvers == IPV4_VERSION) { 4695 ipha = (ipha_t *)mp->b_rptr; 4696 4697 connp->conn_send = ip_output; 4698 connp->conn_recv = tcp_input; 4699 4700 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6); 4701 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6); 4702 4703 sin6 = sin6_null; 4704 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr); 4705 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 4706 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4707 sin6.sin6_family = AF_INET6; 4708 sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst, 4709 lconnp->conn_zoneid); 4710 if (tcp->tcp_recvdstaddr) { 4711 sin6_t sin6d; 4712 4713 sin6d = sin6_null; 4714 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, 4715 &sin6d.sin6_addr); 4716 sin6d.sin6_port = *(uint16_t *)tcph->th_fport; 4717 sin6d.sin6_family = AF_INET; 4718 tpi_mp = mi_tpi_extconn_ind(NULL, 4719 (char *)&sin6d, sizeof (sin6_t), 4720 (char *)&tcp, 4721 (t_scalar_t)sizeof (intptr_t), 4722 (char *)&sin6d, sizeof (sin6_t), 4723 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4724 } else { 4725 tpi_mp = mi_tpi_conn_ind(NULL, 4726 (char *)&sin6, sizeof (sin6_t), 4727 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4728 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4729 } 4730 } else { 4731 ip6h = (ip6_t *)mp->b_rptr; 4732 4733 connp->conn_send = ip_output_v6; 4734 connp->conn_recv = tcp_input; 4735 4736 connp->conn_srcv6 = ip6h->ip6_dst; 4737 connp->conn_remv6 = ip6h->ip6_src; 4738 4739 /* db_cksumstuff is set at ip_fanout_tcp_v6 */ 4740 ifindex = (int)DB_CKSUMSTUFF(mp); 4741 DB_CKSUMSTUFF(mp) = 0; 4742 4743 sin6 = sin6_null; 4744 sin6.sin6_addr = ip6h->ip6_src; 4745 sin6.sin6_port = *(uint16_t *)tcph->th_lport; 4746 sin6.sin6_family = AF_INET6; 4747 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; 4748 sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, 4749 lconnp->conn_zoneid); 4750 4751 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { 4752 /* Pass up the scope_id of remote addr */ 4753 sin6.sin6_scope_id = ifindex; 4754 } else { 4755 sin6.sin6_scope_id = 0; 4756 } 4757 if (tcp->tcp_recvdstaddr) { 4758 sin6_t sin6d; 4759 4760 sin6d = sin6_null; 4761 sin6.sin6_addr = ip6h->ip6_dst; 4762 sin6d.sin6_port = *(uint16_t *)tcph->th_fport; 4763 sin6d.sin6_family = AF_INET; 4764 tpi_mp = mi_tpi_extconn_ind(NULL, 4765 (char *)&sin6d, sizeof (sin6_t), 4766 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4767 (char *)&sin6d, sizeof (sin6_t), 4768 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4769 } else { 4770 tpi_mp = mi_tpi_conn_ind(NULL, 4771 (char *)&sin6, sizeof (sin6_t), 4772 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4773 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4774 } 4775 } 4776 4777 if (tpi_mp == NULL) 4778 return (ENOMEM); 4779 4780 connp->conn_fport = *(uint16_t *)tcph->th_lport; 4781 connp->conn_lport = *(uint16_t *)tcph->th_fport; 4782 connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER); 4783 connp->conn_fully_bound = B_FALSE; 4784 4785 if (tcp_trace) 4786 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP); 4787 4788 /* Inherit information from the "parent" */ 4789 tcp->tcp_ipversion = ltcp->tcp_ipversion; 4790 tcp->tcp_family = ltcp->tcp_family; 4791 tcp->tcp_wq = ltcp->tcp_wq; 4792 tcp->tcp_rq = ltcp->tcp_rq; 4793 tcp->tcp_mss = tcp_mss_def_ipv6; 4794 tcp->tcp_detached = B_TRUE; 4795 if ((err = tcp_init_values(tcp)) != 0) { 4796 freemsg(tpi_mp); 4797 return (err); 4798 } 4799 4800 if (ipvers == IPV4_VERSION) { 4801 if ((err = tcp_header_init_ipv4(tcp)) != 0) { 4802 freemsg(tpi_mp); 4803 return (err); 4804 } 4805 ASSERT(tcp->tcp_ipha != NULL); 4806 } else { 4807 /* ifindex must be already set */ 4808 ASSERT(ifindex != 0); 4809 4810 if (ltcp->tcp_bound_if != 0) { 4811 /* 4812 * Set newtcp's bound_if equal to 4813 * listener's value. If ifindex is 4814 * not the same as ltcp->tcp_bound_if, 4815 * it must be a packet for the ipmp group 4816 * of interfaces 4817 */ 4818 tcp->tcp_bound_if = ltcp->tcp_bound_if; 4819 } else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { 4820 tcp->tcp_bound_if = ifindex; 4821 } 4822 4823 tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary; 4824 tcp->tcp_recvifindex = 0; 4825 tcp->tcp_recvhops = 0xffffffffU; 4826 ASSERT(tcp->tcp_ip6h != NULL); 4827 } 4828 4829 tcp->tcp_lport = ltcp->tcp_lport; 4830 4831 if (ltcp->tcp_ipversion == tcp->tcp_ipversion) { 4832 if (tcp->tcp_iphc_len != ltcp->tcp_iphc_len) { 4833 /* 4834 * Listener had options of some sort; eager inherits. 4835 * Free up the eager template and allocate one 4836 * of the right size. 4837 */ 4838 if (tcp->tcp_hdr_grown) { 4839 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 4840 } else { 4841 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 4842 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 4843 } 4844 tcp->tcp_iphc = kmem_zalloc(ltcp->tcp_iphc_len, 4845 KM_NOSLEEP); 4846 if (tcp->tcp_iphc == NULL) { 4847 tcp->tcp_iphc_len = 0; 4848 freemsg(tpi_mp); 4849 return (ENOMEM); 4850 } 4851 tcp->tcp_iphc_len = ltcp->tcp_iphc_len; 4852 tcp->tcp_hdr_grown = B_TRUE; 4853 } 4854 tcp->tcp_hdr_len = ltcp->tcp_hdr_len; 4855 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len; 4856 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; 4857 tcp->tcp_ip6_hops = ltcp->tcp_ip6_hops; 4858 tcp->tcp_ip6_vcf = ltcp->tcp_ip6_vcf; 4859 4860 /* 4861 * Copy the IP+TCP header template from listener to eager 4862 */ 4863 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len); 4864 if (tcp->tcp_ipversion == IPV6_VERSION) { 4865 if (((ip6i_t *)(tcp->tcp_iphc))->ip6i_nxt == 4866 IPPROTO_RAW) { 4867 tcp->tcp_ip6h = 4868 (ip6_t *)(tcp->tcp_iphc + 4869 sizeof (ip6i_t)); 4870 } else { 4871 tcp->tcp_ip6h = 4872 (ip6_t *)(tcp->tcp_iphc); 4873 } 4874 tcp->tcp_ipha = NULL; 4875 } else { 4876 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; 4877 tcp->tcp_ip6h = NULL; 4878 } 4879 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + 4880 tcp->tcp_ip_hdr_len); 4881 } else { 4882 /* 4883 * only valid case when ipversion of listener and 4884 * eager differ is when listener is IPv6 and 4885 * eager is IPv4. 4886 * Eager header template has been initialized to the 4887 * maximum v4 header sizes, which includes space for 4888 * TCP and IP options. 4889 */ 4890 ASSERT((ltcp->tcp_ipversion == IPV6_VERSION) && 4891 (tcp->tcp_ipversion == IPV4_VERSION)); 4892 ASSERT(tcp->tcp_iphc_len >= 4893 TCP_MAX_COMBINED_HEADER_LENGTH); 4894 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; 4895 /* copy IP header fields individually */ 4896 tcp->tcp_ipha->ipha_ttl = 4897 ltcp->tcp_ip6h->ip6_hops; 4898 bcopy(ltcp->tcp_tcph->th_lport, 4899 tcp->tcp_tcph->th_lport, sizeof (ushort_t)); 4900 } 4901 4902 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t)); 4903 bcopy(tcp->tcp_tcph->th_fport, &tcp->tcp_fport, 4904 sizeof (in_port_t)); 4905 4906 if (ltcp->tcp_lport == 0) { 4907 tcp->tcp_lport = *(in_port_t *)tcph->th_fport; 4908 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, 4909 sizeof (in_port_t)); 4910 } 4911 4912 if (tcp->tcp_ipversion == IPV4_VERSION) { 4913 ASSERT(ipha != NULL); 4914 tcp->tcp_ipha->ipha_dst = ipha->ipha_src; 4915 tcp->tcp_ipha->ipha_src = ipha->ipha_dst; 4916 4917 /* Source routing option copyover (reverse it) */ 4918 if (tcp_rev_src_routes) 4919 tcp_opt_reverse(tcp, ipha); 4920 } else { 4921 ASSERT(ip6h != NULL); 4922 tcp->tcp_ip6h->ip6_dst = ip6h->ip6_src; 4923 tcp->tcp_ip6h->ip6_src = ip6h->ip6_dst; 4924 } 4925 4926 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 4927 /* 4928 * If the SYN contains a credential, it's a loopback packet; attach 4929 * the credential to the TPI message. 4930 */ 4931 if ((cr = DB_CRED(idmp)) != NULL) { 4932 mblk_setcred(tpi_mp, cr); 4933 DB_CPID(tpi_mp) = DB_CPID(idmp); 4934 } 4935 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp; 4936 4937 /* Inherit the listener's SSL protection state */ 4938 4939 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) { 4940 kssl_hold_ent(tcp->tcp_kssl_ent); 4941 tcp->tcp_kssl_pending = B_TRUE; 4942 } 4943 4944 return (0); 4945 } 4946 4947 4948 int 4949 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, 4950 tcph_t *tcph, mblk_t *idmp) 4951 { 4952 tcp_t *ltcp = lconnp->conn_tcp; 4953 tcp_t *tcp = connp->conn_tcp; 4954 sin_t sin; 4955 mblk_t *tpi_mp = NULL; 4956 int err; 4957 cred_t *cr; 4958 4959 sin = sin_null; 4960 sin.sin_addr.s_addr = ipha->ipha_src; 4961 sin.sin_port = *(uint16_t *)tcph->th_lport; 4962 sin.sin_family = AF_INET; 4963 if (ltcp->tcp_recvdstaddr) { 4964 sin_t sind; 4965 4966 sind = sin_null; 4967 sind.sin_addr.s_addr = ipha->ipha_dst; 4968 sind.sin_port = *(uint16_t *)tcph->th_fport; 4969 sind.sin_family = AF_INET; 4970 tpi_mp = mi_tpi_extconn_ind(NULL, 4971 (char *)&sind, sizeof (sin_t), (char *)&tcp, 4972 (t_scalar_t)sizeof (intptr_t), (char *)&sind, 4973 sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4974 } else { 4975 tpi_mp = mi_tpi_conn_ind(NULL, 4976 (char *)&sin, sizeof (sin_t), 4977 (char *)&tcp, (t_scalar_t)sizeof (intptr_t), 4978 (t_scalar_t)ltcp->tcp_conn_req_seqnum); 4979 } 4980 4981 if (tpi_mp == NULL) { 4982 return (ENOMEM); 4983 } 4984 4985 connp->conn_flags |= (IPCL_TCP4|IPCL_EAGER); 4986 connp->conn_send = ip_output; 4987 connp->conn_recv = tcp_input; 4988 connp->conn_fully_bound = B_FALSE; 4989 4990 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6); 4991 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6); 4992 connp->conn_fport = *(uint16_t *)tcph->th_lport; 4993 connp->conn_lport = *(uint16_t *)tcph->th_fport; 4994 4995 if (tcp_trace) { 4996 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP); 4997 } 4998 4999 /* Inherit information from the "parent" */ 5000 tcp->tcp_ipversion = ltcp->tcp_ipversion; 5001 tcp->tcp_family = ltcp->tcp_family; 5002 tcp->tcp_wq = ltcp->tcp_wq; 5003 tcp->tcp_rq = ltcp->tcp_rq; 5004 tcp->tcp_mss = tcp_mss_def_ipv4; 5005 tcp->tcp_detached = B_TRUE; 5006 if ((err = tcp_init_values(tcp)) != 0) { 5007 freemsg(tpi_mp); 5008 return (err); 5009 } 5010 5011 /* 5012 * Let's make sure that eager tcp template has enough space to 5013 * copy IPv4 listener's tcp template. Since the conn_t structure is 5014 * preserved and tcp_iphc_len is also preserved, an eager conn_t may 5015 * have a tcp_template of total len TCP_MAX_COMBINED_HEADER_LENGTH or 5016 * more (in case of re-allocation of conn_t with tcp-IPv6 template with 5017 * extension headers or with ip6i_t struct). Note that bcopy() below 5018 * copies listener tcp's hdr_len which cannot be greater than TCP_MAX_ 5019 * COMBINED_HEADER_LENGTH as this listener must be a IPv4 listener. 5020 */ 5021 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 5022 ASSERT(ltcp->tcp_hdr_len <= TCP_MAX_COMBINED_HEADER_LENGTH); 5023 5024 tcp->tcp_hdr_len = ltcp->tcp_hdr_len; 5025 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len; 5026 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; 5027 tcp->tcp_ttl = ltcp->tcp_ttl; 5028 tcp->tcp_tos = ltcp->tcp_tos; 5029 5030 /* Copy the IP+TCP header template from listener to eager */ 5031 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len); 5032 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; 5033 tcp->tcp_ip6h = NULL; 5034 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + 5035 tcp->tcp_ip_hdr_len); 5036 5037 /* Initialize the IP addresses and Ports */ 5038 tcp->tcp_ipha->ipha_dst = ipha->ipha_src; 5039 tcp->tcp_ipha->ipha_src = ipha->ipha_dst; 5040 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t)); 5041 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t)); 5042 5043 /* Source routing option copyover (reverse it) */ 5044 if (tcp_rev_src_routes) 5045 tcp_opt_reverse(tcp, ipha); 5046 5047 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 5048 5049 /* 5050 * If the SYN contains a credential, it's a loopback packet; attach 5051 * the credential to the TPI message. 5052 */ 5053 if ((cr = DB_CRED(idmp)) != NULL) { 5054 mblk_setcred(tpi_mp, cr); 5055 DB_CPID(tpi_mp) = DB_CPID(idmp); 5056 } 5057 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp; 5058 5059 /* Inherit the listener's SSL protection state */ 5060 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) { 5061 kssl_hold_ent(tcp->tcp_kssl_ent); 5062 tcp->tcp_kssl_pending = B_TRUE; 5063 } 5064 5065 return (0); 5066 } 5067 5068 /* 5069 * sets up conn for ipsec. 5070 * if the first mblk is M_CTL it is consumed and mpp is updated. 5071 * in case of error mpp is freed. 5072 */ 5073 conn_t * 5074 tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp) 5075 { 5076 conn_t *connp = tcp->tcp_connp; 5077 conn_t *econnp; 5078 squeue_t *new_sqp; 5079 mblk_t *first_mp = *mpp; 5080 mblk_t *mp = *mpp; 5081 boolean_t mctl_present = B_FALSE; 5082 uint_t ipvers; 5083 5084 econnp = tcp_get_conn(sqp); 5085 if (econnp == NULL) { 5086 freemsg(first_mp); 5087 return (NULL); 5088 } 5089 if (DB_TYPE(mp) == M_CTL) { 5090 if (mp->b_cont == NULL || 5091 mp->b_cont->b_datap->db_type != M_DATA) { 5092 freemsg(first_mp); 5093 return (NULL); 5094 } 5095 mp = mp->b_cont; 5096 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) == 0) { 5097 freemsg(first_mp); 5098 return (NULL); 5099 } 5100 5101 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 5102 first_mp->b_datap->db_struioflag &= ~STRUIO_POLICY; 5103 mctl_present = B_TRUE; 5104 } else { 5105 ASSERT(mp->b_datap->db_struioflag & STRUIO_POLICY); 5106 mp->b_datap->db_struioflag &= ~STRUIO_POLICY; 5107 } 5108 5109 new_sqp = (squeue_t *)DB_CKSUMSTART(mp); 5110 DB_CKSUMSTART(mp) = 0; 5111 5112 ASSERT(OK_32PTR(mp->b_rptr)); 5113 ipvers = IPH_HDR_VERSION(mp->b_rptr); 5114 if (ipvers == IPV4_VERSION) { 5115 uint16_t *up; 5116 uint32_t ports; 5117 ipha_t *ipha; 5118 5119 ipha = (ipha_t *)mp->b_rptr; 5120 up = (uint16_t *)((uchar_t *)ipha + 5121 IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET); 5122 ports = *(uint32_t *)up; 5123 IPCL_TCP_EAGER_INIT(econnp, IPPROTO_TCP, 5124 ipha->ipha_dst, ipha->ipha_src, ports); 5125 } else { 5126 uint16_t *up; 5127 uint32_t ports; 5128 uint16_t ip_hdr_len; 5129 uint8_t *nexthdrp; 5130 ip6_t *ip6h; 5131 tcph_t *tcph; 5132 5133 ip6h = (ip6_t *)mp->b_rptr; 5134 if (ip6h->ip6_nxt == IPPROTO_TCP) { 5135 ip_hdr_len = IPV6_HDR_LEN; 5136 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_len, 5137 &nexthdrp) || *nexthdrp != IPPROTO_TCP) { 5138 CONN_DEC_REF(econnp); 5139 freemsg(first_mp); 5140 return (NULL); 5141 } 5142 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5143 up = (uint16_t *)tcph->th_lport; 5144 ports = *(uint32_t *)up; 5145 IPCL_TCP_EAGER_INIT_V6(econnp, IPPROTO_TCP, 5146 ip6h->ip6_dst, ip6h->ip6_src, ports); 5147 } 5148 5149 /* 5150 * The caller already ensured that there is a sqp present. 5151 */ 5152 econnp->conn_sqp = new_sqp; 5153 5154 if (connp->conn_policy != NULL) { 5155 ipsec_in_t *ii; 5156 ii = (ipsec_in_t *)(first_mp->b_rptr); 5157 ASSERT(ii->ipsec_in_policy == NULL); 5158 IPPH_REFHOLD(connp->conn_policy); 5159 ii->ipsec_in_policy = connp->conn_policy; 5160 5161 first_mp->b_datap->db_type = IPSEC_POLICY_SET; 5162 if (!ip_bind_ipsec_policy_set(econnp, first_mp)) { 5163 CONN_DEC_REF(econnp); 5164 freemsg(first_mp); 5165 return (NULL); 5166 } 5167 } 5168 5169 if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) { 5170 CONN_DEC_REF(econnp); 5171 freemsg(first_mp); 5172 return (NULL); 5173 } 5174 5175 /* 5176 * If we know we have some policy, pass the "IPSEC" 5177 * options size TCP uses this adjust the MSS. 5178 */ 5179 econnp->conn_tcp->tcp_ipsec_overhead = conn_ipsec_length(econnp); 5180 if (mctl_present) { 5181 freeb(first_mp); 5182 *mpp = mp; 5183 } 5184 5185 return (econnp); 5186 } 5187 5188 /* 5189 * tcp_get_conn/tcp_free_conn 5190 * 5191 * tcp_get_conn is used to get a clean tcp connection structure. 5192 * It tries to reuse the connections put on the freelist by the 5193 * time_wait_collector failing which it goes to kmem_cache. This 5194 * way has two benefits compared to just allocating from and 5195 * freeing to kmem_cache. 5196 * 1) The time_wait_collector can free (which includes the cleanup) 5197 * outside the squeue. So when the interrupt comes, we have a clean 5198 * connection sitting in the freelist. Obviously, this buys us 5199 * performance. 5200 * 5201 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_conn_request 5202 * has multiple disadvantages - tying up the squeue during alloc, and the 5203 * fact that IPSec policy initialization has to happen here which 5204 * requires us sending a M_CTL and checking for it i.e. real ugliness. 5205 * But allocating the conn/tcp in IP land is also not the best since 5206 * we can't check the 'q' and 'q0' which are protected by squeue and 5207 * blindly allocate memory which might have to be freed here if we are 5208 * not allowed to accept the connection. By using the freelist and 5209 * putting the conn/tcp back in freelist, we don't pay a penalty for 5210 * allocating memory without checking 'q/q0' and freeing it if we can't 5211 * accept the connection. 5212 * 5213 * Care should be taken to put the conn back in the same squeue's freelist 5214 * from which it was allocated. Best results are obtained if conn is 5215 * allocated from listener's squeue and freed to the same. Time wait 5216 * collector will free up the freelist is the connection ends up sitting 5217 * there for too long. 5218 */ 5219 void * 5220 tcp_get_conn(void *arg) 5221 { 5222 tcp_t *tcp = NULL; 5223 conn_t *connp = NULL; 5224 squeue_t *sqp = (squeue_t *)arg; 5225 tcp_squeue_priv_t *tcp_time_wait; 5226 5227 tcp_time_wait = 5228 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); 5229 5230 mutex_enter(&tcp_time_wait->tcp_time_wait_lock); 5231 tcp = tcp_time_wait->tcp_free_list; 5232 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0)); 5233 if (tcp != NULL) { 5234 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; 5235 tcp_time_wait->tcp_free_list_cnt--; 5236 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 5237 tcp->tcp_time_wait_next = NULL; 5238 connp = tcp->tcp_connp; 5239 connp->conn_flags |= IPCL_REUSED; 5240 return ((void *)connp); 5241 } 5242 mutex_exit(&tcp_time_wait->tcp_time_wait_lock); 5243 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL) 5244 return (NULL); 5245 return ((void *)connp); 5246 } 5247 5248 /* 5249 * Update the cached label for the given tcp_t. This should be called once per 5250 * connection, and before any packets are sent or tcp_process_options is 5251 * invoked. Returns B_FALSE if the correct label could not be constructed. 5252 */ 5253 static boolean_t 5254 tcp_update_label(tcp_t *tcp, const cred_t *cr) 5255 { 5256 conn_t *connp = tcp->tcp_connp; 5257 5258 if (tcp->tcp_ipversion == IPV4_VERSION) { 5259 uchar_t optbuf[IP_MAX_OPT_LENGTH]; 5260 int added; 5261 5262 if (tsol_compute_label(cr, tcp->tcp_remote, optbuf, 5263 connp->conn_mac_exempt) != 0) 5264 return (B_FALSE); 5265 5266 added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len); 5267 if (added == -1) 5268 return (B_FALSE); 5269 tcp->tcp_hdr_len += added; 5270 tcp->tcp_tcph = (tcph_t *)((uchar_t *)tcp->tcp_tcph + added); 5271 tcp->tcp_ip_hdr_len += added; 5272 if ((tcp->tcp_label_len = optbuf[IPOPT_OLEN]) != 0) { 5273 tcp->tcp_label_len = (tcp->tcp_label_len + 3) & ~3; 5274 added = tsol_prepend_option(optbuf, tcp->tcp_ipha, 5275 tcp->tcp_hdr_len); 5276 if (added == -1) 5277 return (B_FALSE); 5278 tcp->tcp_hdr_len += added; 5279 tcp->tcp_tcph = (tcph_t *) 5280 ((uchar_t *)tcp->tcp_tcph + added); 5281 tcp->tcp_ip_hdr_len += added; 5282 } 5283 } else { 5284 uchar_t optbuf[TSOL_MAX_IPV6_OPTION]; 5285 5286 if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf, 5287 connp->conn_mac_exempt) != 0) 5288 return (B_FALSE); 5289 if (tsol_update_sticky(&tcp->tcp_sticky_ipp, 5290 &tcp->tcp_label_len, optbuf) != 0) 5291 return (B_FALSE); 5292 if (tcp_build_hdrs(tcp->tcp_rq, tcp) != 0) 5293 return (B_FALSE); 5294 } 5295 5296 connp->conn_ulp_labeled = 1; 5297 5298 return (B_TRUE); 5299 } 5300 5301 /* BEGIN CSTYLED */ 5302 /* 5303 * 5304 * The sockfs ACCEPT path: 5305 * ======================= 5306 * 5307 * The eager is now established in its own perimeter as soon as SYN is 5308 * received in tcp_conn_request(). When sockfs receives conn_ind, it 5309 * completes the accept processing on the acceptor STREAM. The sending 5310 * of conn_ind part is common for both sockfs listener and a TLI/XTI 5311 * listener but a TLI/XTI listener completes the accept processing 5312 * on the listener perimeter. 5313 * 5314 * Common control flow for 3 way handshake: 5315 * ---------------------------------------- 5316 * 5317 * incoming SYN (listener perimeter) -> tcp_rput_data() 5318 * -> tcp_conn_request() 5319 * 5320 * incoming SYN-ACK-ACK (eager perim) -> tcp_rput_data() 5321 * send T_CONN_IND (listener perim) -> tcp_send_conn_ind() 5322 * 5323 * Sockfs ACCEPT Path: 5324 * ------------------- 5325 * 5326 * open acceptor stream (ip_tcpopen allocates tcp_wput_accept() 5327 * as STREAM entry point) 5328 * 5329 * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept() 5330 * 5331 * tcp_wput_accept() extracts the eager and makes the q->q_ptr <-> eager 5332 * association (we are not behind eager's squeue but sockfs is protecting us 5333 * and no one knows about this stream yet. The STREAMS entry point q->q_info 5334 * is changed to point at tcp_wput(). 5335 * 5336 * tcp_wput_accept() sends any deferred eagers via tcp_send_pending() to 5337 * listener (done on listener's perimeter). 5338 * 5339 * tcp_wput_accept() calls tcp_accept_finish() on eagers perimeter to finish 5340 * accept. 5341 * 5342 * TLI/XTI client ACCEPT path: 5343 * --------------------------- 5344 * 5345 * soaccept() sends T_CONN_RES on the listener STREAM. 5346 * 5347 * tcp_accept() -> tcp_accept_swap() complete the processing and send 5348 * the bind_mp to eager perimeter to finish accept (tcp_rput_other()). 5349 * 5350 * Locks: 5351 * ====== 5352 * 5353 * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and 5354 * and listeners->tcp_eager_next_q. 5355 * 5356 * Referencing: 5357 * ============ 5358 * 5359 * 1) We start out in tcp_conn_request by eager placing a ref on 5360 * listener and listener adding eager to listeners->tcp_eager_next_q0. 5361 * 5362 * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before 5363 * doing so we place a ref on the eager. This ref is finally dropped at the 5364 * end of tcp_accept_finish() while unwinding from the squeue, i.e. the 5365 * reference is dropped by the squeue framework. 5366 * 5367 * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish 5368 * 5369 * The reference must be released by the same entity that added the reference 5370 * In the above scheme, the eager is the entity that adds and releases the 5371 * references. Note that tcp_accept_finish executes in the squeue of the eager 5372 * (albeit after it is attached to the acceptor stream). Though 1. executes 5373 * in the listener's squeue, the eager is nascent at this point and the 5374 * reference can be considered to have been added on behalf of the eager. 5375 * 5376 * Eager getting a Reset or listener closing: 5377 * ========================================== 5378 * 5379 * Once the listener and eager are linked, the listener never does the unlink. 5380 * If the listener needs to close, tcp_eager_cleanup() is called which queues 5381 * a message on all eager perimeter. The eager then does the unlink, clears 5382 * any pointers to the listener's queue and drops the reference to the 5383 * listener. The listener waits in tcp_close outside the squeue until its 5384 * refcount has dropped to 1. This ensures that the listener has waited for 5385 * all eagers to clear their association with the listener. 5386 * 5387 * Similarly, if eager decides to go away, it can unlink itself and close. 5388 * When the T_CONN_RES comes down, we check if eager has closed. Note that 5389 * the reference to eager is still valid because of the extra ref we put 5390 * in tcp_send_conn_ind. 5391 * 5392 * Listener can always locate the eager under the protection 5393 * of the listener->tcp_eager_lock, and then do a refhold 5394 * on the eager during the accept processing. 5395 * 5396 * The acceptor stream accesses the eager in the accept processing 5397 * based on the ref placed on eager before sending T_conn_ind. 5398 * The only entity that can negate this refhold is a listener close 5399 * which is mutually exclusive with an active acceptor stream. 5400 * 5401 * Eager's reference on the listener 5402 * =================================== 5403 * 5404 * If the accept happens (even on a closed eager) the eager drops its 5405 * reference on the listener at the start of tcp_accept_finish. If the 5406 * eager is killed due to an incoming RST before the T_conn_ind is sent up, 5407 * the reference is dropped in tcp_closei_local. If the listener closes, 5408 * the reference is dropped in tcp_eager_kill. In all cases the reference 5409 * is dropped while executing in the eager's context (squeue). 5410 */ 5411 /* END CSTYLED */ 5412 5413 /* Process the SYN packet, mp, directed at the listener 'tcp' */ 5414 5415 /* 5416 * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN. 5417 * tcp_rput_data will not see any SYN packets. 5418 */ 5419 /* ARGSUSED */ 5420 void 5421 tcp_conn_request(void *arg, mblk_t *mp, void *arg2) 5422 { 5423 tcph_t *tcph; 5424 uint32_t seg_seq; 5425 tcp_t *eager; 5426 uint_t ipvers; 5427 ipha_t *ipha; 5428 ip6_t *ip6h; 5429 int err; 5430 conn_t *econnp = NULL; 5431 squeue_t *new_sqp; 5432 mblk_t *mp1; 5433 uint_t ip_hdr_len; 5434 conn_t *connp = (conn_t *)arg; 5435 tcp_t *tcp = connp->conn_tcp; 5436 ire_t *ire; 5437 cred_t *credp; 5438 5439 if (tcp->tcp_state != TCPS_LISTEN) 5440 goto error2; 5441 5442 ASSERT((tcp->tcp_connp->conn_flags & IPCL_BOUND) != 0); 5443 5444 mutex_enter(&tcp->tcp_eager_lock); 5445 if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { 5446 mutex_exit(&tcp->tcp_eager_lock); 5447 TCP_STAT(tcp_listendrop); 5448 BUMP_MIB(&tcp_mib, tcpListenDrop); 5449 if (tcp->tcp_debug) { 5450 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 5451 "tcp_conn_request: listen backlog (max=%d) " 5452 "overflow (%d pending) on %s", 5453 tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, 5454 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 5455 } 5456 goto error2; 5457 } 5458 5459 if (tcp->tcp_conn_req_cnt_q0 >= 5460 tcp->tcp_conn_req_max + tcp_conn_req_max_q0) { 5461 /* 5462 * Q0 is full. Drop a pending half-open req from the queue 5463 * to make room for the new SYN req. Also mark the time we 5464 * drop a SYN. 5465 * 5466 * A more aggressive defense against SYN attack will 5467 * be to set the "tcp_syn_defense" flag now. 5468 */ 5469 TCP_STAT(tcp_listendropq0); 5470 tcp->tcp_last_rcv_lbolt = lbolt64; 5471 if (!tcp_drop_q0(tcp)) { 5472 mutex_exit(&tcp->tcp_eager_lock); 5473 BUMP_MIB(&tcp_mib, tcpListenDropQ0); 5474 if (tcp->tcp_debug) { 5475 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, 5476 "tcp_conn_request: listen half-open queue " 5477 "(max=%d) full (%d pending) on %s", 5478 tcp_conn_req_max_q0, 5479 tcp->tcp_conn_req_cnt_q0, 5480 tcp_display(tcp, NULL, 5481 DISP_PORT_ONLY)); 5482 } 5483 goto error2; 5484 } 5485 } 5486 mutex_exit(&tcp->tcp_eager_lock); 5487 5488 /* 5489 * IP adds STRUIO_EAGER and ensures that the received packet is 5490 * M_DATA even if conn_ipv6_recvpktinfo is enabled or for ip6 5491 * link local address. If IPSec is enabled, db_struioflag has 5492 * STRUIO_POLICY set (mutually exclusive from STRUIO_EAGER); 5493 * otherwise an error case if neither of them is set. 5494 */ 5495 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 5496 new_sqp = (squeue_t *)DB_CKSUMSTART(mp); 5497 DB_CKSUMSTART(mp) = 0; 5498 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 5499 econnp = (conn_t *)tcp_get_conn(arg2); 5500 if (econnp == NULL) 5501 goto error2; 5502 econnp->conn_sqp = new_sqp; 5503 } else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) { 5504 /* 5505 * mp is updated in tcp_get_ipsec_conn(). 5506 */ 5507 econnp = tcp_get_ipsec_conn(tcp, arg2, &mp); 5508 if (econnp == NULL) { 5509 /* 5510 * mp freed by tcp_get_ipsec_conn. 5511 */ 5512 return; 5513 } 5514 } else { 5515 goto error2; 5516 } 5517 5518 ASSERT(DB_TYPE(mp) == M_DATA); 5519 5520 ipvers = IPH_HDR_VERSION(mp->b_rptr); 5521 ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION); 5522 ASSERT(OK_32PTR(mp->b_rptr)); 5523 if (ipvers == IPV4_VERSION) { 5524 ipha = (ipha_t *)mp->b_rptr; 5525 ip_hdr_len = IPH_HDR_LENGTH(ipha); 5526 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5527 } else { 5528 ip6h = (ip6_t *)mp->b_rptr; 5529 ip_hdr_len = ip_hdr_length_v6(mp, ip6h); 5530 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 5531 } 5532 5533 if (tcp->tcp_family == AF_INET) { 5534 ASSERT(ipvers == IPV4_VERSION); 5535 err = tcp_conn_create_v4(connp, econnp, ipha, tcph, mp); 5536 } else { 5537 err = tcp_conn_create_v6(connp, econnp, mp, tcph, ipvers, mp); 5538 } 5539 5540 if (err) 5541 goto error3; 5542 5543 eager = econnp->conn_tcp; 5544 5545 /* Inherit various TCP parameters from the listener */ 5546 eager->tcp_naglim = tcp->tcp_naglim; 5547 eager->tcp_first_timer_threshold = 5548 tcp->tcp_first_timer_threshold; 5549 eager->tcp_second_timer_threshold = 5550 tcp->tcp_second_timer_threshold; 5551 5552 eager->tcp_first_ctimer_threshold = 5553 tcp->tcp_first_ctimer_threshold; 5554 eager->tcp_second_ctimer_threshold = 5555 tcp->tcp_second_ctimer_threshold; 5556 5557 /* 5558 * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics. 5559 * If it does not, the eager's receive window will be set to the 5560 * listener's receive window later in this function. 5561 */ 5562 eager->tcp_rwnd = 0; 5563 5564 /* 5565 * Inherit listener's tcp_init_cwnd. Need to do this before 5566 * calling tcp_process_options() where tcp_mss_set() is called 5567 * to set the initial cwnd. 5568 */ 5569 eager->tcp_init_cwnd = tcp->tcp_init_cwnd; 5570 5571 /* 5572 * Zones: tcp_adapt_ire() and tcp_send_data() both need the 5573 * zone id before the accept is completed in tcp_wput_accept(). 5574 */ 5575 econnp->conn_zoneid = connp->conn_zoneid; 5576 5577 /* Copy nexthop information from listener to eager */ 5578 if (connp->conn_nexthop_set) { 5579 econnp->conn_nexthop_set = connp->conn_nexthop_set; 5580 econnp->conn_nexthop_v4 = connp->conn_nexthop_v4; 5581 } 5582 5583 /* 5584 * TSOL: tsol_input_proc() needs the eager's cred before the 5585 * eager is accepted 5586 */ 5587 econnp->conn_cred = eager->tcp_cred = credp = connp->conn_cred; 5588 crhold(credp); 5589 5590 /* 5591 * If the caller has the process-wide flag set, then default to MAC 5592 * exempt mode. This allows read-down to unlabeled hosts. 5593 */ 5594 if (getpflags(NET_MAC_AWARE, credp) != 0) 5595 econnp->conn_mac_exempt = B_TRUE; 5596 5597 if (is_system_labeled()) { 5598 cred_t *cr; 5599 5600 if (connp->conn_mlp_type != mlptSingle) { 5601 cr = econnp->conn_peercred = DB_CRED(mp); 5602 if (cr != NULL) 5603 crhold(cr); 5604 else 5605 cr = econnp->conn_cred; 5606 DTRACE_PROBE2(mlp_syn_accept, conn_t *, 5607 econnp, cred_t *, cr) 5608 } else { 5609 cr = econnp->conn_cred; 5610 DTRACE_PROBE2(syn_accept, conn_t *, 5611 econnp, cred_t *, cr) 5612 } 5613 5614 if (!tcp_update_label(eager, cr)) { 5615 DTRACE_PROBE3( 5616 tx__ip__log__error__connrequest__tcp, 5617 char *, "eager connp(1) label on SYN mp(2) failed", 5618 conn_t *, econnp, mblk_t *, mp); 5619 goto error3; 5620 } 5621 } 5622 5623 eager->tcp_hard_binding = B_TRUE; 5624 5625 tcp_bind_hash_insert(&tcp_bind_fanout[ 5626 TCP_BIND_HASH(eager->tcp_lport)], eager, 0); 5627 5628 CL_INET_CONNECT(eager); 5629 5630 /* 5631 * No need to check for multicast destination since ip will only pass 5632 * up multicasts to those that have expressed interest 5633 * TODO: what about rejecting broadcasts? 5634 * Also check that source is not a multicast or broadcast address. 5635 */ 5636 eager->tcp_state = TCPS_SYN_RCVD; 5637 5638 5639 /* 5640 * There should be no ire in the mp as we are being called after 5641 * receiving the SYN. 5642 */ 5643 ASSERT(tcp_ire_mp(mp) == NULL); 5644 5645 /* 5646 * Adapt our mss, ttl, ... according to information provided in IRE. 5647 */ 5648 5649 if (tcp_adapt_ire(eager, NULL) == 0) { 5650 /* Undo the bind_hash_insert */ 5651 tcp_bind_hash_remove(eager); 5652 goto error3; 5653 } 5654 5655 /* Process all TCP options. */ 5656 tcp_process_options(eager, tcph); 5657 5658 /* Is the other end ECN capable? */ 5659 if (tcp_ecn_permitted >= 1 && 5660 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { 5661 eager->tcp_ecn_ok = B_TRUE; 5662 } 5663 5664 /* 5665 * listener->tcp_rq->q_hiwat should be the default window size or a 5666 * window size changed via SO_RCVBUF option. First round up the 5667 * eager's tcp_rwnd to the nearest MSS. Then find out the window 5668 * scale option value if needed. Call tcp_rwnd_set() to finish the 5669 * setting. 5670 * 5671 * Note if there is a rpipe metric associated with the remote host, 5672 * we should not inherit receive window size from listener. 5673 */ 5674 eager->tcp_rwnd = MSS_ROUNDUP( 5675 (eager->tcp_rwnd == 0 ? tcp->tcp_rq->q_hiwat : 5676 eager->tcp_rwnd), eager->tcp_mss); 5677 if (eager->tcp_snd_ws_ok) 5678 tcp_set_ws_value(eager); 5679 /* 5680 * Note that this is the only place tcp_rwnd_set() is called for 5681 * accepting a connection. We need to call it here instead of 5682 * after the 3-way handshake because we need to tell the other 5683 * side our rwnd in the SYN-ACK segment. 5684 */ 5685 (void) tcp_rwnd_set(eager, eager->tcp_rwnd); 5686 5687 /* 5688 * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ 5689 * via soaccept()->soinheritoptions() which essentially applies 5690 * all the listener options to the new STREAM. The options that we 5691 * need to take care of are: 5692 * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST, 5693 * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER, 5694 * SO_SNDBUF, SO_RCVBUF. 5695 * 5696 * SO_RCVBUF: tcp_rwnd_set() above takes care of it. 5697 * SO_SNDBUF: Set the tcp_xmit_hiwater for the eager. When 5698 * tcp_maxpsz_set() gets called later from 5699 * tcp_accept_finish(), the option takes effect. 5700 * 5701 */ 5702 /* Set the TCP options */ 5703 eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater; 5704 eager->tcp_dgram_errind = tcp->tcp_dgram_errind; 5705 eager->tcp_oobinline = tcp->tcp_oobinline; 5706 eager->tcp_reuseaddr = tcp->tcp_reuseaddr; 5707 eager->tcp_broadcast = tcp->tcp_broadcast; 5708 eager->tcp_useloopback = tcp->tcp_useloopback; 5709 eager->tcp_dontroute = tcp->tcp_dontroute; 5710 eager->tcp_linger = tcp->tcp_linger; 5711 eager->tcp_lingertime = tcp->tcp_lingertime; 5712 if (tcp->tcp_ka_enabled) 5713 eager->tcp_ka_enabled = 1; 5714 5715 /* Set the IP options */ 5716 econnp->conn_broadcast = connp->conn_broadcast; 5717 econnp->conn_loopback = connp->conn_loopback; 5718 econnp->conn_dontroute = connp->conn_dontroute; 5719 econnp->conn_reuseaddr = connp->conn_reuseaddr; 5720 5721 /* Put a ref on the listener for the eager. */ 5722 CONN_INC_REF(connp); 5723 mutex_enter(&tcp->tcp_eager_lock); 5724 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; 5725 eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; 5726 tcp->tcp_eager_next_q0 = eager; 5727 eager->tcp_eager_prev_q0 = tcp; 5728 5729 /* Set tcp_listener before adding it to tcp_conn_fanout */ 5730 eager->tcp_listener = tcp; 5731 eager->tcp_saved_listener = tcp; 5732 5733 /* 5734 * Tag this detached tcp vector for later retrieval 5735 * by our listener client in tcp_accept(). 5736 */ 5737 eager->tcp_conn_req_seqnum = tcp->tcp_conn_req_seqnum; 5738 tcp->tcp_conn_req_cnt_q0++; 5739 if (++tcp->tcp_conn_req_seqnum == -1) { 5740 /* 5741 * -1 is "special" and defined in TPI as something 5742 * that should never be used in T_CONN_IND 5743 */ 5744 ++tcp->tcp_conn_req_seqnum; 5745 } 5746 mutex_exit(&tcp->tcp_eager_lock); 5747 5748 if (tcp->tcp_syn_defense) { 5749 /* Don't drop the SYN that comes from a good IP source */ 5750 ipaddr_t *addr_cache = (ipaddr_t *)(tcp->tcp_ip_addr_cache); 5751 if (addr_cache != NULL && eager->tcp_remote == 5752 addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) { 5753 eager->tcp_dontdrop = B_TRUE; 5754 } 5755 } 5756 5757 /* 5758 * We need to insert the eager in its own perimeter but as soon 5759 * as we do that, we expose the eager to the classifier and 5760 * should not touch any field outside the eager's perimeter. 5761 * So do all the work necessary before inserting the eager 5762 * in its own perimeter. Be optimistic that ipcl_conn_insert() 5763 * will succeed but undo everything if it fails. 5764 */ 5765 seg_seq = ABE32_TO_U32(tcph->th_seq); 5766 eager->tcp_irs = seg_seq; 5767 eager->tcp_rack = seg_seq; 5768 eager->tcp_rnxt = seg_seq + 1; 5769 U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack); 5770 BUMP_MIB(&tcp_mib, tcpPassiveOpens); 5771 eager->tcp_state = TCPS_SYN_RCVD; 5772 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, 5773 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE); 5774 if (mp1 == NULL) 5775 goto error1; 5776 DB_CPID(mp1) = tcp->tcp_cpid; 5777 5778 /* 5779 * We need to start the rto timer. In normal case, we start 5780 * the timer after sending the packet on the wire (or at 5781 * least believing that packet was sent by waiting for 5782 * CALL_IP_WPUT() to return). Since this is the first packet 5783 * being sent on the wire for the eager, our initial tcp_rto 5784 * is at least tcp_rexmit_interval_min which is a fairly 5785 * large value to allow the algorithm to adjust slowly to large 5786 * fluctuations of RTT during first few transmissions. 5787 * 5788 * Starting the timer first and then sending the packet in this 5789 * case shouldn't make much difference since tcp_rexmit_interval_min 5790 * is of the order of several 100ms and starting the timer 5791 * first and then sending the packet will result in difference 5792 * of few micro seconds. 5793 * 5794 * Without this optimization, we are forced to hold the fanout 5795 * lock across the ipcl_bind_insert() and sending the packet 5796 * so that we don't race against an incoming packet (maybe RST) 5797 * for this eager. 5798 */ 5799 5800 TCP_RECORD_TRACE(eager, mp1, TCP_TRACE_SEND_PKT); 5801 TCP_TIMER_RESTART(eager, eager->tcp_rto); 5802 5803 5804 /* 5805 * Insert the eager in its own perimeter now. We are ready to deal 5806 * with any packets on eager. 5807 */ 5808 if (eager->tcp_ipversion == IPV4_VERSION) { 5809 if (ipcl_conn_insert(econnp, IPPROTO_TCP, 0, 0, 0) != 0) { 5810 goto error; 5811 } 5812 } else { 5813 if (ipcl_conn_insert_v6(econnp, IPPROTO_TCP, 0, 0, 0, 0) != 0) { 5814 goto error; 5815 } 5816 } 5817 5818 /* mark conn as fully-bound */ 5819 econnp->conn_fully_bound = B_TRUE; 5820 5821 /* Send the SYN-ACK */ 5822 tcp_send_data(eager, eager->tcp_wq, mp1); 5823 freemsg(mp); 5824 5825 return; 5826 error: 5827 (void) TCP_TIMER_CANCEL(eager, eager->tcp_timer_tid); 5828 freemsg(mp1); 5829 error1: 5830 /* Undo what we did above */ 5831 mutex_enter(&tcp->tcp_eager_lock); 5832 tcp_eager_unlink(eager); 5833 mutex_exit(&tcp->tcp_eager_lock); 5834 /* Drop eager's reference on the listener */ 5835 CONN_DEC_REF(connp); 5836 5837 /* 5838 * Delete the cached ire in conn_ire_cache and also mark 5839 * the conn as CONDEMNED 5840 */ 5841 mutex_enter(&econnp->conn_lock); 5842 econnp->conn_state_flags |= CONN_CONDEMNED; 5843 ire = econnp->conn_ire_cache; 5844 econnp->conn_ire_cache = NULL; 5845 mutex_exit(&econnp->conn_lock); 5846 if (ire != NULL) 5847 IRE_REFRELE_NOTR(ire); 5848 5849 /* 5850 * tcp_accept_comm inserts the eager to the bind_hash 5851 * we need to remove it from the hash if ipcl_conn_insert 5852 * fails. 5853 */ 5854 tcp_bind_hash_remove(eager); 5855 /* Drop the eager ref placed in tcp_open_detached */ 5856 CONN_DEC_REF(econnp); 5857 5858 /* 5859 * If a connection already exists, send the mp to that connections so 5860 * that it can be appropriately dealt with. 5861 */ 5862 if ((econnp = ipcl_classify(mp, connp->conn_zoneid)) != NULL) { 5863 if (!IPCL_IS_CONNECTED(econnp)) { 5864 /* 5865 * Something bad happened. ipcl_conn_insert() 5866 * failed because a connection already existed 5867 * in connected hash but we can't find it 5868 * anymore (someone blew it away). Just 5869 * free this message and hopefully remote 5870 * will retransmit at which time the SYN can be 5871 * treated as a new connection or dealth with 5872 * a TH_RST if a connection already exists. 5873 */ 5874 freemsg(mp); 5875 } else { 5876 squeue_fill(econnp->conn_sqp, mp, tcp_input, 5877 econnp, SQTAG_TCP_CONN_REQ); 5878 } 5879 } else { 5880 /* Nobody wants this packet */ 5881 freemsg(mp); 5882 } 5883 return; 5884 error2: 5885 freemsg(mp); 5886 return; 5887 error3: 5888 CONN_DEC_REF(econnp); 5889 freemsg(mp); 5890 } 5891 5892 /* 5893 * In an ideal case of vertical partition in NUMA architecture, its 5894 * beneficial to have the listener and all the incoming connections 5895 * tied to the same squeue. The other constraint is that incoming 5896 * connections should be tied to the squeue attached to interrupted 5897 * CPU for obvious locality reason so this leaves the listener to 5898 * be tied to the same squeue. Our only problem is that when listener 5899 * is binding, the CPU that will get interrupted by the NIC whose 5900 * IP address the listener is binding to is not even known. So 5901 * the code below allows us to change that binding at the time the 5902 * CPU is interrupted by virtue of incoming connection's squeue. 5903 * 5904 * This is usefull only in case of a listener bound to a specific IP 5905 * address. For other kind of listeners, they get bound the 5906 * very first time and there is no attempt to rebind them. 5907 */ 5908 void 5909 tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) 5910 { 5911 conn_t *connp = (conn_t *)arg; 5912 squeue_t *sqp = (squeue_t *)arg2; 5913 squeue_t *new_sqp; 5914 uint32_t conn_flags; 5915 5916 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 5917 new_sqp = (squeue_t *)DB_CKSUMSTART(mp); 5918 } else { 5919 goto done; 5920 } 5921 5922 if (connp->conn_fanout == NULL) 5923 goto done; 5924 5925 if (!(connp->conn_flags & IPCL_FULLY_BOUND)) { 5926 mutex_enter(&connp->conn_fanout->connf_lock); 5927 mutex_enter(&connp->conn_lock); 5928 /* 5929 * No one from read or write side can access us now 5930 * except for already queued packets on this squeue. 5931 * But since we haven't changed the squeue yet, they 5932 * can't execute. If they are processed after we have 5933 * changed the squeue, they are sent back to the 5934 * correct squeue down below. 5935 */ 5936 if (connp->conn_sqp != new_sqp) { 5937 while (connp->conn_sqp != new_sqp) 5938 (void) casptr(&connp->conn_sqp, sqp, new_sqp); 5939 } 5940 5941 do { 5942 conn_flags = connp->conn_flags; 5943 conn_flags |= IPCL_FULLY_BOUND; 5944 (void) cas32(&connp->conn_flags, connp->conn_flags, 5945 conn_flags); 5946 } while (!(connp->conn_flags & IPCL_FULLY_BOUND)); 5947 5948 mutex_exit(&connp->conn_fanout->connf_lock); 5949 mutex_exit(&connp->conn_lock); 5950 } 5951 5952 done: 5953 if (connp->conn_sqp != sqp) { 5954 CONN_INC_REF(connp); 5955 squeue_fill(connp->conn_sqp, mp, 5956 connp->conn_recv, connp, SQTAG_TCP_CONN_REQ_UNBOUND); 5957 } else { 5958 tcp_conn_request(connp, mp, sqp); 5959 } 5960 } 5961 5962 /* 5963 * Successful connect request processing begins when our client passes 5964 * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes 5965 * our T_OK_ACK reply message upstream. The control flow looks like this: 5966 * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_connect() -> IP 5967 * upstream <- tcp_rput() <- IP 5968 * After various error checks are completed, tcp_connect() lays 5969 * the target address and port into the composite header template, 5970 * preallocates the T_OK_ACK reply message, construct a full 12 byte bind 5971 * request followed by an IRE request, and passes the three mblk message 5972 * down to IP looking like this: 5973 * O_T_BIND_REQ for IP --> IRE req --> T_OK_ACK for our client 5974 * Processing continues in tcp_rput() when we receive the following message: 5975 * T_BIND_ACK from IP --> IRE ack --> T_OK_ACK for our client 5976 * After consuming the first two mblks, tcp_rput() calls tcp_timer(), 5977 * to fire off the connection request, and then passes the T_OK_ACK mblk 5978 * upstream that we filled in below. There are, of course, numerous 5979 * error conditions along the way which truncate the processing described 5980 * above. 5981 */ 5982 static void 5983 tcp_connect(tcp_t *tcp, mblk_t *mp) 5984 { 5985 sin_t *sin; 5986 sin6_t *sin6; 5987 queue_t *q = tcp->tcp_wq; 5988 struct T_conn_req *tcr; 5989 ipaddr_t *dstaddrp; 5990 in_port_t dstport; 5991 uint_t srcid; 5992 5993 tcr = (struct T_conn_req *)mp->b_rptr; 5994 5995 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 5996 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { 5997 tcp_err_ack(tcp, mp, TPROTO, 0); 5998 return; 5999 } 6000 6001 /* 6002 * Determine packet type based on type of address passed in 6003 * the request should contain an IPv4 or IPv6 address. 6004 * Make sure that address family matches the type of 6005 * family of the the address passed down 6006 */ 6007 switch (tcr->DEST_length) { 6008 default: 6009 tcp_err_ack(tcp, mp, TBADADDR, 0); 6010 return; 6011 6012 case (sizeof (sin_t) - sizeof (sin->sin_zero)): { 6013 /* 6014 * XXX: The check for valid DEST_length was not there 6015 * in earlier releases and some buggy 6016 * TLI apps (e.g Sybase) got away with not feeding 6017 * in sin_zero part of address. 6018 * We allow that bug to keep those buggy apps humming. 6019 * Test suites require the check on DEST_length. 6020 * We construct a new mblk with valid DEST_length 6021 * free the original so the rest of the code does 6022 * not have to keep track of this special shorter 6023 * length address case. 6024 */ 6025 mblk_t *nmp; 6026 struct T_conn_req *ntcr; 6027 sin_t *nsin; 6028 6029 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + 6030 tcr->OPT_length, BPRI_HI); 6031 if (nmp == NULL) { 6032 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 6033 return; 6034 } 6035 ntcr = (struct T_conn_req *)nmp->b_rptr; 6036 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ 6037 ntcr->PRIM_type = T_CONN_REQ; 6038 ntcr->DEST_length = sizeof (sin_t); 6039 ntcr->DEST_offset = sizeof (struct T_conn_req); 6040 6041 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); 6042 *nsin = sin_null; 6043 /* Get pointer to shorter address to copy from original mp */ 6044 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 6045 tcr->DEST_length); /* extract DEST_length worth of sin_t */ 6046 if (sin == NULL || !OK_32PTR((char *)sin)) { 6047 freemsg(nmp); 6048 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 6049 return; 6050 } 6051 nsin->sin_family = sin->sin_family; 6052 nsin->sin_port = sin->sin_port; 6053 nsin->sin_addr = sin->sin_addr; 6054 /* Note:nsin->sin_zero zero-fill with sin_null assign above */ 6055 nmp->b_wptr = (uchar_t *)&nsin[1]; 6056 if (tcr->OPT_length != 0) { 6057 ntcr->OPT_length = tcr->OPT_length; 6058 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; 6059 bcopy((uchar_t *)tcr + tcr->OPT_offset, 6060 (uchar_t *)ntcr + ntcr->OPT_offset, 6061 tcr->OPT_length); 6062 nmp->b_wptr += tcr->OPT_length; 6063 } 6064 freemsg(mp); /* original mp freed */ 6065 mp = nmp; /* re-initialize original variables */ 6066 tcr = ntcr; 6067 } 6068 /* FALLTHRU */ 6069 6070 case sizeof (sin_t): 6071 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, 6072 sizeof (sin_t)); 6073 if (sin == NULL || !OK_32PTR((char *)sin)) { 6074 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 6075 return; 6076 } 6077 if (tcp->tcp_family != AF_INET || 6078 sin->sin_family != AF_INET) { 6079 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 6080 return; 6081 } 6082 if (sin->sin_port == 0) { 6083 tcp_err_ack(tcp, mp, TBADADDR, 0); 6084 return; 6085 } 6086 if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) { 6087 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 6088 return; 6089 } 6090 6091 break; 6092 6093 case sizeof (sin6_t): 6094 sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset, 6095 sizeof (sin6_t)); 6096 if (sin6 == NULL || !OK_32PTR((char *)sin6)) { 6097 tcp_err_ack(tcp, mp, TSYSERR, EINVAL); 6098 return; 6099 } 6100 if (tcp->tcp_family != AF_INET6 || 6101 sin6->sin6_family != AF_INET6) { 6102 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT); 6103 return; 6104 } 6105 if (sin6->sin6_port == 0) { 6106 tcp_err_ack(tcp, mp, TBADADDR, 0); 6107 return; 6108 } 6109 break; 6110 } 6111 /* 6112 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we 6113 * should key on their sequence number and cut them loose. 6114 */ 6115 6116 /* 6117 * If options passed in, feed it for verification and handling 6118 */ 6119 if (tcr->OPT_length != 0) { 6120 mblk_t *ok_mp; 6121 mblk_t *discon_mp; 6122 mblk_t *conn_opts_mp; 6123 int t_error, sys_error, do_disconnect; 6124 6125 conn_opts_mp = NULL; 6126 6127 if (tcp_conprim_opt_process(tcp, mp, 6128 &do_disconnect, &t_error, &sys_error) < 0) { 6129 if (do_disconnect) { 6130 ASSERT(t_error == 0 && sys_error == 0); 6131 discon_mp = mi_tpi_discon_ind(NULL, 6132 ECONNREFUSED, 0); 6133 if (!discon_mp) { 6134 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 6135 TSYSERR, ENOMEM); 6136 return; 6137 } 6138 ok_mp = mi_tpi_ok_ack_alloc(mp); 6139 if (!ok_mp) { 6140 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6141 TSYSERR, ENOMEM); 6142 return; 6143 } 6144 qreply(q, ok_mp); 6145 qreply(q, discon_mp); /* no flush! */ 6146 } else { 6147 ASSERT(t_error != 0); 6148 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, 6149 sys_error); 6150 } 6151 return; 6152 } 6153 /* 6154 * Success in setting options, the mp option buffer represented 6155 * by OPT_length/offset has been potentially modified and 6156 * contains results of option processing. We copy it in 6157 * another mp to save it for potentially influencing returning 6158 * it in T_CONN_CONN. 6159 */ 6160 if (tcr->OPT_length != 0) { /* there are resulting options */ 6161 conn_opts_mp = copyb(mp); 6162 if (!conn_opts_mp) { 6163 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, 6164 TSYSERR, ENOMEM); 6165 return; 6166 } 6167 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); 6168 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; 6169 /* 6170 * Note: 6171 * These resulting option negotiation can include any 6172 * end-to-end negotiation options but there no such 6173 * thing (yet?) in our TCP/IP. 6174 */ 6175 } 6176 } 6177 6178 /* 6179 * If we're connecting to an IPv4-mapped IPv6 address, we need to 6180 * make sure that the template IP header in the tcp structure is an 6181 * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We 6182 * need to this before we call tcp_bindi() so that the port lookup 6183 * code will look for ports in the correct port space (IPv4 and 6184 * IPv6 have separate port spaces). 6185 */ 6186 if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION && 6187 IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 6188 int err = 0; 6189 6190 err = tcp_header_init_ipv4(tcp); 6191 if (err != 0) { 6192 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); 6193 goto connect_failed; 6194 } 6195 if (tcp->tcp_lport != 0) 6196 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; 6197 } 6198 6199 switch (tcp->tcp_state) { 6200 case TCPS_IDLE: 6201 /* 6202 * We support quick connect, refer to comments in 6203 * tcp_connect_*() 6204 */ 6205 /* FALLTHRU */ 6206 case TCPS_BOUND: 6207 case TCPS_LISTEN: 6208 if (tcp->tcp_family == AF_INET6) { 6209 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { 6210 tcp_connect_ipv6(tcp, mp, 6211 &sin6->sin6_addr, 6212 sin6->sin6_port, sin6->sin6_flowinfo, 6213 sin6->__sin6_src_id, sin6->sin6_scope_id); 6214 return; 6215 } 6216 /* 6217 * Destination adress is mapped IPv6 address. 6218 * Source bound address should be unspecified or 6219 * IPv6 mapped address as well. 6220 */ 6221 if (!IN6_IS_ADDR_UNSPECIFIED( 6222 &tcp->tcp_bound_source_v6) && 6223 !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) { 6224 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, 6225 EADDRNOTAVAIL); 6226 break; 6227 } 6228 dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr)); 6229 dstport = sin6->sin6_port; 6230 srcid = sin6->__sin6_src_id; 6231 } else { 6232 dstaddrp = &sin->sin_addr.s_addr; 6233 dstport = sin->sin_port; 6234 srcid = 0; 6235 } 6236 6237 tcp_connect_ipv4(tcp, mp, dstaddrp, dstport, srcid); 6238 return; 6239 default: 6240 mp = mi_tpi_err_ack_alloc(mp, TOUTSTATE, 0); 6241 break; 6242 } 6243 /* 6244 * Note: Code below is the "failure" case 6245 */ 6246 /* return error ack and blow away saved option results if any */ 6247 connect_failed: 6248 if (mp != NULL) 6249 putnext(tcp->tcp_rq, mp); 6250 else { 6251 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6252 TSYSERR, ENOMEM); 6253 } 6254 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 6255 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 6256 } 6257 6258 /* 6259 * Handle connect to IPv4 destinations, including connections for AF_INET6 6260 * sockets connecting to IPv4 mapped IPv6 destinations. 6261 */ 6262 static void 6263 tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport, 6264 uint_t srcid) 6265 { 6266 tcph_t *tcph; 6267 mblk_t *mp1; 6268 ipaddr_t dstaddr = *dstaddrp; 6269 int32_t oldstate; 6270 uint16_t lport; 6271 6272 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 6273 6274 /* Check for attempt to connect to INADDR_ANY */ 6275 if (dstaddr == INADDR_ANY) { 6276 /* 6277 * SunOS 4.x and 4.3 BSD allow an application 6278 * to connect a TCP socket to INADDR_ANY. 6279 * When they do this, the kernel picks the 6280 * address of one interface and uses it 6281 * instead. The kernel usually ends up 6282 * picking the address of the loopback 6283 * interface. This is an undocumented feature. 6284 * However, we provide the same thing here 6285 * in order to have source and binary 6286 * compatibility with SunOS 4.x. 6287 * Update the T_CONN_REQ (sin/sin6) since it is used to 6288 * generate the T_CONN_CON. 6289 */ 6290 dstaddr = htonl(INADDR_LOOPBACK); 6291 *dstaddrp = dstaddr; 6292 } 6293 6294 /* Handle __sin6_src_id if socket not bound to an IP address */ 6295 if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) { 6296 ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6, 6297 tcp->tcp_connp->conn_zoneid); 6298 IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6, 6299 tcp->tcp_ipha->ipha_src); 6300 } 6301 6302 /* 6303 * Don't let an endpoint connect to itself. Note that 6304 * the test here does not catch the case where the 6305 * source IP addr was left unspecified by the user. In 6306 * this case, the source addr is set in tcp_adapt_ire() 6307 * using the reply to the T_BIND message that we send 6308 * down to IP here and the check is repeated in tcp_rput_other. 6309 */ 6310 if (dstaddr == tcp->tcp_ipha->ipha_src && 6311 dstport == tcp->tcp_lport) { 6312 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 6313 goto failed; 6314 } 6315 6316 tcp->tcp_ipha->ipha_dst = dstaddr; 6317 IN6_IPADDR_TO_V4MAPPED(dstaddr, &tcp->tcp_remote_v6); 6318 6319 /* 6320 * Massage a source route if any putting the first hop 6321 * in iph_dst. Compute a starting value for the checksum which 6322 * takes into account that the original iph_dst should be 6323 * included in the checksum but that ip will include the 6324 * first hop in the source route in the tcp checksum. 6325 */ 6326 tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha); 6327 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); 6328 tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + 6329 (tcp->tcp_ipha->ipha_dst & 0xffff)); 6330 if ((int)tcp->tcp_sum < 0) 6331 tcp->tcp_sum--; 6332 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); 6333 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + 6334 (tcp->tcp_sum >> 16)); 6335 tcph = tcp->tcp_tcph; 6336 *(uint16_t *)tcph->th_fport = dstport; 6337 tcp->tcp_fport = dstport; 6338 6339 oldstate = tcp->tcp_state; 6340 /* 6341 * At this point the remote destination address and remote port fields 6342 * in the tcp-four-tuple have been filled in the tcp structure. Now we 6343 * have to see which state tcp was in so we can take apropriate action. 6344 */ 6345 if (oldstate == TCPS_IDLE) { 6346 /* 6347 * We support a quick connect capability here, allowing 6348 * clients to transition directly from IDLE to SYN_SENT 6349 * tcp_bindi will pick an unused port, insert the connection 6350 * in the bind hash and transition to BOUND state. 6351 */ 6352 lport = tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE); 6353 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, 6354 B_FALSE, B_FALSE); 6355 if (lport == 0) { 6356 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0); 6357 goto failed; 6358 } 6359 } 6360 tcp->tcp_state = TCPS_SYN_SENT; 6361 6362 /* 6363 * TODO: allow data with connect requests 6364 * by unlinking M_DATA trailers here and 6365 * linking them in behind the T_OK_ACK mblk. 6366 * The tcp_rput() bind ack handler would then 6367 * feed them to tcp_wput_data() rather than call 6368 * tcp_timer(). 6369 */ 6370 mp = mi_tpi_ok_ack_alloc(mp); 6371 if (!mp) { 6372 tcp->tcp_state = oldstate; 6373 goto failed; 6374 } 6375 if (tcp->tcp_family == AF_INET) { 6376 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 6377 sizeof (ipa_conn_t)); 6378 } else { 6379 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 6380 sizeof (ipa6_conn_t)); 6381 } 6382 if (mp1) { 6383 /* Hang onto the T_OK_ACK for later. */ 6384 linkb(mp1, mp); 6385 mblk_setcred(mp1, tcp->tcp_cred); 6386 if (tcp->tcp_family == AF_INET) 6387 mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp); 6388 else { 6389 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp, 6390 &tcp->tcp_sticky_ipp); 6391 } 6392 BUMP_MIB(&tcp_mib, tcpActiveOpens); 6393 tcp->tcp_active_open = 1; 6394 /* 6395 * If the bind cannot complete immediately 6396 * IP will arrange to call tcp_rput_other 6397 * when the bind completes. 6398 */ 6399 if (mp1 != NULL) 6400 tcp_rput_other(tcp, mp1); 6401 return; 6402 } 6403 /* Error case */ 6404 tcp->tcp_state = oldstate; 6405 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); 6406 6407 failed: 6408 /* return error ack and blow away saved option results if any */ 6409 if (mp != NULL) 6410 putnext(tcp->tcp_rq, mp); 6411 else { 6412 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6413 TSYSERR, ENOMEM); 6414 } 6415 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 6416 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 6417 6418 } 6419 6420 /* 6421 * Handle connect to IPv6 destinations. 6422 */ 6423 static void 6424 tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, 6425 in_port_t dstport, uint32_t flowinfo, uint_t srcid, uint32_t scope_id) 6426 { 6427 tcph_t *tcph; 6428 mblk_t *mp1; 6429 ip6_rthdr_t *rth; 6430 int32_t oldstate; 6431 uint16_t lport; 6432 6433 ASSERT(tcp->tcp_family == AF_INET6); 6434 6435 /* 6436 * If we're here, it means that the destination address is a native 6437 * IPv6 address. Return an error if tcp_ipversion is not IPv6. A 6438 * reason why it might not be IPv6 is if the socket was bound to an 6439 * IPv4-mapped IPv6 address. 6440 */ 6441 if (tcp->tcp_ipversion != IPV6_VERSION) { 6442 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 6443 goto failed; 6444 } 6445 6446 /* 6447 * Interpret a zero destination to mean loopback. 6448 * Update the T_CONN_REQ (sin/sin6) since it is used to 6449 * generate the T_CONN_CON. 6450 */ 6451 if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) { 6452 *dstaddrp = ipv6_loopback; 6453 } 6454 6455 /* Handle __sin6_src_id if socket not bound to an IP address */ 6456 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { 6457 ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src, 6458 tcp->tcp_connp->conn_zoneid); 6459 tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src; 6460 } 6461 6462 /* 6463 * Take care of the scope_id now and add ip6i_t 6464 * if ip6i_t is not already allocated through TCP 6465 * sticky options. At this point tcp_ip6h does not 6466 * have dst info, thus use dstaddrp. 6467 */ 6468 if (scope_id != 0 && 6469 IN6_IS_ADDR_LINKSCOPE(dstaddrp)) { 6470 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; 6471 ip6i_t *ip6i; 6472 6473 ipp->ipp_ifindex = scope_id; 6474 ip6i = (ip6i_t *)tcp->tcp_iphc; 6475 6476 if ((ipp->ipp_fields & IPPF_HAS_IP6I) && 6477 ip6i != NULL && (ip6i->ip6i_nxt == IPPROTO_RAW)) { 6478 /* Already allocated */ 6479 ip6i->ip6i_flags |= IP6I_IFINDEX; 6480 ip6i->ip6i_ifindex = ipp->ipp_ifindex; 6481 ipp->ipp_fields |= IPPF_SCOPE_ID; 6482 } else { 6483 int reterr; 6484 6485 ipp->ipp_fields |= IPPF_SCOPE_ID; 6486 if (ipp->ipp_fields & IPPF_HAS_IP6I) 6487 ip2dbg(("tcp_connect_v6: SCOPE_ID set\n")); 6488 reterr = tcp_build_hdrs(tcp->tcp_rq, tcp); 6489 if (reterr != 0) 6490 goto failed; 6491 ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n")); 6492 } 6493 } 6494 6495 /* 6496 * Don't let an endpoint connect to itself. Note that 6497 * the test here does not catch the case where the 6498 * source IP addr was left unspecified by the user. In 6499 * this case, the source addr is set in tcp_adapt_ire() 6500 * using the reply to the T_BIND message that we send 6501 * down to IP here and the check is repeated in tcp_rput_other. 6502 */ 6503 if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) && 6504 (dstport == tcp->tcp_lport)) { 6505 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); 6506 goto failed; 6507 } 6508 6509 tcp->tcp_ip6h->ip6_dst = *dstaddrp; 6510 tcp->tcp_remote_v6 = *dstaddrp; 6511 tcp->tcp_ip6h->ip6_vcf = 6512 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | 6513 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); 6514 6515 6516 /* 6517 * Massage a routing header (if present) putting the first hop 6518 * in ip6_dst. Compute a starting value for the checksum which 6519 * takes into account that the original ip6_dst should be 6520 * included in the checksum but that ip will include the 6521 * first hop in the source route in the tcp checksum. 6522 */ 6523 rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph); 6524 if (rth != NULL) { 6525 6526 tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth); 6527 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + 6528 (tcp->tcp_sum >> 16)); 6529 } else { 6530 tcp->tcp_sum = 0; 6531 } 6532 6533 tcph = tcp->tcp_tcph; 6534 *(uint16_t *)tcph->th_fport = dstport; 6535 tcp->tcp_fport = dstport; 6536 6537 oldstate = tcp->tcp_state; 6538 /* 6539 * At this point the remote destination address and remote port fields 6540 * in the tcp-four-tuple have been filled in the tcp structure. Now we 6541 * have to see which state tcp was in so we can take apropriate action. 6542 */ 6543 if (oldstate == TCPS_IDLE) { 6544 /* 6545 * We support a quick connect capability here, allowing 6546 * clients to transition directly from IDLE to SYN_SENT 6547 * tcp_bindi will pick an unused port, insert the connection 6548 * in the bind hash and transition to BOUND state. 6549 */ 6550 lport = tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE); 6551 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, 6552 B_FALSE, B_FALSE); 6553 if (lport == 0) { 6554 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0); 6555 goto failed; 6556 } 6557 } 6558 tcp->tcp_state = TCPS_SYN_SENT; 6559 /* 6560 * TODO: allow data with connect requests 6561 * by unlinking M_DATA trailers here and 6562 * linking them in behind the T_OK_ACK mblk. 6563 * The tcp_rput() bind ack handler would then 6564 * feed them to tcp_wput_data() rather than call 6565 * tcp_timer(). 6566 */ 6567 mp = mi_tpi_ok_ack_alloc(mp); 6568 if (!mp) { 6569 tcp->tcp_state = oldstate; 6570 goto failed; 6571 } 6572 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t)); 6573 if (mp1) { 6574 /* Hang onto the T_OK_ACK for later. */ 6575 linkb(mp1, mp); 6576 mblk_setcred(mp1, tcp->tcp_cred); 6577 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp, 6578 &tcp->tcp_sticky_ipp); 6579 BUMP_MIB(&tcp_mib, tcpActiveOpens); 6580 tcp->tcp_active_open = 1; 6581 /* ip_bind_v6() may return ACK or ERROR */ 6582 if (mp1 != NULL) 6583 tcp_rput_other(tcp, mp1); 6584 return; 6585 } 6586 /* Error case */ 6587 tcp->tcp_state = oldstate; 6588 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM); 6589 6590 failed: 6591 /* return error ack and blow away saved option results if any */ 6592 if (mp != NULL) 6593 putnext(tcp->tcp_rq, mp); 6594 else { 6595 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, 6596 TSYSERR, ENOMEM); 6597 } 6598 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 6599 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 6600 } 6601 6602 /* 6603 * We need a stream q for detached closing tcp connections 6604 * to use. Our client hereby indicates that this q is the 6605 * one to use. 6606 */ 6607 static void 6608 tcp_def_q_set(tcp_t *tcp, mblk_t *mp) 6609 { 6610 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 6611 queue_t *q = tcp->tcp_wq; 6612 6613 mp->b_datap->db_type = M_IOCACK; 6614 iocp->ioc_count = 0; 6615 mutex_enter(&tcp_g_q_lock); 6616 if (tcp_g_q != NULL) { 6617 mutex_exit(&tcp_g_q_lock); 6618 iocp->ioc_error = EALREADY; 6619 } else { 6620 mblk_t *mp1; 6621 6622 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 0); 6623 if (mp1 == NULL) { 6624 mutex_exit(&tcp_g_q_lock); 6625 iocp->ioc_error = ENOMEM; 6626 } else { 6627 tcp_g_q = tcp->tcp_rq; 6628 mutex_exit(&tcp_g_q_lock); 6629 iocp->ioc_error = 0; 6630 iocp->ioc_rval = 0; 6631 /* 6632 * We are passing tcp_sticky_ipp as NULL 6633 * as it is not useful for tcp_default queue 6634 */ 6635 mp1 = ip_bind_v6(q, mp1, tcp->tcp_connp, NULL); 6636 if (mp1 != NULL) 6637 tcp_rput_other(tcp, mp1); 6638 } 6639 } 6640 qreply(q, mp); 6641 } 6642 6643 /* 6644 * Our client hereby directs us to reject the connection request 6645 * that tcp_conn_request() marked with 'seqnum'. Rejection consists 6646 * of sending the appropriate RST, not an ICMP error. 6647 */ 6648 static void 6649 tcp_disconnect(tcp_t *tcp, mblk_t *mp) 6650 { 6651 tcp_t *ltcp = NULL; 6652 t_scalar_t seqnum; 6653 conn_t *connp; 6654 6655 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 6656 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) { 6657 tcp_err_ack(tcp, mp, TPROTO, 0); 6658 return; 6659 } 6660 6661 /* 6662 * Right now, upper modules pass down a T_DISCON_REQ to TCP, 6663 * when the stream is in BOUND state. Do not send a reset, 6664 * since the destination IP address is not valid, and it can 6665 * be the initialized value of all zeros (broadcast address). 6666 * 6667 * If TCP has sent down a bind request to IP and has not 6668 * received the reply, reject the request. Otherwise, TCP 6669 * will be confused. 6670 */ 6671 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) { 6672 if (tcp->tcp_debug) { 6673 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 6674 "tcp_disconnect: bad state, %d", tcp->tcp_state); 6675 } 6676 tcp_err_ack(tcp, mp, TOUTSTATE, 0); 6677 return; 6678 } 6679 6680 seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number; 6681 6682 if (seqnum == -1 || tcp->tcp_conn_req_max == 0) { 6683 6684 /* 6685 * According to TPI, for non-listeners, ignore seqnum 6686 * and disconnect. 6687 * Following interpretation of -1 seqnum is historical 6688 * and implied TPI ? (TPI only states that for T_CONN_IND, 6689 * a valid seqnum should not be -1). 6690 * 6691 * -1 means disconnect everything 6692 * regardless even on a listener. 6693 */ 6694 6695 int old_state = tcp->tcp_state; 6696 6697 /* 6698 * The connection can't be on the tcp_time_wait_head list 6699 * since it is not detached. 6700 */ 6701 ASSERT(tcp->tcp_time_wait_next == NULL); 6702 ASSERT(tcp->tcp_time_wait_prev == NULL); 6703 ASSERT(tcp->tcp_time_wait_expire == 0); 6704 ltcp = NULL; 6705 /* 6706 * If it used to be a listener, check to make sure no one else 6707 * has taken the port before switching back to LISTEN state. 6708 */ 6709 if (tcp->tcp_ipversion == IPV4_VERSION) { 6710 connp = ipcl_lookup_listener_v4(tcp->tcp_lport, 6711 tcp->tcp_ipha->ipha_src, 6712 tcp->tcp_connp->conn_zoneid); 6713 if (connp != NULL) 6714 ltcp = connp->conn_tcp; 6715 } else { 6716 /* Allow tcp_bound_if listeners? */ 6717 connp = ipcl_lookup_listener_v6(tcp->tcp_lport, 6718 &tcp->tcp_ip6h->ip6_src, 0, 6719 tcp->tcp_connp->conn_zoneid); 6720 if (connp != NULL) 6721 ltcp = connp->conn_tcp; 6722 } 6723 if (tcp->tcp_conn_req_max && ltcp == NULL) { 6724 tcp->tcp_state = TCPS_LISTEN; 6725 } else if (old_state > TCPS_BOUND) { 6726 tcp->tcp_conn_req_max = 0; 6727 tcp->tcp_state = TCPS_BOUND; 6728 } 6729 if (ltcp != NULL) 6730 CONN_DEC_REF(ltcp->tcp_connp); 6731 if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) { 6732 BUMP_MIB(&tcp_mib, tcpAttemptFails); 6733 } else if (old_state == TCPS_ESTABLISHED || 6734 old_state == TCPS_CLOSE_WAIT) { 6735 BUMP_MIB(&tcp_mib, tcpEstabResets); 6736 } 6737 6738 if (tcp->tcp_fused) 6739 tcp_unfuse(tcp); 6740 6741 mutex_enter(&tcp->tcp_eager_lock); 6742 if ((tcp->tcp_conn_req_cnt_q0 != 0) || 6743 (tcp->tcp_conn_req_cnt_q != 0)) { 6744 tcp_eager_cleanup(tcp, 0); 6745 } 6746 mutex_exit(&tcp->tcp_eager_lock); 6747 6748 tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt, 6749 tcp->tcp_rnxt, TH_RST | TH_ACK); 6750 6751 tcp_reinit(tcp); 6752 6753 if (old_state >= TCPS_ESTABLISHED) { 6754 /* Send M_FLUSH according to TPI */ 6755 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); 6756 } 6757 mp = mi_tpi_ok_ack_alloc(mp); 6758 if (mp) 6759 putnext(tcp->tcp_rq, mp); 6760 return; 6761 } else if (!tcp_eager_blowoff(tcp, seqnum)) { 6762 tcp_err_ack(tcp, mp, TBADSEQ, 0); 6763 return; 6764 } 6765 if (tcp->tcp_state >= TCPS_ESTABLISHED) { 6766 /* Send M_FLUSH according to TPI */ 6767 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); 6768 } 6769 mp = mi_tpi_ok_ack_alloc(mp); 6770 if (mp) 6771 putnext(tcp->tcp_rq, mp); 6772 } 6773 6774 /* 6775 * Diagnostic routine used to return a string associated with the tcp state. 6776 * Note that if the caller does not supply a buffer, it will use an internal 6777 * static string. This means that if multiple threads call this function at 6778 * the same time, output can be corrupted... Note also that this function 6779 * does not check the size of the supplied buffer. The caller has to make 6780 * sure that it is big enough. 6781 */ 6782 static char * 6783 tcp_display(tcp_t *tcp, char *sup_buf, char format) 6784 { 6785 char buf1[30]; 6786 static char priv_buf[INET6_ADDRSTRLEN * 2 + 80]; 6787 char *buf; 6788 char *cp; 6789 in6_addr_t local, remote; 6790 char local_addrbuf[INET6_ADDRSTRLEN]; 6791 char remote_addrbuf[INET6_ADDRSTRLEN]; 6792 6793 if (sup_buf != NULL) 6794 buf = sup_buf; 6795 else 6796 buf = priv_buf; 6797 6798 if (tcp == NULL) 6799 return ("NULL_TCP"); 6800 switch (tcp->tcp_state) { 6801 case TCPS_CLOSED: 6802 cp = "TCP_CLOSED"; 6803 break; 6804 case TCPS_IDLE: 6805 cp = "TCP_IDLE"; 6806 break; 6807 case TCPS_BOUND: 6808 cp = "TCP_BOUND"; 6809 break; 6810 case TCPS_LISTEN: 6811 cp = "TCP_LISTEN"; 6812 break; 6813 case TCPS_SYN_SENT: 6814 cp = "TCP_SYN_SENT"; 6815 break; 6816 case TCPS_SYN_RCVD: 6817 cp = "TCP_SYN_RCVD"; 6818 break; 6819 case TCPS_ESTABLISHED: 6820 cp = "TCP_ESTABLISHED"; 6821 break; 6822 case TCPS_CLOSE_WAIT: 6823 cp = "TCP_CLOSE_WAIT"; 6824 break; 6825 case TCPS_FIN_WAIT_1: 6826 cp = "TCP_FIN_WAIT_1"; 6827 break; 6828 case TCPS_CLOSING: 6829 cp = "TCP_CLOSING"; 6830 break; 6831 case TCPS_LAST_ACK: 6832 cp = "TCP_LAST_ACK"; 6833 break; 6834 case TCPS_FIN_WAIT_2: 6835 cp = "TCP_FIN_WAIT_2"; 6836 break; 6837 case TCPS_TIME_WAIT: 6838 cp = "TCP_TIME_WAIT"; 6839 break; 6840 default: 6841 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); 6842 cp = buf1; 6843 break; 6844 } 6845 switch (format) { 6846 case DISP_ADDR_AND_PORT: 6847 if (tcp->tcp_ipversion == IPV4_VERSION) { 6848 /* 6849 * Note that we use the remote address in the tcp_b 6850 * structure. This means that it will print out 6851 * the real destination address, not the next hop's 6852 * address if source routing is used. 6853 */ 6854 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ip_src, &local); 6855 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &remote); 6856 6857 } else { 6858 local = tcp->tcp_ip_src_v6; 6859 remote = tcp->tcp_remote_v6; 6860 } 6861 (void) inet_ntop(AF_INET6, &local, local_addrbuf, 6862 sizeof (local_addrbuf)); 6863 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf, 6864 sizeof (remote_addrbuf)); 6865 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s", 6866 local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf, 6867 ntohs(tcp->tcp_fport), cp); 6868 break; 6869 case DISP_PORT_ONLY: 6870 default: 6871 (void) mi_sprintf(buf, "[%u, %u] %s", 6872 ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp); 6873 break; 6874 } 6875 6876 return (buf); 6877 } 6878 6879 /* 6880 * Called via squeue to get on to eager's perimeter to send a 6881 * TH_RST. The listener wants the eager to disappear either 6882 * by means of tcp_eager_blowoff() or tcp_eager_cleanup() 6883 * being called. 6884 */ 6885 /* ARGSUSED */ 6886 void 6887 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2) 6888 { 6889 conn_t *econnp = (conn_t *)arg; 6890 tcp_t *eager = econnp->conn_tcp; 6891 tcp_t *listener = eager->tcp_listener; 6892 6893 /* 6894 * We could be called because listener is closing. Since 6895 * the eager is using listener's queue's, its not safe. 6896 * Better use the default queue just to send the TH_RST 6897 * out. 6898 */ 6899 eager->tcp_rq = tcp_g_q; 6900 eager->tcp_wq = WR(tcp_g_q); 6901 6902 if (eager->tcp_state > TCPS_LISTEN) { 6903 tcp_xmit_ctl("tcp_eager_kill, can't wait", 6904 eager, eager->tcp_snxt, 0, TH_RST); 6905 } 6906 6907 /* We are here because listener wants this eager gone */ 6908 if (listener != NULL) { 6909 mutex_enter(&listener->tcp_eager_lock); 6910 tcp_eager_unlink(eager); 6911 if (eager->tcp_conn.tcp_eager_conn_ind == NULL) { 6912 /* 6913 * The eager has sent a conn_ind up to the 6914 * listener but listener decides to close 6915 * instead. We need to drop the extra ref 6916 * placed on eager in tcp_rput_data() before 6917 * sending the conn_ind to listener. 6918 */ 6919 CONN_DEC_REF(econnp); 6920 } 6921 mutex_exit(&listener->tcp_eager_lock); 6922 CONN_DEC_REF(listener->tcp_connp); 6923 } 6924 6925 if (eager->tcp_state > TCPS_BOUND) 6926 tcp_close_detached(eager); 6927 } 6928 6929 /* 6930 * Reset any eager connection hanging off this listener marked 6931 * with 'seqnum' and then reclaim it's resources. 6932 */ 6933 static boolean_t 6934 tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) 6935 { 6936 tcp_t *eager; 6937 mblk_t *mp; 6938 6939 TCP_STAT(tcp_eager_blowoff_calls); 6940 eager = listener; 6941 mutex_enter(&listener->tcp_eager_lock); 6942 do { 6943 eager = eager->tcp_eager_next_q; 6944 if (eager == NULL) { 6945 mutex_exit(&listener->tcp_eager_lock); 6946 return (B_FALSE); 6947 } 6948 } while (eager->tcp_conn_req_seqnum != seqnum); 6949 CONN_INC_REF(eager->tcp_connp); 6950 mutex_exit(&listener->tcp_eager_lock); 6951 mp = &eager->tcp_closemp; 6952 squeue_fill(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, 6953 eager->tcp_connp, SQTAG_TCP_EAGER_BLOWOFF); 6954 return (B_TRUE); 6955 } 6956 6957 /* 6958 * Reset any eager connection hanging off this listener 6959 * and then reclaim it's resources. 6960 */ 6961 static void 6962 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) 6963 { 6964 tcp_t *eager; 6965 mblk_t *mp; 6966 6967 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 6968 6969 if (!q0_only) { 6970 /* First cleanup q */ 6971 TCP_STAT(tcp_eager_blowoff_q); 6972 eager = listener->tcp_eager_next_q; 6973 while (eager != NULL) { 6974 CONN_INC_REF(eager->tcp_connp); 6975 mp = &eager->tcp_closemp; 6976 squeue_fill(eager->tcp_connp->conn_sqp, mp, 6977 tcp_eager_kill, eager->tcp_connp, 6978 SQTAG_TCP_EAGER_CLEANUP); 6979 eager = eager->tcp_eager_next_q; 6980 } 6981 } 6982 /* Then cleanup q0 */ 6983 TCP_STAT(tcp_eager_blowoff_q0); 6984 eager = listener->tcp_eager_next_q0; 6985 while (eager != listener) { 6986 CONN_INC_REF(eager->tcp_connp); 6987 mp = &eager->tcp_closemp; 6988 squeue_fill(eager->tcp_connp->conn_sqp, mp, 6989 tcp_eager_kill, eager->tcp_connp, 6990 SQTAG_TCP_EAGER_CLEANUP_Q0); 6991 eager = eager->tcp_eager_next_q0; 6992 } 6993 } 6994 6995 /* 6996 * If we are an eager connection hanging off a listener that hasn't 6997 * formally accepted the connection yet, get off his list and blow off 6998 * any data that we have accumulated. 6999 */ 7000 static void 7001 tcp_eager_unlink(tcp_t *tcp) 7002 { 7003 tcp_t *listener = tcp->tcp_listener; 7004 7005 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); 7006 ASSERT(listener != NULL); 7007 if (tcp->tcp_eager_next_q0 != NULL) { 7008 ASSERT(tcp->tcp_eager_prev_q0 != NULL); 7009 7010 /* Remove the eager tcp from q0 */ 7011 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 7012 tcp->tcp_eager_prev_q0; 7013 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 7014 tcp->tcp_eager_next_q0; 7015 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 7016 listener->tcp_conn_req_cnt_q0--; 7017 7018 tcp->tcp_eager_next_q0 = NULL; 7019 tcp->tcp_eager_prev_q0 = NULL; 7020 7021 if (tcp->tcp_syn_rcvd_timeout != 0) { 7022 /* we have timed out before */ 7023 ASSERT(listener->tcp_syn_rcvd_timeout > 0); 7024 listener->tcp_syn_rcvd_timeout--; 7025 } 7026 } else { 7027 tcp_t **tcpp = &listener->tcp_eager_next_q; 7028 tcp_t *prev = NULL; 7029 7030 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { 7031 if (tcpp[0] == tcp) { 7032 if (listener->tcp_eager_last_q == tcp) { 7033 /* 7034 * If we are unlinking the last 7035 * element on the list, adjust 7036 * tail pointer. Set tail pointer 7037 * to nil when list is empty. 7038 */ 7039 ASSERT(tcp->tcp_eager_next_q == NULL); 7040 if (listener->tcp_eager_last_q == 7041 listener->tcp_eager_next_q) { 7042 listener->tcp_eager_last_q = 7043 NULL; 7044 } else { 7045 /* 7046 * We won't get here if there 7047 * is only one eager in the 7048 * list. 7049 */ 7050 ASSERT(prev != NULL); 7051 listener->tcp_eager_last_q = 7052 prev; 7053 } 7054 } 7055 tcpp[0] = tcp->tcp_eager_next_q; 7056 tcp->tcp_eager_next_q = NULL; 7057 tcp->tcp_eager_last_q = NULL; 7058 ASSERT(listener->tcp_conn_req_cnt_q > 0); 7059 listener->tcp_conn_req_cnt_q--; 7060 break; 7061 } 7062 prev = tcpp[0]; 7063 } 7064 } 7065 tcp->tcp_listener = NULL; 7066 } 7067 7068 /* Shorthand to generate and send TPI error acks to our client */ 7069 static void 7070 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) 7071 { 7072 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) 7073 putnext(tcp->tcp_rq, mp); 7074 } 7075 7076 /* Shorthand to generate and send TPI error acks to our client */ 7077 static void 7078 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, 7079 int t_error, int sys_error) 7080 { 7081 struct T_error_ack *teackp; 7082 7083 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), 7084 M_PCPROTO, T_ERROR_ACK)) != NULL) { 7085 teackp = (struct T_error_ack *)mp->b_rptr; 7086 teackp->ERROR_prim = primitive; 7087 teackp->TLI_error = t_error; 7088 teackp->UNIX_error = sys_error; 7089 putnext(tcp->tcp_rq, mp); 7090 } 7091 } 7092 7093 /* 7094 * Note: No locks are held when inspecting tcp_g_*epriv_ports 7095 * but instead the code relies on: 7096 * - the fact that the address of the array and its size never changes 7097 * - the atomic assignment of the elements of the array 7098 */ 7099 /* ARGSUSED */ 7100 static int 7101 tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 7102 { 7103 int i; 7104 7105 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 7106 if (tcp_g_epriv_ports[i] != 0) 7107 (void) mi_mpprintf(mp, "%d ", tcp_g_epriv_ports[i]); 7108 } 7109 return (0); 7110 } 7111 7112 /* 7113 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple 7114 * threads from changing it at the same time. 7115 */ 7116 /* ARGSUSED */ 7117 static int 7118 tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 7119 cred_t *cr) 7120 { 7121 long new_value; 7122 int i; 7123 7124 /* 7125 * Fail the request if the new value does not lie within the 7126 * port number limits. 7127 */ 7128 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 7129 new_value <= 0 || new_value >= 65536) { 7130 return (EINVAL); 7131 } 7132 7133 mutex_enter(&tcp_epriv_port_lock); 7134 /* Check if the value is already in the list */ 7135 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 7136 if (new_value == tcp_g_epriv_ports[i]) { 7137 mutex_exit(&tcp_epriv_port_lock); 7138 return (EEXIST); 7139 } 7140 } 7141 /* Find an empty slot */ 7142 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 7143 if (tcp_g_epriv_ports[i] == 0) 7144 break; 7145 } 7146 if (i == tcp_g_num_epriv_ports) { 7147 mutex_exit(&tcp_epriv_port_lock); 7148 return (EOVERFLOW); 7149 } 7150 /* Set the new value */ 7151 tcp_g_epriv_ports[i] = (uint16_t)new_value; 7152 mutex_exit(&tcp_epriv_port_lock); 7153 return (0); 7154 } 7155 7156 /* 7157 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple 7158 * threads from changing it at the same time. 7159 */ 7160 /* ARGSUSED */ 7161 static int 7162 tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 7163 cred_t *cr) 7164 { 7165 long new_value; 7166 int i; 7167 7168 /* 7169 * Fail the request if the new value does not lie within the 7170 * port number limits. 7171 */ 7172 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value <= 0 || 7173 new_value >= 65536) { 7174 return (EINVAL); 7175 } 7176 7177 mutex_enter(&tcp_epriv_port_lock); 7178 /* Check that the value is already in the list */ 7179 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 7180 if (tcp_g_epriv_ports[i] == new_value) 7181 break; 7182 } 7183 if (i == tcp_g_num_epriv_ports) { 7184 mutex_exit(&tcp_epriv_port_lock); 7185 return (ESRCH); 7186 } 7187 /* Clear the value */ 7188 tcp_g_epriv_ports[i] = 0; 7189 mutex_exit(&tcp_epriv_port_lock); 7190 return (0); 7191 } 7192 7193 /* Return the TPI/TLI equivalent of our current tcp_state */ 7194 static int 7195 tcp_tpistate(tcp_t *tcp) 7196 { 7197 switch (tcp->tcp_state) { 7198 case TCPS_IDLE: 7199 return (TS_UNBND); 7200 case TCPS_LISTEN: 7201 /* 7202 * Return whether there are outstanding T_CONN_IND waiting 7203 * for the matching T_CONN_RES. Therefore don't count q0. 7204 */ 7205 if (tcp->tcp_conn_req_cnt_q > 0) 7206 return (TS_WRES_CIND); 7207 else 7208 return (TS_IDLE); 7209 case TCPS_BOUND: 7210 return (TS_IDLE); 7211 case TCPS_SYN_SENT: 7212 return (TS_WCON_CREQ); 7213 case TCPS_SYN_RCVD: 7214 /* 7215 * Note: assumption: this has to the active open SYN_RCVD. 7216 * The passive instance is detached in SYN_RCVD stage of 7217 * incoming connection processing so we cannot get request 7218 * for T_info_ack on it. 7219 */ 7220 return (TS_WACK_CRES); 7221 case TCPS_ESTABLISHED: 7222 return (TS_DATA_XFER); 7223 case TCPS_CLOSE_WAIT: 7224 return (TS_WREQ_ORDREL); 7225 case TCPS_FIN_WAIT_1: 7226 return (TS_WIND_ORDREL); 7227 case TCPS_FIN_WAIT_2: 7228 return (TS_WIND_ORDREL); 7229 7230 case TCPS_CLOSING: 7231 case TCPS_LAST_ACK: 7232 case TCPS_TIME_WAIT: 7233 case TCPS_CLOSED: 7234 /* 7235 * Following TS_WACK_DREQ7 is a rendition of "not 7236 * yet TS_IDLE" TPI state. There is no best match to any 7237 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we 7238 * choose a value chosen that will map to TLI/XTI level 7239 * state of TSTATECHNG (state is process of changing) which 7240 * captures what this dummy state represents. 7241 */ 7242 return (TS_WACK_DREQ7); 7243 default: 7244 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", 7245 tcp->tcp_state, tcp_display(tcp, NULL, 7246 DISP_PORT_ONLY)); 7247 return (TS_UNBND); 7248 } 7249 } 7250 7251 static void 7252 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) 7253 { 7254 if (tcp->tcp_family == AF_INET6) 7255 *tia = tcp_g_t_info_ack_v6; 7256 else 7257 *tia = tcp_g_t_info_ack; 7258 tia->CURRENT_state = tcp_tpistate(tcp); 7259 tia->OPT_size = tcp_max_optsize; 7260 if (tcp->tcp_mss == 0) { 7261 /* Not yet set - tcp_open does not set mss */ 7262 if (tcp->tcp_ipversion == IPV4_VERSION) 7263 tia->TIDU_size = tcp_mss_def_ipv4; 7264 else 7265 tia->TIDU_size = tcp_mss_def_ipv6; 7266 } else { 7267 tia->TIDU_size = tcp->tcp_mss; 7268 } 7269 /* TODO: Default ETSDU is 1. Is that correct for tcp? */ 7270 } 7271 7272 /* 7273 * This routine responds to T_CAPABILITY_REQ messages. It is called by 7274 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from 7275 * tcp_g_t_info_ack. The current state of the stream is copied from 7276 * tcp_state. 7277 */ 7278 static void 7279 tcp_capability_req(tcp_t *tcp, mblk_t *mp) 7280 { 7281 t_uscalar_t cap_bits1; 7282 struct T_capability_ack *tcap; 7283 7284 if (MBLKL(mp) < sizeof (struct T_capability_req)) { 7285 freemsg(mp); 7286 return; 7287 } 7288 7289 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; 7290 7291 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), 7292 mp->b_datap->db_type, T_CAPABILITY_ACK); 7293 if (mp == NULL) 7294 return; 7295 7296 tcap = (struct T_capability_ack *)mp->b_rptr; 7297 tcap->CAP_bits1 = 0; 7298 7299 if (cap_bits1 & TC1_INFO) { 7300 tcp_copy_info(&tcap->INFO_ack, tcp); 7301 tcap->CAP_bits1 |= TC1_INFO; 7302 } 7303 7304 if (cap_bits1 & TC1_ACCEPTOR_ID) { 7305 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; 7306 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; 7307 } 7308 7309 putnext(tcp->tcp_rq, mp); 7310 } 7311 7312 /* 7313 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. 7314 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. 7315 * The current state of the stream is copied from tcp_state. 7316 */ 7317 static void 7318 tcp_info_req(tcp_t *tcp, mblk_t *mp) 7319 { 7320 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, 7321 T_INFO_ACK); 7322 if (!mp) { 7323 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 7324 return; 7325 } 7326 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); 7327 putnext(tcp->tcp_rq, mp); 7328 } 7329 7330 /* Respond to the TPI addr request */ 7331 static void 7332 tcp_addr_req(tcp_t *tcp, mblk_t *mp) 7333 { 7334 sin_t *sin; 7335 mblk_t *ackmp; 7336 struct T_addr_ack *taa; 7337 7338 /* Make it large enough for worst case */ 7339 ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 7340 2 * sizeof (sin6_t), 1); 7341 if (ackmp == NULL) { 7342 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); 7343 return; 7344 } 7345 7346 if (tcp->tcp_ipversion == IPV6_VERSION) { 7347 tcp_addr_req_ipv6(tcp, ackmp); 7348 return; 7349 } 7350 taa = (struct T_addr_ack *)ackmp->b_rptr; 7351 7352 bzero(taa, sizeof (struct T_addr_ack)); 7353 ackmp->b_wptr = (uchar_t *)&taa[1]; 7354 7355 taa->PRIM_type = T_ADDR_ACK; 7356 ackmp->b_datap->db_type = M_PCPROTO; 7357 7358 /* 7359 * Note: Following code assumes 32 bit alignment of basic 7360 * data structures like sin_t and struct T_addr_ack. 7361 */ 7362 if (tcp->tcp_state >= TCPS_BOUND) { 7363 /* 7364 * Fill in local address 7365 */ 7366 taa->LOCADDR_length = sizeof (sin_t); 7367 taa->LOCADDR_offset = sizeof (*taa); 7368 7369 sin = (sin_t *)&taa[1]; 7370 7371 /* Fill zeroes and then intialize non-zero fields */ 7372 *sin = sin_null; 7373 7374 sin->sin_family = AF_INET; 7375 7376 sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src; 7377 sin->sin_port = *(uint16_t *)tcp->tcp_tcph->th_lport; 7378 7379 ackmp->b_wptr = (uchar_t *)&sin[1]; 7380 7381 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 7382 /* 7383 * Fill in Remote address 7384 */ 7385 taa->REMADDR_length = sizeof (sin_t); 7386 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset + 7387 taa->LOCADDR_length); 7388 7389 sin = (sin_t *)(ackmp->b_rptr + taa->REMADDR_offset); 7390 *sin = sin_null; 7391 sin->sin_family = AF_INET; 7392 sin->sin_addr.s_addr = tcp->tcp_remote; 7393 sin->sin_port = tcp->tcp_fport; 7394 7395 ackmp->b_wptr = (uchar_t *)&sin[1]; 7396 } 7397 } 7398 putnext(tcp->tcp_rq, ackmp); 7399 } 7400 7401 /* Assumes that tcp_addr_req gets enough space and alignment */ 7402 static void 7403 tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp) 7404 { 7405 sin6_t *sin6; 7406 struct T_addr_ack *taa; 7407 7408 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 7409 ASSERT(OK_32PTR(ackmp->b_rptr)); 7410 ASSERT(ackmp->b_wptr - ackmp->b_rptr >= sizeof (struct T_addr_ack) + 7411 2 * sizeof (sin6_t)); 7412 7413 taa = (struct T_addr_ack *)ackmp->b_rptr; 7414 7415 bzero(taa, sizeof (struct T_addr_ack)); 7416 ackmp->b_wptr = (uchar_t *)&taa[1]; 7417 7418 taa->PRIM_type = T_ADDR_ACK; 7419 ackmp->b_datap->db_type = M_PCPROTO; 7420 7421 /* 7422 * Note: Following code assumes 32 bit alignment of basic 7423 * data structures like sin6_t and struct T_addr_ack. 7424 */ 7425 if (tcp->tcp_state >= TCPS_BOUND) { 7426 /* 7427 * Fill in local address 7428 */ 7429 taa->LOCADDR_length = sizeof (sin6_t); 7430 taa->LOCADDR_offset = sizeof (*taa); 7431 7432 sin6 = (sin6_t *)&taa[1]; 7433 *sin6 = sin6_null; 7434 7435 sin6->sin6_family = AF_INET6; 7436 sin6->sin6_addr = tcp->tcp_ip6h->ip6_src; 7437 sin6->sin6_port = tcp->tcp_lport; 7438 7439 ackmp->b_wptr = (uchar_t *)&sin6[1]; 7440 7441 if (tcp->tcp_state >= TCPS_SYN_RCVD) { 7442 /* 7443 * Fill in Remote address 7444 */ 7445 taa->REMADDR_length = sizeof (sin6_t); 7446 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset + 7447 taa->LOCADDR_length); 7448 7449 sin6 = (sin6_t *)(ackmp->b_rptr + taa->REMADDR_offset); 7450 *sin6 = sin6_null; 7451 sin6->sin6_family = AF_INET6; 7452 sin6->sin6_flowinfo = 7453 tcp->tcp_ip6h->ip6_vcf & 7454 ~IPV6_VERS_AND_FLOW_MASK; 7455 sin6->sin6_addr = tcp->tcp_remote_v6; 7456 sin6->sin6_port = tcp->tcp_fport; 7457 7458 ackmp->b_wptr = (uchar_t *)&sin6[1]; 7459 } 7460 } 7461 putnext(tcp->tcp_rq, ackmp); 7462 } 7463 7464 /* 7465 * Handle reinitialization of a tcp structure. 7466 * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE. 7467 */ 7468 static void 7469 tcp_reinit(tcp_t *tcp) 7470 { 7471 mblk_t *mp; 7472 int err; 7473 7474 TCP_STAT(tcp_reinit_calls); 7475 7476 /* tcp_reinit should never be called for detached tcp_t's */ 7477 ASSERT(tcp->tcp_listener == NULL); 7478 ASSERT((tcp->tcp_family == AF_INET && 7479 tcp->tcp_ipversion == IPV4_VERSION) || 7480 (tcp->tcp_family == AF_INET6 && 7481 (tcp->tcp_ipversion == IPV4_VERSION || 7482 tcp->tcp_ipversion == IPV6_VERSION))); 7483 7484 /* Cancel outstanding timers */ 7485 tcp_timers_stop(tcp); 7486 7487 /* 7488 * Reset everything in the state vector, after updating global 7489 * MIB data from instance counters. 7490 */ 7491 UPDATE_MIB(&tcp_mib, tcpInSegs, tcp->tcp_ibsegs); 7492 tcp->tcp_ibsegs = 0; 7493 UPDATE_MIB(&tcp_mib, tcpOutSegs, tcp->tcp_obsegs); 7494 tcp->tcp_obsegs = 0; 7495 7496 tcp_close_mpp(&tcp->tcp_xmit_head); 7497 if (tcp->tcp_snd_zcopy_aware) 7498 tcp_zcopy_notify(tcp); 7499 tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL; 7500 tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0; 7501 if (tcp->tcp_flow_stopped && 7502 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 7503 tcp_clrqfull(tcp); 7504 } 7505 tcp_close_mpp(&tcp->tcp_reass_head); 7506 tcp->tcp_reass_tail = NULL; 7507 if (tcp->tcp_rcv_list != NULL) { 7508 /* Free b_next chain */ 7509 tcp_close_mpp(&tcp->tcp_rcv_list); 7510 tcp->tcp_rcv_last_head = NULL; 7511 tcp->tcp_rcv_last_tail = NULL; 7512 tcp->tcp_rcv_cnt = 0; 7513 } 7514 tcp->tcp_rcv_last_tail = NULL; 7515 7516 if ((mp = tcp->tcp_urp_mp) != NULL) { 7517 freemsg(mp); 7518 tcp->tcp_urp_mp = NULL; 7519 } 7520 if ((mp = tcp->tcp_urp_mark_mp) != NULL) { 7521 freemsg(mp); 7522 tcp->tcp_urp_mark_mp = NULL; 7523 } 7524 if (tcp->tcp_fused_sigurg_mp != NULL) { 7525 freeb(tcp->tcp_fused_sigurg_mp); 7526 tcp->tcp_fused_sigurg_mp = NULL; 7527 } 7528 7529 /* 7530 * Following is a union with two members which are 7531 * identical types and size so the following cleanup 7532 * is enough. 7533 */ 7534 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); 7535 7536 CL_INET_DISCONNECT(tcp); 7537 7538 /* 7539 * The connection can't be on the tcp_time_wait_head list 7540 * since it is not detached. 7541 */ 7542 ASSERT(tcp->tcp_time_wait_next == NULL); 7543 ASSERT(tcp->tcp_time_wait_prev == NULL); 7544 ASSERT(tcp->tcp_time_wait_expire == 0); 7545 7546 if (tcp->tcp_kssl_pending) { 7547 tcp->tcp_kssl_pending = B_FALSE; 7548 7549 /* Don't reset if the initialized by bind. */ 7550 if (tcp->tcp_kssl_ent != NULL) { 7551 kssl_release_ent(tcp->tcp_kssl_ent, NULL, 7552 KSSL_NO_PROXY); 7553 } 7554 } 7555 if (tcp->tcp_kssl_ctx != NULL) { 7556 kssl_release_ctx(tcp->tcp_kssl_ctx); 7557 tcp->tcp_kssl_ctx = NULL; 7558 } 7559 7560 /* 7561 * Reset/preserve other values 7562 */ 7563 tcp_reinit_values(tcp); 7564 ipcl_hash_remove(tcp->tcp_connp); 7565 conn_delete_ire(tcp->tcp_connp, NULL); 7566 7567 if (tcp->tcp_conn_req_max != 0) { 7568 /* 7569 * This is the case when a TLI program uses the same 7570 * transport end point to accept a connection. This 7571 * makes the TCP both a listener and acceptor. When 7572 * this connection is closed, we need to set the state 7573 * back to TCPS_LISTEN. Make sure that the eager list 7574 * is reinitialized. 7575 * 7576 * Note that this stream is still bound to the four 7577 * tuples of the previous connection in IP. If a new 7578 * SYN with different foreign address comes in, IP will 7579 * not find it and will send it to the global queue. In 7580 * the global queue, TCP will do a tcp_lookup_listener() 7581 * to find this stream. This works because this stream 7582 * is only removed from connected hash. 7583 * 7584 */ 7585 tcp->tcp_state = TCPS_LISTEN; 7586 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; 7587 tcp->tcp_connp->conn_recv = tcp_conn_request; 7588 if (tcp->tcp_family == AF_INET6) { 7589 ASSERT(tcp->tcp_connp->conn_af_isv6); 7590 (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP, 7591 &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport); 7592 } else { 7593 ASSERT(!tcp->tcp_connp->conn_af_isv6); 7594 (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP, 7595 tcp->tcp_ipha->ipha_src, tcp->tcp_lport); 7596 } 7597 } else { 7598 tcp->tcp_state = TCPS_BOUND; 7599 } 7600 7601 /* 7602 * Initialize to default values 7603 * Can't fail since enough header template space already allocated 7604 * at open(). 7605 */ 7606 err = tcp_init_values(tcp); 7607 ASSERT(err == 0); 7608 /* Restore state in tcp_tcph */ 7609 bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN); 7610 if (tcp->tcp_ipversion == IPV4_VERSION) 7611 tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source; 7612 else 7613 tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6; 7614 /* 7615 * Copy of the src addr. in tcp_t is needed in tcp_t 7616 * since the lookup funcs can only lookup on tcp_t 7617 */ 7618 tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6; 7619 7620 ASSERT(tcp->tcp_ptpbhn != NULL); 7621 tcp->tcp_rq->q_hiwat = tcp_recv_hiwat; 7622 tcp->tcp_rwnd = tcp_recv_hiwat; 7623 tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ? 7624 tcp_mss_def_ipv6 : tcp_mss_def_ipv4; 7625 } 7626 7627 /* 7628 * Force values to zero that need be zero. 7629 * Do not touch values asociated with the BOUND or LISTEN state 7630 * since the connection will end up in that state after the reinit. 7631 * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t 7632 * structure! 7633 */ 7634 static void 7635 tcp_reinit_values(tcp) 7636 tcp_t *tcp; 7637 { 7638 #ifndef lint 7639 #define DONTCARE(x) 7640 #define PRESERVE(x) 7641 #else 7642 #define DONTCARE(x) ((x) = (x)) 7643 #define PRESERVE(x) ((x) = (x)) 7644 #endif /* lint */ 7645 7646 PRESERVE(tcp->tcp_bind_hash); 7647 PRESERVE(tcp->tcp_ptpbhn); 7648 PRESERVE(tcp->tcp_acceptor_hash); 7649 PRESERVE(tcp->tcp_ptpahn); 7650 7651 /* Should be ASSERT NULL on these with new code! */ 7652 ASSERT(tcp->tcp_time_wait_next == NULL); 7653 ASSERT(tcp->tcp_time_wait_prev == NULL); 7654 ASSERT(tcp->tcp_time_wait_expire == 0); 7655 PRESERVE(tcp->tcp_state); 7656 PRESERVE(tcp->tcp_rq); 7657 PRESERVE(tcp->tcp_wq); 7658 7659 ASSERT(tcp->tcp_xmit_head == NULL); 7660 ASSERT(tcp->tcp_xmit_last == NULL); 7661 ASSERT(tcp->tcp_unsent == 0); 7662 ASSERT(tcp->tcp_xmit_tail == NULL); 7663 ASSERT(tcp->tcp_xmit_tail_unsent == 0); 7664 7665 tcp->tcp_snxt = 0; /* Displayed in mib */ 7666 tcp->tcp_suna = 0; /* Displayed in mib */ 7667 tcp->tcp_swnd = 0; 7668 DONTCARE(tcp->tcp_cwnd); /* Init in tcp_mss_set */ 7669 7670 ASSERT(tcp->tcp_ibsegs == 0); 7671 ASSERT(tcp->tcp_obsegs == 0); 7672 7673 if (tcp->tcp_iphc != NULL) { 7674 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 7675 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 7676 } 7677 7678 DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */ 7679 DONTCARE(tcp->tcp_hdr_len); /* Init in tcp_init_values */ 7680 DONTCARE(tcp->tcp_ipha); 7681 DONTCARE(tcp->tcp_ip6h); 7682 DONTCARE(tcp->tcp_ip_hdr_len); 7683 DONTCARE(tcp->tcp_tcph); 7684 DONTCARE(tcp->tcp_tcp_hdr_len); /* Init in tcp_init_values */ 7685 tcp->tcp_valid_bits = 0; 7686 7687 DONTCARE(tcp->tcp_xmit_hiwater); /* Init in tcp_init_values */ 7688 DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */ 7689 DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */ 7690 tcp->tcp_last_rcv_lbolt = 0; 7691 7692 tcp->tcp_init_cwnd = 0; 7693 7694 tcp->tcp_urp_last_valid = 0; 7695 tcp->tcp_hard_binding = 0; 7696 tcp->tcp_hard_bound = 0; 7697 PRESERVE(tcp->tcp_cred); 7698 PRESERVE(tcp->tcp_cpid); 7699 PRESERVE(tcp->tcp_exclbind); 7700 7701 tcp->tcp_fin_acked = 0; 7702 tcp->tcp_fin_rcvd = 0; 7703 tcp->tcp_fin_sent = 0; 7704 tcp->tcp_ordrel_done = 0; 7705 7706 tcp->tcp_debug = 0; 7707 tcp->tcp_dontroute = 0; 7708 tcp->tcp_broadcast = 0; 7709 7710 tcp->tcp_useloopback = 0; 7711 tcp->tcp_reuseaddr = 0; 7712 tcp->tcp_oobinline = 0; 7713 tcp->tcp_dgram_errind = 0; 7714 7715 tcp->tcp_detached = 0; 7716 tcp->tcp_bind_pending = 0; 7717 tcp->tcp_unbind_pending = 0; 7718 tcp->tcp_deferred_clean_death = 0; 7719 7720 tcp->tcp_snd_ws_ok = B_FALSE; 7721 tcp->tcp_snd_ts_ok = B_FALSE; 7722 tcp->tcp_linger = 0; 7723 tcp->tcp_ka_enabled = 0; 7724 tcp->tcp_zero_win_probe = 0; 7725 7726 tcp->tcp_loopback = 0; 7727 tcp->tcp_localnet = 0; 7728 tcp->tcp_syn_defense = 0; 7729 tcp->tcp_set_timer = 0; 7730 7731 tcp->tcp_active_open = 0; 7732 ASSERT(tcp->tcp_timeout == B_FALSE); 7733 tcp->tcp_rexmit = B_FALSE; 7734 tcp->tcp_xmit_zc_clean = B_FALSE; 7735 7736 tcp->tcp_snd_sack_ok = B_FALSE; 7737 PRESERVE(tcp->tcp_recvdstaddr); 7738 tcp->tcp_hwcksum = B_FALSE; 7739 7740 tcp->tcp_ire_ill_check_done = B_FALSE; 7741 DONTCARE(tcp->tcp_maxpsz); /* Init in tcp_init_values */ 7742 7743 tcp->tcp_mdt = B_FALSE; 7744 tcp->tcp_mdt_hdr_head = 0; 7745 tcp->tcp_mdt_hdr_tail = 0; 7746 7747 tcp->tcp_conn_def_q0 = 0; 7748 tcp->tcp_ip_forward_progress = B_FALSE; 7749 tcp->tcp_anon_priv_bind = 0; 7750 tcp->tcp_ecn_ok = B_FALSE; 7751 7752 tcp->tcp_cwr = B_FALSE; 7753 tcp->tcp_ecn_echo_on = B_FALSE; 7754 7755 if (tcp->tcp_sack_info != NULL) { 7756 if (tcp->tcp_notsack_list != NULL) { 7757 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 7758 } 7759 kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info); 7760 tcp->tcp_sack_info = NULL; 7761 } 7762 7763 tcp->tcp_rcv_ws = 0; 7764 tcp->tcp_snd_ws = 0; 7765 tcp->tcp_ts_recent = 0; 7766 tcp->tcp_rnxt = 0; /* Displayed in mib */ 7767 DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */ 7768 tcp->tcp_if_mtu = 0; 7769 7770 ASSERT(tcp->tcp_reass_head == NULL); 7771 ASSERT(tcp->tcp_reass_tail == NULL); 7772 7773 tcp->tcp_cwnd_cnt = 0; 7774 7775 ASSERT(tcp->tcp_rcv_list == NULL); 7776 ASSERT(tcp->tcp_rcv_last_head == NULL); 7777 ASSERT(tcp->tcp_rcv_last_tail == NULL); 7778 ASSERT(tcp->tcp_rcv_cnt == 0); 7779 7780 DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_adapt_ire */ 7781 DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */ 7782 tcp->tcp_csuna = 0; 7783 7784 tcp->tcp_rto = 0; /* Displayed in MIB */ 7785 DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */ 7786 DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */ 7787 tcp->tcp_rtt_update = 0; 7788 7789 DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 7790 DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */ 7791 7792 tcp->tcp_rack = 0; /* Displayed in mib */ 7793 tcp->tcp_rack_cnt = 0; 7794 tcp->tcp_rack_cur_max = 0; 7795 tcp->tcp_rack_abs_max = 0; 7796 7797 tcp->tcp_max_swnd = 0; 7798 7799 ASSERT(tcp->tcp_listener == NULL); 7800 7801 DONTCARE(tcp->tcp_xmit_lowater); /* Init in tcp_init_values */ 7802 7803 DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */ 7804 DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */ 7805 DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */ 7806 DONTCARE(tcp->tcp_urg); /* tcp_valid_bits cleared */ 7807 7808 ASSERT(tcp->tcp_conn_req_cnt_q == 0); 7809 ASSERT(tcp->tcp_conn_req_cnt_q0 == 0); 7810 PRESERVE(tcp->tcp_conn_req_max); 7811 PRESERVE(tcp->tcp_conn_req_seqnum); 7812 7813 DONTCARE(tcp->tcp_ip_hdr_len); /* Init in tcp_init_values */ 7814 DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */ 7815 DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */ 7816 DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */ 7817 DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */ 7818 7819 tcp->tcp_lingertime = 0; 7820 7821 DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */ 7822 ASSERT(tcp->tcp_urp_mp == NULL); 7823 ASSERT(tcp->tcp_urp_mark_mp == NULL); 7824 ASSERT(tcp->tcp_fused_sigurg_mp == NULL); 7825 7826 ASSERT(tcp->tcp_eager_next_q == NULL); 7827 ASSERT(tcp->tcp_eager_last_q == NULL); 7828 ASSERT((tcp->tcp_eager_next_q0 == NULL && 7829 tcp->tcp_eager_prev_q0 == NULL) || 7830 tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0); 7831 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); 7832 7833 tcp->tcp_client_errno = 0; 7834 7835 DONTCARE(tcp->tcp_sum); /* Init in tcp_init_values */ 7836 7837 tcp->tcp_remote_v6 = ipv6_all_zeros; /* Displayed in MIB */ 7838 7839 PRESERVE(tcp->tcp_bound_source_v6); 7840 tcp->tcp_last_sent_len = 0; 7841 tcp->tcp_dupack_cnt = 0; 7842 7843 tcp->tcp_fport = 0; /* Displayed in MIB */ 7844 PRESERVE(tcp->tcp_lport); 7845 7846 PRESERVE(tcp->tcp_acceptor_lockp); 7847 7848 ASSERT(tcp->tcp_ordrelid == 0); 7849 PRESERVE(tcp->tcp_acceptor_id); 7850 DONTCARE(tcp->tcp_ipsec_overhead); 7851 7852 /* 7853 * If tcp_tracing flag is ON (i.e. We have a trace buffer 7854 * in tcp structure and now tracing), Re-initialize all 7855 * members of tcp_traceinfo. 7856 */ 7857 if (tcp->tcp_tracebuf != NULL) { 7858 bzero(tcp->tcp_tracebuf, sizeof (tcptrch_t)); 7859 } 7860 7861 PRESERVE(tcp->tcp_family); 7862 if (tcp->tcp_family == AF_INET6) { 7863 tcp->tcp_ipversion = IPV6_VERSION; 7864 tcp->tcp_mss = tcp_mss_def_ipv6; 7865 } else { 7866 tcp->tcp_ipversion = IPV4_VERSION; 7867 tcp->tcp_mss = tcp_mss_def_ipv4; 7868 } 7869 7870 tcp->tcp_bound_if = 0; 7871 tcp->tcp_ipv6_recvancillary = 0; 7872 tcp->tcp_recvifindex = 0; 7873 tcp->tcp_recvhops = 0; 7874 tcp->tcp_closed = 0; 7875 tcp->tcp_cleandeathtag = 0; 7876 if (tcp->tcp_hopopts != NULL) { 7877 mi_free(tcp->tcp_hopopts); 7878 tcp->tcp_hopopts = NULL; 7879 tcp->tcp_hopoptslen = 0; 7880 } 7881 ASSERT(tcp->tcp_hopoptslen == 0); 7882 if (tcp->tcp_dstopts != NULL) { 7883 mi_free(tcp->tcp_dstopts); 7884 tcp->tcp_dstopts = NULL; 7885 tcp->tcp_dstoptslen = 0; 7886 } 7887 ASSERT(tcp->tcp_dstoptslen == 0); 7888 if (tcp->tcp_rtdstopts != NULL) { 7889 mi_free(tcp->tcp_rtdstopts); 7890 tcp->tcp_rtdstopts = NULL; 7891 tcp->tcp_rtdstoptslen = 0; 7892 } 7893 ASSERT(tcp->tcp_rtdstoptslen == 0); 7894 if (tcp->tcp_rthdr != NULL) { 7895 mi_free(tcp->tcp_rthdr); 7896 tcp->tcp_rthdr = NULL; 7897 tcp->tcp_rthdrlen = 0; 7898 } 7899 ASSERT(tcp->tcp_rthdrlen == 0); 7900 PRESERVE(tcp->tcp_drop_opt_ack_cnt); 7901 7902 /* Reset fusion-related fields */ 7903 tcp->tcp_fused = B_FALSE; 7904 tcp->tcp_unfusable = B_FALSE; 7905 tcp->tcp_fused_sigurg = B_FALSE; 7906 tcp->tcp_direct_sockfs = B_FALSE; 7907 tcp->tcp_fuse_syncstr_stopped = B_FALSE; 7908 tcp->tcp_fuse_syncstr_plugged = B_FALSE; 7909 tcp->tcp_loopback_peer = NULL; 7910 tcp->tcp_fuse_rcv_hiwater = 0; 7911 tcp->tcp_fuse_rcv_unread_hiwater = 0; 7912 tcp->tcp_fuse_rcv_unread_cnt = 0; 7913 7914 tcp->tcp_in_ack_unsent = 0; 7915 tcp->tcp_cork = B_FALSE; 7916 7917 PRESERVE(tcp->tcp_squeue_bytes); 7918 7919 ASSERT(tcp->tcp_kssl_ctx == NULL); 7920 ASSERT(!tcp->tcp_kssl_pending); 7921 PRESERVE(tcp->tcp_kssl_ent); 7922 7923 #undef DONTCARE 7924 #undef PRESERVE 7925 } 7926 7927 /* 7928 * Allocate necessary resources and initialize state vector. 7929 * Guaranteed not to fail so that when an error is returned, 7930 * the caller doesn't need to do any additional cleanup. 7931 */ 7932 int 7933 tcp_init(tcp_t *tcp, queue_t *q) 7934 { 7935 int err; 7936 7937 tcp->tcp_rq = q; 7938 tcp->tcp_wq = WR(q); 7939 tcp->tcp_state = TCPS_IDLE; 7940 if ((err = tcp_init_values(tcp)) != 0) 7941 tcp_timers_stop(tcp); 7942 return (err); 7943 } 7944 7945 static int 7946 tcp_init_values(tcp_t *tcp) 7947 { 7948 int err; 7949 7950 ASSERT((tcp->tcp_family == AF_INET && 7951 tcp->tcp_ipversion == IPV4_VERSION) || 7952 (tcp->tcp_family == AF_INET6 && 7953 (tcp->tcp_ipversion == IPV4_VERSION || 7954 tcp->tcp_ipversion == IPV6_VERSION))); 7955 7956 /* 7957 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO 7958 * will be close to tcp_rexmit_interval_initial. By doing this, we 7959 * allow the algorithm to adjust slowly to large fluctuations of RTT 7960 * during first few transmissions of a connection as seen in slow 7961 * links. 7962 */ 7963 tcp->tcp_rtt_sa = tcp_rexmit_interval_initial << 2; 7964 tcp->tcp_rtt_sd = tcp_rexmit_interval_initial >> 1; 7965 tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 7966 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + 7967 tcp_conn_grace_period; 7968 if (tcp->tcp_rto < tcp_rexmit_interval_min) 7969 tcp->tcp_rto = tcp_rexmit_interval_min; 7970 tcp->tcp_timer_backoff = 0; 7971 tcp->tcp_ms_we_have_waited = 0; 7972 tcp->tcp_last_recv_time = lbolt; 7973 tcp->tcp_cwnd_max = tcp_cwnd_max_; 7974 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; 7975 tcp->tcp_snd_burst = TCP_CWND_INFINITE; 7976 7977 tcp->tcp_maxpsz = tcp_maxpsz_multiplier; 7978 7979 tcp->tcp_first_timer_threshold = tcp_ip_notify_interval; 7980 tcp->tcp_first_ctimer_threshold = tcp_ip_notify_cinterval; 7981 tcp->tcp_second_timer_threshold = tcp_ip_abort_interval; 7982 /* 7983 * Fix it to tcp_ip_abort_linterval later if it turns out to be a 7984 * passive open. 7985 */ 7986 tcp->tcp_second_ctimer_threshold = tcp_ip_abort_cinterval; 7987 7988 tcp->tcp_naglim = tcp_naglim_def; 7989 7990 /* NOTE: ISS is now set in tcp_adapt_ire(). */ 7991 7992 tcp->tcp_mdt_hdr_head = 0; 7993 tcp->tcp_mdt_hdr_tail = 0; 7994 7995 /* Reset fusion-related fields */ 7996 tcp->tcp_fused = B_FALSE; 7997 tcp->tcp_unfusable = B_FALSE; 7998 tcp->tcp_fused_sigurg = B_FALSE; 7999 tcp->tcp_direct_sockfs = B_FALSE; 8000 tcp->tcp_fuse_syncstr_stopped = B_FALSE; 8001 tcp->tcp_fuse_syncstr_plugged = B_FALSE; 8002 tcp->tcp_loopback_peer = NULL; 8003 tcp->tcp_fuse_rcv_hiwater = 0; 8004 tcp->tcp_fuse_rcv_unread_hiwater = 0; 8005 tcp->tcp_fuse_rcv_unread_cnt = 0; 8006 8007 /* Initialize the header template */ 8008 if (tcp->tcp_ipversion == IPV4_VERSION) { 8009 err = tcp_header_init_ipv4(tcp); 8010 } else { 8011 err = tcp_header_init_ipv6(tcp); 8012 } 8013 if (err) 8014 return (err); 8015 8016 /* 8017 * Init the window scale to the max so tcp_rwnd_set() won't pare 8018 * down tcp_rwnd. tcp_adapt_ire() will set the right value later. 8019 */ 8020 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; 8021 tcp->tcp_xmit_lowater = tcp_xmit_lowat; 8022 tcp->tcp_xmit_hiwater = tcp_xmit_hiwat; 8023 8024 tcp->tcp_cork = B_FALSE; 8025 /* 8026 * Init the tcp_debug option. This value determines whether TCP 8027 * calls strlog() to print out debug messages. Doing this 8028 * initialization here means that this value is not inherited thru 8029 * tcp_reinit(). 8030 */ 8031 tcp->tcp_debug = tcp_dbg; 8032 8033 tcp->tcp_ka_interval = tcp_keepalive_interval; 8034 tcp->tcp_ka_abort_thres = tcp_keepalive_abort_interval; 8035 8036 return (0); 8037 } 8038 8039 /* 8040 * Initialize the IPv4 header. Loses any record of any IP options. 8041 */ 8042 static int 8043 tcp_header_init_ipv4(tcp_t *tcp) 8044 { 8045 tcph_t *tcph; 8046 uint32_t sum; 8047 conn_t *connp; 8048 8049 /* 8050 * This is a simple initialization. If there's 8051 * already a template, it should never be too small, 8052 * so reuse it. Otherwise, allocate space for the new one. 8053 */ 8054 if (tcp->tcp_iphc == NULL) { 8055 ASSERT(tcp->tcp_iphc_len == 0); 8056 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 8057 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP); 8058 if (tcp->tcp_iphc == NULL) { 8059 tcp->tcp_iphc_len = 0; 8060 return (ENOMEM); 8061 } 8062 } 8063 8064 /* options are gone; may need a new label */ 8065 connp = tcp->tcp_connp; 8066 connp->conn_mlp_type = mlptSingle; 8067 connp->conn_ulp_labeled = !is_system_labeled(); 8068 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 8069 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; 8070 tcp->tcp_ip6h = NULL; 8071 tcp->tcp_ipversion = IPV4_VERSION; 8072 tcp->tcp_hdr_len = sizeof (ipha_t) + sizeof (tcph_t); 8073 tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 8074 tcp->tcp_ip_hdr_len = sizeof (ipha_t); 8075 tcp->tcp_ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (tcph_t)); 8076 tcp->tcp_ipha->ipha_version_and_hdr_length 8077 = (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS; 8078 tcp->tcp_ipha->ipha_ident = 0; 8079 8080 tcp->tcp_ttl = (uchar_t)tcp_ipv4_ttl; 8081 tcp->tcp_tos = 0; 8082 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; 8083 tcp->tcp_ipha->ipha_ttl = (uchar_t)tcp_ipv4_ttl; 8084 tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP; 8085 8086 tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t)); 8087 tcp->tcp_tcph = tcph; 8088 tcph->th_offset_and_rsrvd[0] = (5 << 4); 8089 /* 8090 * IP wants our header length in the checksum field to 8091 * allow it to perform a single pseudo-header+checksum 8092 * calculation on behalf of TCP. 8093 * Include the adjustment for a source route once IP_OPTIONS is set. 8094 */ 8095 sum = sizeof (tcph_t) + tcp->tcp_sum; 8096 sum = (sum >> 16) + (sum & 0xFFFF); 8097 U16_TO_ABE16(sum, tcph->th_sum); 8098 return (0); 8099 } 8100 8101 /* 8102 * Initialize the IPv6 header. Loses any record of any IPv6 extension headers. 8103 */ 8104 static int 8105 tcp_header_init_ipv6(tcp_t *tcp) 8106 { 8107 tcph_t *tcph; 8108 uint32_t sum; 8109 conn_t *connp; 8110 8111 /* 8112 * This is a simple initialization. If there's 8113 * already a template, it should never be too small, 8114 * so reuse it. Otherwise, allocate space for the new one. 8115 * Ensure that there is enough space to "downgrade" the tcp_t 8116 * to an IPv4 tcp_t. This requires having space for a full load 8117 * of IPv4 options, as well as a full load of TCP options 8118 * (TCP_MAX_COMBINED_HEADER_LENGTH, 120 bytes); this is more space 8119 * than a v6 header and a TCP header with a full load of TCP options 8120 * (IPV6_HDR_LEN is 40 bytes; TCP_MAX_HDR_LENGTH is 60 bytes). 8121 * We want to avoid reallocation in the "downgraded" case when 8122 * processing outbound IPv4 options. 8123 */ 8124 if (tcp->tcp_iphc == NULL) { 8125 ASSERT(tcp->tcp_iphc_len == 0); 8126 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; 8127 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP); 8128 if (tcp->tcp_iphc == NULL) { 8129 tcp->tcp_iphc_len = 0; 8130 return (ENOMEM); 8131 } 8132 } 8133 8134 /* options are gone; may need a new label */ 8135 connp = tcp->tcp_connp; 8136 connp->conn_mlp_type = mlptSingle; 8137 connp->conn_ulp_labeled = !is_system_labeled(); 8138 8139 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); 8140 tcp->tcp_ipversion = IPV6_VERSION; 8141 tcp->tcp_hdr_len = IPV6_HDR_LEN + sizeof (tcph_t); 8142 tcp->tcp_tcp_hdr_len = sizeof (tcph_t); 8143 tcp->tcp_ip_hdr_len = IPV6_HDR_LEN; 8144 tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc; 8145 tcp->tcp_ipha = NULL; 8146 8147 /* Initialize the header template */ 8148 8149 tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; 8150 tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t)); 8151 tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP; 8152 tcp->tcp_ip6h->ip6_hops = (uint8_t)tcp_ipv6_hoplimit; 8153 8154 tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN); 8155 tcp->tcp_tcph = tcph; 8156 tcph->th_offset_and_rsrvd[0] = (5 << 4); 8157 /* 8158 * IP wants our header length in the checksum field to 8159 * allow it to perform a single psuedo-header+checksum 8160 * calculation on behalf of TCP. 8161 * Include the adjustment for a source route when IPV6_RTHDR is set. 8162 */ 8163 sum = sizeof (tcph_t) + tcp->tcp_sum; 8164 sum = (sum >> 16) + (sum & 0xFFFF); 8165 U16_TO_ABE16(sum, tcph->th_sum); 8166 return (0); 8167 } 8168 8169 /* At minimum we need 4 bytes in the TCP header for the lookup */ 8170 #define ICMP_MIN_TCP_HDR 12 8171 8172 /* 8173 * tcp_icmp_error is called by tcp_rput_other to process ICMP error messages 8174 * passed up by IP. The message is always received on the correct tcp_t. 8175 * Assumes that IP has pulled up everything up to and including the ICMP header. 8176 */ 8177 void 8178 tcp_icmp_error(tcp_t *tcp, mblk_t *mp) 8179 { 8180 icmph_t *icmph; 8181 ipha_t *ipha; 8182 int iph_hdr_length; 8183 tcph_t *tcph; 8184 boolean_t ipsec_mctl = B_FALSE; 8185 boolean_t secure; 8186 mblk_t *first_mp = mp; 8187 uint32_t new_mss; 8188 uint32_t ratio; 8189 size_t mp_size = MBLKL(mp); 8190 uint32_t seg_ack; 8191 uint32_t seg_seq; 8192 8193 /* Assume IP provides aligned packets - otherwise toss */ 8194 if (!OK_32PTR(mp->b_rptr)) { 8195 freemsg(mp); 8196 return; 8197 } 8198 8199 /* 8200 * Since ICMP errors are normal data marked with M_CTL when sent 8201 * to TCP or UDP, we have to look for a IPSEC_IN value to identify 8202 * packets starting with an ipsec_info_t, see ipsec_info.h. 8203 */ 8204 if ((mp_size == sizeof (ipsec_info_t)) && 8205 (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == IPSEC_IN)) { 8206 ASSERT(mp->b_cont != NULL); 8207 mp = mp->b_cont; 8208 /* IP should have done this */ 8209 ASSERT(OK_32PTR(mp->b_rptr)); 8210 mp_size = MBLKL(mp); 8211 ipsec_mctl = B_TRUE; 8212 } 8213 8214 /* 8215 * Verify that we have a complete outer IP header. If not, drop it. 8216 */ 8217 if (mp_size < sizeof (ipha_t)) { 8218 noticmpv4: 8219 freemsg(first_mp); 8220 return; 8221 } 8222 8223 ipha = (ipha_t *)mp->b_rptr; 8224 /* 8225 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent 8226 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6. 8227 */ 8228 switch (IPH_HDR_VERSION(ipha)) { 8229 case IPV6_VERSION: 8230 tcp_icmp_error_ipv6(tcp, first_mp, ipsec_mctl); 8231 return; 8232 case IPV4_VERSION: 8233 break; 8234 default: 8235 goto noticmpv4; 8236 } 8237 8238 /* Skip past the outer IP and ICMP headers */ 8239 iph_hdr_length = IPH_HDR_LENGTH(ipha); 8240 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; 8241 /* 8242 * If we don't have the correct outer IP header length or if the ULP 8243 * is not IPPROTO_ICMP or if we don't have a complete inner IP header 8244 * send it upstream. 8245 */ 8246 if (iph_hdr_length < sizeof (ipha_t) || 8247 ipha->ipha_protocol != IPPROTO_ICMP || 8248 (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) { 8249 goto noticmpv4; 8250 } 8251 ipha = (ipha_t *)&icmph[1]; 8252 8253 /* Skip past the inner IP and find the ULP header */ 8254 iph_hdr_length = IPH_HDR_LENGTH(ipha); 8255 tcph = (tcph_t *)((char *)ipha + iph_hdr_length); 8256 /* 8257 * If we don't have the correct inner IP header length or if the ULP 8258 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR 8259 * bytes of TCP header, drop it. 8260 */ 8261 if (iph_hdr_length < sizeof (ipha_t) || 8262 ipha->ipha_protocol != IPPROTO_TCP || 8263 (uchar_t *)tcph + ICMP_MIN_TCP_HDR > mp->b_wptr) { 8264 goto noticmpv4; 8265 } 8266 8267 if (TCP_IS_DETACHED_NONEAGER(tcp)) { 8268 if (ipsec_mctl) { 8269 secure = ipsec_in_is_secure(first_mp); 8270 } else { 8271 secure = B_FALSE; 8272 } 8273 if (secure) { 8274 /* 8275 * If we are willing to accept this in clear 8276 * we don't have to verify policy. 8277 */ 8278 if (!ipsec_inbound_accept_clear(mp, ipha, NULL)) { 8279 if (!tcp_check_policy(tcp, first_mp, 8280 ipha, NULL, secure, ipsec_mctl)) { 8281 /* 8282 * tcp_check_policy called 8283 * ip_drop_packet() on failure. 8284 */ 8285 return; 8286 } 8287 } 8288 } 8289 } else if (ipsec_mctl) { 8290 /* 8291 * This is a hard_bound connection. IP has already 8292 * verified policy. We don't have to do it again. 8293 */ 8294 freeb(first_mp); 8295 first_mp = mp; 8296 ipsec_mctl = B_FALSE; 8297 } 8298 8299 seg_ack = ABE32_TO_U32(tcph->th_ack); 8300 seg_seq = ABE32_TO_U32(tcph->th_seq); 8301 /* 8302 * TCP SHOULD check that the TCP sequence number contained in 8303 * payload of the ICMP error message is within the range 8304 * SND.UNA <= SEG.SEQ < SND.NXT. and also SEG.ACK <= RECV.NXT 8305 */ 8306 if (SEQ_LT(seg_seq, tcp->tcp_suna) || 8307 SEQ_GEQ(seg_seq, tcp->tcp_snxt) || 8308 SEQ_GT(seg_ack, tcp->tcp_rnxt)) { 8309 /* 8310 * If the ICMP message is bogus, should we kill the 8311 * connection, or should we just drop the bogus ICMP 8312 * message? It would probably make more sense to just 8313 * drop the message so that if this one managed to get 8314 * in, the real connection should not suffer. 8315 */ 8316 goto noticmpv4; 8317 } 8318 8319 switch (icmph->icmph_type) { 8320 case ICMP_DEST_UNREACHABLE: 8321 switch (icmph->icmph_code) { 8322 case ICMP_FRAGMENTATION_NEEDED: 8323 /* 8324 * Reduce the MSS based on the new MTU. This will 8325 * eliminate any fragmentation locally. 8326 * N.B. There may well be some funny side-effects on 8327 * the local send policy and the remote receive policy. 8328 * Pending further research, we provide 8329 * tcp_ignore_path_mtu just in case this proves 8330 * disastrous somewhere. 8331 * 8332 * After updating the MSS, retransmit part of the 8333 * dropped segment using the new mss by calling 8334 * tcp_wput_data(). Need to adjust all those 8335 * params to make sure tcp_wput_data() work properly. 8336 */ 8337 if (tcp_ignore_path_mtu) 8338 break; 8339 8340 /* 8341 * Decrease the MSS by time stamp options 8342 * IP options and IPSEC options. tcp_hdr_len 8343 * includes time stamp option and IP option 8344 * length. 8345 */ 8346 8347 new_mss = ntohs(icmph->icmph_du_mtu) - 8348 tcp->tcp_hdr_len - tcp->tcp_ipsec_overhead; 8349 8350 /* 8351 * Only update the MSS if the new one is 8352 * smaller than the previous one. This is 8353 * to avoid problems when getting multiple 8354 * ICMP errors for the same MTU. 8355 */ 8356 if (new_mss >= tcp->tcp_mss) 8357 break; 8358 8359 /* 8360 * Stop doing PMTU if new_mss is less than 68 8361 * or less than tcp_mss_min. 8362 * The value 68 comes from rfc 1191. 8363 */ 8364 if (new_mss < MAX(68, tcp_mss_min)) 8365 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 8366 0; 8367 8368 ratio = tcp->tcp_cwnd / tcp->tcp_mss; 8369 ASSERT(ratio >= 1); 8370 tcp_mss_set(tcp, new_mss); 8371 8372 /* 8373 * Make sure we have something to 8374 * send. 8375 */ 8376 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) && 8377 (tcp->tcp_xmit_head != NULL)) { 8378 /* 8379 * Shrink tcp_cwnd in 8380 * proportion to the old MSS/new MSS. 8381 */ 8382 tcp->tcp_cwnd = ratio * tcp->tcp_mss; 8383 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 8384 (tcp->tcp_unsent == 0)) { 8385 tcp->tcp_rexmit_max = tcp->tcp_fss; 8386 } else { 8387 tcp->tcp_rexmit_max = tcp->tcp_snxt; 8388 } 8389 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 8390 tcp->tcp_rexmit = B_TRUE; 8391 tcp->tcp_dupack_cnt = 0; 8392 tcp->tcp_snd_burst = TCP_CWND_SS; 8393 tcp_ss_rexmit(tcp); 8394 } 8395 break; 8396 case ICMP_PORT_UNREACHABLE: 8397 case ICMP_PROTOCOL_UNREACHABLE: 8398 switch (tcp->tcp_state) { 8399 case TCPS_SYN_SENT: 8400 case TCPS_SYN_RCVD: 8401 /* 8402 * ICMP can snipe away incipient 8403 * TCP connections as long as 8404 * seq number is same as initial 8405 * send seq number. 8406 */ 8407 if (seg_seq == tcp->tcp_iss) { 8408 (void) tcp_clean_death(tcp, 8409 ECONNREFUSED, 6); 8410 } 8411 break; 8412 } 8413 break; 8414 case ICMP_HOST_UNREACHABLE: 8415 case ICMP_NET_UNREACHABLE: 8416 /* Record the error in case we finally time out. */ 8417 if (icmph->icmph_code == ICMP_HOST_UNREACHABLE) 8418 tcp->tcp_client_errno = EHOSTUNREACH; 8419 else 8420 tcp->tcp_client_errno = ENETUNREACH; 8421 if (tcp->tcp_state == TCPS_SYN_RCVD) { 8422 if (tcp->tcp_listener != NULL && 8423 tcp->tcp_listener->tcp_syn_defense) { 8424 /* 8425 * Ditch the half-open connection if we 8426 * suspect a SYN attack is under way. 8427 */ 8428 tcp_ip_ire_mark_advice(tcp); 8429 (void) tcp_clean_death(tcp, 8430 tcp->tcp_client_errno, 7); 8431 } 8432 } 8433 break; 8434 default: 8435 break; 8436 } 8437 break; 8438 case ICMP_SOURCE_QUENCH: { 8439 /* 8440 * use a global boolean to control 8441 * whether TCP should respond to ICMP_SOURCE_QUENCH. 8442 * The default is false. 8443 */ 8444 if (tcp_icmp_source_quench) { 8445 /* 8446 * Reduce the sending rate as if we got a 8447 * retransmit timeout 8448 */ 8449 uint32_t npkt; 8450 8451 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / 8452 tcp->tcp_mss; 8453 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss; 8454 tcp->tcp_cwnd = tcp->tcp_mss; 8455 tcp->tcp_cwnd_cnt = 0; 8456 } 8457 break; 8458 } 8459 } 8460 freemsg(first_mp); 8461 } 8462 8463 /* 8464 * tcp_icmp_error_ipv6 is called by tcp_rput_other to process ICMPv6 8465 * error messages passed up by IP. 8466 * Assumes that IP has pulled up all the extension headers as well 8467 * as the ICMPv6 header. 8468 */ 8469 static void 8470 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl) 8471 { 8472 icmp6_t *icmp6; 8473 ip6_t *ip6h; 8474 uint16_t iph_hdr_length; 8475 tcpha_t *tcpha; 8476 uint8_t *nexthdrp; 8477 uint32_t new_mss; 8478 uint32_t ratio; 8479 boolean_t secure; 8480 mblk_t *first_mp = mp; 8481 size_t mp_size; 8482 uint32_t seg_ack; 8483 uint32_t seg_seq; 8484 8485 /* 8486 * The caller has determined if this is an IPSEC_IN packet and 8487 * set ipsec_mctl appropriately (see tcp_icmp_error). 8488 */ 8489 if (ipsec_mctl) 8490 mp = mp->b_cont; 8491 8492 mp_size = MBLKL(mp); 8493 8494 /* 8495 * Verify that we have a complete IP header. If not, send it upstream. 8496 */ 8497 if (mp_size < sizeof (ip6_t)) { 8498 noticmpv6: 8499 freemsg(first_mp); 8500 return; 8501 } 8502 8503 /* 8504 * Verify this is an ICMPV6 packet, else send it upstream. 8505 */ 8506 ip6h = (ip6_t *)mp->b_rptr; 8507 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 8508 iph_hdr_length = IPV6_HDR_LEN; 8509 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, 8510 &nexthdrp) || 8511 *nexthdrp != IPPROTO_ICMPV6) { 8512 goto noticmpv6; 8513 } 8514 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; 8515 ip6h = (ip6_t *)&icmp6[1]; 8516 /* 8517 * Verify if we have a complete ICMP and inner IP header. 8518 */ 8519 if ((uchar_t *)&ip6h[1] > mp->b_wptr) 8520 goto noticmpv6; 8521 8522 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) 8523 goto noticmpv6; 8524 tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length); 8525 /* 8526 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't 8527 * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the 8528 * packet. 8529 */ 8530 if ((*nexthdrp != IPPROTO_TCP) || 8531 ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) { 8532 goto noticmpv6; 8533 } 8534 8535 /* 8536 * ICMP errors come on the right queue or come on 8537 * listener/global queue for detached connections and 8538 * get switched to the right queue. If it comes on the 8539 * right queue, policy check has already been done by IP 8540 * and thus free the first_mp without verifying the policy. 8541 * If it has come for a non-hard bound connection, we need 8542 * to verify policy as IP may not have done it. 8543 */ 8544 if (!tcp->tcp_hard_bound) { 8545 if (ipsec_mctl) { 8546 secure = ipsec_in_is_secure(first_mp); 8547 } else { 8548 secure = B_FALSE; 8549 } 8550 if (secure) { 8551 /* 8552 * If we are willing to accept this in clear 8553 * we don't have to verify policy. 8554 */ 8555 if (!ipsec_inbound_accept_clear(mp, NULL, ip6h)) { 8556 if (!tcp_check_policy(tcp, first_mp, 8557 NULL, ip6h, secure, ipsec_mctl)) { 8558 /* 8559 * tcp_check_policy called 8560 * ip_drop_packet() on failure. 8561 */ 8562 return; 8563 } 8564 } 8565 } 8566 } else if (ipsec_mctl) { 8567 /* 8568 * This is a hard_bound connection. IP has already 8569 * verified policy. We don't have to do it again. 8570 */ 8571 freeb(first_mp); 8572 first_mp = mp; 8573 ipsec_mctl = B_FALSE; 8574 } 8575 8576 seg_ack = ntohl(tcpha->tha_ack); 8577 seg_seq = ntohl(tcpha->tha_seq); 8578 /* 8579 * TCP SHOULD check that the TCP sequence number contained in 8580 * payload of the ICMP error message is within the range 8581 * SND.UNA <= SEG.SEQ < SND.NXT. and also SEG.ACK <= RECV.NXT 8582 */ 8583 if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt) || 8584 SEQ_GT(seg_ack, tcp->tcp_rnxt)) { 8585 /* 8586 * If the ICMP message is bogus, should we kill the 8587 * connection, or should we just drop the bogus ICMP 8588 * message? It would probably make more sense to just 8589 * drop the message so that if this one managed to get 8590 * in, the real connection should not suffer. 8591 */ 8592 goto noticmpv6; 8593 } 8594 8595 switch (icmp6->icmp6_type) { 8596 case ICMP6_PACKET_TOO_BIG: 8597 /* 8598 * Reduce the MSS based on the new MTU. This will 8599 * eliminate any fragmentation locally. 8600 * N.B. There may well be some funny side-effects on 8601 * the local send policy and the remote receive policy. 8602 * Pending further research, we provide 8603 * tcp_ignore_path_mtu just in case this proves 8604 * disastrous somewhere. 8605 * 8606 * After updating the MSS, retransmit part of the 8607 * dropped segment using the new mss by calling 8608 * tcp_wput_data(). Need to adjust all those 8609 * params to make sure tcp_wput_data() work properly. 8610 */ 8611 if (tcp_ignore_path_mtu) 8612 break; 8613 8614 /* 8615 * Decrease the MSS by time stamp options 8616 * IP options and IPSEC options. tcp_hdr_len 8617 * includes time stamp option and IP option 8618 * length. 8619 */ 8620 new_mss = ntohs(icmp6->icmp6_mtu) - tcp->tcp_hdr_len - 8621 tcp->tcp_ipsec_overhead; 8622 8623 /* 8624 * Only update the MSS if the new one is 8625 * smaller than the previous one. This is 8626 * to avoid problems when getting multiple 8627 * ICMP errors for the same MTU. 8628 */ 8629 if (new_mss >= tcp->tcp_mss) 8630 break; 8631 8632 ratio = tcp->tcp_cwnd / tcp->tcp_mss; 8633 ASSERT(ratio >= 1); 8634 tcp_mss_set(tcp, new_mss); 8635 8636 /* 8637 * Make sure we have something to 8638 * send. 8639 */ 8640 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) && 8641 (tcp->tcp_xmit_head != NULL)) { 8642 /* 8643 * Shrink tcp_cwnd in 8644 * proportion to the old MSS/new MSS. 8645 */ 8646 tcp->tcp_cwnd = ratio * tcp->tcp_mss; 8647 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 8648 (tcp->tcp_unsent == 0)) { 8649 tcp->tcp_rexmit_max = tcp->tcp_fss; 8650 } else { 8651 tcp->tcp_rexmit_max = tcp->tcp_snxt; 8652 } 8653 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 8654 tcp->tcp_rexmit = B_TRUE; 8655 tcp->tcp_dupack_cnt = 0; 8656 tcp->tcp_snd_burst = TCP_CWND_SS; 8657 tcp_ss_rexmit(tcp); 8658 } 8659 break; 8660 8661 case ICMP6_DST_UNREACH: 8662 switch (icmp6->icmp6_code) { 8663 case ICMP6_DST_UNREACH_NOPORT: 8664 if (((tcp->tcp_state == TCPS_SYN_SENT) || 8665 (tcp->tcp_state == TCPS_SYN_RCVD)) && 8666 (seg_seq == tcp->tcp_iss)) { 8667 (void) tcp_clean_death(tcp, 8668 ECONNREFUSED, 8); 8669 } 8670 break; 8671 8672 case ICMP6_DST_UNREACH_ADMIN: 8673 case ICMP6_DST_UNREACH_NOROUTE: 8674 case ICMP6_DST_UNREACH_BEYONDSCOPE: 8675 case ICMP6_DST_UNREACH_ADDR: 8676 /* Record the error in case we finally time out. */ 8677 tcp->tcp_client_errno = EHOSTUNREACH; 8678 if (((tcp->tcp_state == TCPS_SYN_SENT) || 8679 (tcp->tcp_state == TCPS_SYN_RCVD)) && 8680 (seg_seq == tcp->tcp_iss)) { 8681 if (tcp->tcp_listener != NULL && 8682 tcp->tcp_listener->tcp_syn_defense) { 8683 /* 8684 * Ditch the half-open connection if we 8685 * suspect a SYN attack is under way. 8686 */ 8687 tcp_ip_ire_mark_advice(tcp); 8688 (void) tcp_clean_death(tcp, 8689 tcp->tcp_client_errno, 9); 8690 } 8691 } 8692 8693 8694 break; 8695 default: 8696 break; 8697 } 8698 break; 8699 8700 case ICMP6_PARAM_PROB: 8701 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ 8702 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && 8703 (uchar_t *)ip6h + icmp6->icmp6_pptr == 8704 (uchar_t *)nexthdrp) { 8705 if (tcp->tcp_state == TCPS_SYN_SENT || 8706 tcp->tcp_state == TCPS_SYN_RCVD) { 8707 (void) tcp_clean_death(tcp, 8708 ECONNREFUSED, 10); 8709 } 8710 break; 8711 } 8712 break; 8713 8714 case ICMP6_TIME_EXCEEDED: 8715 default: 8716 break; 8717 } 8718 freemsg(first_mp); 8719 } 8720 8721 /* 8722 * IP recognizes seven kinds of bind requests: 8723 * 8724 * - A zero-length address binds only to the protocol number. 8725 * 8726 * - A 4-byte address is treated as a request to 8727 * validate that the address is a valid local IPv4 8728 * address, appropriate for an application to bind to. 8729 * IP does the verification, but does not make any note 8730 * of the address at this time. 8731 * 8732 * - A 16-byte address contains is treated as a request 8733 * to validate a local IPv6 address, as the 4-byte 8734 * address case above. 8735 * 8736 * - A 16-byte sockaddr_in to validate the local IPv4 address and also 8737 * use it for the inbound fanout of packets. 8738 * 8739 * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also 8740 * use it for the inbound fanout of packets. 8741 * 8742 * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout 8743 * information consisting of local and remote addresses 8744 * and ports. In this case, the addresses are both 8745 * validated as appropriate for this operation, and, if 8746 * so, the information is retained for use in the 8747 * inbound fanout. 8748 * 8749 * - A 36-byte address address (ipa6_conn_t) containing complete IPv6 8750 * fanout information, like the 12-byte case above. 8751 * 8752 * IP will also fill in the IRE request mblk with information 8753 * regarding our peer. In all cases, we notify IP of our protocol 8754 * type by appending a single protocol byte to the bind request. 8755 */ 8756 static mblk_t * 8757 tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, t_scalar_t addr_length) 8758 { 8759 char *cp; 8760 mblk_t *mp; 8761 struct T_bind_req *tbr; 8762 ipa_conn_t *ac; 8763 ipa6_conn_t *ac6; 8764 sin_t *sin; 8765 sin6_t *sin6; 8766 8767 ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ); 8768 ASSERT((tcp->tcp_family == AF_INET && 8769 tcp->tcp_ipversion == IPV4_VERSION) || 8770 (tcp->tcp_family == AF_INET6 && 8771 (tcp->tcp_ipversion == IPV4_VERSION || 8772 tcp->tcp_ipversion == IPV6_VERSION))); 8773 8774 mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI); 8775 if (!mp) 8776 return (mp); 8777 mp->b_datap->db_type = M_PROTO; 8778 tbr = (struct T_bind_req *)mp->b_rptr; 8779 tbr->PRIM_type = bind_prim; 8780 tbr->ADDR_offset = sizeof (*tbr); 8781 tbr->CONIND_number = 0; 8782 tbr->ADDR_length = addr_length; 8783 cp = (char *)&tbr[1]; 8784 switch (addr_length) { 8785 case sizeof (ipa_conn_t): 8786 ASSERT(tcp->tcp_family == AF_INET); 8787 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 8788 8789 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); 8790 if (mp->b_cont == NULL) { 8791 freemsg(mp); 8792 return (NULL); 8793 } 8794 mp->b_cont->b_wptr += sizeof (ire_t); 8795 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; 8796 8797 /* cp known to be 32 bit aligned */ 8798 ac = (ipa_conn_t *)cp; 8799 ac->ac_laddr = tcp->tcp_ipha->ipha_src; 8800 ac->ac_faddr = tcp->tcp_remote; 8801 ac->ac_fport = tcp->tcp_fport; 8802 ac->ac_lport = tcp->tcp_lport; 8803 tcp->tcp_hard_binding = 1; 8804 break; 8805 8806 case sizeof (ipa6_conn_t): 8807 ASSERT(tcp->tcp_family == AF_INET6); 8808 8809 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); 8810 if (mp->b_cont == NULL) { 8811 freemsg(mp); 8812 return (NULL); 8813 } 8814 mp->b_cont->b_wptr += sizeof (ire_t); 8815 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE; 8816 8817 /* cp known to be 32 bit aligned */ 8818 ac6 = (ipa6_conn_t *)cp; 8819 if (tcp->tcp_ipversion == IPV4_VERSION) { 8820 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 8821 &ac6->ac6_laddr); 8822 } else { 8823 ac6->ac6_laddr = tcp->tcp_ip6h->ip6_src; 8824 } 8825 ac6->ac6_faddr = tcp->tcp_remote_v6; 8826 ac6->ac6_fport = tcp->tcp_fport; 8827 ac6->ac6_lport = tcp->tcp_lport; 8828 tcp->tcp_hard_binding = 1; 8829 break; 8830 8831 case sizeof (sin_t): 8832 /* 8833 * NOTE: IPV6_ADDR_LEN also has same size. 8834 * Use family to discriminate. 8835 */ 8836 if (tcp->tcp_family == AF_INET) { 8837 sin = (sin_t *)cp; 8838 8839 *sin = sin_null; 8840 sin->sin_family = AF_INET; 8841 sin->sin_addr.s_addr = tcp->tcp_bound_source; 8842 sin->sin_port = tcp->tcp_lport; 8843 break; 8844 } else { 8845 *(in6_addr_t *)cp = tcp->tcp_bound_source_v6; 8846 } 8847 break; 8848 8849 case sizeof (sin6_t): 8850 ASSERT(tcp->tcp_family == AF_INET6); 8851 sin6 = (sin6_t *)cp; 8852 8853 *sin6 = sin6_null; 8854 sin6->sin6_family = AF_INET6; 8855 sin6->sin6_addr = tcp->tcp_bound_source_v6; 8856 sin6->sin6_port = tcp->tcp_lport; 8857 break; 8858 8859 case IP_ADDR_LEN: 8860 ASSERT(tcp->tcp_ipversion == IPV4_VERSION); 8861 *(uint32_t *)cp = tcp->tcp_ipha->ipha_src; 8862 break; 8863 8864 } 8865 /* Add protocol number to end */ 8866 cp[addr_length] = (char)IPPROTO_TCP; 8867 mp->b_wptr = (uchar_t *)&cp[addr_length + 1]; 8868 return (mp); 8869 } 8870 8871 /* 8872 * Notify IP that we are having trouble with this connection. IP should 8873 * blow the IRE away and start over. 8874 */ 8875 static void 8876 tcp_ip_notify(tcp_t *tcp) 8877 { 8878 struct iocblk *iocp; 8879 ipid_t *ipid; 8880 mblk_t *mp; 8881 8882 /* IPv6 has NUD thus notification to delete the IRE is not needed */ 8883 if (tcp->tcp_ipversion == IPV6_VERSION) 8884 return; 8885 8886 mp = mkiocb(IP_IOCTL); 8887 if (mp == NULL) 8888 return; 8889 8890 iocp = (struct iocblk *)mp->b_rptr; 8891 iocp->ioc_count = sizeof (ipid_t) + sizeof (tcp->tcp_ipha->ipha_dst); 8892 8893 mp->b_cont = allocb(iocp->ioc_count, BPRI_HI); 8894 if (!mp->b_cont) { 8895 freeb(mp); 8896 return; 8897 } 8898 8899 ipid = (ipid_t *)mp->b_cont->b_rptr; 8900 mp->b_cont->b_wptr += iocp->ioc_count; 8901 bzero(ipid, sizeof (*ipid)); 8902 ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY; 8903 ipid->ipid_ire_type = IRE_CACHE; 8904 ipid->ipid_addr_offset = sizeof (ipid_t); 8905 ipid->ipid_addr_length = sizeof (tcp->tcp_ipha->ipha_dst); 8906 /* 8907 * Note: in the case of source routing we want to blow away the 8908 * route to the first source route hop. 8909 */ 8910 bcopy(&tcp->tcp_ipha->ipha_dst, &ipid[1], 8911 sizeof (tcp->tcp_ipha->ipha_dst)); 8912 8913 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 8914 } 8915 8916 /* Unlink and return any mblk that looks like it contains an ire */ 8917 static mblk_t * 8918 tcp_ire_mp(mblk_t *mp) 8919 { 8920 mblk_t *prev_mp; 8921 8922 for (;;) { 8923 prev_mp = mp; 8924 mp = mp->b_cont; 8925 if (mp == NULL) 8926 break; 8927 switch (DB_TYPE(mp)) { 8928 case IRE_DB_TYPE: 8929 case IRE_DB_REQ_TYPE: 8930 if (prev_mp != NULL) 8931 prev_mp->b_cont = mp->b_cont; 8932 mp->b_cont = NULL; 8933 return (mp); 8934 default: 8935 break; 8936 } 8937 } 8938 return (mp); 8939 } 8940 8941 /* 8942 * Timer callback routine for keepalive probe. We do a fake resend of 8943 * last ACKed byte. Then set a timer using RTO. When the timer expires, 8944 * check to see if we have heard anything from the other end for the last 8945 * RTO period. If we have, set the timer to expire for another 8946 * tcp_keepalive_intrvl and check again. If we have not, set a timer using 8947 * RTO << 1 and check again when it expires. Keep exponentially increasing 8948 * the timeout if we have not heard from the other side. If for more than 8949 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything, 8950 * kill the connection unless the keepalive abort threshold is 0. In 8951 * that case, we will probe "forever." 8952 */ 8953 static void 8954 tcp_keepalive_killer(void *arg) 8955 { 8956 mblk_t *mp; 8957 conn_t *connp = (conn_t *)arg; 8958 tcp_t *tcp = connp->conn_tcp; 8959 int32_t firetime; 8960 int32_t idletime; 8961 int32_t ka_intrvl; 8962 8963 tcp->tcp_ka_tid = 0; 8964 8965 if (tcp->tcp_fused) 8966 return; 8967 8968 BUMP_MIB(&tcp_mib, tcpTimKeepalive); 8969 ka_intrvl = tcp->tcp_ka_interval; 8970 8971 /* 8972 * Keepalive probe should only be sent if the application has not 8973 * done a close on the connection. 8974 */ 8975 if (tcp->tcp_state > TCPS_CLOSE_WAIT) { 8976 return; 8977 } 8978 /* Timer fired too early, restart it. */ 8979 if (tcp->tcp_state < TCPS_ESTABLISHED) { 8980 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 8981 MSEC_TO_TICK(ka_intrvl)); 8982 return; 8983 } 8984 8985 idletime = TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time); 8986 /* 8987 * If we have not heard from the other side for a long 8988 * time, kill the connection unless the keepalive abort 8989 * threshold is 0. In that case, we will probe "forever." 8990 */ 8991 if (tcp->tcp_ka_abort_thres != 0 && 8992 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) { 8993 BUMP_MIB(&tcp_mib, tcpTimKeepaliveDrop); 8994 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ? 8995 tcp->tcp_client_errno : ETIMEDOUT, 11); 8996 return; 8997 } 8998 8999 if (tcp->tcp_snxt == tcp->tcp_suna && 9000 idletime >= ka_intrvl) { 9001 /* Fake resend of last ACKed byte. */ 9002 mblk_t *mp1 = allocb(1, BPRI_LO); 9003 9004 if (mp1 != NULL) { 9005 *mp1->b_wptr++ = '\0'; 9006 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL, 9007 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE); 9008 freeb(mp1); 9009 /* 9010 * if allocation failed, fall through to start the 9011 * timer back. 9012 */ 9013 if (mp != NULL) { 9014 TCP_RECORD_TRACE(tcp, mp, 9015 TCP_TRACE_SEND_PKT); 9016 tcp_send_data(tcp, tcp->tcp_wq, mp); 9017 BUMP_MIB(&tcp_mib, tcpTimKeepaliveProbe); 9018 if (tcp->tcp_ka_last_intrvl != 0) { 9019 /* 9020 * We should probe again at least 9021 * in ka_intrvl, but not more than 9022 * tcp_rexmit_interval_max. 9023 */ 9024 firetime = MIN(ka_intrvl - 1, 9025 tcp->tcp_ka_last_intrvl << 1); 9026 if (firetime > tcp_rexmit_interval_max) 9027 firetime = 9028 tcp_rexmit_interval_max; 9029 } else { 9030 firetime = tcp->tcp_rto; 9031 } 9032 tcp->tcp_ka_tid = TCP_TIMER(tcp, 9033 tcp_keepalive_killer, 9034 MSEC_TO_TICK(firetime)); 9035 tcp->tcp_ka_last_intrvl = firetime; 9036 return; 9037 } 9038 } 9039 } else { 9040 tcp->tcp_ka_last_intrvl = 0; 9041 } 9042 9043 /* firetime can be negative if (mp1 == NULL || mp == NULL) */ 9044 if ((firetime = ka_intrvl - idletime) < 0) { 9045 firetime = ka_intrvl; 9046 } 9047 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 9048 MSEC_TO_TICK(firetime)); 9049 } 9050 9051 int 9052 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) 9053 { 9054 queue_t *q = tcp->tcp_rq; 9055 int32_t mss = tcp->tcp_mss; 9056 int maxpsz; 9057 9058 if (TCP_IS_DETACHED(tcp)) 9059 return (mss); 9060 9061 if (tcp->tcp_fused) { 9062 maxpsz = tcp_fuse_maxpsz_set(tcp); 9063 mss = INFPSZ; 9064 } else if (tcp->tcp_mdt || tcp->tcp_maxpsz == 0) { 9065 /* 9066 * Set the sd_qn_maxpsz according to the socket send buffer 9067 * size, and sd_maxblk to INFPSZ (-1). This will essentially 9068 * instruct the stream head to copyin user data into contiguous 9069 * kernel-allocated buffers without breaking it up into smaller 9070 * chunks. We round up the buffer size to the nearest SMSS. 9071 */ 9072 maxpsz = MSS_ROUNDUP(tcp->tcp_xmit_hiwater, mss); 9073 if (tcp->tcp_kssl_ctx == NULL) 9074 mss = INFPSZ; 9075 else 9076 mss = SSL3_MAX_RECORD_LEN; 9077 } else { 9078 /* 9079 * Set sd_qn_maxpsz to approx half the (receivers) buffer 9080 * (and a multiple of the mss). This instructs the stream 9081 * head to break down larger than SMSS writes into SMSS- 9082 * size mblks, up to tcp_maxpsz_multiplier mblks at a time. 9083 */ 9084 maxpsz = tcp->tcp_maxpsz * mss; 9085 if (maxpsz > tcp->tcp_xmit_hiwater/2) { 9086 maxpsz = tcp->tcp_xmit_hiwater/2; 9087 /* Round up to nearest mss */ 9088 maxpsz = MSS_ROUNDUP(maxpsz, mss); 9089 } 9090 } 9091 (void) setmaxps(q, maxpsz); 9092 tcp->tcp_wq->q_maxpsz = maxpsz; 9093 9094 if (set_maxblk) 9095 (void) mi_set_sth_maxblk(q, mss); 9096 9097 return (mss); 9098 } 9099 9100 /* 9101 * Extract option values from a tcp header. We put any found values into the 9102 * tcpopt struct and return a bitmask saying which options were found. 9103 */ 9104 static int 9105 tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) 9106 { 9107 uchar_t *endp; 9108 int len; 9109 uint32_t mss; 9110 uchar_t *up = (uchar_t *)tcph; 9111 int found = 0; 9112 int32_t sack_len; 9113 tcp_seq sack_begin, sack_end; 9114 tcp_t *tcp; 9115 9116 endp = up + TCP_HDR_LENGTH(tcph); 9117 up += TCP_MIN_HEADER_LENGTH; 9118 while (up < endp) { 9119 len = endp - up; 9120 switch (*up) { 9121 case TCPOPT_EOL: 9122 break; 9123 9124 case TCPOPT_NOP: 9125 up++; 9126 continue; 9127 9128 case TCPOPT_MAXSEG: 9129 if (len < TCPOPT_MAXSEG_LEN || 9130 up[1] != TCPOPT_MAXSEG_LEN) 9131 break; 9132 9133 mss = BE16_TO_U16(up+2); 9134 /* Caller must handle tcp_mss_min and tcp_mss_max_* */ 9135 tcpopt->tcp_opt_mss = mss; 9136 found |= TCP_OPT_MSS_PRESENT; 9137 9138 up += TCPOPT_MAXSEG_LEN; 9139 continue; 9140 9141 case TCPOPT_WSCALE: 9142 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) 9143 break; 9144 9145 if (up[2] > TCP_MAX_WINSHIFT) 9146 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; 9147 else 9148 tcpopt->tcp_opt_wscale = up[2]; 9149 found |= TCP_OPT_WSCALE_PRESENT; 9150 9151 up += TCPOPT_WS_LEN; 9152 continue; 9153 9154 case TCPOPT_SACK_PERMITTED: 9155 if (len < TCPOPT_SACK_OK_LEN || 9156 up[1] != TCPOPT_SACK_OK_LEN) 9157 break; 9158 found |= TCP_OPT_SACK_OK_PRESENT; 9159 up += TCPOPT_SACK_OK_LEN; 9160 continue; 9161 9162 case TCPOPT_SACK: 9163 if (len <= 2 || up[1] <= 2 || len < up[1]) 9164 break; 9165 9166 /* If TCP is not interested in SACK blks... */ 9167 if ((tcp = tcpopt->tcp) == NULL) { 9168 up += up[1]; 9169 continue; 9170 } 9171 sack_len = up[1] - TCPOPT_HEADER_LEN; 9172 up += TCPOPT_HEADER_LEN; 9173 9174 /* 9175 * If the list is empty, allocate one and assume 9176 * nothing is sack'ed. 9177 */ 9178 ASSERT(tcp->tcp_sack_info != NULL); 9179 if (tcp->tcp_notsack_list == NULL) { 9180 tcp_notsack_update(&(tcp->tcp_notsack_list), 9181 tcp->tcp_suna, tcp->tcp_snxt, 9182 &(tcp->tcp_num_notsack_blk), 9183 &(tcp->tcp_cnt_notsack_list)); 9184 9185 /* 9186 * Make sure tcp_notsack_list is not NULL. 9187 * This happens when kmem_alloc(KM_NOSLEEP) 9188 * returns NULL. 9189 */ 9190 if (tcp->tcp_notsack_list == NULL) { 9191 up += sack_len; 9192 continue; 9193 } 9194 tcp->tcp_fack = tcp->tcp_suna; 9195 } 9196 9197 while (sack_len > 0) { 9198 if (up + 8 > endp) { 9199 up = endp; 9200 break; 9201 } 9202 sack_begin = BE32_TO_U32(up); 9203 up += 4; 9204 sack_end = BE32_TO_U32(up); 9205 up += 4; 9206 sack_len -= 8; 9207 /* 9208 * Bounds checking. Make sure the SACK 9209 * info is within tcp_suna and tcp_snxt. 9210 * If this SACK blk is out of bound, ignore 9211 * it but continue to parse the following 9212 * blks. 9213 */ 9214 if (SEQ_LEQ(sack_end, sack_begin) || 9215 SEQ_LT(sack_begin, tcp->tcp_suna) || 9216 SEQ_GT(sack_end, tcp->tcp_snxt)) { 9217 continue; 9218 } 9219 tcp_notsack_insert(&(tcp->tcp_notsack_list), 9220 sack_begin, sack_end, 9221 &(tcp->tcp_num_notsack_blk), 9222 &(tcp->tcp_cnt_notsack_list)); 9223 if (SEQ_GT(sack_end, tcp->tcp_fack)) { 9224 tcp->tcp_fack = sack_end; 9225 } 9226 } 9227 found |= TCP_OPT_SACK_PRESENT; 9228 continue; 9229 9230 case TCPOPT_TSTAMP: 9231 if (len < TCPOPT_TSTAMP_LEN || 9232 up[1] != TCPOPT_TSTAMP_LEN) 9233 break; 9234 9235 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); 9236 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); 9237 9238 found |= TCP_OPT_TSTAMP_PRESENT; 9239 9240 up += TCPOPT_TSTAMP_LEN; 9241 continue; 9242 9243 default: 9244 if (len <= 1 || len < (int)up[1] || up[1] == 0) 9245 break; 9246 up += up[1]; 9247 continue; 9248 } 9249 break; 9250 } 9251 return (found); 9252 } 9253 9254 /* 9255 * Set the mss associated with a particular tcp based on its current value, 9256 * and a new one passed in. Observe minimums and maximums, and reset 9257 * other state variables that we want to view as multiples of mss. 9258 * 9259 * This function is called in various places mainly because 9260 * 1) Various stuffs, tcp_mss, tcp_cwnd, ... need to be adjusted when the 9261 * other side's SYN/SYN-ACK packet arrives. 9262 * 2) PMTUd may get us a new MSS. 9263 * 3) If the other side stops sending us timestamp option, we need to 9264 * increase the MSS size to use the extra bytes available. 9265 */ 9266 static void 9267 tcp_mss_set(tcp_t *tcp, uint32_t mss) 9268 { 9269 uint32_t mss_max; 9270 9271 if (tcp->tcp_ipversion == IPV4_VERSION) 9272 mss_max = tcp_mss_max_ipv4; 9273 else 9274 mss_max = tcp_mss_max_ipv6; 9275 9276 if (mss < tcp_mss_min) 9277 mss = tcp_mss_min; 9278 if (mss > mss_max) 9279 mss = mss_max; 9280 /* 9281 * Unless naglim has been set by our client to 9282 * a non-mss value, force naglim to track mss. 9283 * This can help to aggregate small writes. 9284 */ 9285 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) 9286 tcp->tcp_naglim = mss; 9287 /* 9288 * TCP should be able to buffer at least 4 MSS data for obvious 9289 * performance reason. 9290 */ 9291 if ((mss << 2) > tcp->tcp_xmit_hiwater) 9292 tcp->tcp_xmit_hiwater = mss << 2; 9293 9294 /* 9295 * Check if we need to apply the tcp_init_cwnd here. If 9296 * it is set and the MSS gets bigger (should not happen 9297 * normally), we need to adjust the resulting tcp_cwnd properly. 9298 * The new tcp_cwnd should not get bigger. 9299 */ 9300 if (tcp->tcp_init_cwnd == 0) { 9301 tcp->tcp_cwnd = MIN(tcp_slow_start_initial * mss, 9302 MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); 9303 } else { 9304 if (tcp->tcp_mss < mss) { 9305 tcp->tcp_cwnd = MAX(1, 9306 (tcp->tcp_init_cwnd * tcp->tcp_mss / mss)) * mss; 9307 } else { 9308 tcp->tcp_cwnd = tcp->tcp_init_cwnd * mss; 9309 } 9310 } 9311 tcp->tcp_mss = mss; 9312 tcp->tcp_cwnd_cnt = 0; 9313 (void) tcp_maxpsz_set(tcp, B_TRUE); 9314 } 9315 9316 static int 9317 tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) 9318 { 9319 tcp_t *tcp = NULL; 9320 conn_t *connp; 9321 int err; 9322 dev_t conn_dev; 9323 zoneid_t zoneid = getzoneid(); 9324 9325 /* 9326 * Special case for install: miniroot needs to be able to access files 9327 * via NFS as though it were always in the global zone. 9328 */ 9329 if (credp == kcred && nfs_global_client_only != 0) 9330 zoneid = GLOBAL_ZONEID; 9331 9332 if (q->q_ptr != NULL) 9333 return (0); 9334 9335 if (sflag == MODOPEN) { 9336 /* 9337 * This is a special case. The purpose of a modopen 9338 * is to allow just the T_SVR4_OPTMGMT_REQ to pass 9339 * through for MIB browsers. Everything else is failed. 9340 */ 9341 connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt)); 9342 9343 if (connp == NULL) 9344 return (ENOMEM); 9345 9346 connp->conn_flags |= IPCL_TCPMOD; 9347 connp->conn_cred = credp; 9348 connp->conn_zoneid = zoneid; 9349 q->q_ptr = WR(q)->q_ptr = connp; 9350 crhold(credp); 9351 q->q_qinfo = &tcp_mod_rinit; 9352 WR(q)->q_qinfo = &tcp_mod_winit; 9353 qprocson(q); 9354 return (0); 9355 } 9356 9357 if ((conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) 9358 return (EBUSY); 9359 9360 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); 9361 9362 if (flag & SO_ACCEPTOR) { 9363 q->q_qinfo = &tcp_acceptor_rinit; 9364 q->q_ptr = (void *)conn_dev; 9365 WR(q)->q_qinfo = &tcp_acceptor_winit; 9366 WR(q)->q_ptr = (void *)conn_dev; 9367 qprocson(q); 9368 return (0); 9369 } 9370 9371 connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt)); 9372 if (connp == NULL) { 9373 inet_minor_free(ip_minor_arena, conn_dev); 9374 q->q_ptr = NULL; 9375 return (ENOSR); 9376 } 9377 connp->conn_sqp = IP_SQUEUE_GET(lbolt); 9378 tcp = connp->conn_tcp; 9379 9380 q->q_ptr = WR(q)->q_ptr = connp; 9381 if (getmajor(*devp) == TCP6_MAJ) { 9382 connp->conn_flags |= (IPCL_TCP6|IPCL_ISV6); 9383 connp->conn_send = ip_output_v6; 9384 connp->conn_af_isv6 = B_TRUE; 9385 connp->conn_pkt_isv6 = B_TRUE; 9386 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; 9387 tcp->tcp_ipversion = IPV6_VERSION; 9388 tcp->tcp_family = AF_INET6; 9389 tcp->tcp_mss = tcp_mss_def_ipv6; 9390 } else { 9391 connp->conn_flags |= IPCL_TCP4; 9392 connp->conn_send = ip_output; 9393 connp->conn_af_isv6 = B_FALSE; 9394 connp->conn_pkt_isv6 = B_FALSE; 9395 tcp->tcp_ipversion = IPV4_VERSION; 9396 tcp->tcp_family = AF_INET; 9397 tcp->tcp_mss = tcp_mss_def_ipv4; 9398 } 9399 9400 /* 9401 * TCP keeps a copy of cred for cache locality reasons but 9402 * we put a reference only once. If connp->conn_cred 9403 * becomes invalid, tcp_cred should also be set to NULL. 9404 */ 9405 tcp->tcp_cred = connp->conn_cred = credp; 9406 crhold(connp->conn_cred); 9407 tcp->tcp_cpid = curproc->p_pid; 9408 connp->conn_zoneid = zoneid; 9409 connp->conn_mlp_type = mlptSingle; 9410 connp->conn_ulp_labeled = !is_system_labeled(); 9411 9412 /* 9413 * If the caller has the process-wide flag set, then default to MAC 9414 * exempt mode. This allows read-down to unlabeled hosts. 9415 */ 9416 if (getpflags(NET_MAC_AWARE, credp) != 0) 9417 connp->conn_mac_exempt = B_TRUE; 9418 9419 connp->conn_dev = conn_dev; 9420 9421 ASSERT(q->q_qinfo == &tcp_rinit); 9422 ASSERT(WR(q)->q_qinfo == &tcp_winit); 9423 9424 if (flag & SO_SOCKSTR) { 9425 /* 9426 * No need to insert a socket in tcp acceptor hash. 9427 * If it was a socket acceptor stream, we dealt with 9428 * it above. A socket listener can never accept a 9429 * connection and doesn't need acceptor_id. 9430 */ 9431 connp->conn_flags |= IPCL_SOCKET; 9432 tcp->tcp_issocket = 1; 9433 WR(q)->q_qinfo = &tcp_sock_winit; 9434 } else { 9435 #ifdef _ILP32 9436 tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); 9437 #else 9438 tcp->tcp_acceptor_id = conn_dev; 9439 #endif /* _ILP32 */ 9440 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 9441 } 9442 9443 if (tcp_trace) 9444 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_SLEEP); 9445 9446 err = tcp_init(tcp, q); 9447 if (err != 0) { 9448 inet_minor_free(ip_minor_arena, connp->conn_dev); 9449 tcp_acceptor_hash_remove(tcp); 9450 CONN_DEC_REF(connp); 9451 q->q_ptr = WR(q)->q_ptr = NULL; 9452 return (err); 9453 } 9454 9455 RD(q)->q_hiwat = tcp_recv_hiwat; 9456 tcp->tcp_rwnd = tcp_recv_hiwat; 9457 9458 /* Non-zero default values */ 9459 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; 9460 /* 9461 * Put the ref for TCP. Ref for IP was already put 9462 * by ipcl_conn_create. Also Make the conn_t globally 9463 * visible to walkers 9464 */ 9465 mutex_enter(&connp->conn_lock); 9466 CONN_INC_REF_LOCKED(connp); 9467 ASSERT(connp->conn_ref == 2); 9468 connp->conn_state_flags &= ~CONN_INCIPIENT; 9469 mutex_exit(&connp->conn_lock); 9470 9471 qprocson(q); 9472 return (0); 9473 } 9474 9475 /* 9476 * Some TCP options can be "set" by requesting them in the option 9477 * buffer. This is needed for XTI feature test though we do not 9478 * allow it in general. We interpret that this mechanism is more 9479 * applicable to OSI protocols and need not be allowed in general. 9480 * This routine filters out options for which it is not allowed (most) 9481 * and lets through those (few) for which it is. [ The XTI interface 9482 * test suite specifics will imply that any XTI_GENERIC level XTI_* if 9483 * ever implemented will have to be allowed here ]. 9484 */ 9485 static boolean_t 9486 tcp_allow_connopt_set(int level, int name) 9487 { 9488 9489 switch (level) { 9490 case IPPROTO_TCP: 9491 switch (name) { 9492 case TCP_NODELAY: 9493 return (B_TRUE); 9494 default: 9495 return (B_FALSE); 9496 } 9497 /*NOTREACHED*/ 9498 default: 9499 return (B_FALSE); 9500 } 9501 /*NOTREACHED*/ 9502 } 9503 9504 /* 9505 * This routine gets default values of certain options whose default 9506 * values are maintained by protocol specific code 9507 */ 9508 /* ARGSUSED */ 9509 int 9510 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) 9511 { 9512 int32_t *i1 = (int32_t *)ptr; 9513 9514 switch (level) { 9515 case IPPROTO_TCP: 9516 switch (name) { 9517 case TCP_NOTIFY_THRESHOLD: 9518 *i1 = tcp_ip_notify_interval; 9519 break; 9520 case TCP_ABORT_THRESHOLD: 9521 *i1 = tcp_ip_abort_interval; 9522 break; 9523 case TCP_CONN_NOTIFY_THRESHOLD: 9524 *i1 = tcp_ip_notify_cinterval; 9525 break; 9526 case TCP_CONN_ABORT_THRESHOLD: 9527 *i1 = tcp_ip_abort_cinterval; 9528 break; 9529 default: 9530 return (-1); 9531 } 9532 break; 9533 case IPPROTO_IP: 9534 switch (name) { 9535 case IP_TTL: 9536 *i1 = tcp_ipv4_ttl; 9537 break; 9538 default: 9539 return (-1); 9540 } 9541 break; 9542 case IPPROTO_IPV6: 9543 switch (name) { 9544 case IPV6_UNICAST_HOPS: 9545 *i1 = tcp_ipv6_hoplimit; 9546 break; 9547 default: 9548 return (-1); 9549 } 9550 break; 9551 default: 9552 return (-1); 9553 } 9554 return (sizeof (int)); 9555 } 9556 9557 9558 /* 9559 * TCP routine to get the values of options. 9560 */ 9561 int 9562 tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr) 9563 { 9564 int *i1 = (int *)ptr; 9565 conn_t *connp = Q_TO_CONN(q); 9566 tcp_t *tcp = connp->conn_tcp; 9567 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; 9568 9569 switch (level) { 9570 case SOL_SOCKET: 9571 switch (name) { 9572 case SO_LINGER: { 9573 struct linger *lgr = (struct linger *)ptr; 9574 9575 lgr->l_onoff = tcp->tcp_linger ? SO_LINGER : 0; 9576 lgr->l_linger = tcp->tcp_lingertime; 9577 } 9578 return (sizeof (struct linger)); 9579 case SO_DEBUG: 9580 *i1 = tcp->tcp_debug ? SO_DEBUG : 0; 9581 break; 9582 case SO_KEEPALIVE: 9583 *i1 = tcp->tcp_ka_enabled ? SO_KEEPALIVE : 0; 9584 break; 9585 case SO_DONTROUTE: 9586 *i1 = tcp->tcp_dontroute ? SO_DONTROUTE : 0; 9587 break; 9588 case SO_USELOOPBACK: 9589 *i1 = tcp->tcp_useloopback ? SO_USELOOPBACK : 0; 9590 break; 9591 case SO_BROADCAST: 9592 *i1 = tcp->tcp_broadcast ? SO_BROADCAST : 0; 9593 break; 9594 case SO_REUSEADDR: 9595 *i1 = tcp->tcp_reuseaddr ? SO_REUSEADDR : 0; 9596 break; 9597 case SO_OOBINLINE: 9598 *i1 = tcp->tcp_oobinline ? SO_OOBINLINE : 0; 9599 break; 9600 case SO_DGRAM_ERRIND: 9601 *i1 = tcp->tcp_dgram_errind ? SO_DGRAM_ERRIND : 0; 9602 break; 9603 case SO_TYPE: 9604 *i1 = SOCK_STREAM; 9605 break; 9606 case SO_SNDBUF: 9607 *i1 = tcp->tcp_xmit_hiwater; 9608 break; 9609 case SO_RCVBUF: 9610 *i1 = RD(q)->q_hiwat; 9611 break; 9612 case SO_SND_COPYAVOID: 9613 *i1 = tcp->tcp_snd_zcopy_on ? 9614 SO_SND_COPYAVOID : 0; 9615 break; 9616 case SO_ALLZONES: 9617 *i1 = connp->conn_allzones ? 1 : 0; 9618 break; 9619 case SO_ANON_MLP: 9620 *i1 = connp->conn_anon_mlp; 9621 break; 9622 case SO_MAC_EXEMPT: 9623 *i1 = connp->conn_mac_exempt; 9624 break; 9625 case SO_EXCLBIND: 9626 *i1 = tcp->tcp_exclbind ? SO_EXCLBIND : 0; 9627 break; 9628 default: 9629 return (-1); 9630 } 9631 break; 9632 case IPPROTO_TCP: 9633 switch (name) { 9634 case TCP_NODELAY: 9635 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; 9636 break; 9637 case TCP_MAXSEG: 9638 *i1 = tcp->tcp_mss; 9639 break; 9640 case TCP_NOTIFY_THRESHOLD: 9641 *i1 = (int)tcp->tcp_first_timer_threshold; 9642 break; 9643 case TCP_ABORT_THRESHOLD: 9644 *i1 = tcp->tcp_second_timer_threshold; 9645 break; 9646 case TCP_CONN_NOTIFY_THRESHOLD: 9647 *i1 = tcp->tcp_first_ctimer_threshold; 9648 break; 9649 case TCP_CONN_ABORT_THRESHOLD: 9650 *i1 = tcp->tcp_second_ctimer_threshold; 9651 break; 9652 case TCP_RECVDSTADDR: 9653 *i1 = tcp->tcp_recvdstaddr; 9654 break; 9655 case TCP_ANONPRIVBIND: 9656 *i1 = tcp->tcp_anon_priv_bind; 9657 break; 9658 case TCP_EXCLBIND: 9659 *i1 = tcp->tcp_exclbind ? TCP_EXCLBIND : 0; 9660 break; 9661 case TCP_INIT_CWND: 9662 *i1 = tcp->tcp_init_cwnd; 9663 break; 9664 case TCP_KEEPALIVE_THRESHOLD: 9665 *i1 = tcp->tcp_ka_interval; 9666 break; 9667 case TCP_KEEPALIVE_ABORT_THRESHOLD: 9668 *i1 = tcp->tcp_ka_abort_thres; 9669 break; 9670 case TCP_CORK: 9671 *i1 = tcp->tcp_cork; 9672 break; 9673 default: 9674 return (-1); 9675 } 9676 break; 9677 case IPPROTO_IP: 9678 if (tcp->tcp_family != AF_INET) 9679 return (-1); 9680 switch (name) { 9681 case IP_OPTIONS: 9682 case T_IP_OPTIONS: { 9683 /* 9684 * This is compatible with BSD in that in only return 9685 * the reverse source route with the final destination 9686 * as the last entry. The first 4 bytes of the option 9687 * will contain the final destination. 9688 */ 9689 int opt_len; 9690 9691 opt_len = (char *)tcp->tcp_tcph - (char *)tcp->tcp_ipha; 9692 opt_len -= tcp->tcp_label_len + IP_SIMPLE_HDR_LENGTH; 9693 ASSERT(opt_len >= 0); 9694 /* Caller ensures enough space */ 9695 if (opt_len > 0) { 9696 /* 9697 * TODO: Do we have to handle getsockopt on an 9698 * initiator as well? 9699 */ 9700 return (ip_opt_get_user(tcp->tcp_ipha, ptr)); 9701 } 9702 return (0); 9703 } 9704 case IP_TOS: 9705 case T_IP_TOS: 9706 *i1 = (int)tcp->tcp_ipha->ipha_type_of_service; 9707 break; 9708 case IP_TTL: 9709 *i1 = (int)tcp->tcp_ipha->ipha_ttl; 9710 break; 9711 case IP_NEXTHOP: 9712 /* Handled at IP level */ 9713 return (-EINVAL); 9714 default: 9715 return (-1); 9716 } 9717 break; 9718 case IPPROTO_IPV6: 9719 /* 9720 * IPPROTO_IPV6 options are only supported for sockets 9721 * that are using IPv6 on the wire. 9722 */ 9723 if (tcp->tcp_ipversion != IPV6_VERSION) { 9724 return (-1); 9725 } 9726 switch (name) { 9727 case IPV6_UNICAST_HOPS: 9728 *i1 = (unsigned int) tcp->tcp_ip6h->ip6_hops; 9729 break; /* goto sizeof (int) option return */ 9730 case IPV6_BOUND_IF: 9731 /* Zero if not set */ 9732 *i1 = tcp->tcp_bound_if; 9733 break; /* goto sizeof (int) option return */ 9734 case IPV6_RECVPKTINFO: 9735 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) 9736 *i1 = 1; 9737 else 9738 *i1 = 0; 9739 break; /* goto sizeof (int) option return */ 9740 case IPV6_RECVTCLASS: 9741 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS) 9742 *i1 = 1; 9743 else 9744 *i1 = 0; 9745 break; /* goto sizeof (int) option return */ 9746 case IPV6_RECVHOPLIMIT: 9747 if (tcp->tcp_ipv6_recvancillary & 9748 TCP_IPV6_RECVHOPLIMIT) 9749 *i1 = 1; 9750 else 9751 *i1 = 0; 9752 break; /* goto sizeof (int) option return */ 9753 case IPV6_RECVHOPOPTS: 9754 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) 9755 *i1 = 1; 9756 else 9757 *i1 = 0; 9758 break; /* goto sizeof (int) option return */ 9759 case IPV6_RECVDSTOPTS: 9760 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVDSTOPTS) 9761 *i1 = 1; 9762 else 9763 *i1 = 0; 9764 break; /* goto sizeof (int) option return */ 9765 case _OLD_IPV6_RECVDSTOPTS: 9766 if (tcp->tcp_ipv6_recvancillary & 9767 TCP_OLD_IPV6_RECVDSTOPTS) 9768 *i1 = 1; 9769 else 9770 *i1 = 0; 9771 break; /* goto sizeof (int) option return */ 9772 case IPV6_RECVRTHDR: 9773 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) 9774 *i1 = 1; 9775 else 9776 *i1 = 0; 9777 break; /* goto sizeof (int) option return */ 9778 case IPV6_RECVRTHDRDSTOPTS: 9779 if (tcp->tcp_ipv6_recvancillary & 9780 TCP_IPV6_RECVRTDSTOPTS) 9781 *i1 = 1; 9782 else 9783 *i1 = 0; 9784 break; /* goto sizeof (int) option return */ 9785 case IPV6_PKTINFO: { 9786 /* XXX assumes that caller has room for max size! */ 9787 struct in6_pktinfo *pkti; 9788 9789 pkti = (struct in6_pktinfo *)ptr; 9790 if (ipp->ipp_fields & IPPF_IFINDEX) 9791 pkti->ipi6_ifindex = ipp->ipp_ifindex; 9792 else 9793 pkti->ipi6_ifindex = 0; 9794 if (ipp->ipp_fields & IPPF_ADDR) 9795 pkti->ipi6_addr = ipp->ipp_addr; 9796 else 9797 pkti->ipi6_addr = ipv6_all_zeros; 9798 return (sizeof (struct in6_pktinfo)); 9799 } 9800 case IPV6_TCLASS: 9801 if (ipp->ipp_fields & IPPF_TCLASS) 9802 *i1 = ipp->ipp_tclass; 9803 else 9804 *i1 = IPV6_FLOW_TCLASS( 9805 IPV6_DEFAULT_VERS_AND_FLOW); 9806 break; /* goto sizeof (int) option return */ 9807 case IPV6_NEXTHOP: { 9808 sin6_t *sin6 = (sin6_t *)ptr; 9809 9810 if (!(ipp->ipp_fields & IPPF_NEXTHOP)) 9811 return (0); 9812 *sin6 = sin6_null; 9813 sin6->sin6_family = AF_INET6; 9814 sin6->sin6_addr = ipp->ipp_nexthop; 9815 return (sizeof (sin6_t)); 9816 } 9817 case IPV6_HOPOPTS: 9818 if (!(ipp->ipp_fields & IPPF_HOPOPTS)) 9819 return (0); 9820 if (ipp->ipp_hopoptslen <= tcp->tcp_label_len) 9821 return (0); 9822 bcopy((char *)ipp->ipp_hopopts + tcp->tcp_label_len, 9823 ptr, ipp->ipp_hopoptslen - tcp->tcp_label_len); 9824 if (tcp->tcp_label_len > 0) { 9825 ptr[0] = ((char *)ipp->ipp_hopopts)[0]; 9826 ptr[1] = (ipp->ipp_hopoptslen - 9827 tcp->tcp_label_len + 7) / 8 - 1; 9828 } 9829 return (ipp->ipp_hopoptslen - tcp->tcp_label_len); 9830 case IPV6_RTHDRDSTOPTS: 9831 if (!(ipp->ipp_fields & IPPF_RTDSTOPTS)) 9832 return (0); 9833 bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen); 9834 return (ipp->ipp_rtdstoptslen); 9835 case IPV6_RTHDR: 9836 if (!(ipp->ipp_fields & IPPF_RTHDR)) 9837 return (0); 9838 bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen); 9839 return (ipp->ipp_rthdrlen); 9840 case IPV6_DSTOPTS: 9841 if (!(ipp->ipp_fields & IPPF_DSTOPTS)) 9842 return (0); 9843 bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen); 9844 return (ipp->ipp_dstoptslen); 9845 case IPV6_SRC_PREFERENCES: 9846 return (ip6_get_src_preferences(connp, 9847 (uint32_t *)ptr)); 9848 case IPV6_PATHMTU: { 9849 struct ip6_mtuinfo *mtuinfo = (struct ip6_mtuinfo *)ptr; 9850 9851 if (tcp->tcp_state < TCPS_ESTABLISHED) 9852 return (-1); 9853 9854 return (ip_fill_mtuinfo(&connp->conn_remv6, 9855 connp->conn_fport, mtuinfo)); 9856 } 9857 default: 9858 return (-1); 9859 } 9860 break; 9861 default: 9862 return (-1); 9863 } 9864 return (sizeof (int)); 9865 } 9866 9867 /* 9868 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. 9869 * Parameters are assumed to be verified by the caller. 9870 */ 9871 /* ARGSUSED */ 9872 int 9873 tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, 9874 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, 9875 void *thisdg_attrs, cred_t *cr, mblk_t *mblk) 9876 { 9877 conn_t *connp = Q_TO_CONN(q); 9878 tcp_t *tcp = connp->conn_tcp; 9879 int *i1 = (int *)invalp; 9880 boolean_t onoff = (*i1 == 0) ? 0 : 1; 9881 boolean_t checkonly; 9882 int reterr; 9883 9884 switch (optset_context) { 9885 case SETFN_OPTCOM_CHECKONLY: 9886 checkonly = B_TRUE; 9887 /* 9888 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ 9889 * inlen != 0 implies value supplied and 9890 * we have to "pretend" to set it. 9891 * inlen == 0 implies that there is no 9892 * value part in T_CHECK request and just validation 9893 * done elsewhere should be enough, we just return here. 9894 */ 9895 if (inlen == 0) { 9896 *outlenp = 0; 9897 return (0); 9898 } 9899 break; 9900 case SETFN_OPTCOM_NEGOTIATE: 9901 checkonly = B_FALSE; 9902 break; 9903 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ 9904 case SETFN_CONN_NEGOTIATE: 9905 checkonly = B_FALSE; 9906 /* 9907 * Negotiating local and "association-related" options 9908 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) 9909 * primitives is allowed by XTI, but we choose 9910 * to not implement this style negotiation for Internet 9911 * protocols (We interpret it is a must for OSI world but 9912 * optional for Internet protocols) for all options. 9913 * [ Will do only for the few options that enable test 9914 * suites that our XTI implementation of this feature 9915 * works for transports that do allow it ] 9916 */ 9917 if (!tcp_allow_connopt_set(level, name)) { 9918 *outlenp = 0; 9919 return (EINVAL); 9920 } 9921 break; 9922 default: 9923 /* 9924 * We should never get here 9925 */ 9926 *outlenp = 0; 9927 return (EINVAL); 9928 } 9929 9930 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || 9931 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); 9932 9933 /* 9934 * For TCP, we should have no ancillary data sent down 9935 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs 9936 * has to be zero. 9937 */ 9938 ASSERT(thisdg_attrs == NULL); 9939 9940 /* 9941 * For fixed length options, no sanity check 9942 * of passed in length is done. It is assumed *_optcom_req() 9943 * routines do the right thing. 9944 */ 9945 9946 switch (level) { 9947 case SOL_SOCKET: 9948 switch (name) { 9949 case SO_LINGER: { 9950 struct linger *lgr = (struct linger *)invalp; 9951 9952 if (!checkonly) { 9953 if (lgr->l_onoff) { 9954 tcp->tcp_linger = 1; 9955 tcp->tcp_lingertime = lgr->l_linger; 9956 } else { 9957 tcp->tcp_linger = 0; 9958 tcp->tcp_lingertime = 0; 9959 } 9960 /* struct copy */ 9961 *(struct linger *)outvalp = *lgr; 9962 } else { 9963 if (!lgr->l_onoff) { 9964 ((struct linger *)outvalp)->l_onoff = 0; 9965 ((struct linger *)outvalp)->l_linger = 0; 9966 } else { 9967 /* struct copy */ 9968 *(struct linger *)outvalp = *lgr; 9969 } 9970 } 9971 *outlenp = sizeof (struct linger); 9972 return (0); 9973 } 9974 case SO_DEBUG: 9975 if (!checkonly) 9976 tcp->tcp_debug = onoff; 9977 break; 9978 case SO_KEEPALIVE: 9979 if (checkonly) { 9980 /* T_CHECK case */ 9981 break; 9982 } 9983 9984 if (!onoff) { 9985 if (tcp->tcp_ka_enabled) { 9986 if (tcp->tcp_ka_tid != 0) { 9987 (void) TCP_TIMER_CANCEL(tcp, 9988 tcp->tcp_ka_tid); 9989 tcp->tcp_ka_tid = 0; 9990 } 9991 tcp->tcp_ka_enabled = 0; 9992 } 9993 break; 9994 } 9995 if (!tcp->tcp_ka_enabled) { 9996 /* Crank up the keepalive timer */ 9997 tcp->tcp_ka_last_intrvl = 0; 9998 tcp->tcp_ka_tid = TCP_TIMER(tcp, 9999 tcp_keepalive_killer, 10000 MSEC_TO_TICK(tcp->tcp_ka_interval)); 10001 tcp->tcp_ka_enabled = 1; 10002 } 10003 break; 10004 case SO_DONTROUTE: 10005 /* 10006 * SO_DONTROUTE, SO_USELOOPBACK, and SO_BROADCAST are 10007 * only of interest to IP. We track them here only so 10008 * that we can report their current value. 10009 */ 10010 if (!checkonly) { 10011 tcp->tcp_dontroute = onoff; 10012 tcp->tcp_connp->conn_dontroute = onoff; 10013 } 10014 break; 10015 case SO_USELOOPBACK: 10016 if (!checkonly) { 10017 tcp->tcp_useloopback = onoff; 10018 tcp->tcp_connp->conn_loopback = onoff; 10019 } 10020 break; 10021 case SO_BROADCAST: 10022 if (!checkonly) { 10023 tcp->tcp_broadcast = onoff; 10024 tcp->tcp_connp->conn_broadcast = onoff; 10025 } 10026 break; 10027 case SO_REUSEADDR: 10028 if (!checkonly) { 10029 tcp->tcp_reuseaddr = onoff; 10030 tcp->tcp_connp->conn_reuseaddr = onoff; 10031 } 10032 break; 10033 case SO_OOBINLINE: 10034 if (!checkonly) 10035 tcp->tcp_oobinline = onoff; 10036 break; 10037 case SO_DGRAM_ERRIND: 10038 if (!checkonly) 10039 tcp->tcp_dgram_errind = onoff; 10040 break; 10041 case SO_SNDBUF: { 10042 tcp_t *peer_tcp; 10043 10044 if (*i1 > tcp_max_buf) { 10045 *outlenp = 0; 10046 return (ENOBUFS); 10047 } 10048 if (checkonly) 10049 break; 10050 10051 tcp->tcp_xmit_hiwater = *i1; 10052 if (tcp_snd_lowat_fraction != 0) 10053 tcp->tcp_xmit_lowater = 10054 tcp->tcp_xmit_hiwater / 10055 tcp_snd_lowat_fraction; 10056 (void) tcp_maxpsz_set(tcp, B_TRUE); 10057 /* 10058 * If we are flow-controlled, recheck the condition. 10059 * There are apps that increase SO_SNDBUF size when 10060 * flow-controlled (EWOULDBLOCK), and expect the flow 10061 * control condition to be lifted right away. 10062 * 10063 * For the fused tcp loopback case, in order to avoid 10064 * a race with the peer's tcp_fuse_rrw() we need to 10065 * hold its fuse_lock while accessing tcp_flow_stopped. 10066 */ 10067 peer_tcp = tcp->tcp_loopback_peer; 10068 ASSERT(!tcp->tcp_fused || peer_tcp != NULL); 10069 if (tcp->tcp_fused) 10070 mutex_enter(&peer_tcp->tcp_fuse_lock); 10071 10072 if (tcp->tcp_flow_stopped && 10073 TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) { 10074 tcp_clrqfull(tcp); 10075 } 10076 if (tcp->tcp_fused) 10077 mutex_exit(&peer_tcp->tcp_fuse_lock); 10078 break; 10079 } 10080 case SO_RCVBUF: 10081 if (*i1 > tcp_max_buf) { 10082 *outlenp = 0; 10083 return (ENOBUFS); 10084 } 10085 /* Silently ignore zero */ 10086 if (!checkonly && *i1 != 0) { 10087 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); 10088 (void) tcp_rwnd_set(tcp, *i1); 10089 } 10090 /* 10091 * XXX should we return the rwnd here 10092 * and tcp_opt_get ? 10093 */ 10094 break; 10095 case SO_SND_COPYAVOID: 10096 if (!checkonly) { 10097 /* we only allow enable at most once for now */ 10098 if (tcp->tcp_loopback || 10099 (!tcp->tcp_snd_zcopy_aware && 10100 (onoff != 1 || !tcp_zcopy_check(tcp)))) { 10101 *outlenp = 0; 10102 return (EOPNOTSUPP); 10103 } 10104 tcp->tcp_snd_zcopy_aware = 1; 10105 } 10106 break; 10107 case SO_ALLZONES: 10108 /* Handled at the IP level */ 10109 return (-EINVAL); 10110 case SO_ANON_MLP: 10111 if (!checkonly) { 10112 mutex_enter(&connp->conn_lock); 10113 connp->conn_anon_mlp = onoff; 10114 mutex_exit(&connp->conn_lock); 10115 } 10116 break; 10117 case SO_MAC_EXEMPT: 10118 if (secpolicy_net_mac_aware(cr) != 0 || 10119 IPCL_IS_BOUND(connp)) 10120 return (EACCES); 10121 if (!checkonly) { 10122 mutex_enter(&connp->conn_lock); 10123 connp->conn_mac_exempt = onoff; 10124 mutex_exit(&connp->conn_lock); 10125 } 10126 break; 10127 case SO_EXCLBIND: 10128 if (!checkonly) 10129 tcp->tcp_exclbind = onoff; 10130 break; 10131 default: 10132 *outlenp = 0; 10133 return (EINVAL); 10134 } 10135 break; 10136 case IPPROTO_TCP: 10137 switch (name) { 10138 case TCP_NODELAY: 10139 if (!checkonly) 10140 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; 10141 break; 10142 case TCP_NOTIFY_THRESHOLD: 10143 if (!checkonly) 10144 tcp->tcp_first_timer_threshold = *i1; 10145 break; 10146 case TCP_ABORT_THRESHOLD: 10147 if (!checkonly) 10148 tcp->tcp_second_timer_threshold = *i1; 10149 break; 10150 case TCP_CONN_NOTIFY_THRESHOLD: 10151 if (!checkonly) 10152 tcp->tcp_first_ctimer_threshold = *i1; 10153 break; 10154 case TCP_CONN_ABORT_THRESHOLD: 10155 if (!checkonly) 10156 tcp->tcp_second_ctimer_threshold = *i1; 10157 break; 10158 case TCP_RECVDSTADDR: 10159 if (tcp->tcp_state > TCPS_LISTEN) 10160 return (EOPNOTSUPP); 10161 if (!checkonly) 10162 tcp->tcp_recvdstaddr = onoff; 10163 break; 10164 case TCP_ANONPRIVBIND: 10165 if ((reterr = secpolicy_net_privaddr(cr, 0)) != 0) { 10166 *outlenp = 0; 10167 return (reterr); 10168 } 10169 if (!checkonly) { 10170 tcp->tcp_anon_priv_bind = onoff; 10171 } 10172 break; 10173 case TCP_EXCLBIND: 10174 if (!checkonly) 10175 tcp->tcp_exclbind = onoff; 10176 break; /* goto sizeof (int) option return */ 10177 case TCP_INIT_CWND: { 10178 uint32_t init_cwnd = *((uint32_t *)invalp); 10179 10180 if (checkonly) 10181 break; 10182 10183 /* 10184 * Only allow socket with network configuration 10185 * privilege to set the initial cwnd to be larger 10186 * than allowed by RFC 3390. 10187 */ 10188 if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { 10189 tcp->tcp_init_cwnd = init_cwnd; 10190 break; 10191 } 10192 if ((reterr = secpolicy_net_config(cr, B_TRUE)) != 0) { 10193 *outlenp = 0; 10194 return (reterr); 10195 } 10196 if (init_cwnd > TCP_MAX_INIT_CWND) { 10197 *outlenp = 0; 10198 return (EINVAL); 10199 } 10200 tcp->tcp_init_cwnd = init_cwnd; 10201 break; 10202 } 10203 case TCP_KEEPALIVE_THRESHOLD: 10204 if (checkonly) 10205 break; 10206 10207 if (*i1 < tcp_keepalive_interval_low || 10208 *i1 > tcp_keepalive_interval_high) { 10209 *outlenp = 0; 10210 return (EINVAL); 10211 } 10212 if (*i1 != tcp->tcp_ka_interval) { 10213 tcp->tcp_ka_interval = *i1; 10214 /* 10215 * Check if we need to restart the 10216 * keepalive timer. 10217 */ 10218 if (tcp->tcp_ka_tid != 0) { 10219 ASSERT(tcp->tcp_ka_enabled); 10220 (void) TCP_TIMER_CANCEL(tcp, 10221 tcp->tcp_ka_tid); 10222 tcp->tcp_ka_last_intrvl = 0; 10223 tcp->tcp_ka_tid = TCP_TIMER(tcp, 10224 tcp_keepalive_killer, 10225 MSEC_TO_TICK(tcp->tcp_ka_interval)); 10226 } 10227 } 10228 break; 10229 case TCP_KEEPALIVE_ABORT_THRESHOLD: 10230 if (!checkonly) { 10231 if (*i1 < tcp_keepalive_abort_interval_low || 10232 *i1 > tcp_keepalive_abort_interval_high) { 10233 *outlenp = 0; 10234 return (EINVAL); 10235 } 10236 tcp->tcp_ka_abort_thres = *i1; 10237 } 10238 break; 10239 case TCP_CORK: 10240 if (!checkonly) { 10241 /* 10242 * if tcp->tcp_cork was set and is now 10243 * being unset, we have to make sure that 10244 * the remaining data gets sent out. Also 10245 * unset tcp->tcp_cork so that tcp_wput_data() 10246 * can send data even if it is less than mss 10247 */ 10248 if (tcp->tcp_cork && onoff == 0 && 10249 tcp->tcp_unsent > 0) { 10250 tcp->tcp_cork = B_FALSE; 10251 tcp_wput_data(tcp, NULL, B_FALSE); 10252 } 10253 tcp->tcp_cork = onoff; 10254 } 10255 break; 10256 default: 10257 *outlenp = 0; 10258 return (EINVAL); 10259 } 10260 break; 10261 case IPPROTO_IP: 10262 if (tcp->tcp_family != AF_INET) { 10263 *outlenp = 0; 10264 return (ENOPROTOOPT); 10265 } 10266 switch (name) { 10267 case IP_OPTIONS: 10268 case T_IP_OPTIONS: 10269 reterr = tcp_opt_set_header(tcp, checkonly, 10270 invalp, inlen); 10271 if (reterr) { 10272 *outlenp = 0; 10273 return (reterr); 10274 } 10275 /* OK return - copy input buffer into output buffer */ 10276 if (invalp != outvalp) { 10277 /* don't trust bcopy for identical src/dst */ 10278 bcopy(invalp, outvalp, inlen); 10279 } 10280 *outlenp = inlen; 10281 return (0); 10282 case IP_TOS: 10283 case T_IP_TOS: 10284 if (!checkonly) { 10285 tcp->tcp_ipha->ipha_type_of_service = 10286 (uchar_t)*i1; 10287 tcp->tcp_tos = (uchar_t)*i1; 10288 } 10289 break; 10290 case IP_TTL: 10291 if (!checkonly) { 10292 tcp->tcp_ipha->ipha_ttl = (uchar_t)*i1; 10293 tcp->tcp_ttl = (uchar_t)*i1; 10294 } 10295 break; 10296 case IP_BOUND_IF: 10297 case IP_NEXTHOP: 10298 /* Handled at the IP level */ 10299 return (-EINVAL); 10300 case IP_SEC_OPT: 10301 /* 10302 * We should not allow policy setting after 10303 * we start listening for connections. 10304 */ 10305 if (tcp->tcp_state == TCPS_LISTEN) { 10306 return (EINVAL); 10307 } else { 10308 /* Handled at the IP level */ 10309 return (-EINVAL); 10310 } 10311 default: 10312 *outlenp = 0; 10313 return (EINVAL); 10314 } 10315 break; 10316 case IPPROTO_IPV6: { 10317 ip6_pkt_t *ipp; 10318 10319 /* 10320 * IPPROTO_IPV6 options are only supported for sockets 10321 * that are using IPv6 on the wire. 10322 */ 10323 if (tcp->tcp_ipversion != IPV6_VERSION) { 10324 *outlenp = 0; 10325 return (ENOPROTOOPT); 10326 } 10327 /* 10328 * Only sticky options; no ancillary data 10329 */ 10330 ASSERT(thisdg_attrs == NULL); 10331 ipp = &tcp->tcp_sticky_ipp; 10332 10333 switch (name) { 10334 case IPV6_UNICAST_HOPS: 10335 /* -1 means use default */ 10336 if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) { 10337 *outlenp = 0; 10338 return (EINVAL); 10339 } 10340 if (!checkonly) { 10341 if (*i1 == -1) { 10342 tcp->tcp_ip6h->ip6_hops = 10343 ipp->ipp_unicast_hops = 10344 (uint8_t)tcp_ipv6_hoplimit; 10345 ipp->ipp_fields &= ~IPPF_UNICAST_HOPS; 10346 /* Pass modified value to IP. */ 10347 *i1 = tcp->tcp_ip6h->ip6_hops; 10348 } else { 10349 tcp->tcp_ip6h->ip6_hops = 10350 ipp->ipp_unicast_hops = 10351 (uint8_t)*i1; 10352 ipp->ipp_fields |= IPPF_UNICAST_HOPS; 10353 } 10354 reterr = tcp_build_hdrs(q, tcp); 10355 if (reterr != 0) 10356 return (reterr); 10357 } 10358 break; 10359 case IPV6_BOUND_IF: 10360 if (!checkonly) { 10361 int error = 0; 10362 10363 tcp->tcp_bound_if = *i1; 10364 error = ip_opt_set_ill(tcp->tcp_connp, *i1, 10365 B_TRUE, checkonly, level, name, mblk); 10366 if (error != 0) { 10367 *outlenp = 0; 10368 return (error); 10369 } 10370 } 10371 break; 10372 /* 10373 * Set boolean switches for ancillary data delivery 10374 */ 10375 case IPV6_RECVPKTINFO: 10376 if (!checkonly) { 10377 if (onoff) 10378 tcp->tcp_ipv6_recvancillary |= 10379 TCP_IPV6_RECVPKTINFO; 10380 else 10381 tcp->tcp_ipv6_recvancillary &= 10382 ~TCP_IPV6_RECVPKTINFO; 10383 /* Force it to be sent up with the next msg */ 10384 tcp->tcp_recvifindex = 0; 10385 } 10386 break; 10387 case IPV6_RECVTCLASS: 10388 if (!checkonly) { 10389 if (onoff) 10390 tcp->tcp_ipv6_recvancillary |= 10391 TCP_IPV6_RECVTCLASS; 10392 else 10393 tcp->tcp_ipv6_recvancillary &= 10394 ~TCP_IPV6_RECVTCLASS; 10395 } 10396 break; 10397 case IPV6_RECVHOPLIMIT: 10398 if (!checkonly) { 10399 if (onoff) 10400 tcp->tcp_ipv6_recvancillary |= 10401 TCP_IPV6_RECVHOPLIMIT; 10402 else 10403 tcp->tcp_ipv6_recvancillary &= 10404 ~TCP_IPV6_RECVHOPLIMIT; 10405 /* Force it to be sent up with the next msg */ 10406 tcp->tcp_recvhops = 0xffffffffU; 10407 } 10408 break; 10409 case IPV6_RECVHOPOPTS: 10410 if (!checkonly) { 10411 if (onoff) 10412 tcp->tcp_ipv6_recvancillary |= 10413 TCP_IPV6_RECVHOPOPTS; 10414 else 10415 tcp->tcp_ipv6_recvancillary &= 10416 ~TCP_IPV6_RECVHOPOPTS; 10417 } 10418 break; 10419 case IPV6_RECVDSTOPTS: 10420 if (!checkonly) { 10421 if (onoff) 10422 tcp->tcp_ipv6_recvancillary |= 10423 TCP_IPV6_RECVDSTOPTS; 10424 else 10425 tcp->tcp_ipv6_recvancillary &= 10426 ~TCP_IPV6_RECVDSTOPTS; 10427 } 10428 break; 10429 case _OLD_IPV6_RECVDSTOPTS: 10430 if (!checkonly) { 10431 if (onoff) 10432 tcp->tcp_ipv6_recvancillary |= 10433 TCP_OLD_IPV6_RECVDSTOPTS; 10434 else 10435 tcp->tcp_ipv6_recvancillary &= 10436 ~TCP_OLD_IPV6_RECVDSTOPTS; 10437 } 10438 break; 10439 case IPV6_RECVRTHDR: 10440 if (!checkonly) { 10441 if (onoff) 10442 tcp->tcp_ipv6_recvancillary |= 10443 TCP_IPV6_RECVRTHDR; 10444 else 10445 tcp->tcp_ipv6_recvancillary &= 10446 ~TCP_IPV6_RECVRTHDR; 10447 } 10448 break; 10449 case IPV6_RECVRTHDRDSTOPTS: 10450 if (!checkonly) { 10451 if (onoff) 10452 tcp->tcp_ipv6_recvancillary |= 10453 TCP_IPV6_RECVRTDSTOPTS; 10454 else 10455 tcp->tcp_ipv6_recvancillary &= 10456 ~TCP_IPV6_RECVRTDSTOPTS; 10457 } 10458 break; 10459 case IPV6_PKTINFO: 10460 if (inlen != 0 && inlen != sizeof (struct in6_pktinfo)) 10461 return (EINVAL); 10462 if (checkonly) 10463 break; 10464 10465 if (inlen == 0) { 10466 ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR); 10467 } else { 10468 struct in6_pktinfo *pkti; 10469 10470 pkti = (struct in6_pktinfo *)invalp; 10471 /* 10472 * RFC 3542 states that ipi6_addr must be 10473 * the unspecified address when setting the 10474 * IPV6_PKTINFO sticky socket option on a 10475 * TCP socket. 10476 */ 10477 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) 10478 return (EINVAL); 10479 /* 10480 * ip6_set_pktinfo() validates the source 10481 * address and interface index. 10482 */ 10483 reterr = ip6_set_pktinfo(cr, tcp->tcp_connp, 10484 pkti, mblk); 10485 if (reterr != 0) 10486 return (reterr); 10487 ipp->ipp_ifindex = pkti->ipi6_ifindex; 10488 ipp->ipp_addr = pkti->ipi6_addr; 10489 if (ipp->ipp_ifindex != 0) 10490 ipp->ipp_fields |= IPPF_IFINDEX; 10491 else 10492 ipp->ipp_fields &= ~IPPF_IFINDEX; 10493 if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)) 10494 ipp->ipp_fields |= IPPF_ADDR; 10495 else 10496 ipp->ipp_fields &= ~IPPF_ADDR; 10497 } 10498 reterr = tcp_build_hdrs(q, tcp); 10499 if (reterr != 0) 10500 return (reterr); 10501 break; 10502 case IPV6_TCLASS: 10503 if (inlen != 0 && inlen != sizeof (int)) 10504 return (EINVAL); 10505 if (checkonly) 10506 break; 10507 10508 if (inlen == 0) { 10509 ipp->ipp_fields &= ~IPPF_TCLASS; 10510 } else { 10511 if (*i1 > 255 || *i1 < -1) 10512 return (EINVAL); 10513 if (*i1 == -1) { 10514 ipp->ipp_tclass = 0; 10515 *i1 = 0; 10516 } else { 10517 ipp->ipp_tclass = *i1; 10518 } 10519 ipp->ipp_fields |= IPPF_TCLASS; 10520 } 10521 reterr = tcp_build_hdrs(q, tcp); 10522 if (reterr != 0) 10523 return (reterr); 10524 break; 10525 case IPV6_NEXTHOP: 10526 /* 10527 * IP will verify that the nexthop is reachable 10528 * and fail for sticky options. 10529 */ 10530 if (inlen != 0 && inlen != sizeof (sin6_t)) 10531 return (EINVAL); 10532 if (checkonly) 10533 break; 10534 10535 if (inlen == 0) { 10536 ipp->ipp_fields &= ~IPPF_NEXTHOP; 10537 } else { 10538 sin6_t *sin6 = (sin6_t *)invalp; 10539 10540 if (sin6->sin6_family != AF_INET6) 10541 return (EAFNOSUPPORT); 10542 if (IN6_IS_ADDR_V4MAPPED( 10543 &sin6->sin6_addr)) 10544 return (EADDRNOTAVAIL); 10545 ipp->ipp_nexthop = sin6->sin6_addr; 10546 if (!IN6_IS_ADDR_UNSPECIFIED( 10547 &ipp->ipp_nexthop)) 10548 ipp->ipp_fields |= IPPF_NEXTHOP; 10549 else 10550 ipp->ipp_fields &= ~IPPF_NEXTHOP; 10551 } 10552 reterr = tcp_build_hdrs(q, tcp); 10553 if (reterr != 0) 10554 return (reterr); 10555 break; 10556 case IPV6_HOPOPTS: { 10557 ip6_hbh_t *hopts = (ip6_hbh_t *)invalp; 10558 10559 /* 10560 * Sanity checks - minimum size, size a multiple of 10561 * eight bytes, and matching size passed in. 10562 */ 10563 if (inlen != 0 && 10564 inlen != (8 * (hopts->ip6h_len + 1))) 10565 return (EINVAL); 10566 10567 if (checkonly) 10568 break; 10569 10570 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10571 (uchar_t **)&ipp->ipp_hopopts, 10572 &ipp->ipp_hopoptslen, tcp->tcp_label_len); 10573 if (reterr != 0) 10574 return (reterr); 10575 if (ipp->ipp_hopoptslen == 0) 10576 ipp->ipp_fields &= ~IPPF_HOPOPTS; 10577 else 10578 ipp->ipp_fields |= IPPF_HOPOPTS; 10579 reterr = tcp_build_hdrs(q, tcp); 10580 if (reterr != 0) 10581 return (reterr); 10582 break; 10583 } 10584 case IPV6_RTHDRDSTOPTS: { 10585 ip6_dest_t *dopts = (ip6_dest_t *)invalp; 10586 10587 /* 10588 * Sanity checks - minimum size, size a multiple of 10589 * eight bytes, and matching size passed in. 10590 */ 10591 if (inlen != 0 && 10592 inlen != (8 * (dopts->ip6d_len + 1))) 10593 return (EINVAL); 10594 10595 if (checkonly) 10596 break; 10597 10598 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10599 (uchar_t **)&ipp->ipp_rtdstopts, 10600 &ipp->ipp_rtdstoptslen, 0); 10601 if (reterr != 0) 10602 return (reterr); 10603 if (ipp->ipp_rtdstoptslen == 0) 10604 ipp->ipp_fields &= ~IPPF_RTDSTOPTS; 10605 else 10606 ipp->ipp_fields |= IPPF_RTDSTOPTS; 10607 reterr = tcp_build_hdrs(q, tcp); 10608 if (reterr != 0) 10609 return (reterr); 10610 break; 10611 } 10612 case IPV6_DSTOPTS: { 10613 ip6_dest_t *dopts = (ip6_dest_t *)invalp; 10614 10615 /* 10616 * Sanity checks - minimum size, size a multiple of 10617 * eight bytes, and matching size passed in. 10618 */ 10619 if (inlen != 0 && 10620 inlen != (8 * (dopts->ip6d_len + 1))) 10621 return (EINVAL); 10622 10623 if (checkonly) 10624 break; 10625 10626 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10627 (uchar_t **)&ipp->ipp_dstopts, 10628 &ipp->ipp_dstoptslen, 0); 10629 if (reterr != 0) 10630 return (reterr); 10631 if (ipp->ipp_dstoptslen == 0) 10632 ipp->ipp_fields &= ~IPPF_DSTOPTS; 10633 else 10634 ipp->ipp_fields |= IPPF_DSTOPTS; 10635 reterr = tcp_build_hdrs(q, tcp); 10636 if (reterr != 0) 10637 return (reterr); 10638 break; 10639 } 10640 case IPV6_RTHDR: { 10641 ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp; 10642 10643 /* 10644 * Sanity checks - minimum size, size a multiple of 10645 * eight bytes, and matching size passed in. 10646 */ 10647 if (inlen != 0 && 10648 inlen != (8 * (rt->ip6r_len + 1))) 10649 return (EINVAL); 10650 10651 if (checkonly) 10652 break; 10653 10654 reterr = optcom_pkt_set(invalp, inlen, B_TRUE, 10655 (uchar_t **)&ipp->ipp_rthdr, 10656 &ipp->ipp_rthdrlen, 0); 10657 if (reterr != 0) 10658 return (reterr); 10659 if (ipp->ipp_rthdrlen == 0) 10660 ipp->ipp_fields &= ~IPPF_RTHDR; 10661 else 10662 ipp->ipp_fields |= IPPF_RTHDR; 10663 reterr = tcp_build_hdrs(q, tcp); 10664 if (reterr != 0) 10665 return (reterr); 10666 break; 10667 } 10668 case IPV6_V6ONLY: 10669 if (!checkonly) 10670 tcp->tcp_connp->conn_ipv6_v6only = onoff; 10671 break; 10672 case IPV6_USE_MIN_MTU: 10673 if (inlen != sizeof (int)) 10674 return (EINVAL); 10675 10676 if (*i1 < -1 || *i1 > 1) 10677 return (EINVAL); 10678 10679 if (checkonly) 10680 break; 10681 10682 ipp->ipp_fields |= IPPF_USE_MIN_MTU; 10683 ipp->ipp_use_min_mtu = *i1; 10684 break; 10685 case IPV6_BOUND_PIF: 10686 /* Handled at the IP level */ 10687 return (-EINVAL); 10688 case IPV6_SEC_OPT: 10689 /* 10690 * We should not allow policy setting after 10691 * we start listening for connections. 10692 */ 10693 if (tcp->tcp_state == TCPS_LISTEN) { 10694 return (EINVAL); 10695 } else { 10696 /* Handled at the IP level */ 10697 return (-EINVAL); 10698 } 10699 case IPV6_SRC_PREFERENCES: 10700 if (inlen != sizeof (uint32_t)) 10701 return (EINVAL); 10702 reterr = ip6_set_src_preferences(tcp->tcp_connp, 10703 *(uint32_t *)invalp); 10704 if (reterr != 0) { 10705 *outlenp = 0; 10706 return (reterr); 10707 } 10708 break; 10709 default: 10710 *outlenp = 0; 10711 return (EINVAL); 10712 } 10713 break; 10714 } /* end IPPROTO_IPV6 */ 10715 default: 10716 *outlenp = 0; 10717 return (EINVAL); 10718 } 10719 /* 10720 * Common case of OK return with outval same as inval 10721 */ 10722 if (invalp != outvalp) { 10723 /* don't trust bcopy for identical src/dst */ 10724 (void) bcopy(invalp, outvalp, inlen); 10725 } 10726 *outlenp = inlen; 10727 return (0); 10728 } 10729 10730 /* 10731 * Update tcp_sticky_hdrs based on tcp_sticky_ipp. 10732 * The headers include ip6i_t (if needed), ip6_t, any sticky extension 10733 * headers, and the maximum size tcp header (to avoid reallocation 10734 * on the fly for additional tcp options). 10735 * Returns failure if can't allocate memory. 10736 */ 10737 static int 10738 tcp_build_hdrs(queue_t *q, tcp_t *tcp) 10739 { 10740 char *hdrs; 10741 uint_t hdrs_len; 10742 ip6i_t *ip6i; 10743 char buf[TCP_MAX_HDR_LENGTH]; 10744 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; 10745 in6_addr_t src, dst; 10746 10747 /* 10748 * save the existing tcp header and source/dest IP addresses 10749 */ 10750 bcopy(tcp->tcp_tcph, buf, tcp->tcp_tcp_hdr_len); 10751 src = tcp->tcp_ip6h->ip6_src; 10752 dst = tcp->tcp_ip6h->ip6_dst; 10753 hdrs_len = ip_total_hdrs_len_v6(ipp) + TCP_MAX_HDR_LENGTH; 10754 ASSERT(hdrs_len != 0); 10755 if (hdrs_len > tcp->tcp_iphc_len) { 10756 /* Need to reallocate */ 10757 hdrs = kmem_zalloc(hdrs_len, KM_NOSLEEP); 10758 if (hdrs == NULL) 10759 return (ENOMEM); 10760 if (tcp->tcp_iphc != NULL) { 10761 if (tcp->tcp_hdr_grown) { 10762 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); 10763 } else { 10764 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); 10765 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); 10766 } 10767 tcp->tcp_iphc_len = 0; 10768 } 10769 ASSERT(tcp->tcp_iphc_len == 0); 10770 tcp->tcp_iphc = hdrs; 10771 tcp->tcp_iphc_len = hdrs_len; 10772 tcp->tcp_hdr_grown = B_TRUE; 10773 } 10774 ip_build_hdrs_v6((uchar_t *)tcp->tcp_iphc, 10775 hdrs_len - TCP_MAX_HDR_LENGTH, ipp, IPPROTO_TCP); 10776 10777 /* Set header fields not in ipp */ 10778 if (ipp->ipp_fields & IPPF_HAS_IP6I) { 10779 ip6i = (ip6i_t *)tcp->tcp_iphc; 10780 tcp->tcp_ip6h = (ip6_t *)&ip6i[1]; 10781 } else { 10782 tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc; 10783 } 10784 /* 10785 * tcp->tcp_ip_hdr_len will include ip6i_t if there is one. 10786 * 10787 * tcp->tcp_tcp_hdr_len doesn't change here. 10788 */ 10789 tcp->tcp_ip_hdr_len = hdrs_len - TCP_MAX_HDR_LENGTH; 10790 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + tcp->tcp_ip_hdr_len); 10791 tcp->tcp_hdr_len = tcp->tcp_ip_hdr_len + tcp->tcp_tcp_hdr_len; 10792 10793 bcopy(buf, tcp->tcp_tcph, tcp->tcp_tcp_hdr_len); 10794 10795 tcp->tcp_ip6h->ip6_src = src; 10796 tcp->tcp_ip6h->ip6_dst = dst; 10797 10798 /* 10799 * If the hop limit was not set by ip_build_hdrs_v6(), set it to 10800 * the default value for TCP. 10801 */ 10802 if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS)) 10803 tcp->tcp_ip6h->ip6_hops = tcp_ipv6_hoplimit; 10804 10805 /* 10806 * If we're setting extension headers after a connection 10807 * has been established, and if we have a routing header 10808 * among the extension headers, call ip_massage_options_v6 to 10809 * manipulate the routing header/ip6_dst set the checksum 10810 * difference in the tcp header template. 10811 * (This happens in tcp_connect_ipv6 if the routing header 10812 * is set prior to the connect.) 10813 * Set the tcp_sum to zero first in case we've cleared a 10814 * routing header or don't have one at all. 10815 */ 10816 tcp->tcp_sum = 0; 10817 if ((tcp->tcp_state >= TCPS_SYN_SENT) && 10818 (tcp->tcp_ipp_fields & IPPF_RTHDR)) { 10819 ip6_rthdr_t *rth = ip_find_rthdr_v6(tcp->tcp_ip6h, 10820 (uint8_t *)tcp->tcp_tcph); 10821 if (rth != NULL) { 10822 tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, 10823 rth); 10824 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + 10825 (tcp->tcp_sum >> 16)); 10826 } 10827 } 10828 10829 /* Try to get everything in a single mblk */ 10830 (void) mi_set_sth_wroff(RD(q), hdrs_len + tcp_wroff_xtra); 10831 return (0); 10832 } 10833 10834 /* 10835 * Transfer any source route option from ipha to buf/dst in reversed form. 10836 */ 10837 static int 10838 tcp_opt_rev_src_route(ipha_t *ipha, char *buf, uchar_t *dst) 10839 { 10840 ipoptp_t opts; 10841 uchar_t *opt; 10842 uint8_t optval; 10843 uint8_t optlen; 10844 uint32_t len = 0; 10845 10846 for (optval = ipoptp_first(&opts, ipha); 10847 optval != IPOPT_EOL; 10848 optval = ipoptp_next(&opts)) { 10849 opt = opts.ipoptp_cur; 10850 optlen = opts.ipoptp_len; 10851 switch (optval) { 10852 int off1, off2; 10853 case IPOPT_SSRR: 10854 case IPOPT_LSRR: 10855 10856 /* Reverse source route */ 10857 /* 10858 * First entry should be the next to last one in the 10859 * current source route (the last entry is our 10860 * address.) 10861 * The last entry should be the final destination. 10862 */ 10863 buf[IPOPT_OPTVAL] = (uint8_t)optval; 10864 buf[IPOPT_OLEN] = (uint8_t)optlen; 10865 off1 = IPOPT_MINOFF_SR - 1; 10866 off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; 10867 if (off2 < 0) { 10868 /* No entries in source route */ 10869 break; 10870 } 10871 bcopy(opt + off2, dst, IP_ADDR_LEN); 10872 /* 10873 * Note: use src since ipha has not had its src 10874 * and dst reversed (it is in the state it was 10875 * received. 10876 */ 10877 bcopy(&ipha->ipha_src, buf + off2, 10878 IP_ADDR_LEN); 10879 off2 -= IP_ADDR_LEN; 10880 10881 while (off2 > 0) { 10882 bcopy(opt + off2, buf + off1, 10883 IP_ADDR_LEN); 10884 off1 += IP_ADDR_LEN; 10885 off2 -= IP_ADDR_LEN; 10886 } 10887 buf[IPOPT_OFFSET] = IPOPT_MINOFF_SR; 10888 buf += optlen; 10889 len += optlen; 10890 break; 10891 } 10892 } 10893 done: 10894 /* Pad the resulting options */ 10895 while (len & 0x3) { 10896 *buf++ = IPOPT_EOL; 10897 len++; 10898 } 10899 return (len); 10900 } 10901 10902 10903 /* 10904 * Extract and revert a source route from ipha (if any) 10905 * and then update the relevant fields in both tcp_t and the standard header. 10906 */ 10907 static void 10908 tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha) 10909 { 10910 char buf[TCP_MAX_HDR_LENGTH]; 10911 uint_t tcph_len; 10912 int len; 10913 10914 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); 10915 len = IPH_HDR_LENGTH(ipha); 10916 if (len == IP_SIMPLE_HDR_LENGTH) 10917 /* Nothing to do */ 10918 return; 10919 if (len > IP_SIMPLE_HDR_LENGTH + TCP_MAX_IP_OPTIONS_LENGTH || 10920 (len & 0x3)) 10921 return; 10922 10923 tcph_len = tcp->tcp_tcp_hdr_len; 10924 bcopy(tcp->tcp_tcph, buf, tcph_len); 10925 tcp->tcp_sum = (tcp->tcp_ipha->ipha_dst >> 16) + 10926 (tcp->tcp_ipha->ipha_dst & 0xffff); 10927 len = tcp_opt_rev_src_route(ipha, (char *)tcp->tcp_ipha + 10928 IP_SIMPLE_HDR_LENGTH, (uchar_t *)&tcp->tcp_ipha->ipha_dst); 10929 len += IP_SIMPLE_HDR_LENGTH; 10930 tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + 10931 (tcp->tcp_ipha->ipha_dst & 0xffff)); 10932 if ((int)tcp->tcp_sum < 0) 10933 tcp->tcp_sum--; 10934 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); 10935 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16)); 10936 tcp->tcp_tcph = (tcph_t *)((char *)tcp->tcp_ipha + len); 10937 bcopy(buf, tcp->tcp_tcph, tcph_len); 10938 tcp->tcp_ip_hdr_len = len; 10939 tcp->tcp_ipha->ipha_version_and_hdr_length = 10940 (IP_VERSION << 4) | (len >> 2); 10941 len += tcph_len; 10942 tcp->tcp_hdr_len = len; 10943 } 10944 10945 /* 10946 * Copy the standard header into its new location, 10947 * lay in the new options and then update the relevant 10948 * fields in both tcp_t and the standard header. 10949 */ 10950 static int 10951 tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len) 10952 { 10953 uint_t tcph_len; 10954 uint8_t *ip_optp; 10955 tcph_t *new_tcph; 10956 10957 if ((len > TCP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3)) 10958 return (EINVAL); 10959 10960 if (len > IP_MAX_OPT_LENGTH - tcp->tcp_label_len) 10961 return (EINVAL); 10962 10963 if (checkonly) { 10964 /* 10965 * do not really set, just pretend to - T_CHECK 10966 */ 10967 return (0); 10968 } 10969 10970 ip_optp = (uint8_t *)tcp->tcp_ipha + IP_SIMPLE_HDR_LENGTH; 10971 if (tcp->tcp_label_len > 0) { 10972 int padlen; 10973 uint8_t opt; 10974 10975 /* convert list termination to no-ops */ 10976 padlen = tcp->tcp_label_len - ip_optp[IPOPT_OLEN]; 10977 ip_optp += ip_optp[IPOPT_OLEN]; 10978 opt = len > 0 ? IPOPT_NOP : IPOPT_EOL; 10979 while (--padlen >= 0) 10980 *ip_optp++ = opt; 10981 } 10982 tcph_len = tcp->tcp_tcp_hdr_len; 10983 new_tcph = (tcph_t *)(ip_optp + len); 10984 ovbcopy(tcp->tcp_tcph, new_tcph, tcph_len); 10985 tcp->tcp_tcph = new_tcph; 10986 bcopy(ptr, ip_optp, len); 10987 10988 len += IP_SIMPLE_HDR_LENGTH + tcp->tcp_label_len; 10989 10990 tcp->tcp_ip_hdr_len = len; 10991 tcp->tcp_ipha->ipha_version_and_hdr_length = 10992 (IP_VERSION << 4) | (len >> 2); 10993 tcp->tcp_hdr_len = len + tcph_len; 10994 if (!TCP_IS_DETACHED(tcp)) { 10995 /* Always allocate room for all options. */ 10996 (void) mi_set_sth_wroff(tcp->tcp_rq, 10997 TCP_MAX_COMBINED_HEADER_LENGTH + tcp_wroff_xtra); 10998 } 10999 return (0); 11000 } 11001 11002 /* Get callback routine passed to nd_load by tcp_param_register */ 11003 /* ARGSUSED */ 11004 static int 11005 tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 11006 { 11007 tcpparam_t *tcppa = (tcpparam_t *)cp; 11008 11009 (void) mi_mpprintf(mp, "%u", tcppa->tcp_param_val); 11010 return (0); 11011 } 11012 11013 /* 11014 * Walk through the param array specified registering each element with the 11015 * named dispatch handler. 11016 */ 11017 static boolean_t 11018 tcp_param_register(tcpparam_t *tcppa, int cnt) 11019 { 11020 for (; cnt-- > 0; tcppa++) { 11021 if (tcppa->tcp_param_name && tcppa->tcp_param_name[0]) { 11022 if (!nd_load(&tcp_g_nd, tcppa->tcp_param_name, 11023 tcp_param_get, tcp_param_set, 11024 (caddr_t)tcppa)) { 11025 nd_free(&tcp_g_nd); 11026 return (B_FALSE); 11027 } 11028 } 11029 } 11030 if (!nd_load(&tcp_g_nd, tcp_wroff_xtra_param.tcp_param_name, 11031 tcp_param_get, tcp_param_set_aligned, 11032 (caddr_t)&tcp_wroff_xtra_param)) { 11033 nd_free(&tcp_g_nd); 11034 return (B_FALSE); 11035 } 11036 if (!nd_load(&tcp_g_nd, tcp_mdt_head_param.tcp_param_name, 11037 tcp_param_get, tcp_param_set_aligned, 11038 (caddr_t)&tcp_mdt_head_param)) { 11039 nd_free(&tcp_g_nd); 11040 return (B_FALSE); 11041 } 11042 if (!nd_load(&tcp_g_nd, tcp_mdt_tail_param.tcp_param_name, 11043 tcp_param_get, tcp_param_set_aligned, 11044 (caddr_t)&tcp_mdt_tail_param)) { 11045 nd_free(&tcp_g_nd); 11046 return (B_FALSE); 11047 } 11048 if (!nd_load(&tcp_g_nd, tcp_mdt_max_pbufs_param.tcp_param_name, 11049 tcp_param_get, tcp_param_set, 11050 (caddr_t)&tcp_mdt_max_pbufs_param)) { 11051 nd_free(&tcp_g_nd); 11052 return (B_FALSE); 11053 } 11054 if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports", 11055 tcp_extra_priv_ports_get, NULL, NULL)) { 11056 nd_free(&tcp_g_nd); 11057 return (B_FALSE); 11058 } 11059 if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports_add", 11060 NULL, tcp_extra_priv_ports_add, NULL)) { 11061 nd_free(&tcp_g_nd); 11062 return (B_FALSE); 11063 } 11064 if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports_del", 11065 NULL, tcp_extra_priv_ports_del, NULL)) { 11066 nd_free(&tcp_g_nd); 11067 return (B_FALSE); 11068 } 11069 if (!nd_load(&tcp_g_nd, "tcp_status", tcp_status_report, NULL, 11070 NULL)) { 11071 nd_free(&tcp_g_nd); 11072 return (B_FALSE); 11073 } 11074 if (!nd_load(&tcp_g_nd, "tcp_bind_hash", tcp_bind_hash_report, 11075 NULL, NULL)) { 11076 nd_free(&tcp_g_nd); 11077 return (B_FALSE); 11078 } 11079 if (!nd_load(&tcp_g_nd, "tcp_listen_hash", tcp_listen_hash_report, 11080 NULL, NULL)) { 11081 nd_free(&tcp_g_nd); 11082 return (B_FALSE); 11083 } 11084 if (!nd_load(&tcp_g_nd, "tcp_conn_hash", tcp_conn_hash_report, 11085 NULL, NULL)) { 11086 nd_free(&tcp_g_nd); 11087 return (B_FALSE); 11088 } 11089 if (!nd_load(&tcp_g_nd, "tcp_acceptor_hash", tcp_acceptor_hash_report, 11090 NULL, NULL)) { 11091 nd_free(&tcp_g_nd); 11092 return (B_FALSE); 11093 } 11094 if (!nd_load(&tcp_g_nd, "tcp_host_param", tcp_host_param_report, 11095 tcp_host_param_set, NULL)) { 11096 nd_free(&tcp_g_nd); 11097 return (B_FALSE); 11098 } 11099 if (!nd_load(&tcp_g_nd, "tcp_host_param_ipv6", tcp_host_param_report, 11100 tcp_host_param_set_ipv6, NULL)) { 11101 nd_free(&tcp_g_nd); 11102 return (B_FALSE); 11103 } 11104 if (!nd_load(&tcp_g_nd, "tcp_1948_phrase", NULL, tcp_1948_phrase_set, 11105 NULL)) { 11106 nd_free(&tcp_g_nd); 11107 return (B_FALSE); 11108 } 11109 if (!nd_load(&tcp_g_nd, "tcp_reserved_port_list", 11110 tcp_reserved_port_list, NULL, NULL)) { 11111 nd_free(&tcp_g_nd); 11112 return (B_FALSE); 11113 } 11114 /* 11115 * Dummy ndd variables - only to convey obsolescence information 11116 * through printing of their name (no get or set routines) 11117 * XXX Remove in future releases ? 11118 */ 11119 if (!nd_load(&tcp_g_nd, 11120 "tcp_close_wait_interval(obsoleted - " 11121 "use tcp_time_wait_interval)", NULL, NULL, NULL)) { 11122 nd_free(&tcp_g_nd); 11123 return (B_FALSE); 11124 } 11125 return (B_TRUE); 11126 } 11127 11128 /* ndd set routine for tcp_wroff_xtra, tcp_mdt_hdr_{head,tail}_min. */ 11129 /* ARGSUSED */ 11130 static int 11131 tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 11132 cred_t *cr) 11133 { 11134 long new_value; 11135 tcpparam_t *tcppa = (tcpparam_t *)cp; 11136 11137 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11138 new_value < tcppa->tcp_param_min || 11139 new_value > tcppa->tcp_param_max) { 11140 return (EINVAL); 11141 } 11142 /* 11143 * Need to make sure new_value is a multiple of 4. If it is not, 11144 * round it up. For future 64 bit requirement, we actually make it 11145 * a multiple of 8. 11146 */ 11147 if (new_value & 0x7) { 11148 new_value = (new_value & ~0x7) + 0x8; 11149 } 11150 tcppa->tcp_param_val = new_value; 11151 return (0); 11152 } 11153 11154 /* Set callback routine passed to nd_load by tcp_param_register */ 11155 /* ARGSUSED */ 11156 static int 11157 tcp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) 11158 { 11159 long new_value; 11160 tcpparam_t *tcppa = (tcpparam_t *)cp; 11161 11162 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || 11163 new_value < tcppa->tcp_param_min || 11164 new_value > tcppa->tcp_param_max) { 11165 return (EINVAL); 11166 } 11167 tcppa->tcp_param_val = new_value; 11168 return (0); 11169 } 11170 11171 /* 11172 * Add a new piece to the tcp reassembly queue. If the gap at the beginning 11173 * is filled, return as much as we can. The message passed in may be 11174 * multi-part, chained using b_cont. "start" is the starting sequence 11175 * number for this piece. 11176 */ 11177 static mblk_t * 11178 tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) 11179 { 11180 uint32_t end; 11181 mblk_t *mp1; 11182 mblk_t *mp2; 11183 mblk_t *next_mp; 11184 uint32_t u1; 11185 11186 /* Walk through all the new pieces. */ 11187 do { 11188 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 11189 (uintptr_t)INT_MAX); 11190 end = start + (int)(mp->b_wptr - mp->b_rptr); 11191 next_mp = mp->b_cont; 11192 if (start == end) { 11193 /* Empty. Blast it. */ 11194 freeb(mp); 11195 continue; 11196 } 11197 mp->b_cont = NULL; 11198 TCP_REASS_SET_SEQ(mp, start); 11199 TCP_REASS_SET_END(mp, end); 11200 mp1 = tcp->tcp_reass_tail; 11201 if (!mp1) { 11202 tcp->tcp_reass_tail = mp; 11203 tcp->tcp_reass_head = mp; 11204 BUMP_MIB(&tcp_mib, tcpInDataUnorderSegs); 11205 UPDATE_MIB(&tcp_mib, 11206 tcpInDataUnorderBytes, end - start); 11207 continue; 11208 } 11209 /* New stuff completely beyond tail? */ 11210 if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { 11211 /* Link it on end. */ 11212 mp1->b_cont = mp; 11213 tcp->tcp_reass_tail = mp; 11214 BUMP_MIB(&tcp_mib, tcpInDataUnorderSegs); 11215 UPDATE_MIB(&tcp_mib, 11216 tcpInDataUnorderBytes, end - start); 11217 continue; 11218 } 11219 mp1 = tcp->tcp_reass_head; 11220 u1 = TCP_REASS_SEQ(mp1); 11221 /* New stuff at the front? */ 11222 if (SEQ_LT(start, u1)) { 11223 /* Yes... Check for overlap. */ 11224 mp->b_cont = mp1; 11225 tcp->tcp_reass_head = mp; 11226 tcp_reass_elim_overlap(tcp, mp); 11227 continue; 11228 } 11229 /* 11230 * The new piece fits somewhere between the head and tail. 11231 * We find our slot, where mp1 precedes us and mp2 trails. 11232 */ 11233 for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { 11234 u1 = TCP_REASS_SEQ(mp2); 11235 if (SEQ_LEQ(start, u1)) 11236 break; 11237 } 11238 /* Link ourselves in */ 11239 mp->b_cont = mp2; 11240 mp1->b_cont = mp; 11241 11242 /* Trim overlap with following mblk(s) first */ 11243 tcp_reass_elim_overlap(tcp, mp); 11244 11245 /* Trim overlap with preceding mblk */ 11246 tcp_reass_elim_overlap(tcp, mp1); 11247 11248 } while (start = end, mp = next_mp); 11249 mp1 = tcp->tcp_reass_head; 11250 /* Anything ready to go? */ 11251 if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) 11252 return (NULL); 11253 /* Eat what we can off the queue */ 11254 for (;;) { 11255 mp = mp1->b_cont; 11256 end = TCP_REASS_END(mp1); 11257 TCP_REASS_SET_SEQ(mp1, 0); 11258 TCP_REASS_SET_END(mp1, 0); 11259 if (!mp) { 11260 tcp->tcp_reass_tail = NULL; 11261 break; 11262 } 11263 if (end != TCP_REASS_SEQ(mp)) { 11264 mp1->b_cont = NULL; 11265 break; 11266 } 11267 mp1 = mp; 11268 } 11269 mp1 = tcp->tcp_reass_head; 11270 tcp->tcp_reass_head = mp; 11271 return (mp1); 11272 } 11273 11274 /* Eliminate any overlap that mp may have over later mblks */ 11275 static void 11276 tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) 11277 { 11278 uint32_t end; 11279 mblk_t *mp1; 11280 uint32_t u1; 11281 11282 end = TCP_REASS_END(mp); 11283 while ((mp1 = mp->b_cont) != NULL) { 11284 u1 = TCP_REASS_SEQ(mp1); 11285 if (!SEQ_GT(end, u1)) 11286 break; 11287 if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { 11288 mp->b_wptr -= end - u1; 11289 TCP_REASS_SET_END(mp, u1); 11290 BUMP_MIB(&tcp_mib, tcpInDataPartDupSegs); 11291 UPDATE_MIB(&tcp_mib, tcpInDataPartDupBytes, end - u1); 11292 break; 11293 } 11294 mp->b_cont = mp1->b_cont; 11295 TCP_REASS_SET_SEQ(mp1, 0); 11296 TCP_REASS_SET_END(mp1, 0); 11297 freeb(mp1); 11298 BUMP_MIB(&tcp_mib, tcpInDataDupSegs); 11299 UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, end - u1); 11300 } 11301 if (!mp1) 11302 tcp->tcp_reass_tail = mp; 11303 } 11304 11305 /* 11306 * Send up all messages queued on tcp_rcv_list. 11307 */ 11308 static uint_t 11309 tcp_rcv_drain(queue_t *q, tcp_t *tcp) 11310 { 11311 mblk_t *mp; 11312 uint_t ret = 0; 11313 uint_t thwin; 11314 #ifdef DEBUG 11315 uint_t cnt = 0; 11316 #endif 11317 /* Can't drain on an eager connection */ 11318 if (tcp->tcp_listener != NULL) 11319 return (ret); 11320 11321 /* 11322 * Handle two cases here: we are currently fused or we were 11323 * previously fused and have some urgent data to be delivered 11324 * upstream. The latter happens because we either ran out of 11325 * memory or were detached and therefore sending the SIGURG was 11326 * deferred until this point. In either case we pass control 11327 * over to tcp_fuse_rcv_drain() since it may need to complete 11328 * some work. 11329 */ 11330 if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) { 11331 ASSERT(tcp->tcp_fused_sigurg_mp != NULL); 11332 if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL : 11333 &tcp->tcp_fused_sigurg_mp)) 11334 return (ret); 11335 } 11336 11337 while ((mp = tcp->tcp_rcv_list) != NULL) { 11338 tcp->tcp_rcv_list = mp->b_next; 11339 mp->b_next = NULL; 11340 #ifdef DEBUG 11341 cnt += msgdsize(mp); 11342 #endif 11343 /* Does this need SSL processing first? */ 11344 if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) { 11345 tcp_kssl_input(tcp, mp); 11346 continue; 11347 } 11348 putnext(q, mp); 11349 } 11350 ASSERT(cnt == tcp->tcp_rcv_cnt); 11351 tcp->tcp_rcv_last_head = NULL; 11352 tcp->tcp_rcv_last_tail = NULL; 11353 tcp->tcp_rcv_cnt = 0; 11354 11355 /* Learn the latest rwnd information that we sent to the other side. */ 11356 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 11357 << tcp->tcp_rcv_ws; 11358 /* This is peer's calculated send window (our receive window). */ 11359 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 11360 /* 11361 * Increase the receive window to max. But we need to do receiver 11362 * SWS avoidance. This means that we need to check the increase of 11363 * of receive window is at least 1 MSS. 11364 */ 11365 if (canputnext(q) && (q->q_hiwat - thwin >= tcp->tcp_mss)) { 11366 /* 11367 * If the window that the other side knows is less than max 11368 * deferred acks segments, send an update immediately. 11369 */ 11370 if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { 11371 BUMP_MIB(&tcp_mib, tcpOutWinUpdate); 11372 ret = TH_ACK_NEEDED; 11373 } 11374 tcp->tcp_rwnd = q->q_hiwat; 11375 } 11376 /* No need for the push timer now. */ 11377 if (tcp->tcp_push_tid != 0) { 11378 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 11379 tcp->tcp_push_tid = 0; 11380 } 11381 return (ret); 11382 } 11383 11384 /* 11385 * Queue data on tcp_rcv_list which is a b_next chain. 11386 * tcp_rcv_last_head/tail is the last element of this chain. 11387 * Each element of the chain is a b_cont chain. 11388 * 11389 * M_DATA messages are added to the current element. 11390 * Other messages are added as new (b_next) elements. 11391 */ 11392 void 11393 tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) 11394 { 11395 ASSERT(seg_len == msgdsize(mp)); 11396 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); 11397 11398 if (tcp->tcp_rcv_list == NULL) { 11399 ASSERT(tcp->tcp_rcv_last_head == NULL); 11400 tcp->tcp_rcv_list = mp; 11401 tcp->tcp_rcv_last_head = mp; 11402 } else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) { 11403 tcp->tcp_rcv_last_tail->b_cont = mp; 11404 } else { 11405 tcp->tcp_rcv_last_head->b_next = mp; 11406 tcp->tcp_rcv_last_head = mp; 11407 } 11408 11409 while (mp->b_cont) 11410 mp = mp->b_cont; 11411 11412 tcp->tcp_rcv_last_tail = mp; 11413 tcp->tcp_rcv_cnt += seg_len; 11414 tcp->tcp_rwnd -= seg_len; 11415 } 11416 11417 /* 11418 * DEFAULT TCP ENTRY POINT via squeue on READ side. 11419 * 11420 * This is the default entry function into TCP on the read side. TCP is 11421 * always entered via squeue i.e. using squeue's for mutual exclusion. 11422 * When classifier does a lookup to find the tcp, it also puts a reference 11423 * on the conn structure associated so the tcp is guaranteed to exist 11424 * when we come here. We still need to check the state because it might 11425 * as well has been closed. The squeue processing function i.e. squeue_enter, 11426 * squeue_enter_nodrain, or squeue_drain is responsible for doing the 11427 * CONN_DEC_REF. 11428 * 11429 * Apart from the default entry point, IP also sends packets directly to 11430 * tcp_rput_data for AF_INET fast path and tcp_conn_request for incoming 11431 * connections. 11432 */ 11433 void 11434 tcp_input(void *arg, mblk_t *mp, void *arg2) 11435 { 11436 conn_t *connp = (conn_t *)arg; 11437 tcp_t *tcp = (tcp_t *)connp->conn_tcp; 11438 11439 /* arg2 is the sqp */ 11440 ASSERT(arg2 != NULL); 11441 ASSERT(mp != NULL); 11442 11443 /* 11444 * Don't accept any input on a closed tcp as this TCP logically does 11445 * not exist on the system. Don't proceed further with this TCP. 11446 * For eg. this packet could trigger another close of this tcp 11447 * which would be disastrous for tcp_refcnt. tcp_close_detached / 11448 * tcp_clean_death / tcp_closei_local must be called at most once 11449 * on a TCP. In this case we need to refeed the packet into the 11450 * classifier and figure out where the packet should go. Need to 11451 * preserve the recv_ill somehow. Until we figure that out, for 11452 * now just drop the packet if we can't classify the packet. 11453 */ 11454 if (tcp->tcp_state == TCPS_CLOSED || 11455 tcp->tcp_state == TCPS_BOUND) { 11456 conn_t *new_connp; 11457 11458 new_connp = ipcl_classify(mp, connp->conn_zoneid); 11459 if (new_connp != NULL) { 11460 tcp_reinput(new_connp, mp, arg2); 11461 return; 11462 } 11463 /* We failed to classify. For now just drop the packet */ 11464 freemsg(mp); 11465 return; 11466 } 11467 11468 if (DB_TYPE(mp) == M_DATA) 11469 tcp_rput_data(connp, mp, arg2); 11470 else 11471 tcp_rput_common(tcp, mp); 11472 } 11473 11474 /* 11475 * The read side put procedure. 11476 * The packets passed up by ip are assume to be aligned according to 11477 * OK_32PTR and the IP+TCP headers fitting in the first mblk. 11478 */ 11479 static void 11480 tcp_rput_common(tcp_t *tcp, mblk_t *mp) 11481 { 11482 /* 11483 * tcp_rput_data() does not expect M_CTL except for the case 11484 * where tcp_ipv6_recvancillary is set and we get a IN_PKTINFO 11485 * type. Need to make sure that any other M_CTLs don't make 11486 * it to tcp_rput_data since it is not expecting any and doesn't 11487 * check for it. 11488 */ 11489 if (DB_TYPE(mp) == M_CTL) { 11490 switch (*(uint32_t *)(mp->b_rptr)) { 11491 case TCP_IOC_ABORT_CONN: 11492 /* 11493 * Handle connection abort request. 11494 */ 11495 tcp_ioctl_abort_handler(tcp, mp); 11496 return; 11497 case IPSEC_IN: 11498 /* 11499 * Only secure icmp arrive in TCP and they 11500 * don't go through data path. 11501 */ 11502 tcp_icmp_error(tcp, mp); 11503 return; 11504 case IN_PKTINFO: 11505 /* 11506 * Handle IPV6_RECVPKTINFO socket option on AF_INET6 11507 * sockets that are receiving IPv4 traffic. tcp 11508 */ 11509 ASSERT(tcp->tcp_family == AF_INET6); 11510 ASSERT(tcp->tcp_ipv6_recvancillary & 11511 TCP_IPV6_RECVPKTINFO); 11512 tcp_rput_data(tcp->tcp_connp, mp, 11513 tcp->tcp_connp->conn_sqp); 11514 return; 11515 case MDT_IOC_INFO_UPDATE: 11516 /* 11517 * Handle Multidata information update; the 11518 * following routine will free the message. 11519 */ 11520 if (tcp->tcp_connp->conn_mdt_ok) { 11521 tcp_mdt_update(tcp, 11522 &((ip_mdt_info_t *)mp->b_rptr)->mdt_capab, 11523 B_FALSE); 11524 } 11525 freemsg(mp); 11526 return; 11527 default: 11528 break; 11529 } 11530 } 11531 11532 /* No point processing the message if tcp is already closed */ 11533 if (TCP_IS_DETACHED_NONEAGER(tcp)) { 11534 freemsg(mp); 11535 return; 11536 } 11537 11538 tcp_rput_other(tcp, mp); 11539 } 11540 11541 11542 /* The minimum of smoothed mean deviation in RTO calculation. */ 11543 #define TCP_SD_MIN 400 11544 11545 /* 11546 * Set RTO for this connection. The formula is from Jacobson and Karels' 11547 * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 11548 * are the same as those in Appendix A.2 of that paper. 11549 * 11550 * m = new measurement 11551 * sa = smoothed RTT average (8 * average estimates). 11552 * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). 11553 */ 11554 static void 11555 tcp_set_rto(tcp_t *tcp, clock_t rtt) 11556 { 11557 long m = TICK_TO_MSEC(rtt); 11558 clock_t sa = tcp->tcp_rtt_sa; 11559 clock_t sv = tcp->tcp_rtt_sd; 11560 clock_t rto; 11561 11562 BUMP_MIB(&tcp_mib, tcpRttUpdate); 11563 tcp->tcp_rtt_update++; 11564 11565 /* tcp_rtt_sa is not 0 means this is a new sample. */ 11566 if (sa != 0) { 11567 /* 11568 * Update average estimator: 11569 * new rtt = 7/8 old rtt + 1/8 Error 11570 */ 11571 11572 /* m is now Error in estimate. */ 11573 m -= sa >> 3; 11574 if ((sa += m) <= 0) { 11575 /* 11576 * Don't allow the smoothed average to be negative. 11577 * We use 0 to denote reinitialization of the 11578 * variables. 11579 */ 11580 sa = 1; 11581 } 11582 11583 /* 11584 * Update deviation estimator: 11585 * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) 11586 */ 11587 if (m < 0) 11588 m = -m; 11589 m -= sv >> 2; 11590 sv += m; 11591 } else { 11592 /* 11593 * This follows BSD's implementation. So the reinitialized 11594 * RTO is 3 * m. We cannot go less than 2 because if the 11595 * link is bandwidth dominated, doubling the window size 11596 * during slow start means doubling the RTT. We want to be 11597 * more conservative when we reinitialize our estimates. 3 11598 * is just a convenient number. 11599 */ 11600 sa = m << 3; 11601 sv = m << 1; 11602 } 11603 if (sv < TCP_SD_MIN) { 11604 /* 11605 * We do not know that if sa captures the delay ACK 11606 * effect as in a long train of segments, a receiver 11607 * does not delay its ACKs. So set the minimum of sv 11608 * to be TCP_SD_MIN, which is default to 400 ms, twice 11609 * of BSD DATO. That means the minimum of mean 11610 * deviation is 100 ms. 11611 * 11612 */ 11613 sv = TCP_SD_MIN; 11614 } 11615 tcp->tcp_rtt_sa = sa; 11616 tcp->tcp_rtt_sd = sv; 11617 /* 11618 * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) 11619 * 11620 * Add tcp_rexmit_interval extra in case of extreme environment 11621 * where the algorithm fails to work. The default value of 11622 * tcp_rexmit_interval_extra should be 0. 11623 * 11624 * As we use a finer grained clock than BSD and update 11625 * RTO for every ACKs, add in another .25 of RTT to the 11626 * deviation of RTO to accomodate burstiness of 1/4 of 11627 * window size. 11628 */ 11629 rto = (sa >> 3) + sv + tcp_rexmit_interval_extra + (sa >> 5); 11630 11631 if (rto > tcp_rexmit_interval_max) { 11632 tcp->tcp_rto = tcp_rexmit_interval_max; 11633 } else if (rto < tcp_rexmit_interval_min) { 11634 tcp->tcp_rto = tcp_rexmit_interval_min; 11635 } else { 11636 tcp->tcp_rto = rto; 11637 } 11638 11639 /* Now, we can reset tcp_timer_backoff to use the new RTO... */ 11640 tcp->tcp_timer_backoff = 0; 11641 } 11642 11643 /* 11644 * tcp_get_seg_mp() is called to get the pointer to a segment in the 11645 * send queue which starts at the given seq. no. 11646 * 11647 * Parameters: 11648 * tcp_t *tcp: the tcp instance pointer. 11649 * uint32_t seq: the starting seq. no of the requested segment. 11650 * int32_t *off: after the execution, *off will be the offset to 11651 * the returned mblk which points to the requested seq no. 11652 * It is the caller's responsibility to send in a non-null off. 11653 * 11654 * Return: 11655 * A mblk_t pointer pointing to the requested segment in send queue. 11656 */ 11657 static mblk_t * 11658 tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) 11659 { 11660 int32_t cnt; 11661 mblk_t *mp; 11662 11663 /* Defensive coding. Make sure we don't send incorrect data. */ 11664 if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt)) 11665 return (NULL); 11666 11667 cnt = seq - tcp->tcp_suna; 11668 mp = tcp->tcp_xmit_head; 11669 while (cnt > 0 && mp != NULL) { 11670 cnt -= mp->b_wptr - mp->b_rptr; 11671 if (cnt < 0) { 11672 cnt += mp->b_wptr - mp->b_rptr; 11673 break; 11674 } 11675 mp = mp->b_cont; 11676 } 11677 ASSERT(mp != NULL); 11678 *off = cnt; 11679 return (mp); 11680 } 11681 11682 /* 11683 * This function handles all retransmissions if SACK is enabled for this 11684 * connection. First it calculates how many segments can be retransmitted 11685 * based on tcp_pipe. Then it goes thru the notsack list to find eligible 11686 * segments. A segment is eligible if sack_cnt for that segment is greater 11687 * than or equal tcp_dupack_fast_retransmit. After it has retransmitted 11688 * all eligible segments, it checks to see if TCP can send some new segments 11689 * (fast recovery). If it can, set the appropriate flag for tcp_rput_data(). 11690 * 11691 * Parameters: 11692 * tcp_t *tcp: the tcp structure of the connection. 11693 * uint_t *flags: in return, appropriate value will be set for 11694 * tcp_rput_data(). 11695 */ 11696 static void 11697 tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) 11698 { 11699 notsack_blk_t *notsack_blk; 11700 int32_t usable_swnd; 11701 int32_t mss; 11702 uint32_t seg_len; 11703 mblk_t *xmit_mp; 11704 11705 ASSERT(tcp->tcp_sack_info != NULL); 11706 ASSERT(tcp->tcp_notsack_list != NULL); 11707 ASSERT(tcp->tcp_rexmit == B_FALSE); 11708 11709 /* Defensive coding in case there is a bug... */ 11710 if (tcp->tcp_notsack_list == NULL) { 11711 return; 11712 } 11713 notsack_blk = tcp->tcp_notsack_list; 11714 mss = tcp->tcp_mss; 11715 11716 /* 11717 * Limit the num of outstanding data in the network to be 11718 * tcp_cwnd_ssthresh, which is half of the original congestion wnd. 11719 */ 11720 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 11721 11722 /* At least retransmit 1 MSS of data. */ 11723 if (usable_swnd <= 0) { 11724 usable_swnd = mss; 11725 } 11726 11727 /* Make sure no new RTT samples will be taken. */ 11728 tcp->tcp_csuna = tcp->tcp_snxt; 11729 11730 notsack_blk = tcp->tcp_notsack_list; 11731 while (usable_swnd > 0) { 11732 mblk_t *snxt_mp, *tmp_mp; 11733 tcp_seq begin = tcp->tcp_sack_snxt; 11734 tcp_seq end; 11735 int32_t off; 11736 11737 for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { 11738 if (SEQ_GT(notsack_blk->end, begin) && 11739 (notsack_blk->sack_cnt >= 11740 tcp_dupack_fast_retransmit)) { 11741 end = notsack_blk->end; 11742 if (SEQ_LT(begin, notsack_blk->begin)) { 11743 begin = notsack_blk->begin; 11744 } 11745 break; 11746 } 11747 } 11748 /* 11749 * All holes are filled. Manipulate tcp_cwnd to send more 11750 * if we can. Note that after the SACK recovery, tcp_cwnd is 11751 * set to tcp_cwnd_ssthresh. 11752 */ 11753 if (notsack_blk == NULL) { 11754 usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; 11755 if (usable_swnd <= 0 || tcp->tcp_unsent == 0) { 11756 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; 11757 ASSERT(tcp->tcp_cwnd > 0); 11758 return; 11759 } else { 11760 usable_swnd = usable_swnd / mss; 11761 tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + 11762 MAX(usable_swnd * mss, mss); 11763 *flags |= TH_XMIT_NEEDED; 11764 return; 11765 } 11766 } 11767 11768 /* 11769 * Note that we may send more than usable_swnd allows here 11770 * because of round off, but no more than 1 MSS of data. 11771 */ 11772 seg_len = end - begin; 11773 if (seg_len > mss) 11774 seg_len = mss; 11775 snxt_mp = tcp_get_seg_mp(tcp, begin, &off); 11776 ASSERT(snxt_mp != NULL); 11777 /* This should not happen. Defensive coding again... */ 11778 if (snxt_mp == NULL) { 11779 return; 11780 } 11781 11782 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, 11783 &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); 11784 if (xmit_mp == NULL) 11785 return; 11786 11787 usable_swnd -= seg_len; 11788 tcp->tcp_pipe += seg_len; 11789 tcp->tcp_sack_snxt = begin + seg_len; 11790 TCP_RECORD_TRACE(tcp, xmit_mp, TCP_TRACE_SEND_PKT); 11791 tcp_send_data(tcp, tcp->tcp_wq, xmit_mp); 11792 11793 /* 11794 * Update the send timestamp to avoid false retransmission. 11795 */ 11796 snxt_mp->b_prev = (mblk_t *)lbolt; 11797 11798 BUMP_MIB(&tcp_mib, tcpRetransSegs); 11799 UPDATE_MIB(&tcp_mib, tcpRetransBytes, seg_len); 11800 BUMP_MIB(&tcp_mib, tcpOutSackRetransSegs); 11801 /* 11802 * Update tcp_rexmit_max to extend this SACK recovery phase. 11803 * This happens when new data sent during fast recovery is 11804 * also lost. If TCP retransmits those new data, it needs 11805 * to extend SACK recover phase to avoid starting another 11806 * fast retransmit/recovery unnecessarily. 11807 */ 11808 if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { 11809 tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; 11810 } 11811 } 11812 } 11813 11814 /* 11815 * This function handles policy checking at TCP level for non-hard_bound/ 11816 * detached connections. 11817 */ 11818 static boolean_t 11819 tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h, 11820 boolean_t secure, boolean_t mctl_present) 11821 { 11822 ipsec_latch_t *ipl = NULL; 11823 ipsec_action_t *act = NULL; 11824 mblk_t *data_mp; 11825 ipsec_in_t *ii; 11826 const char *reason; 11827 kstat_named_t *counter; 11828 11829 ASSERT(mctl_present || !secure); 11830 11831 ASSERT((ipha == NULL && ip6h != NULL) || 11832 (ip6h == NULL && ipha != NULL)); 11833 11834 /* 11835 * We don't necessarily have an ipsec_in_act action to verify 11836 * policy because of assymetrical policy where we have only 11837 * outbound policy and no inbound policy (possible with global 11838 * policy). 11839 */ 11840 if (!secure) { 11841 if (act == NULL || act->ipa_act.ipa_type == IPSEC_ACT_BYPASS || 11842 act->ipa_act.ipa_type == IPSEC_ACT_CLEAR) 11843 return (B_TRUE); 11844 ipsec_log_policy_failure(tcp->tcp_wq, IPSEC_POLICY_MISMATCH, 11845 "tcp_check_policy", ipha, ip6h, secure); 11846 ip_drop_packet(first_mp, B_TRUE, NULL, NULL, 11847 &ipdrops_tcp_clear, &tcp_dropper); 11848 return (B_FALSE); 11849 } 11850 11851 /* 11852 * We have a secure packet. 11853 */ 11854 if (act == NULL) { 11855 ipsec_log_policy_failure(tcp->tcp_wq, 11856 IPSEC_POLICY_NOT_NEEDED, "tcp_check_policy", ipha, ip6h, 11857 secure); 11858 ip_drop_packet(first_mp, B_TRUE, NULL, NULL, 11859 &ipdrops_tcp_secure, &tcp_dropper); 11860 return (B_FALSE); 11861 } 11862 11863 /* 11864 * XXX This whole routine is currently incorrect. ipl should 11865 * be set to the latch pointer, but is currently not set, so 11866 * we initialize it to NULL to avoid picking up random garbage. 11867 */ 11868 if (ipl == NULL) 11869 return (B_TRUE); 11870 11871 data_mp = first_mp->b_cont; 11872 11873 ii = (ipsec_in_t *)first_mp->b_rptr; 11874 11875 if (ipsec_check_ipsecin_latch(ii, data_mp, ipl, ipha, ip6h, &reason, 11876 &counter)) { 11877 BUMP_MIB(&ip_mib, ipsecInSucceeded); 11878 return (B_TRUE); 11879 } 11880 (void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE, 11881 "tcp inbound policy mismatch: %s, packet dropped\n", 11882 reason); 11883 BUMP_MIB(&ip_mib, ipsecInFailed); 11884 11885 ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter, &tcp_dropper); 11886 return (B_FALSE); 11887 } 11888 11889 /* 11890 * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start 11891 * retransmission after a timeout. 11892 * 11893 * To limit the number of duplicate segments, we limit the number of segment 11894 * to be sent in one time to tcp_snd_burst, the burst variable. 11895 */ 11896 static void 11897 tcp_ss_rexmit(tcp_t *tcp) 11898 { 11899 uint32_t snxt; 11900 uint32_t smax; 11901 int32_t win; 11902 int32_t mss; 11903 int32_t off; 11904 int32_t burst = tcp->tcp_snd_burst; 11905 mblk_t *snxt_mp; 11906 11907 /* 11908 * Note that tcp_rexmit can be set even though TCP has retransmitted 11909 * all unack'ed segments. 11910 */ 11911 if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { 11912 smax = tcp->tcp_rexmit_max; 11913 snxt = tcp->tcp_rexmit_nxt; 11914 if (SEQ_LT(snxt, tcp->tcp_suna)) { 11915 snxt = tcp->tcp_suna; 11916 } 11917 win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); 11918 win -= snxt - tcp->tcp_suna; 11919 mss = tcp->tcp_mss; 11920 snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); 11921 11922 while (SEQ_LT(snxt, smax) && (win > 0) && 11923 (burst > 0) && (snxt_mp != NULL)) { 11924 mblk_t *xmit_mp; 11925 mblk_t *old_snxt_mp = snxt_mp; 11926 uint32_t cnt = mss; 11927 11928 if (win < cnt) { 11929 cnt = win; 11930 } 11931 if (SEQ_GT(snxt + cnt, smax)) { 11932 cnt = smax - snxt; 11933 } 11934 xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, 11935 &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); 11936 if (xmit_mp == NULL) 11937 return; 11938 11939 tcp_send_data(tcp, tcp->tcp_wq, xmit_mp); 11940 11941 snxt += cnt; 11942 win -= cnt; 11943 /* 11944 * Update the send timestamp to avoid false 11945 * retransmission. 11946 */ 11947 old_snxt_mp->b_prev = (mblk_t *)lbolt; 11948 BUMP_MIB(&tcp_mib, tcpRetransSegs); 11949 UPDATE_MIB(&tcp_mib, tcpRetransBytes, cnt); 11950 11951 tcp->tcp_rexmit_nxt = snxt; 11952 burst--; 11953 } 11954 /* 11955 * If we have transmitted all we have at the time 11956 * we started the retranmission, we can leave 11957 * the rest of the job to tcp_wput_data(). But we 11958 * need to check the send window first. If the 11959 * win is not 0, go on with tcp_wput_data(). 11960 */ 11961 if (SEQ_LT(snxt, smax) || win == 0) { 11962 return; 11963 } 11964 } 11965 /* Only call tcp_wput_data() if there is data to be sent. */ 11966 if (tcp->tcp_unsent) { 11967 tcp_wput_data(tcp, NULL, B_FALSE); 11968 } 11969 } 11970 11971 /* 11972 * Process all TCP option in SYN segment. Note that this function should 11973 * be called after tcp_adapt_ire() is called so that the necessary info 11974 * from IRE is already set in the tcp structure. 11975 * 11976 * This function sets up the correct tcp_mss value according to the 11977 * MSS option value and our header size. It also sets up the window scale 11978 * and timestamp values, and initialize SACK info blocks. But it does not 11979 * change receive window size after setting the tcp_mss value. The caller 11980 * should do the appropriate change. 11981 */ 11982 void 11983 tcp_process_options(tcp_t *tcp, tcph_t *tcph) 11984 { 11985 int options; 11986 tcp_opt_t tcpopt; 11987 uint32_t mss_max; 11988 char *tmp_tcph; 11989 11990 tcpopt.tcp = NULL; 11991 options = tcp_parse_options(tcph, &tcpopt); 11992 11993 /* 11994 * Process MSS option. Note that MSS option value does not account 11995 * for IP or TCP options. This means that it is equal to MTU - minimum 11996 * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for 11997 * IPv6. 11998 */ 11999 if (!(options & TCP_OPT_MSS_PRESENT)) { 12000 if (tcp->tcp_ipversion == IPV4_VERSION) 12001 tcpopt.tcp_opt_mss = tcp_mss_def_ipv4; 12002 else 12003 tcpopt.tcp_opt_mss = tcp_mss_def_ipv6; 12004 } else { 12005 if (tcp->tcp_ipversion == IPV4_VERSION) 12006 mss_max = tcp_mss_max_ipv4; 12007 else 12008 mss_max = tcp_mss_max_ipv6; 12009 if (tcpopt.tcp_opt_mss < tcp_mss_min) 12010 tcpopt.tcp_opt_mss = tcp_mss_min; 12011 else if (tcpopt.tcp_opt_mss > mss_max) 12012 tcpopt.tcp_opt_mss = mss_max; 12013 } 12014 12015 /* Process Window Scale option. */ 12016 if (options & TCP_OPT_WSCALE_PRESENT) { 12017 tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; 12018 tcp->tcp_snd_ws_ok = B_TRUE; 12019 } else { 12020 tcp->tcp_snd_ws = B_FALSE; 12021 tcp->tcp_snd_ws_ok = B_FALSE; 12022 tcp->tcp_rcv_ws = B_FALSE; 12023 } 12024 12025 /* Process Timestamp option. */ 12026 if ((options & TCP_OPT_TSTAMP_PRESENT) && 12027 (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) { 12028 tmp_tcph = (char *)tcp->tcp_tcph; 12029 12030 tcp->tcp_snd_ts_ok = B_TRUE; 12031 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 12032 tcp->tcp_last_rcv_lbolt = lbolt64; 12033 ASSERT(OK_32PTR(tmp_tcph)); 12034 ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 12035 12036 /* Fill in our template header with basic timestamp option. */ 12037 tmp_tcph += tcp->tcp_tcp_hdr_len; 12038 tmp_tcph[0] = TCPOPT_NOP; 12039 tmp_tcph[1] = TCPOPT_NOP; 12040 tmp_tcph[2] = TCPOPT_TSTAMP; 12041 tmp_tcph[3] = TCPOPT_TSTAMP_LEN; 12042 tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN; 12043 tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN; 12044 tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4); 12045 } else { 12046 tcp->tcp_snd_ts_ok = B_FALSE; 12047 } 12048 12049 /* 12050 * Process SACK options. If SACK is enabled for this connection, 12051 * then allocate the SACK info structure. Note the following ways 12052 * when tcp_snd_sack_ok is set to true. 12053 * 12054 * For active connection: in tcp_adapt_ire() called in 12055 * tcp_rput_other(), or in tcp_rput_other() when tcp_sack_permitted 12056 * is checked. 12057 * 12058 * For passive connection: in tcp_adapt_ire() called in 12059 * tcp_accept_comm(). 12060 * 12061 * That's the reason why the extra TCP_IS_DETACHED() check is there. 12062 * That check makes sure that if we did not send a SACK OK option, 12063 * we will not enable SACK for this connection even though the other 12064 * side sends us SACK OK option. For active connection, the SACK 12065 * info structure has already been allocated. So we need to free 12066 * it if SACK is disabled. 12067 */ 12068 if ((options & TCP_OPT_SACK_OK_PRESENT) && 12069 (tcp->tcp_snd_sack_ok || 12070 (tcp_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) { 12071 /* This should be true only in the passive case. */ 12072 if (tcp->tcp_sack_info == NULL) { 12073 ASSERT(TCP_IS_DETACHED(tcp)); 12074 tcp->tcp_sack_info = 12075 kmem_cache_alloc(tcp_sack_info_cache, KM_NOSLEEP); 12076 } 12077 if (tcp->tcp_sack_info == NULL) { 12078 tcp->tcp_snd_sack_ok = B_FALSE; 12079 } else { 12080 tcp->tcp_snd_sack_ok = B_TRUE; 12081 if (tcp->tcp_snd_ts_ok) { 12082 tcp->tcp_max_sack_blk = 3; 12083 } else { 12084 tcp->tcp_max_sack_blk = 4; 12085 } 12086 } 12087 } else { 12088 /* 12089 * Resetting tcp_snd_sack_ok to B_FALSE so that 12090 * no SACK info will be used for this 12091 * connection. This assumes that SACK usage 12092 * permission is negotiated. This may need 12093 * to be changed once this is clarified. 12094 */ 12095 if (tcp->tcp_sack_info != NULL) { 12096 ASSERT(tcp->tcp_notsack_list == NULL); 12097 kmem_cache_free(tcp_sack_info_cache, 12098 tcp->tcp_sack_info); 12099 tcp->tcp_sack_info = NULL; 12100 } 12101 tcp->tcp_snd_sack_ok = B_FALSE; 12102 } 12103 12104 /* 12105 * Now we know the exact TCP/IP header length, subtract 12106 * that from tcp_mss to get our side's MSS. 12107 */ 12108 tcp->tcp_mss -= tcp->tcp_hdr_len; 12109 /* 12110 * Here we assume that the other side's header size will be equal to 12111 * our header size. We calculate the real MSS accordingly. Need to 12112 * take into additional stuffs IPsec puts in. 12113 * 12114 * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) 12115 */ 12116 tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len + tcp->tcp_ipsec_overhead - 12117 ((tcp->tcp_ipversion == IPV4_VERSION ? 12118 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH); 12119 12120 /* 12121 * Set MSS to the smaller one of both ends of the connection. 12122 * We should not have called tcp_mss_set() before, but our 12123 * side of the MSS should have been set to a proper value 12124 * by tcp_adapt_ire(). tcp_mss_set() will also set up the 12125 * STREAM head parameters properly. 12126 * 12127 * If we have a larger-than-16-bit window but the other side 12128 * didn't want to do window scale, tcp_rwnd_set() will take 12129 * care of that. 12130 */ 12131 tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); 12132 } 12133 12134 /* 12135 * Sends the T_CONN_IND to the listener. The caller calls this 12136 * functions via squeue to get inside the listener's perimeter 12137 * once the 3 way hand shake is done a T_CONN_IND needs to be 12138 * sent. As an optimization, the caller can call this directly 12139 * if listener's perimeter is same as eager's. 12140 */ 12141 /* ARGSUSED */ 12142 void 12143 tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) 12144 { 12145 conn_t *lconnp = (conn_t *)arg; 12146 tcp_t *listener = lconnp->conn_tcp; 12147 tcp_t *tcp; 12148 struct T_conn_ind *conn_ind; 12149 ipaddr_t *addr_cache; 12150 boolean_t need_send_conn_ind = B_FALSE; 12151 12152 /* retrieve the eager */ 12153 conn_ind = (struct T_conn_ind *)mp->b_rptr; 12154 ASSERT(conn_ind->OPT_offset != 0 && 12155 conn_ind->OPT_length == sizeof (intptr_t)); 12156 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 12157 conn_ind->OPT_length); 12158 12159 /* 12160 * TLI/XTI applications will get confused by 12161 * sending eager as an option since it violates 12162 * the option semantics. So remove the eager as 12163 * option since TLI/XTI app doesn't need it anyway. 12164 */ 12165 if (!TCP_IS_SOCKET(listener)) { 12166 conn_ind->OPT_length = 0; 12167 conn_ind->OPT_offset = 0; 12168 } 12169 if (listener->tcp_state == TCPS_CLOSED || 12170 TCP_IS_DETACHED(listener)) { 12171 /* 12172 * If listener has closed, it would have caused a 12173 * a cleanup/blowoff to happen for the eager. We 12174 * just need to return. 12175 */ 12176 freemsg(mp); 12177 return; 12178 } 12179 12180 12181 /* 12182 * if the conn_req_q is full defer passing up the 12183 * T_CONN_IND until space is availabe after t_accept() 12184 * processing 12185 */ 12186 mutex_enter(&listener->tcp_eager_lock); 12187 if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { 12188 tcp_t *tail; 12189 12190 /* 12191 * The eager already has an extra ref put in tcp_rput_data 12192 * so that it stays till accept comes back even though it 12193 * might get into TCPS_CLOSED as a result of a TH_RST etc. 12194 */ 12195 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 12196 listener->tcp_conn_req_cnt_q0--; 12197 listener->tcp_conn_req_cnt_q++; 12198 12199 /* Move from SYN_RCVD to ESTABLISHED list */ 12200 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 12201 tcp->tcp_eager_prev_q0; 12202 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 12203 tcp->tcp_eager_next_q0; 12204 tcp->tcp_eager_prev_q0 = NULL; 12205 tcp->tcp_eager_next_q0 = NULL; 12206 12207 /* 12208 * Insert at end of the queue because sockfs 12209 * sends down T_CONN_RES in chronological 12210 * order. Leaving the older conn indications 12211 * at front of the queue helps reducing search 12212 * time. 12213 */ 12214 tail = listener->tcp_eager_last_q; 12215 if (tail != NULL) 12216 tail->tcp_eager_next_q = tcp; 12217 else 12218 listener->tcp_eager_next_q = tcp; 12219 listener->tcp_eager_last_q = tcp; 12220 tcp->tcp_eager_next_q = NULL; 12221 /* 12222 * Delay sending up the T_conn_ind until we are 12223 * done with the eager. Once we have have sent up 12224 * the T_conn_ind, the accept can potentially complete 12225 * any time and release the refhold we have on the eager. 12226 */ 12227 need_send_conn_ind = B_TRUE; 12228 } else { 12229 /* 12230 * Defer connection on q0 and set deferred 12231 * connection bit true 12232 */ 12233 tcp->tcp_conn_def_q0 = B_TRUE; 12234 12235 /* take tcp out of q0 ... */ 12236 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 12237 tcp->tcp_eager_next_q0; 12238 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 12239 tcp->tcp_eager_prev_q0; 12240 12241 /* ... and place it at the end of q0 */ 12242 tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; 12243 tcp->tcp_eager_next_q0 = listener; 12244 listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; 12245 listener->tcp_eager_prev_q0 = tcp; 12246 tcp->tcp_conn.tcp_eager_conn_ind = mp; 12247 } 12248 12249 /* we have timed out before */ 12250 if (tcp->tcp_syn_rcvd_timeout != 0) { 12251 tcp->tcp_syn_rcvd_timeout = 0; 12252 listener->tcp_syn_rcvd_timeout--; 12253 if (listener->tcp_syn_defense && 12254 listener->tcp_syn_rcvd_timeout <= 12255 (tcp_conn_req_max_q0 >> 5) && 12256 10*MINUTES < TICK_TO_MSEC(lbolt64 - 12257 listener->tcp_last_rcv_lbolt)) { 12258 /* 12259 * Turn off the defense mode if we 12260 * believe the SYN attack is over. 12261 */ 12262 listener->tcp_syn_defense = B_FALSE; 12263 if (listener->tcp_ip_addr_cache) { 12264 kmem_free((void *)listener->tcp_ip_addr_cache, 12265 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); 12266 listener->tcp_ip_addr_cache = NULL; 12267 } 12268 } 12269 } 12270 addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); 12271 if (addr_cache != NULL) { 12272 /* 12273 * We have finished a 3-way handshake with this 12274 * remote host. This proves the IP addr is good. 12275 * Cache it! 12276 */ 12277 addr_cache[IP_ADDR_CACHE_HASH( 12278 tcp->tcp_remote)] = tcp->tcp_remote; 12279 } 12280 mutex_exit(&listener->tcp_eager_lock); 12281 if (need_send_conn_ind) 12282 putnext(listener->tcp_rq, mp); 12283 } 12284 12285 mblk_t * 12286 tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp, 12287 uint_t *ifindexp, ip6_pkt_t *ippp) 12288 { 12289 in_pktinfo_t *pinfo; 12290 ip6_t *ip6h; 12291 uchar_t *rptr; 12292 mblk_t *first_mp = mp; 12293 boolean_t mctl_present = B_FALSE; 12294 uint_t ifindex = 0; 12295 ip6_pkt_t ipp; 12296 uint_t ipvers; 12297 uint_t ip_hdr_len; 12298 12299 rptr = mp->b_rptr; 12300 ASSERT(OK_32PTR(rptr)); 12301 ASSERT(tcp != NULL); 12302 ipp.ipp_fields = 0; 12303 12304 switch DB_TYPE(mp) { 12305 case M_CTL: 12306 mp = mp->b_cont; 12307 if (mp == NULL) { 12308 freemsg(first_mp); 12309 return (NULL); 12310 } 12311 if (DB_TYPE(mp) != M_DATA) { 12312 freemsg(first_mp); 12313 return (NULL); 12314 } 12315 mctl_present = B_TRUE; 12316 break; 12317 case M_DATA: 12318 break; 12319 default: 12320 cmn_err(CE_NOTE, "tcp_find_pktinfo: unknown db_type"); 12321 freemsg(mp); 12322 return (NULL); 12323 } 12324 ipvers = IPH_HDR_VERSION(rptr); 12325 if (ipvers == IPV4_VERSION) { 12326 if (tcp == NULL) { 12327 ip_hdr_len = IPH_HDR_LENGTH(rptr); 12328 goto done; 12329 } 12330 12331 ipp.ipp_fields |= IPPF_HOPLIMIT; 12332 ipp.ipp_hoplimit = ((ipha_t *)rptr)->ipha_ttl; 12333 12334 /* 12335 * If we have IN_PKTINFO in an M_CTL and tcp_ipv6_recvancillary 12336 * has TCP_IPV6_RECVPKTINFO set, pass I/F index along in ipp. 12337 */ 12338 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) && 12339 mctl_present) { 12340 pinfo = (in_pktinfo_t *)first_mp->b_rptr; 12341 if ((MBLKL(first_mp) == sizeof (in_pktinfo_t)) && 12342 (pinfo->in_pkt_ulp_type == IN_PKTINFO) && 12343 (pinfo->in_pkt_flags & IPF_RECVIF)) { 12344 ipp.ipp_fields |= IPPF_IFINDEX; 12345 ipp.ipp_ifindex = pinfo->in_pkt_ifindex; 12346 ifindex = pinfo->in_pkt_ifindex; 12347 } 12348 freeb(first_mp); 12349 mctl_present = B_FALSE; 12350 } 12351 ip_hdr_len = IPH_HDR_LENGTH(rptr); 12352 } else { 12353 ip6h = (ip6_t *)rptr; 12354 12355 ASSERT(ipvers == IPV6_VERSION); 12356 ipp.ipp_fields = IPPF_HOPLIMIT | IPPF_TCLASS; 12357 ipp.ipp_tclass = (ip6h->ip6_flow & 0x0FF00000) >> 20; 12358 ipp.ipp_hoplimit = ip6h->ip6_hops; 12359 12360 if (ip6h->ip6_nxt != IPPROTO_TCP) { 12361 uint8_t nexthdrp; 12362 12363 /* Look for ifindex information */ 12364 if (ip6h->ip6_nxt == IPPROTO_RAW) { 12365 ip6i_t *ip6i = (ip6i_t *)ip6h; 12366 if ((uchar_t *)&ip6i[1] > mp->b_wptr) { 12367 BUMP_MIB(&ip_mib, tcpInErrs); 12368 freemsg(first_mp); 12369 return (NULL); 12370 } 12371 12372 if (ip6i->ip6i_flags & IP6I_IFINDEX) { 12373 ASSERT(ip6i->ip6i_ifindex != 0); 12374 ipp.ipp_fields |= IPPF_IFINDEX; 12375 ipp.ipp_ifindex = ip6i->ip6i_ifindex; 12376 ifindex = ip6i->ip6i_ifindex; 12377 } 12378 rptr = (uchar_t *)&ip6i[1]; 12379 mp->b_rptr = rptr; 12380 if (rptr == mp->b_wptr) { 12381 mblk_t *mp1; 12382 mp1 = mp->b_cont; 12383 freeb(mp); 12384 mp = mp1; 12385 rptr = mp->b_rptr; 12386 } 12387 if (MBLKL(mp) < IPV6_HDR_LEN + 12388 sizeof (tcph_t)) { 12389 BUMP_MIB(&ip_mib, tcpInErrs); 12390 freemsg(first_mp); 12391 return (NULL); 12392 } 12393 ip6h = (ip6_t *)rptr; 12394 } 12395 12396 /* 12397 * Find any potentially interesting extension headers 12398 * as well as the length of the IPv6 + extension 12399 * headers. 12400 */ 12401 ip_hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp); 12402 /* Verify if this is a TCP packet */ 12403 if (nexthdrp != IPPROTO_TCP) { 12404 BUMP_MIB(&ip_mib, tcpInErrs); 12405 freemsg(first_mp); 12406 return (NULL); 12407 } 12408 } else { 12409 ip_hdr_len = IPV6_HDR_LEN; 12410 } 12411 } 12412 12413 done: 12414 if (ipversp != NULL) 12415 *ipversp = ipvers; 12416 if (ip_hdr_lenp != NULL) 12417 *ip_hdr_lenp = ip_hdr_len; 12418 if (ippp != NULL) 12419 *ippp = ipp; 12420 if (ifindexp != NULL) 12421 *ifindexp = ifindex; 12422 if (mctl_present) { 12423 freeb(first_mp); 12424 } 12425 return (mp); 12426 } 12427 12428 /* 12429 * Handle M_DATA messages from IP. Its called directly from IP via 12430 * squeue for AF_INET type sockets fast path. No M_CTL are expected 12431 * in this path. 12432 * 12433 * For everything else (including AF_INET6 sockets with 'tcp_ipversion' 12434 * v4 and v6), we are called through tcp_input() and a M_CTL can 12435 * be present for options but tcp_find_pktinfo() deals with it. We 12436 * only expect M_DATA packets after tcp_find_pktinfo() is done. 12437 * 12438 * The first argument is always the connp/tcp to which the mp belongs. 12439 * There are no exceptions to this rule. The caller has already put 12440 * a reference on this connp/tcp and once tcp_rput_data() returns, 12441 * the squeue will do the refrele. 12442 * 12443 * The TH_SYN for the listener directly go to tcp_conn_request via 12444 * squeue. 12445 * 12446 * sqp: NULL = recursive, sqp != NULL means called from squeue 12447 */ 12448 void 12449 tcp_rput_data(void *arg, mblk_t *mp, void *arg2) 12450 { 12451 int32_t bytes_acked; 12452 int32_t gap; 12453 mblk_t *mp1; 12454 uint_t flags; 12455 uint32_t new_swnd = 0; 12456 uchar_t *iphdr; 12457 uchar_t *rptr; 12458 int32_t rgap; 12459 uint32_t seg_ack; 12460 int seg_len; 12461 uint_t ip_hdr_len; 12462 uint32_t seg_seq; 12463 tcph_t *tcph; 12464 int urp; 12465 tcp_opt_t tcpopt; 12466 uint_t ipvers; 12467 ip6_pkt_t ipp; 12468 boolean_t ofo_seg = B_FALSE; /* Out of order segment */ 12469 uint32_t cwnd; 12470 uint32_t add; 12471 int npkt; 12472 int mss; 12473 conn_t *connp = (conn_t *)arg; 12474 squeue_t *sqp = (squeue_t *)arg2; 12475 tcp_t *tcp = connp->conn_tcp; 12476 12477 /* 12478 * RST from fused tcp loopback peer should trigger an unfuse. 12479 */ 12480 if (tcp->tcp_fused) { 12481 TCP_STAT(tcp_fusion_aborted); 12482 tcp_unfuse(tcp); 12483 } 12484 12485 iphdr = mp->b_rptr; 12486 rptr = mp->b_rptr; 12487 ASSERT(OK_32PTR(rptr)); 12488 12489 /* 12490 * An AF_INET socket is not capable of receiving any pktinfo. Do inline 12491 * processing here. For rest call tcp_find_pktinfo to fill up the 12492 * necessary information. 12493 */ 12494 if (IPCL_IS_TCP4(connp)) { 12495 ipvers = IPV4_VERSION; 12496 ip_hdr_len = IPH_HDR_LENGTH(rptr); 12497 } else { 12498 mp = tcp_find_pktinfo(tcp, mp, &ipvers, &ip_hdr_len, 12499 NULL, &ipp); 12500 if (mp == NULL) { 12501 TCP_STAT(tcp_rput_v6_error); 12502 return; 12503 } 12504 iphdr = mp->b_rptr; 12505 rptr = mp->b_rptr; 12506 } 12507 ASSERT(DB_TYPE(mp) == M_DATA); 12508 12509 tcph = (tcph_t *)&rptr[ip_hdr_len]; 12510 seg_seq = ABE32_TO_U32(tcph->th_seq); 12511 seg_ack = ABE32_TO_U32(tcph->th_ack); 12512 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 12513 seg_len = (int)(mp->b_wptr - rptr) - 12514 (ip_hdr_len + TCP_HDR_LENGTH(tcph)); 12515 if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) { 12516 do { 12517 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 12518 (uintptr_t)INT_MAX); 12519 seg_len += (int)(mp1->b_wptr - mp1->b_rptr); 12520 } while ((mp1 = mp1->b_cont) != NULL && 12521 mp1->b_datap->db_type == M_DATA); 12522 } 12523 12524 if (tcp->tcp_state == TCPS_TIME_WAIT) { 12525 tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, 12526 seg_len, tcph); 12527 return; 12528 } 12529 12530 if (sqp != NULL) { 12531 /* 12532 * This is the correct place to update tcp_last_recv_time. Note 12533 * that it is also updated for tcp structure that belongs to 12534 * global and listener queues which do not really need updating. 12535 * But that should not cause any harm. And it is updated for 12536 * all kinds of incoming segments, not only for data segments. 12537 */ 12538 tcp->tcp_last_recv_time = lbolt; 12539 } 12540 12541 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 12542 12543 BUMP_LOCAL(tcp->tcp_ibsegs); 12544 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_RECV_PKT); 12545 12546 if ((flags & TH_URG) && sqp != NULL) { 12547 /* 12548 * TCP can't handle urgent pointers that arrive before 12549 * the connection has been accept()ed since it can't 12550 * buffer OOB data. Discard segment if this happens. 12551 * 12552 * Nor can it reassemble urgent pointers, so discard 12553 * if it's not the next segment expected. 12554 * 12555 * Otherwise, collapse chain into one mblk (discard if 12556 * that fails). This makes sure the headers, retransmitted 12557 * data, and new data all are in the same mblk. 12558 */ 12559 ASSERT(mp != NULL); 12560 if (tcp->tcp_listener || !pullupmsg(mp, -1)) { 12561 freemsg(mp); 12562 return; 12563 } 12564 /* Update pointers into message */ 12565 iphdr = rptr = mp->b_rptr; 12566 tcph = (tcph_t *)&rptr[ip_hdr_len]; 12567 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) { 12568 /* 12569 * Since we can't handle any data with this urgent 12570 * pointer that is out of sequence, we expunge 12571 * the data. This allows us to still register 12572 * the urgent mark and generate the M_PCSIG, 12573 * which we can do. 12574 */ 12575 mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph); 12576 seg_len = 0; 12577 } 12578 } 12579 12580 switch (tcp->tcp_state) { 12581 case TCPS_SYN_SENT: 12582 if (flags & TH_ACK) { 12583 /* 12584 * Note that our stack cannot send data before a 12585 * connection is established, therefore the 12586 * following check is valid. Otherwise, it has 12587 * to be changed. 12588 */ 12589 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || 12590 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 12591 freemsg(mp); 12592 if (flags & TH_RST) 12593 return; 12594 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", 12595 tcp, seg_ack, 0, TH_RST); 12596 return; 12597 } 12598 ASSERT(tcp->tcp_suna + 1 == seg_ack); 12599 } 12600 if (flags & TH_RST) { 12601 freemsg(mp); 12602 if (flags & TH_ACK) 12603 (void) tcp_clean_death(tcp, 12604 ECONNREFUSED, 13); 12605 return; 12606 } 12607 if (!(flags & TH_SYN)) { 12608 freemsg(mp); 12609 return; 12610 } 12611 12612 /* Process all TCP options. */ 12613 tcp_process_options(tcp, tcph); 12614 /* 12615 * The following changes our rwnd to be a multiple of the 12616 * MIN(peer MSS, our MSS) for performance reason. 12617 */ 12618 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rq->q_hiwat, 12619 tcp->tcp_mss)); 12620 12621 /* Is the other end ECN capable? */ 12622 if (tcp->tcp_ecn_ok) { 12623 if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { 12624 tcp->tcp_ecn_ok = B_FALSE; 12625 } 12626 } 12627 /* 12628 * Clear ECN flags because it may interfere with later 12629 * processing. 12630 */ 12631 flags &= ~(TH_ECE|TH_CWR); 12632 12633 tcp->tcp_irs = seg_seq; 12634 tcp->tcp_rack = seg_seq; 12635 tcp->tcp_rnxt = seg_seq + 1; 12636 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 12637 if (!TCP_IS_DETACHED(tcp)) { 12638 /* Allocate room for SACK options if needed. */ 12639 if (tcp->tcp_snd_sack_ok) { 12640 (void) mi_set_sth_wroff(tcp->tcp_rq, 12641 tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + 12642 (tcp->tcp_loopback ? 0 : tcp_wroff_xtra)); 12643 } else { 12644 (void) mi_set_sth_wroff(tcp->tcp_rq, 12645 tcp->tcp_hdr_len + 12646 (tcp->tcp_loopback ? 0 : tcp_wroff_xtra)); 12647 } 12648 } 12649 if (flags & TH_ACK) { 12650 /* 12651 * If we can't get the confirmation upstream, pretend 12652 * we didn't even see this one. 12653 * 12654 * XXX: how can we pretend we didn't see it if we 12655 * have updated rnxt et. al. 12656 * 12657 * For loopback we defer sending up the T_CONN_CON 12658 * until after some checks below. 12659 */ 12660 mp1 = NULL; 12661 if (!tcp_conn_con(tcp, iphdr, tcph, mp, 12662 tcp->tcp_loopback ? &mp1 : NULL)) { 12663 freemsg(mp); 12664 return; 12665 } 12666 /* SYN was acked - making progress */ 12667 if (tcp->tcp_ipversion == IPV6_VERSION) 12668 tcp->tcp_ip_forward_progress = B_TRUE; 12669 12670 /* One for the SYN */ 12671 tcp->tcp_suna = tcp->tcp_iss + 1; 12672 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 12673 tcp->tcp_state = TCPS_ESTABLISHED; 12674 12675 /* 12676 * If SYN was retransmitted, need to reset all 12677 * retransmission info. This is because this 12678 * segment will be treated as a dup ACK. 12679 */ 12680 if (tcp->tcp_rexmit) { 12681 tcp->tcp_rexmit = B_FALSE; 12682 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 12683 tcp->tcp_rexmit_max = tcp->tcp_snxt; 12684 tcp->tcp_snd_burst = tcp->tcp_localnet ? 12685 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 12686 tcp->tcp_ms_we_have_waited = 0; 12687 12688 /* 12689 * Set tcp_cwnd back to 1 MSS, per 12690 * recommendation from 12691 * draft-floyd-incr-init-win-01.txt, 12692 * Increasing TCP's Initial Window. 12693 */ 12694 tcp->tcp_cwnd = tcp->tcp_mss; 12695 } 12696 12697 tcp->tcp_swl1 = seg_seq; 12698 tcp->tcp_swl2 = seg_ack; 12699 12700 new_swnd = BE16_TO_U16(tcph->th_win); 12701 tcp->tcp_swnd = new_swnd; 12702 if (new_swnd > tcp->tcp_max_swnd) 12703 tcp->tcp_max_swnd = new_swnd; 12704 12705 /* 12706 * Always send the three-way handshake ack immediately 12707 * in order to make the connection complete as soon as 12708 * possible on the accepting host. 12709 */ 12710 flags |= TH_ACK_NEEDED; 12711 12712 /* 12713 * Special case for loopback. At this point we have 12714 * received SYN-ACK from the remote endpoint. In 12715 * order to ensure that both endpoints reach the 12716 * fused state prior to any data exchange, the final 12717 * ACK needs to be sent before we indicate T_CONN_CON 12718 * to the module upstream. 12719 */ 12720 if (tcp->tcp_loopback) { 12721 mblk_t *ack_mp; 12722 12723 ASSERT(!tcp->tcp_unfusable); 12724 ASSERT(mp1 != NULL); 12725 /* 12726 * For loopback, we always get a pure SYN-ACK 12727 * and only need to send back the final ACK 12728 * with no data (this is because the other 12729 * tcp is ours and we don't do T/TCP). This 12730 * final ACK triggers the passive side to 12731 * perform fusion in ESTABLISHED state. 12732 */ 12733 if ((ack_mp = tcp_ack_mp(tcp)) != NULL) { 12734 if (tcp->tcp_ack_tid != 0) { 12735 (void) TCP_TIMER_CANCEL(tcp, 12736 tcp->tcp_ack_tid); 12737 tcp->tcp_ack_tid = 0; 12738 } 12739 TCP_RECORD_TRACE(tcp, ack_mp, 12740 TCP_TRACE_SEND_PKT); 12741 tcp_send_data(tcp, tcp->tcp_wq, ack_mp); 12742 BUMP_LOCAL(tcp->tcp_obsegs); 12743 BUMP_MIB(&tcp_mib, tcpOutAck); 12744 12745 /* Send up T_CONN_CON */ 12746 putnext(tcp->tcp_rq, mp1); 12747 12748 freemsg(mp); 12749 return; 12750 } 12751 /* 12752 * Forget fusion; we need to handle more 12753 * complex cases below. Send the deferred 12754 * T_CONN_CON message upstream and proceed 12755 * as usual. Mark this tcp as not capable 12756 * of fusion. 12757 */ 12758 TCP_STAT(tcp_fusion_unfusable); 12759 tcp->tcp_unfusable = B_TRUE; 12760 putnext(tcp->tcp_rq, mp1); 12761 } 12762 12763 /* 12764 * Check to see if there is data to be sent. If 12765 * yes, set the transmit flag. Then check to see 12766 * if received data processing needs to be done. 12767 * If not, go straight to xmit_check. This short 12768 * cut is OK as we don't support T/TCP. 12769 */ 12770 if (tcp->tcp_unsent) 12771 flags |= TH_XMIT_NEEDED; 12772 12773 if (seg_len == 0 && !(flags & TH_URG)) { 12774 freemsg(mp); 12775 goto xmit_check; 12776 } 12777 12778 flags &= ~TH_SYN; 12779 seg_seq++; 12780 break; 12781 } 12782 tcp->tcp_state = TCPS_SYN_RCVD; 12783 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, 12784 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 12785 if (mp1) { 12786 DB_CPID(mp1) = tcp->tcp_cpid; 12787 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); 12788 tcp_send_data(tcp, tcp->tcp_wq, mp1); 12789 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 12790 } 12791 freemsg(mp); 12792 return; 12793 case TCPS_SYN_RCVD: 12794 if (flags & TH_ACK) { 12795 /* 12796 * In this state, a SYN|ACK packet is either bogus 12797 * because the other side must be ACKing our SYN which 12798 * indicates it has seen the ACK for their SYN and 12799 * shouldn't retransmit it or we're crossing SYNs 12800 * on active open. 12801 */ 12802 if ((flags & TH_SYN) && !tcp->tcp_active_open) { 12803 freemsg(mp); 12804 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_syn", 12805 tcp, seg_ack, 0, TH_RST); 12806 return; 12807 } 12808 /* 12809 * NOTE: RFC 793 pg. 72 says this should be 12810 * tcp->tcp_suna <= seg_ack <= tcp->tcp_snxt 12811 * but that would mean we have an ack that ignored 12812 * our SYN. 12813 */ 12814 if (SEQ_LEQ(seg_ack, tcp->tcp_suna) || 12815 SEQ_GT(seg_ack, tcp->tcp_snxt)) { 12816 freemsg(mp); 12817 tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", 12818 tcp, seg_ack, 0, TH_RST); 12819 return; 12820 } 12821 } 12822 break; 12823 case TCPS_LISTEN: 12824 /* 12825 * Only a TLI listener can come through this path when a 12826 * acceptor is going back to be a listener and a packet 12827 * for the acceptor hits the classifier. For a socket 12828 * listener, this can never happen because a listener 12829 * can never accept connection on itself and hence a 12830 * socket acceptor can not go back to being a listener. 12831 */ 12832 ASSERT(!TCP_IS_SOCKET(tcp)); 12833 /*FALLTHRU*/ 12834 case TCPS_CLOSED: 12835 case TCPS_BOUND: { 12836 conn_t *new_connp; 12837 12838 new_connp = ipcl_classify(mp, connp->conn_zoneid); 12839 if (new_connp != NULL) { 12840 tcp_reinput(new_connp, mp, connp->conn_sqp); 12841 return; 12842 } 12843 /* We failed to classify. For now just drop the packet */ 12844 freemsg(mp); 12845 return; 12846 } 12847 case TCPS_IDLE: 12848 /* 12849 * Handle the case where the tcp_clean_death() has happened 12850 * on a connection (application hasn't closed yet) but a packet 12851 * was already queued on squeue before tcp_clean_death() 12852 * was processed. Calling tcp_clean_death() twice on same 12853 * connection can result in weird behaviour. 12854 */ 12855 freemsg(mp); 12856 return; 12857 default: 12858 break; 12859 } 12860 12861 /* 12862 * Already on the correct queue/perimeter. 12863 * If this is a detached connection and not an eager 12864 * connection hanging off a listener then new data 12865 * (past the FIN) will cause a reset. 12866 * We do a special check here where it 12867 * is out of the main line, rather than check 12868 * if we are detached every time we see new 12869 * data down below. 12870 */ 12871 if (TCP_IS_DETACHED_NONEAGER(tcp) && 12872 (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) { 12873 BUMP_MIB(&tcp_mib, tcpInClosed); 12874 TCP_RECORD_TRACE(tcp, 12875 mp, TCP_TRACE_RECV_PKT); 12876 12877 freemsg(mp); 12878 /* 12879 * This could be an SSL closure alert. We're detached so just 12880 * acknowledge it this last time. 12881 */ 12882 if (tcp->tcp_kssl_ctx != NULL) { 12883 kssl_release_ctx(tcp->tcp_kssl_ctx); 12884 tcp->tcp_kssl_ctx = NULL; 12885 12886 tcp->tcp_rnxt += seg_len; 12887 U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); 12888 flags |= TH_ACK_NEEDED; 12889 goto ack_check; 12890 } 12891 12892 tcp_xmit_ctl("new data when detached", tcp, 12893 tcp->tcp_snxt, 0, TH_RST); 12894 (void) tcp_clean_death(tcp, EPROTO, 12); 12895 return; 12896 } 12897 12898 mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph); 12899 urp = BE16_TO_U16(tcph->th_urp) - TCP_OLD_URP_INTERPRETATION; 12900 new_swnd = BE16_TO_U16(tcph->th_win) << 12901 ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 12902 mss = tcp->tcp_mss; 12903 12904 if (tcp->tcp_snd_ts_ok) { 12905 if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 12906 /* 12907 * This segment is not acceptable. 12908 * Drop it and send back an ACK. 12909 */ 12910 freemsg(mp); 12911 flags |= TH_ACK_NEEDED; 12912 goto ack_check; 12913 } 12914 } else if (tcp->tcp_snd_sack_ok) { 12915 ASSERT(tcp->tcp_sack_info != NULL); 12916 tcpopt.tcp = tcp; 12917 /* 12918 * SACK info in already updated in tcp_parse_options. Ignore 12919 * all other TCP options... 12920 */ 12921 (void) tcp_parse_options(tcph, &tcpopt); 12922 } 12923 try_again:; 12924 gap = seg_seq - tcp->tcp_rnxt; 12925 rgap = tcp->tcp_rwnd - (gap + seg_len); 12926 /* 12927 * gap is the amount of sequence space between what we expect to see 12928 * and what we got for seg_seq. A positive value for gap means 12929 * something got lost. A negative value means we got some old stuff. 12930 */ 12931 if (gap < 0) { 12932 /* Old stuff present. Is the SYN in there? */ 12933 if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && 12934 (seg_len != 0)) { 12935 flags &= ~TH_SYN; 12936 seg_seq++; 12937 urp--; 12938 /* Recompute the gaps after noting the SYN. */ 12939 goto try_again; 12940 } 12941 BUMP_MIB(&tcp_mib, tcpInDataDupSegs); 12942 UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, 12943 (seg_len > -gap ? -gap : seg_len)); 12944 /* Remove the old stuff from seg_len. */ 12945 seg_len += gap; 12946 /* 12947 * Anything left? 12948 * Make sure to check for unack'd FIN when rest of data 12949 * has been previously ack'd. 12950 */ 12951 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 12952 /* 12953 * Resets are only valid if they lie within our offered 12954 * window. If the RST bit is set, we just ignore this 12955 * segment. 12956 */ 12957 if (flags & TH_RST) { 12958 freemsg(mp); 12959 return; 12960 } 12961 12962 /* 12963 * The arriving of dup data packets indicate that we 12964 * may have postponed an ack for too long, or the other 12965 * side's RTT estimate is out of shape. Start acking 12966 * more often. 12967 */ 12968 if (SEQ_GEQ(seg_seq + seg_len - gap, tcp->tcp_rack) && 12969 tcp->tcp_rack_cnt >= 1 && 12970 tcp->tcp_rack_abs_max > 2) { 12971 tcp->tcp_rack_abs_max--; 12972 } 12973 tcp->tcp_rack_cur_max = 1; 12974 12975 /* 12976 * This segment is "unacceptable". None of its 12977 * sequence space lies within our advertized window. 12978 * 12979 * Adjust seg_len to the original value for tracing. 12980 */ 12981 seg_len -= gap; 12982 if (tcp->tcp_debug) { 12983 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 12984 "tcp_rput: unacceptable, gap %d, rgap %d, " 12985 "flags 0x%x, seg_seq %u, seg_ack %u, " 12986 "seg_len %d, rnxt %u, snxt %u, %s", 12987 gap, rgap, flags, seg_seq, seg_ack, 12988 seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, 12989 tcp_display(tcp, NULL, 12990 DISP_ADDR_AND_PORT)); 12991 } 12992 12993 /* 12994 * Arrange to send an ACK in response to the 12995 * unacceptable segment per RFC 793 page 69. There 12996 * is only one small difference between ours and the 12997 * acceptability test in the RFC - we accept ACK-only 12998 * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK 12999 * will be generated. 13000 * 13001 * Note that we have to ACK an ACK-only packet at least 13002 * for stacks that send 0-length keep-alives with 13003 * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, 13004 * section 4.2.3.6. As long as we don't ever generate 13005 * an unacceptable packet in response to an incoming 13006 * packet that is unacceptable, it should not cause 13007 * "ACK wars". 13008 */ 13009 flags |= TH_ACK_NEEDED; 13010 13011 /* 13012 * Continue processing this segment in order to use the 13013 * ACK information it contains, but skip all other 13014 * sequence-number processing. Processing the ACK 13015 * information is necessary in order to 13016 * re-synchronize connections that may have lost 13017 * synchronization. 13018 * 13019 * We clear seg_len and flag fields related to 13020 * sequence number processing as they are not 13021 * to be trusted for an unacceptable segment. 13022 */ 13023 seg_len = 0; 13024 flags &= ~(TH_SYN | TH_FIN | TH_URG); 13025 goto process_ack; 13026 } 13027 13028 /* Fix seg_seq, and chew the gap off the front. */ 13029 seg_seq = tcp->tcp_rnxt; 13030 urp += gap; 13031 do { 13032 mblk_t *mp2; 13033 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 13034 (uintptr_t)UINT_MAX); 13035 gap += (uint_t)(mp->b_wptr - mp->b_rptr); 13036 if (gap > 0) { 13037 mp->b_rptr = mp->b_wptr - gap; 13038 break; 13039 } 13040 mp2 = mp; 13041 mp = mp->b_cont; 13042 freeb(mp2); 13043 } while (gap < 0); 13044 /* 13045 * If the urgent data has already been acknowledged, we 13046 * should ignore TH_URG below 13047 */ 13048 if (urp < 0) 13049 flags &= ~TH_URG; 13050 } 13051 /* 13052 * rgap is the amount of stuff received out of window. A negative 13053 * value is the amount out of window. 13054 */ 13055 if (rgap < 0) { 13056 mblk_t *mp2; 13057 13058 if (tcp->tcp_rwnd == 0) { 13059 BUMP_MIB(&tcp_mib, tcpInWinProbe); 13060 } else { 13061 BUMP_MIB(&tcp_mib, tcpInDataPastWinSegs); 13062 UPDATE_MIB(&tcp_mib, tcpInDataPastWinBytes, -rgap); 13063 } 13064 13065 /* 13066 * seg_len does not include the FIN, so if more than 13067 * just the FIN is out of window, we act like we don't 13068 * see it. (If just the FIN is out of window, rgap 13069 * will be zero and we will go ahead and acknowledge 13070 * the FIN.) 13071 */ 13072 flags &= ~TH_FIN; 13073 13074 /* Fix seg_len and make sure there is something left. */ 13075 seg_len += rgap; 13076 if (seg_len <= 0) { 13077 /* 13078 * Resets are only valid if they lie within our offered 13079 * window. If the RST bit is set, we just ignore this 13080 * segment. 13081 */ 13082 if (flags & TH_RST) { 13083 freemsg(mp); 13084 return; 13085 } 13086 13087 /* Per RFC 793, we need to send back an ACK. */ 13088 flags |= TH_ACK_NEEDED; 13089 13090 /* 13091 * Send SIGURG as soon as possible i.e. even 13092 * if the TH_URG was delivered in a window probe 13093 * packet (which will be unacceptable). 13094 * 13095 * We generate a signal if none has been generated 13096 * for this connection or if this is a new urgent 13097 * byte. Also send a zero-length "unmarked" message 13098 * to inform SIOCATMARK that this is not the mark. 13099 * 13100 * tcp_urp_last_valid is cleared when the T_exdata_ind 13101 * is sent up. This plus the check for old data 13102 * (gap >= 0) handles the wraparound of the sequence 13103 * number space without having to always track the 13104 * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks 13105 * this max in its rcv_up variable). 13106 * 13107 * This prevents duplicate SIGURGS due to a "late" 13108 * zero-window probe when the T_EXDATA_IND has already 13109 * been sent up. 13110 */ 13111 if ((flags & TH_URG) && 13112 (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, 13113 tcp->tcp_urp_last))) { 13114 mp1 = allocb(0, BPRI_MED); 13115 if (mp1 == NULL) { 13116 freemsg(mp); 13117 return; 13118 } 13119 if (!TCP_IS_DETACHED(tcp) && 13120 !putnextctl1(tcp->tcp_rq, M_PCSIG, 13121 SIGURG)) { 13122 /* Try again on the rexmit. */ 13123 freemsg(mp1); 13124 freemsg(mp); 13125 return; 13126 } 13127 /* 13128 * If the next byte would be the mark 13129 * then mark with MARKNEXT else mark 13130 * with NOTMARKNEXT. 13131 */ 13132 if (gap == 0 && urp == 0) 13133 mp1->b_flag |= MSGMARKNEXT; 13134 else 13135 mp1->b_flag |= MSGNOTMARKNEXT; 13136 freemsg(tcp->tcp_urp_mark_mp); 13137 tcp->tcp_urp_mark_mp = mp1; 13138 flags |= TH_SEND_URP_MARK; 13139 tcp->tcp_urp_last_valid = B_TRUE; 13140 tcp->tcp_urp_last = urp + seg_seq; 13141 } 13142 /* 13143 * If this is a zero window probe, continue to 13144 * process the ACK part. But we need to set seg_len 13145 * to 0 to avoid data processing. Otherwise just 13146 * drop the segment and send back an ACK. 13147 */ 13148 if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { 13149 flags &= ~(TH_SYN | TH_URG); 13150 seg_len = 0; 13151 goto process_ack; 13152 } else { 13153 freemsg(mp); 13154 goto ack_check; 13155 } 13156 } 13157 /* Pitch out of window stuff off the end. */ 13158 rgap = seg_len; 13159 mp2 = mp; 13160 do { 13161 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 13162 (uintptr_t)INT_MAX); 13163 rgap -= (int)(mp2->b_wptr - mp2->b_rptr); 13164 if (rgap < 0) { 13165 mp2->b_wptr += rgap; 13166 if ((mp1 = mp2->b_cont) != NULL) { 13167 mp2->b_cont = NULL; 13168 freemsg(mp1); 13169 } 13170 break; 13171 } 13172 } while ((mp2 = mp2->b_cont) != NULL); 13173 } 13174 ok:; 13175 /* 13176 * TCP should check ECN info for segments inside the window only. 13177 * Therefore the check should be done here. 13178 */ 13179 if (tcp->tcp_ecn_ok) { 13180 if (flags & TH_CWR) { 13181 tcp->tcp_ecn_echo_on = B_FALSE; 13182 } 13183 /* 13184 * Note that both ECN_CE and CWR can be set in the 13185 * same segment. In this case, we once again turn 13186 * on ECN_ECHO. 13187 */ 13188 if (tcp->tcp_ipversion == IPV4_VERSION) { 13189 uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service; 13190 13191 if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { 13192 tcp->tcp_ecn_echo_on = B_TRUE; 13193 } 13194 } else { 13195 uint32_t vcf = ((ip6_t *)rptr)->ip6_vcf; 13196 13197 if ((vcf & htonl(IPH_ECN_CE << 20)) == 13198 htonl(IPH_ECN_CE << 20)) { 13199 tcp->tcp_ecn_echo_on = B_TRUE; 13200 } 13201 } 13202 } 13203 13204 /* 13205 * Check whether we can update tcp_ts_recent. This test is 13206 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 13207 * Extensions for High Performance: An Update", Internet Draft. 13208 */ 13209 if (tcp->tcp_snd_ts_ok && 13210 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 13211 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 13212 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 13213 tcp->tcp_last_rcv_lbolt = lbolt64; 13214 } 13215 13216 if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { 13217 /* 13218 * FIN in an out of order segment. We record this in 13219 * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. 13220 * Clear the FIN so that any check on FIN flag will fail. 13221 * Remember that FIN also counts in the sequence number 13222 * space. So we need to ack out of order FIN only segments. 13223 */ 13224 if (flags & TH_FIN) { 13225 tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; 13226 tcp->tcp_ofo_fin_seq = seg_seq + seg_len; 13227 flags &= ~TH_FIN; 13228 flags |= TH_ACK_NEEDED; 13229 } 13230 if (seg_len > 0) { 13231 /* Fill in the SACK blk list. */ 13232 if (tcp->tcp_snd_sack_ok) { 13233 ASSERT(tcp->tcp_sack_info != NULL); 13234 tcp_sack_insert(tcp->tcp_sack_list, 13235 seg_seq, seg_seq + seg_len, 13236 &(tcp->tcp_num_sack_blk)); 13237 } 13238 13239 /* 13240 * Attempt reassembly and see if we have something 13241 * ready to go. 13242 */ 13243 mp = tcp_reass(tcp, mp, seg_seq); 13244 /* Always ack out of order packets */ 13245 flags |= TH_ACK_NEEDED | TH_PUSH; 13246 if (mp) { 13247 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 13248 (uintptr_t)INT_MAX); 13249 seg_len = mp->b_cont ? msgdsize(mp) : 13250 (int)(mp->b_wptr - mp->b_rptr); 13251 seg_seq = tcp->tcp_rnxt; 13252 /* 13253 * A gap is filled and the seq num and len 13254 * of the gap match that of a previously 13255 * received FIN, put the FIN flag back in. 13256 */ 13257 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 13258 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 13259 flags |= TH_FIN; 13260 tcp->tcp_valid_bits &= 13261 ~TCP_OFO_FIN_VALID; 13262 } 13263 } else { 13264 /* 13265 * Keep going even with NULL mp. 13266 * There may be a useful ACK or something else 13267 * we don't want to miss. 13268 * 13269 * But TCP should not perform fast retransmit 13270 * because of the ack number. TCP uses 13271 * seg_len == 0 to determine if it is a pure 13272 * ACK. And this is not a pure ACK. 13273 */ 13274 seg_len = 0; 13275 ofo_seg = B_TRUE; 13276 } 13277 } 13278 } else if (seg_len > 0) { 13279 BUMP_MIB(&tcp_mib, tcpInDataInorderSegs); 13280 UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, seg_len); 13281 /* 13282 * If an out of order FIN was received before, and the seq 13283 * num and len of the new segment match that of the FIN, 13284 * put the FIN flag back in. 13285 */ 13286 if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && 13287 seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { 13288 flags |= TH_FIN; 13289 tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; 13290 } 13291 } 13292 if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { 13293 if (flags & TH_RST) { 13294 freemsg(mp); 13295 switch (tcp->tcp_state) { 13296 case TCPS_SYN_RCVD: 13297 (void) tcp_clean_death(tcp, ECONNREFUSED, 14); 13298 break; 13299 case TCPS_ESTABLISHED: 13300 case TCPS_FIN_WAIT_1: 13301 case TCPS_FIN_WAIT_2: 13302 case TCPS_CLOSE_WAIT: 13303 (void) tcp_clean_death(tcp, ECONNRESET, 15); 13304 break; 13305 case TCPS_CLOSING: 13306 case TCPS_LAST_ACK: 13307 (void) tcp_clean_death(tcp, 0, 16); 13308 break; 13309 default: 13310 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 13311 (void) tcp_clean_death(tcp, ENXIO, 17); 13312 break; 13313 } 13314 return; 13315 } 13316 if (flags & TH_SYN) { 13317 /* 13318 * See RFC 793, Page 71 13319 * 13320 * The seq number must be in the window as it should 13321 * be "fixed" above. If it is outside window, it should 13322 * be already rejected. Note that we allow seg_seq to be 13323 * rnxt + rwnd because we want to accept 0 window probe. 13324 */ 13325 ASSERT(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && 13326 SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); 13327 freemsg(mp); 13328 /* 13329 * If the ACK flag is not set, just use our snxt as the 13330 * seq number of the RST segment. 13331 */ 13332 if (!(flags & TH_ACK)) { 13333 seg_ack = tcp->tcp_snxt; 13334 } 13335 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 13336 TH_RST|TH_ACK); 13337 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 13338 (void) tcp_clean_death(tcp, ECONNRESET, 18); 13339 return; 13340 } 13341 /* 13342 * urp could be -1 when the urp field in the packet is 0 13343 * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent 13344 * byte was at seg_seq - 1, in which case we ignore the urgent flag. 13345 */ 13346 if (flags & TH_URG && urp >= 0) { 13347 if (!tcp->tcp_urp_last_valid || 13348 SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { 13349 /* 13350 * If we haven't generated the signal yet for this 13351 * urgent pointer value, do it now. Also, send up a 13352 * zero-length M_DATA indicating whether or not this is 13353 * the mark. The latter is not needed when a 13354 * T_EXDATA_IND is sent up. However, if there are 13355 * allocation failures this code relies on the sender 13356 * retransmitting and the socket code for determining 13357 * the mark should not block waiting for the peer to 13358 * transmit. Thus, for simplicity we always send up the 13359 * mark indication. 13360 */ 13361 mp1 = allocb(0, BPRI_MED); 13362 if (mp1 == NULL) { 13363 freemsg(mp); 13364 return; 13365 } 13366 if (!TCP_IS_DETACHED(tcp) && 13367 !putnextctl1(tcp->tcp_rq, M_PCSIG, SIGURG)) { 13368 /* Try again on the rexmit. */ 13369 freemsg(mp1); 13370 freemsg(mp); 13371 return; 13372 } 13373 /* 13374 * Mark with NOTMARKNEXT for now. 13375 * The code below will change this to MARKNEXT 13376 * if we are at the mark. 13377 * 13378 * If there are allocation failures (e.g. in dupmsg 13379 * below) the next time tcp_rput_data sees the urgent 13380 * segment it will send up the MSG*MARKNEXT message. 13381 */ 13382 mp1->b_flag |= MSGNOTMARKNEXT; 13383 freemsg(tcp->tcp_urp_mark_mp); 13384 tcp->tcp_urp_mark_mp = mp1; 13385 flags |= TH_SEND_URP_MARK; 13386 #ifdef DEBUG 13387 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13388 "tcp_rput: sent M_PCSIG 2 seq %x urp %x " 13389 "last %x, %s", 13390 seg_seq, urp, tcp->tcp_urp_last, 13391 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 13392 #endif /* DEBUG */ 13393 tcp->tcp_urp_last_valid = B_TRUE; 13394 tcp->tcp_urp_last = urp + seg_seq; 13395 } else if (tcp->tcp_urp_mark_mp != NULL) { 13396 /* 13397 * An allocation failure prevented the previous 13398 * tcp_rput_data from sending up the allocated 13399 * MSG*MARKNEXT message - send it up this time 13400 * around. 13401 */ 13402 flags |= TH_SEND_URP_MARK; 13403 } 13404 13405 /* 13406 * If the urgent byte is in this segment, make sure that it is 13407 * all by itself. This makes it much easier to deal with the 13408 * possibility of an allocation failure on the T_exdata_ind. 13409 * Note that seg_len is the number of bytes in the segment, and 13410 * urp is the offset into the segment of the urgent byte. 13411 * urp < seg_len means that the urgent byte is in this segment. 13412 */ 13413 if (urp < seg_len) { 13414 if (seg_len != 1) { 13415 uint32_t tmp_rnxt; 13416 /* 13417 * Break it up and feed it back in. 13418 * Re-attach the IP header. 13419 */ 13420 mp->b_rptr = iphdr; 13421 if (urp > 0) { 13422 /* 13423 * There is stuff before the urgent 13424 * byte. 13425 */ 13426 mp1 = dupmsg(mp); 13427 if (!mp1) { 13428 /* 13429 * Trim from urgent byte on. 13430 * The rest will come back. 13431 */ 13432 (void) adjmsg(mp, 13433 urp - seg_len); 13434 tcp_rput_data(connp, 13435 mp, NULL); 13436 return; 13437 } 13438 (void) adjmsg(mp1, urp - seg_len); 13439 /* Feed this piece back in. */ 13440 tmp_rnxt = tcp->tcp_rnxt; 13441 tcp_rput_data(connp, mp1, NULL); 13442 /* 13443 * If the data passed back in was not 13444 * processed (ie: bad ACK) sending 13445 * the remainder back in will cause a 13446 * loop. In this case, drop the 13447 * packet and let the sender try 13448 * sending a good packet. 13449 */ 13450 if (tmp_rnxt == tcp->tcp_rnxt) { 13451 freemsg(mp); 13452 return; 13453 } 13454 } 13455 if (urp != seg_len - 1) { 13456 uint32_t tmp_rnxt; 13457 /* 13458 * There is stuff after the urgent 13459 * byte. 13460 */ 13461 mp1 = dupmsg(mp); 13462 if (!mp1) { 13463 /* 13464 * Trim everything beyond the 13465 * urgent byte. The rest will 13466 * come back. 13467 */ 13468 (void) adjmsg(mp, 13469 urp + 1 - seg_len); 13470 tcp_rput_data(connp, 13471 mp, NULL); 13472 return; 13473 } 13474 (void) adjmsg(mp1, urp + 1 - seg_len); 13475 tmp_rnxt = tcp->tcp_rnxt; 13476 tcp_rput_data(connp, mp1, NULL); 13477 /* 13478 * If the data passed back in was not 13479 * processed (ie: bad ACK) sending 13480 * the remainder back in will cause a 13481 * loop. In this case, drop the 13482 * packet and let the sender try 13483 * sending a good packet. 13484 */ 13485 if (tmp_rnxt == tcp->tcp_rnxt) { 13486 freemsg(mp); 13487 return; 13488 } 13489 } 13490 tcp_rput_data(connp, mp, NULL); 13491 return; 13492 } 13493 /* 13494 * This segment contains only the urgent byte. We 13495 * have to allocate the T_exdata_ind, if we can. 13496 */ 13497 if (!tcp->tcp_urp_mp) { 13498 struct T_exdata_ind *tei; 13499 mp1 = allocb(sizeof (struct T_exdata_ind), 13500 BPRI_MED); 13501 if (!mp1) { 13502 /* 13503 * Sigh... It'll be back. 13504 * Generate any MSG*MARK message now. 13505 */ 13506 freemsg(mp); 13507 seg_len = 0; 13508 if (flags & TH_SEND_URP_MARK) { 13509 13510 13511 ASSERT(tcp->tcp_urp_mark_mp); 13512 tcp->tcp_urp_mark_mp->b_flag &= 13513 ~MSGNOTMARKNEXT; 13514 tcp->tcp_urp_mark_mp->b_flag |= 13515 MSGMARKNEXT; 13516 } 13517 goto ack_check; 13518 } 13519 mp1->b_datap->db_type = M_PROTO; 13520 tei = (struct T_exdata_ind *)mp1->b_rptr; 13521 tei->PRIM_type = T_EXDATA_IND; 13522 tei->MORE_flag = 0; 13523 mp1->b_wptr = (uchar_t *)&tei[1]; 13524 tcp->tcp_urp_mp = mp1; 13525 #ifdef DEBUG 13526 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13527 "tcp_rput: allocated exdata_ind %s", 13528 tcp_display(tcp, NULL, 13529 DISP_PORT_ONLY)); 13530 #endif /* DEBUG */ 13531 /* 13532 * There is no need to send a separate MSG*MARK 13533 * message since the T_EXDATA_IND will be sent 13534 * now. 13535 */ 13536 flags &= ~TH_SEND_URP_MARK; 13537 freemsg(tcp->tcp_urp_mark_mp); 13538 tcp->tcp_urp_mark_mp = NULL; 13539 } 13540 /* 13541 * Now we are all set. On the next putnext upstream, 13542 * tcp_urp_mp will be non-NULL and will get prepended 13543 * to what has to be this piece containing the urgent 13544 * byte. If for any reason we abort this segment below, 13545 * if it comes back, we will have this ready, or it 13546 * will get blown off in close. 13547 */ 13548 } else if (urp == seg_len) { 13549 /* 13550 * The urgent byte is the next byte after this sequence 13551 * number. If there is data it is marked with 13552 * MSGMARKNEXT and any tcp_urp_mark_mp is discarded 13553 * since it is not needed. Otherwise, if the code 13554 * above just allocated a zero-length tcp_urp_mark_mp 13555 * message, that message is tagged with MSGMARKNEXT. 13556 * Sending up these MSGMARKNEXT messages makes 13557 * SIOCATMARK work correctly even though 13558 * the T_EXDATA_IND will not be sent up until the 13559 * urgent byte arrives. 13560 */ 13561 if (seg_len != 0) { 13562 flags |= TH_MARKNEXT_NEEDED; 13563 freemsg(tcp->tcp_urp_mark_mp); 13564 tcp->tcp_urp_mark_mp = NULL; 13565 flags &= ~TH_SEND_URP_MARK; 13566 } else if (tcp->tcp_urp_mark_mp != NULL) { 13567 flags |= TH_SEND_URP_MARK; 13568 tcp->tcp_urp_mark_mp->b_flag &= 13569 ~MSGNOTMARKNEXT; 13570 tcp->tcp_urp_mark_mp->b_flag |= MSGMARKNEXT; 13571 } 13572 #ifdef DEBUG 13573 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13574 "tcp_rput: AT MARK, len %d, flags 0x%x, %s", 13575 seg_len, flags, 13576 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 13577 #endif /* DEBUG */ 13578 } else { 13579 /* Data left until we hit mark */ 13580 #ifdef DEBUG 13581 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 13582 "tcp_rput: URP %d bytes left, %s", 13583 urp - seg_len, tcp_display(tcp, NULL, 13584 DISP_PORT_ONLY)); 13585 #endif /* DEBUG */ 13586 } 13587 } 13588 13589 process_ack: 13590 if (!(flags & TH_ACK)) { 13591 freemsg(mp); 13592 goto xmit_check; 13593 } 13594 } 13595 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 13596 13597 if (tcp->tcp_ipversion == IPV6_VERSION && bytes_acked > 0) 13598 tcp->tcp_ip_forward_progress = B_TRUE; 13599 if (tcp->tcp_state == TCPS_SYN_RCVD) { 13600 if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) && 13601 ((tcp->tcp_kssl_ent == NULL) || !tcp->tcp_kssl_pending)) { 13602 /* 3-way handshake complete - pass up the T_CONN_IND */ 13603 tcp_t *listener = tcp->tcp_listener; 13604 mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind; 13605 13606 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 13607 /* 13608 * We are here means eager is fine but it can 13609 * get a TH_RST at any point between now and till 13610 * accept completes and disappear. We need to 13611 * ensure that reference to eager is valid after 13612 * we get out of eager's perimeter. So we do 13613 * an extra refhold. 13614 */ 13615 CONN_INC_REF(connp); 13616 13617 /* 13618 * The listener also exists because of the refhold 13619 * done in tcp_conn_request. Its possible that it 13620 * might have closed. We will check that once we 13621 * get inside listeners context. 13622 */ 13623 CONN_INC_REF(listener->tcp_connp); 13624 if (listener->tcp_connp->conn_sqp == 13625 connp->conn_sqp) { 13626 tcp_send_conn_ind(listener->tcp_connp, mp, 13627 listener->tcp_connp->conn_sqp); 13628 CONN_DEC_REF(listener->tcp_connp); 13629 } else if (!tcp->tcp_loopback) { 13630 squeue_fill(listener->tcp_connp->conn_sqp, mp, 13631 tcp_send_conn_ind, 13632 listener->tcp_connp, SQTAG_TCP_CONN_IND); 13633 } else { 13634 squeue_enter(listener->tcp_connp->conn_sqp, mp, 13635 tcp_send_conn_ind, listener->tcp_connp, 13636 SQTAG_TCP_CONN_IND); 13637 } 13638 } 13639 13640 if (tcp->tcp_active_open) { 13641 /* 13642 * We are seeing the final ack in the three way 13643 * hand shake of a active open'ed connection 13644 * so we must send up a T_CONN_CON 13645 */ 13646 if (!tcp_conn_con(tcp, iphdr, tcph, mp, NULL)) { 13647 freemsg(mp); 13648 return; 13649 } 13650 /* 13651 * Don't fuse the loopback endpoints for 13652 * simultaneous active opens. 13653 */ 13654 if (tcp->tcp_loopback) { 13655 TCP_STAT(tcp_fusion_unfusable); 13656 tcp->tcp_unfusable = B_TRUE; 13657 } 13658 } 13659 13660 tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ 13661 bytes_acked--; 13662 /* SYN was acked - making progress */ 13663 if (tcp->tcp_ipversion == IPV6_VERSION) 13664 tcp->tcp_ip_forward_progress = B_TRUE; 13665 13666 /* 13667 * If SYN was retransmitted, need to reset all 13668 * retransmission info as this segment will be 13669 * treated as a dup ACK. 13670 */ 13671 if (tcp->tcp_rexmit) { 13672 tcp->tcp_rexmit = B_FALSE; 13673 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 13674 tcp->tcp_rexmit_max = tcp->tcp_snxt; 13675 tcp->tcp_snd_burst = tcp->tcp_localnet ? 13676 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 13677 tcp->tcp_ms_we_have_waited = 0; 13678 tcp->tcp_cwnd = mss; 13679 } 13680 13681 /* 13682 * We set the send window to zero here. 13683 * This is needed if there is data to be 13684 * processed already on the queue. 13685 * Later (at swnd_update label), the 13686 * "new_swnd > tcp_swnd" condition is satisfied 13687 * the XMIT_NEEDED flag is set in the current 13688 * (SYN_RCVD) state. This ensures tcp_wput_data() is 13689 * called if there is already data on queue in 13690 * this state. 13691 */ 13692 tcp->tcp_swnd = 0; 13693 13694 if (new_swnd > tcp->tcp_max_swnd) 13695 tcp->tcp_max_swnd = new_swnd; 13696 tcp->tcp_swl1 = seg_seq; 13697 tcp->tcp_swl2 = seg_ack; 13698 tcp->tcp_state = TCPS_ESTABLISHED; 13699 tcp->tcp_valid_bits &= ~TCP_ISS_VALID; 13700 13701 /* Fuse when both sides are in ESTABLISHED state */ 13702 if (tcp->tcp_loopback && do_tcp_fusion) 13703 tcp_fuse(tcp, iphdr, tcph); 13704 13705 } 13706 /* This code follows 4.4BSD-Lite2 mostly. */ 13707 if (bytes_acked < 0) 13708 goto est; 13709 13710 /* 13711 * If TCP is ECN capable and the congestion experience bit is 13712 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be 13713 * done once per window (or more loosely, per RTT). 13714 */ 13715 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) 13716 tcp->tcp_cwr = B_FALSE; 13717 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { 13718 if (!tcp->tcp_cwr) { 13719 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss; 13720 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; 13721 tcp->tcp_cwnd = npkt * mss; 13722 /* 13723 * If the cwnd is 0, use the timer to clock out 13724 * new segments. This is required by the ECN spec. 13725 */ 13726 if (npkt == 0) { 13727 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 13728 /* 13729 * This makes sure that when the ACK comes 13730 * back, we will increase tcp_cwnd by 1 MSS. 13731 */ 13732 tcp->tcp_cwnd_cnt = 0; 13733 } 13734 tcp->tcp_cwr = B_TRUE; 13735 /* 13736 * This marks the end of the current window of in 13737 * flight data. That is why we don't use 13738 * tcp_suna + tcp_swnd. Only data in flight can 13739 * provide ECN info. 13740 */ 13741 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 13742 tcp->tcp_ecn_cwr_sent = B_FALSE; 13743 } 13744 } 13745 13746 mp1 = tcp->tcp_xmit_head; 13747 if (bytes_acked == 0) { 13748 if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { 13749 int dupack_cnt; 13750 13751 BUMP_MIB(&tcp_mib, tcpInDupAck); 13752 /* 13753 * Fast retransmit. When we have seen exactly three 13754 * identical ACKs while we have unacked data 13755 * outstanding we take it as a hint that our peer 13756 * dropped something. 13757 * 13758 * If TCP is retransmitting, don't do fast retransmit. 13759 */ 13760 if (mp1 && tcp->tcp_suna != tcp->tcp_snxt && 13761 ! tcp->tcp_rexmit) { 13762 /* Do Limited Transmit */ 13763 if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < 13764 tcp_dupack_fast_retransmit) { 13765 /* 13766 * RFC 3042 13767 * 13768 * What we need to do is temporarily 13769 * increase tcp_cwnd so that new 13770 * data can be sent if it is allowed 13771 * by the receive window (tcp_rwnd). 13772 * tcp_wput_data() will take care of 13773 * the rest. 13774 * 13775 * If the connection is SACK capable, 13776 * only do limited xmit when there 13777 * is SACK info. 13778 * 13779 * Note how tcp_cwnd is incremented. 13780 * The first dup ACK will increase 13781 * it by 1 MSS. The second dup ACK 13782 * will increase it by 2 MSS. This 13783 * means that only 1 new segment will 13784 * be sent for each dup ACK. 13785 */ 13786 if (tcp->tcp_unsent > 0 && 13787 (!tcp->tcp_snd_sack_ok || 13788 (tcp->tcp_snd_sack_ok && 13789 tcp->tcp_notsack_list != NULL))) { 13790 tcp->tcp_cwnd += mss << 13791 (tcp->tcp_dupack_cnt - 1); 13792 flags |= TH_LIMIT_XMIT; 13793 } 13794 } else if (dupack_cnt == 13795 tcp_dupack_fast_retransmit) { 13796 13797 /* 13798 * If we have reduced tcp_ssthresh 13799 * because of ECN, do not reduce it again 13800 * unless it is already one window of data 13801 * away. After one window of data, tcp_cwr 13802 * should then be cleared. Note that 13803 * for non ECN capable connection, tcp_cwr 13804 * should always be false. 13805 * 13806 * Adjust cwnd since the duplicate 13807 * ack indicates that a packet was 13808 * dropped (due to congestion.) 13809 */ 13810 if (!tcp->tcp_cwr) { 13811 npkt = ((tcp->tcp_snxt - 13812 tcp->tcp_suna) >> 1) / mss; 13813 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 13814 mss; 13815 tcp->tcp_cwnd = (npkt + 13816 tcp->tcp_dupack_cnt) * mss; 13817 } 13818 if (tcp->tcp_ecn_ok) { 13819 tcp->tcp_cwr = B_TRUE; 13820 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 13821 tcp->tcp_ecn_cwr_sent = B_FALSE; 13822 } 13823 13824 /* 13825 * We do Hoe's algorithm. Refer to her 13826 * paper "Improving the Start-up Behavior 13827 * of a Congestion Control Scheme for TCP," 13828 * appeared in SIGCOMM'96. 13829 * 13830 * Save highest seq no we have sent so far. 13831 * Be careful about the invisible FIN byte. 13832 */ 13833 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 13834 (tcp->tcp_unsent == 0)) { 13835 tcp->tcp_rexmit_max = tcp->tcp_fss; 13836 } else { 13837 tcp->tcp_rexmit_max = tcp->tcp_snxt; 13838 } 13839 13840 /* 13841 * Do not allow bursty traffic during. 13842 * fast recovery. Refer to Fall and Floyd's 13843 * paper "Simulation-based Comparisons of 13844 * Tahoe, Reno and SACK TCP" (in CCR?) 13845 * This is a best current practise. 13846 */ 13847 tcp->tcp_snd_burst = TCP_CWND_SS; 13848 13849 /* 13850 * For SACK: 13851 * Calculate tcp_pipe, which is the 13852 * estimated number of bytes in 13853 * network. 13854 * 13855 * tcp_fack is the highest sack'ed seq num 13856 * TCP has received. 13857 * 13858 * tcp_pipe is explained in the above quoted 13859 * Fall and Floyd's paper. tcp_fack is 13860 * explained in Mathis and Mahdavi's 13861 * "Forward Acknowledgment: Refining TCP 13862 * Congestion Control" in SIGCOMM '96. 13863 */ 13864 if (tcp->tcp_snd_sack_ok) { 13865 ASSERT(tcp->tcp_sack_info != NULL); 13866 if (tcp->tcp_notsack_list != NULL) { 13867 tcp->tcp_pipe = tcp->tcp_snxt - 13868 tcp->tcp_fack; 13869 tcp->tcp_sack_snxt = seg_ack; 13870 flags |= TH_NEED_SACK_REXMIT; 13871 } else { 13872 /* 13873 * Always initialize tcp_pipe 13874 * even though we don't have 13875 * any SACK info. If later 13876 * we get SACK info and 13877 * tcp_pipe is not initialized, 13878 * funny things will happen. 13879 */ 13880 tcp->tcp_pipe = 13881 tcp->tcp_cwnd_ssthresh; 13882 } 13883 } else { 13884 flags |= TH_REXMIT_NEEDED; 13885 } /* tcp_snd_sack_ok */ 13886 13887 } else { 13888 /* 13889 * Here we perform congestion 13890 * avoidance, but NOT slow start. 13891 * This is known as the Fast 13892 * Recovery Algorithm. 13893 */ 13894 if (tcp->tcp_snd_sack_ok && 13895 tcp->tcp_notsack_list != NULL) { 13896 flags |= TH_NEED_SACK_REXMIT; 13897 tcp->tcp_pipe -= mss; 13898 if (tcp->tcp_pipe < 0) 13899 tcp->tcp_pipe = 0; 13900 } else { 13901 /* 13902 * We know that one more packet has 13903 * left the pipe thus we can update 13904 * cwnd. 13905 */ 13906 cwnd = tcp->tcp_cwnd + mss; 13907 if (cwnd > tcp->tcp_cwnd_max) 13908 cwnd = tcp->tcp_cwnd_max; 13909 tcp->tcp_cwnd = cwnd; 13910 if (tcp->tcp_unsent > 0) 13911 flags |= TH_XMIT_NEEDED; 13912 } 13913 } 13914 } 13915 } else if (tcp->tcp_zero_win_probe) { 13916 /* 13917 * If the window has opened, need to arrange 13918 * to send additional data. 13919 */ 13920 if (new_swnd != 0) { 13921 /* tcp_suna != tcp_snxt */ 13922 /* Packet contains a window update */ 13923 BUMP_MIB(&tcp_mib, tcpInWinUpdate); 13924 tcp->tcp_zero_win_probe = 0; 13925 tcp->tcp_timer_backoff = 0; 13926 tcp->tcp_ms_we_have_waited = 0; 13927 13928 /* 13929 * Transmit starting with tcp_suna since 13930 * the one byte probe is not ack'ed. 13931 * If TCP has sent more than one identical 13932 * probe, tcp_rexmit will be set. That means 13933 * tcp_ss_rexmit() will send out the one 13934 * byte along with new data. Otherwise, 13935 * fake the retransmission. 13936 */ 13937 flags |= TH_XMIT_NEEDED; 13938 if (!tcp->tcp_rexmit) { 13939 tcp->tcp_rexmit = B_TRUE; 13940 tcp->tcp_dupack_cnt = 0; 13941 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 13942 tcp->tcp_rexmit_max = tcp->tcp_suna + 1; 13943 } 13944 } 13945 } 13946 goto swnd_update; 13947 } 13948 13949 /* 13950 * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. 13951 * If the ACK value acks something that we have not yet sent, it might 13952 * be an old duplicate segment. Send an ACK to re-synchronize the 13953 * other side. 13954 * Note: reset in response to unacceptable ACK in SYN_RECEIVE 13955 * state is handled above, so we can always just drop the segment and 13956 * send an ACK here. 13957 * 13958 * Should we send ACKs in response to ACK only segments? 13959 */ 13960 if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { 13961 BUMP_MIB(&tcp_mib, tcpInAckUnsent); 13962 /* drop the received segment */ 13963 freemsg(mp); 13964 13965 /* 13966 * Send back an ACK. If tcp_drop_ack_unsent_cnt is 13967 * greater than 0, check if the number of such 13968 * bogus ACks is greater than that count. If yes, 13969 * don't send back any ACK. This prevents TCP from 13970 * getting into an ACK storm if somehow an attacker 13971 * successfully spoofs an acceptable segment to our 13972 * peer. 13973 */ 13974 if (tcp_drop_ack_unsent_cnt > 0 && 13975 ++tcp->tcp_in_ack_unsent > tcp_drop_ack_unsent_cnt) { 13976 TCP_STAT(tcp_in_ack_unsent_drop); 13977 return; 13978 } 13979 mp = tcp_ack_mp(tcp); 13980 if (mp != NULL) { 13981 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 13982 BUMP_LOCAL(tcp->tcp_obsegs); 13983 BUMP_MIB(&tcp_mib, tcpOutAck); 13984 tcp_send_data(tcp, tcp->tcp_wq, mp); 13985 } 13986 return; 13987 } 13988 13989 /* 13990 * TCP gets a new ACK, update the notsack'ed list to delete those 13991 * blocks that are covered by this ACK. 13992 */ 13993 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 13994 tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, 13995 &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); 13996 } 13997 13998 /* 13999 * If we got an ACK after fast retransmit, check to see 14000 * if it is a partial ACK. If it is not and the congestion 14001 * window was inflated to account for the other side's 14002 * cached packets, retract it. If it is, do Hoe's algorithm. 14003 */ 14004 if (tcp->tcp_dupack_cnt >= tcp_dupack_fast_retransmit) { 14005 ASSERT(tcp->tcp_rexmit == B_FALSE); 14006 if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { 14007 tcp->tcp_dupack_cnt = 0; 14008 /* 14009 * Restore the orig tcp_cwnd_ssthresh after 14010 * fast retransmit phase. 14011 */ 14012 if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { 14013 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; 14014 } 14015 tcp->tcp_rexmit_max = seg_ack; 14016 tcp->tcp_cwnd_cnt = 0; 14017 tcp->tcp_snd_burst = tcp->tcp_localnet ? 14018 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 14019 14020 /* 14021 * Remove all notsack info to avoid confusion with 14022 * the next fast retrasnmit/recovery phase. 14023 */ 14024 if (tcp->tcp_snd_sack_ok && 14025 tcp->tcp_notsack_list != NULL) { 14026 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 14027 } 14028 } else { 14029 if (tcp->tcp_snd_sack_ok && 14030 tcp->tcp_notsack_list != NULL) { 14031 flags |= TH_NEED_SACK_REXMIT; 14032 tcp->tcp_pipe -= mss; 14033 if (tcp->tcp_pipe < 0) 14034 tcp->tcp_pipe = 0; 14035 } else { 14036 /* 14037 * Hoe's algorithm: 14038 * 14039 * Retransmit the unack'ed segment and 14040 * restart fast recovery. Note that we 14041 * need to scale back tcp_cwnd to the 14042 * original value when we started fast 14043 * recovery. This is to prevent overly 14044 * aggressive behaviour in sending new 14045 * segments. 14046 */ 14047 tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + 14048 tcp_dupack_fast_retransmit * mss; 14049 tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; 14050 flags |= TH_REXMIT_NEEDED; 14051 } 14052 } 14053 } else { 14054 tcp->tcp_dupack_cnt = 0; 14055 if (tcp->tcp_rexmit) { 14056 /* 14057 * TCP is retranmitting. If the ACK ack's all 14058 * outstanding data, update tcp_rexmit_max and 14059 * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt 14060 * to the correct value. 14061 * 14062 * Note that SEQ_LEQ() is used. This is to avoid 14063 * unnecessary fast retransmit caused by dup ACKs 14064 * received when TCP does slow start retransmission 14065 * after a time out. During this phase, TCP may 14066 * send out segments which are already received. 14067 * This causes dup ACKs to be sent back. 14068 */ 14069 if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { 14070 if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { 14071 tcp->tcp_rexmit_nxt = seg_ack; 14072 } 14073 if (seg_ack != tcp->tcp_rexmit_max) { 14074 flags |= TH_XMIT_NEEDED; 14075 } 14076 } else { 14077 tcp->tcp_rexmit = B_FALSE; 14078 tcp->tcp_xmit_zc_clean = B_FALSE; 14079 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 14080 tcp->tcp_snd_burst = tcp->tcp_localnet ? 14081 TCP_CWND_INFINITE : TCP_CWND_NORMAL; 14082 } 14083 tcp->tcp_ms_we_have_waited = 0; 14084 } 14085 } 14086 14087 BUMP_MIB(&tcp_mib, tcpInAckSegs); 14088 UPDATE_MIB(&tcp_mib, tcpInAckBytes, bytes_acked); 14089 tcp->tcp_suna = seg_ack; 14090 if (tcp->tcp_zero_win_probe != 0) { 14091 tcp->tcp_zero_win_probe = 0; 14092 tcp->tcp_timer_backoff = 0; 14093 } 14094 14095 /* 14096 * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. 14097 * Note that it cannot be the SYN being ack'ed. The code flow 14098 * will not reach here. 14099 */ 14100 if (mp1 == NULL) { 14101 goto fin_acked; 14102 } 14103 14104 /* 14105 * Update the congestion window. 14106 * 14107 * If TCP is not ECN capable or TCP is ECN capable but the 14108 * congestion experience bit is not set, increase the tcp_cwnd as 14109 * usual. 14110 */ 14111 if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { 14112 cwnd = tcp->tcp_cwnd; 14113 add = mss; 14114 14115 if (cwnd >= tcp->tcp_cwnd_ssthresh) { 14116 /* 14117 * This is to prevent an increase of less than 1 MSS of 14118 * tcp_cwnd. With partial increase, tcp_wput_data() 14119 * may send out tinygrams in order to preserve mblk 14120 * boundaries. 14121 * 14122 * By initializing tcp_cwnd_cnt to new tcp_cwnd and 14123 * decrementing it by 1 MSS for every ACKs, tcp_cwnd is 14124 * increased by 1 MSS for every RTTs. 14125 */ 14126 if (tcp->tcp_cwnd_cnt <= 0) { 14127 tcp->tcp_cwnd_cnt = cwnd + add; 14128 } else { 14129 tcp->tcp_cwnd_cnt -= add; 14130 add = 0; 14131 } 14132 } 14133 tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); 14134 } 14135 14136 /* See if the latest urgent data has been acknowledged */ 14137 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && 14138 SEQ_GT(seg_ack, tcp->tcp_urg)) 14139 tcp->tcp_valid_bits &= ~TCP_URG_VALID; 14140 14141 /* Can we update the RTT estimates? */ 14142 if (tcp->tcp_snd_ts_ok) { 14143 /* Ignore zero timestamp echo-reply. */ 14144 if (tcpopt.tcp_opt_ts_ecr != 0) { 14145 tcp_set_rto(tcp, (int32_t)lbolt - 14146 (int32_t)tcpopt.tcp_opt_ts_ecr); 14147 } 14148 14149 /* If needed, restart the timer. */ 14150 if (tcp->tcp_set_timer == 1) { 14151 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 14152 tcp->tcp_set_timer = 0; 14153 } 14154 /* 14155 * Update tcp_csuna in case the other side stops sending 14156 * us timestamps. 14157 */ 14158 tcp->tcp_csuna = tcp->tcp_snxt; 14159 } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { 14160 /* 14161 * An ACK sequence we haven't seen before, so get the RTT 14162 * and update the RTO. But first check if the timestamp is 14163 * valid to use. 14164 */ 14165 if ((mp1->b_next != NULL) && 14166 SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) 14167 tcp_set_rto(tcp, (int32_t)lbolt - 14168 (int32_t)(intptr_t)mp1->b_prev); 14169 else 14170 BUMP_MIB(&tcp_mib, tcpRttNoUpdate); 14171 14172 /* Remeber the last sequence to be ACKed */ 14173 tcp->tcp_csuna = seg_ack; 14174 if (tcp->tcp_set_timer == 1) { 14175 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 14176 tcp->tcp_set_timer = 0; 14177 } 14178 } else { 14179 BUMP_MIB(&tcp_mib, tcpRttNoUpdate); 14180 } 14181 14182 /* Eat acknowledged bytes off the xmit queue. */ 14183 for (;;) { 14184 mblk_t *mp2; 14185 uchar_t *wptr; 14186 14187 wptr = mp1->b_wptr; 14188 ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); 14189 bytes_acked -= (int)(wptr - mp1->b_rptr); 14190 if (bytes_acked < 0) { 14191 mp1->b_rptr = wptr + bytes_acked; 14192 /* 14193 * Set a new timestamp if all the bytes timed by the 14194 * old timestamp have been ack'ed. 14195 */ 14196 if (SEQ_GT(seg_ack, 14197 (uint32_t)(uintptr_t)(mp1->b_next))) { 14198 mp1->b_prev = (mblk_t *)(uintptr_t)lbolt; 14199 mp1->b_next = NULL; 14200 } 14201 break; 14202 } 14203 mp1->b_next = NULL; 14204 mp1->b_prev = NULL; 14205 mp2 = mp1; 14206 mp1 = mp1->b_cont; 14207 14208 /* 14209 * This notification is required for some zero-copy 14210 * clients to maintain a copy semantic. After the data 14211 * is ack'ed, client is safe to modify or reuse the buffer. 14212 */ 14213 if (tcp->tcp_snd_zcopy_aware && 14214 (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 14215 tcp_zcopy_notify(tcp); 14216 freeb(mp2); 14217 if (bytes_acked == 0) { 14218 if (mp1 == NULL) { 14219 /* Everything is ack'ed, clear the tail. */ 14220 tcp->tcp_xmit_tail = NULL; 14221 /* 14222 * Cancel the timer unless we are still 14223 * waiting for an ACK for the FIN packet. 14224 */ 14225 if (tcp->tcp_timer_tid != 0 && 14226 tcp->tcp_snxt == tcp->tcp_suna) { 14227 (void) TCP_TIMER_CANCEL(tcp, 14228 tcp->tcp_timer_tid); 14229 tcp->tcp_timer_tid = 0; 14230 } 14231 goto pre_swnd_update; 14232 } 14233 if (mp2 != tcp->tcp_xmit_tail) 14234 break; 14235 tcp->tcp_xmit_tail = mp1; 14236 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 14237 (uintptr_t)INT_MAX); 14238 tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - 14239 mp1->b_rptr); 14240 break; 14241 } 14242 if (mp1 == NULL) { 14243 /* 14244 * More was acked but there is nothing more 14245 * outstanding. This means that the FIN was 14246 * just acked or that we're talking to a clown. 14247 */ 14248 fin_acked: 14249 ASSERT(tcp->tcp_fin_sent); 14250 tcp->tcp_xmit_tail = NULL; 14251 if (tcp->tcp_fin_sent) { 14252 /* FIN was acked - making progress */ 14253 if (tcp->tcp_ipversion == IPV6_VERSION && 14254 !tcp->tcp_fin_acked) 14255 tcp->tcp_ip_forward_progress = B_TRUE; 14256 tcp->tcp_fin_acked = B_TRUE; 14257 if (tcp->tcp_linger_tid != 0 && 14258 TCP_TIMER_CANCEL(tcp, 14259 tcp->tcp_linger_tid) >= 0) { 14260 tcp_stop_lingering(tcp); 14261 } 14262 } else { 14263 /* 14264 * We should never get here because 14265 * we have already checked that the 14266 * number of bytes ack'ed should be 14267 * smaller than or equal to what we 14268 * have sent so far (it is the 14269 * acceptability check of the ACK). 14270 * We can only get here if the send 14271 * queue is corrupted. 14272 * 14273 * Terminate the connection and 14274 * panic the system. It is better 14275 * for us to panic instead of 14276 * continuing to avoid other disaster. 14277 */ 14278 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 14279 tcp->tcp_rnxt, TH_RST|TH_ACK); 14280 panic("Memory corruption " 14281 "detected for connection %s.", 14282 tcp_display(tcp, NULL, 14283 DISP_ADDR_AND_PORT)); 14284 /*NOTREACHED*/ 14285 } 14286 goto pre_swnd_update; 14287 } 14288 ASSERT(mp2 != tcp->tcp_xmit_tail); 14289 } 14290 if (tcp->tcp_unsent) { 14291 flags |= TH_XMIT_NEEDED; 14292 } 14293 pre_swnd_update: 14294 tcp->tcp_xmit_head = mp1; 14295 swnd_update: 14296 /* 14297 * The following check is different from most other implementations. 14298 * For bi-directional transfer, when segments are dropped, the 14299 * "normal" check will not accept a window update in those 14300 * retransmitted segemnts. Failing to do that, TCP may send out 14301 * segments which are outside receiver's window. As TCP accepts 14302 * the ack in those retransmitted segments, if the window update in 14303 * the same segment is not accepted, TCP will incorrectly calculates 14304 * that it can send more segments. This can create a deadlock 14305 * with the receiver if its window becomes zero. 14306 */ 14307 if (SEQ_LT(tcp->tcp_swl2, seg_ack) || 14308 SEQ_LT(tcp->tcp_swl1, seg_seq) || 14309 (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { 14310 /* 14311 * The criteria for update is: 14312 * 14313 * 1. the segment acknowledges some data. Or 14314 * 2. the segment is new, i.e. it has a higher seq num. Or 14315 * 3. the segment is not old and the advertised window is 14316 * larger than the previous advertised window. 14317 */ 14318 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) 14319 flags |= TH_XMIT_NEEDED; 14320 tcp->tcp_swnd = new_swnd; 14321 if (new_swnd > tcp->tcp_max_swnd) 14322 tcp->tcp_max_swnd = new_swnd; 14323 tcp->tcp_swl1 = seg_seq; 14324 tcp->tcp_swl2 = seg_ack; 14325 } 14326 est: 14327 if (tcp->tcp_state > TCPS_ESTABLISHED) { 14328 14329 switch (tcp->tcp_state) { 14330 case TCPS_FIN_WAIT_1: 14331 if (tcp->tcp_fin_acked) { 14332 tcp->tcp_state = TCPS_FIN_WAIT_2; 14333 /* 14334 * We implement the non-standard BSD/SunOS 14335 * FIN_WAIT_2 flushing algorithm. 14336 * If there is no user attached to this 14337 * TCP endpoint, then this TCP struct 14338 * could hang around forever in FIN_WAIT_2 14339 * state if the peer forgets to send us 14340 * a FIN. To prevent this, we wait only 14341 * 2*MSL (a convenient time value) for 14342 * the FIN to arrive. If it doesn't show up, 14343 * we flush the TCP endpoint. This algorithm, 14344 * though a violation of RFC-793, has worked 14345 * for over 10 years in BSD systems. 14346 * Note: SunOS 4.x waits 675 seconds before 14347 * flushing the FIN_WAIT_2 connection. 14348 */ 14349 TCP_TIMER_RESTART(tcp, 14350 tcp_fin_wait_2_flush_interval); 14351 } 14352 break; 14353 case TCPS_FIN_WAIT_2: 14354 break; /* Shutdown hook? */ 14355 case TCPS_LAST_ACK: 14356 freemsg(mp); 14357 if (tcp->tcp_fin_acked) { 14358 (void) tcp_clean_death(tcp, 0, 19); 14359 return; 14360 } 14361 goto xmit_check; 14362 case TCPS_CLOSING: 14363 if (tcp->tcp_fin_acked) { 14364 tcp->tcp_state = TCPS_TIME_WAIT; 14365 /* 14366 * Unconditionally clear the exclusive binding 14367 * bit so this TIME-WAIT connection won't 14368 * interfere with new ones. 14369 */ 14370 tcp->tcp_exclbind = 0; 14371 if (!TCP_IS_DETACHED(tcp)) { 14372 TCP_TIMER_RESTART(tcp, 14373 tcp_time_wait_interval); 14374 } else { 14375 tcp_time_wait_append(tcp); 14376 TCP_DBGSTAT(tcp_rput_time_wait); 14377 } 14378 } 14379 /*FALLTHRU*/ 14380 case TCPS_CLOSE_WAIT: 14381 freemsg(mp); 14382 goto xmit_check; 14383 default: 14384 ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); 14385 break; 14386 } 14387 } 14388 if (flags & TH_FIN) { 14389 /* Make sure we ack the fin */ 14390 flags |= TH_ACK_NEEDED; 14391 if (!tcp->tcp_fin_rcvd) { 14392 tcp->tcp_fin_rcvd = B_TRUE; 14393 tcp->tcp_rnxt++; 14394 tcph = tcp->tcp_tcph; 14395 U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); 14396 14397 /* 14398 * Generate the ordrel_ind at the end unless we 14399 * are an eager guy. 14400 * In the eager case tcp_rsrv will do this when run 14401 * after tcp_accept is done. 14402 */ 14403 if (tcp->tcp_listener == NULL && 14404 !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding)) 14405 flags |= TH_ORDREL_NEEDED; 14406 switch (tcp->tcp_state) { 14407 case TCPS_SYN_RCVD: 14408 case TCPS_ESTABLISHED: 14409 tcp->tcp_state = TCPS_CLOSE_WAIT; 14410 /* Keepalive? */ 14411 break; 14412 case TCPS_FIN_WAIT_1: 14413 if (!tcp->tcp_fin_acked) { 14414 tcp->tcp_state = TCPS_CLOSING; 14415 break; 14416 } 14417 /* FALLTHRU */ 14418 case TCPS_FIN_WAIT_2: 14419 tcp->tcp_state = TCPS_TIME_WAIT; 14420 /* 14421 * Unconditionally clear the exclusive binding 14422 * bit so this TIME-WAIT connection won't 14423 * interfere with new ones. 14424 */ 14425 tcp->tcp_exclbind = 0; 14426 if (!TCP_IS_DETACHED(tcp)) { 14427 TCP_TIMER_RESTART(tcp, 14428 tcp_time_wait_interval); 14429 } else { 14430 tcp_time_wait_append(tcp); 14431 TCP_DBGSTAT(tcp_rput_time_wait); 14432 } 14433 if (seg_len) { 14434 /* 14435 * implies data piggybacked on FIN. 14436 * break to handle data. 14437 */ 14438 break; 14439 } 14440 freemsg(mp); 14441 goto ack_check; 14442 } 14443 } 14444 } 14445 if (mp == NULL) 14446 goto xmit_check; 14447 if (seg_len == 0) { 14448 freemsg(mp); 14449 goto xmit_check; 14450 } 14451 if (mp->b_rptr == mp->b_wptr) { 14452 /* 14453 * The header has been consumed, so we remove the 14454 * zero-length mblk here. 14455 */ 14456 mp1 = mp; 14457 mp = mp->b_cont; 14458 freeb(mp1); 14459 } 14460 tcph = tcp->tcp_tcph; 14461 tcp->tcp_rack_cnt++; 14462 { 14463 uint32_t cur_max; 14464 14465 cur_max = tcp->tcp_rack_cur_max; 14466 if (tcp->tcp_rack_cnt >= cur_max) { 14467 /* 14468 * We have more unacked data than we should - send 14469 * an ACK now. 14470 */ 14471 flags |= TH_ACK_NEEDED; 14472 cur_max++; 14473 if (cur_max > tcp->tcp_rack_abs_max) 14474 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 14475 else 14476 tcp->tcp_rack_cur_max = cur_max; 14477 } else if (TCP_IS_DETACHED(tcp)) { 14478 /* We don't have an ACK timer for detached TCP. */ 14479 flags |= TH_ACK_NEEDED; 14480 } else if (seg_len < mss) { 14481 /* 14482 * If we get a segment that is less than an mss, and we 14483 * already have unacknowledged data, and the amount 14484 * unacknowledged is not a multiple of mss, then we 14485 * better generate an ACK now. Otherwise, this may be 14486 * the tail piece of a transaction, and we would rather 14487 * wait for the response. 14488 */ 14489 uint32_t udif; 14490 ASSERT((uintptr_t)(tcp->tcp_rnxt - tcp->tcp_rack) <= 14491 (uintptr_t)INT_MAX); 14492 udif = (int)(tcp->tcp_rnxt - tcp->tcp_rack); 14493 if (udif && (udif % mss)) 14494 flags |= TH_ACK_NEEDED; 14495 else 14496 flags |= TH_ACK_TIMER_NEEDED; 14497 } else { 14498 /* Start delayed ack timer */ 14499 flags |= TH_ACK_TIMER_NEEDED; 14500 } 14501 } 14502 tcp->tcp_rnxt += seg_len; 14503 U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); 14504 14505 /* Update SACK list */ 14506 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 14507 tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, 14508 &(tcp->tcp_num_sack_blk)); 14509 } 14510 14511 if (tcp->tcp_urp_mp) { 14512 tcp->tcp_urp_mp->b_cont = mp; 14513 mp = tcp->tcp_urp_mp; 14514 tcp->tcp_urp_mp = NULL; 14515 /* Ready for a new signal. */ 14516 tcp->tcp_urp_last_valid = B_FALSE; 14517 #ifdef DEBUG 14518 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14519 "tcp_rput: sending exdata_ind %s", 14520 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 14521 #endif /* DEBUG */ 14522 } 14523 14524 /* 14525 * Check for ancillary data changes compared to last segment. 14526 */ 14527 if (tcp->tcp_ipv6_recvancillary != 0) { 14528 mp = tcp_rput_add_ancillary(tcp, mp, &ipp); 14529 if (mp == NULL) 14530 return; 14531 } 14532 14533 if (tcp->tcp_listener || tcp->tcp_hard_binding) { 14534 /* 14535 * Side queue inbound data until the accept happens. 14536 * tcp_accept/tcp_rput drains this when the accept happens. 14537 * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or 14538 * T_EXDATA_IND) it is queued on b_next. 14539 * XXX Make urgent data use this. Requires: 14540 * Removing tcp_listener check for TH_URG 14541 * Making M_PCPROTO and MARK messages skip the eager case 14542 */ 14543 14544 if (tcp->tcp_kssl_pending) { 14545 tcp_kssl_input(tcp, mp); 14546 } else { 14547 tcp_rcv_enqueue(tcp, mp, seg_len); 14548 } 14549 } else { 14550 if (mp->b_datap->db_type != M_DATA || 14551 (flags & TH_MARKNEXT_NEEDED)) { 14552 if (tcp->tcp_rcv_list != NULL) { 14553 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 14554 } 14555 ASSERT(tcp->tcp_rcv_list == NULL || 14556 tcp->tcp_fused_sigurg); 14557 if (flags & TH_MARKNEXT_NEEDED) { 14558 #ifdef DEBUG 14559 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14560 "tcp_rput: sending MSGMARKNEXT %s", 14561 tcp_display(tcp, NULL, 14562 DISP_PORT_ONLY)); 14563 #endif /* DEBUG */ 14564 mp->b_flag |= MSGMARKNEXT; 14565 flags &= ~TH_MARKNEXT_NEEDED; 14566 } 14567 14568 /* Does this need SSL processing first? */ 14569 if ((tcp->tcp_kssl_ctx != NULL) && 14570 (DB_TYPE(mp) == M_DATA)) { 14571 tcp_kssl_input(tcp, mp); 14572 } else { 14573 putnext(tcp->tcp_rq, mp); 14574 if (!canputnext(tcp->tcp_rq)) 14575 tcp->tcp_rwnd -= seg_len; 14576 } 14577 } else if ((flags & (TH_PUSH|TH_FIN)) || 14578 tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_rq->q_hiwat >> 3) { 14579 if (tcp->tcp_rcv_list != NULL) { 14580 /* 14581 * Enqueue the new segment first and then 14582 * call tcp_rcv_drain() to send all data 14583 * up. The other way to do this is to 14584 * send all queued data up and then call 14585 * putnext() to send the new segment up. 14586 * This way can remove the else part later 14587 * on. 14588 * 14589 * We don't this to avoid one more call to 14590 * canputnext() as tcp_rcv_drain() needs to 14591 * call canputnext(). 14592 */ 14593 tcp_rcv_enqueue(tcp, mp, seg_len); 14594 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 14595 } else { 14596 /* Does this need SSL processing first? */ 14597 if ((tcp->tcp_kssl_ctx != NULL) && 14598 (DB_TYPE(mp) == M_DATA)) { 14599 tcp_kssl_input(tcp, mp); 14600 } else { 14601 putnext(tcp->tcp_rq, mp); 14602 if (!canputnext(tcp->tcp_rq)) 14603 tcp->tcp_rwnd -= seg_len; 14604 } 14605 } 14606 } else { 14607 /* 14608 * Enqueue all packets when processing an mblk 14609 * from the co queue and also enqueue normal packets. 14610 */ 14611 tcp_rcv_enqueue(tcp, mp, seg_len); 14612 } 14613 /* 14614 * Make sure the timer is running if we have data waiting 14615 * for a push bit. This provides resiliency against 14616 * implementations that do not correctly generate push bits. 14617 */ 14618 if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) { 14619 /* 14620 * The connection may be closed at this point, so don't 14621 * do anything for a detached tcp. 14622 */ 14623 if (!TCP_IS_DETACHED(tcp)) 14624 tcp->tcp_push_tid = TCP_TIMER(tcp, 14625 tcp_push_timer, 14626 MSEC_TO_TICK(tcp_push_timer_interval)); 14627 } 14628 } 14629 xmit_check: 14630 /* Is there anything left to do? */ 14631 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 14632 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| 14633 TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED| 14634 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 14635 goto done; 14636 14637 /* Any transmit work to do and a non-zero window? */ 14638 if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| 14639 TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { 14640 if (flags & TH_REXMIT_NEEDED) { 14641 uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; 14642 14643 BUMP_MIB(&tcp_mib, tcpOutFastRetrans); 14644 if (snd_size > mss) 14645 snd_size = mss; 14646 if (snd_size > tcp->tcp_swnd) 14647 snd_size = tcp->tcp_swnd; 14648 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, 14649 NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, 14650 B_TRUE); 14651 14652 if (mp1 != NULL) { 14653 tcp->tcp_xmit_head->b_prev = (mblk_t *)lbolt; 14654 tcp->tcp_csuna = tcp->tcp_snxt; 14655 BUMP_MIB(&tcp_mib, tcpRetransSegs); 14656 UPDATE_MIB(&tcp_mib, tcpRetransBytes, snd_size); 14657 TCP_RECORD_TRACE(tcp, mp1, 14658 TCP_TRACE_SEND_PKT); 14659 tcp_send_data(tcp, tcp->tcp_wq, mp1); 14660 } 14661 } 14662 if (flags & TH_NEED_SACK_REXMIT) { 14663 tcp_sack_rxmit(tcp, &flags); 14664 } 14665 /* 14666 * For TH_LIMIT_XMIT, tcp_wput_data() is called to send 14667 * out new segment. Note that tcp_rexmit should not be 14668 * set, otherwise TH_LIMIT_XMIT should not be set. 14669 */ 14670 if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { 14671 if (!tcp->tcp_rexmit) { 14672 tcp_wput_data(tcp, NULL, B_FALSE); 14673 } else { 14674 tcp_ss_rexmit(tcp); 14675 } 14676 } 14677 /* 14678 * Adjust tcp_cwnd back to normal value after sending 14679 * new data segments. 14680 */ 14681 if (flags & TH_LIMIT_XMIT) { 14682 tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); 14683 /* 14684 * This will restart the timer. Restarting the 14685 * timer is used to avoid a timeout before the 14686 * limited transmitted segment's ACK gets back. 14687 */ 14688 if (tcp->tcp_xmit_head != NULL) 14689 tcp->tcp_xmit_head->b_prev = (mblk_t *)lbolt; 14690 } 14691 14692 /* Anything more to do? */ 14693 if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| 14694 TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) 14695 goto done; 14696 } 14697 ack_check: 14698 if (flags & TH_SEND_URP_MARK) { 14699 ASSERT(tcp->tcp_urp_mark_mp); 14700 /* 14701 * Send up any queued data and then send the mark message 14702 */ 14703 if (tcp->tcp_rcv_list != NULL) { 14704 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 14705 } 14706 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 14707 14708 mp1 = tcp->tcp_urp_mark_mp; 14709 tcp->tcp_urp_mark_mp = NULL; 14710 #ifdef DEBUG 14711 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 14712 "tcp_rput: sending zero-length %s %s", 14713 ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" : 14714 "MSGNOTMARKNEXT"), 14715 tcp_display(tcp, NULL, DISP_PORT_ONLY)); 14716 #endif /* DEBUG */ 14717 putnext(tcp->tcp_rq, mp1); 14718 flags &= ~TH_SEND_URP_MARK; 14719 } 14720 if (flags & TH_ACK_NEEDED) { 14721 /* 14722 * Time to send an ack for some reason. 14723 */ 14724 mp1 = tcp_ack_mp(tcp); 14725 14726 if (mp1 != NULL) { 14727 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); 14728 tcp_send_data(tcp, tcp->tcp_wq, mp1); 14729 BUMP_LOCAL(tcp->tcp_obsegs); 14730 BUMP_MIB(&tcp_mib, tcpOutAck); 14731 } 14732 if (tcp->tcp_ack_tid != 0) { 14733 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); 14734 tcp->tcp_ack_tid = 0; 14735 } 14736 } 14737 if (flags & TH_ACK_TIMER_NEEDED) { 14738 /* 14739 * Arrange for deferred ACK or push wait timeout. 14740 * Start timer if it is not already running. 14741 */ 14742 if (tcp->tcp_ack_tid == 0) { 14743 tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer, 14744 MSEC_TO_TICK(tcp->tcp_localnet ? 14745 (clock_t)tcp_local_dack_interval : 14746 (clock_t)tcp_deferred_ack_interval)); 14747 } 14748 } 14749 if (flags & TH_ORDREL_NEEDED) { 14750 /* 14751 * Send up the ordrel_ind unless we are an eager guy. 14752 * In the eager case tcp_rsrv will do this when run 14753 * after tcp_accept is done. 14754 */ 14755 ASSERT(tcp->tcp_listener == NULL); 14756 if (tcp->tcp_rcv_list != NULL) { 14757 /* 14758 * Push any mblk(s) enqueued from co processing. 14759 */ 14760 flags |= tcp_rcv_drain(tcp->tcp_rq, tcp); 14761 } 14762 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 14763 if ((mp1 = mi_tpi_ordrel_ind()) != NULL) { 14764 tcp->tcp_ordrel_done = B_TRUE; 14765 putnext(tcp->tcp_rq, mp1); 14766 if (tcp->tcp_deferred_clean_death) { 14767 /* 14768 * tcp_clean_death was deferred 14769 * for T_ORDREL_IND - do it now 14770 */ 14771 (void) tcp_clean_death(tcp, 14772 tcp->tcp_client_errno, 20); 14773 tcp->tcp_deferred_clean_death = B_FALSE; 14774 } 14775 } else { 14776 /* 14777 * Run the orderly release in the 14778 * service routine. 14779 */ 14780 qenable(tcp->tcp_rq); 14781 /* 14782 * Caveat(XXX): The machine may be so 14783 * overloaded that tcp_rsrv() is not scheduled 14784 * until after the endpoint has transitioned 14785 * to TCPS_TIME_WAIT 14786 * and tcp_time_wait_interval expires. Then 14787 * tcp_timer() will blow away state in tcp_t 14788 * and T_ORDREL_IND will never be delivered 14789 * upstream. Unlikely but potentially 14790 * a problem. 14791 */ 14792 } 14793 } 14794 done: 14795 ASSERT(!(flags & TH_MARKNEXT_NEEDED)); 14796 } 14797 14798 /* 14799 * This function does PAWS protection check. Returns B_TRUE if the 14800 * segment passes the PAWS test, else returns B_FALSE. 14801 */ 14802 boolean_t 14803 tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) 14804 { 14805 uint8_t flags; 14806 int options; 14807 uint8_t *up; 14808 14809 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 14810 /* 14811 * If timestamp option is aligned nicely, get values inline, 14812 * otherwise call general routine to parse. Only do that 14813 * if timestamp is the only option. 14814 */ 14815 if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH + 14816 TCPOPT_REAL_TS_LEN && 14817 OK_32PTR((up = ((uint8_t *)tcph) + 14818 TCP_MIN_HEADER_LENGTH)) && 14819 *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { 14820 tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); 14821 tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); 14822 14823 options = TCP_OPT_TSTAMP_PRESENT; 14824 } else { 14825 if (tcp->tcp_snd_sack_ok) { 14826 tcpoptp->tcp = tcp; 14827 } else { 14828 tcpoptp->tcp = NULL; 14829 } 14830 options = tcp_parse_options(tcph, tcpoptp); 14831 } 14832 14833 if (options & TCP_OPT_TSTAMP_PRESENT) { 14834 /* 14835 * Do PAWS per RFC 1323 section 4.2. Accept RST 14836 * regardless of the timestamp, page 18 RFC 1323.bis. 14837 */ 14838 if ((flags & TH_RST) == 0 && 14839 TSTMP_LT(tcpoptp->tcp_opt_ts_val, 14840 tcp->tcp_ts_recent)) { 14841 if (TSTMP_LT(lbolt64, tcp->tcp_last_rcv_lbolt + 14842 PAWS_TIMEOUT)) { 14843 /* This segment is not acceptable. */ 14844 return (B_FALSE); 14845 } else { 14846 /* 14847 * Connection has been idle for 14848 * too long. Reset the timestamp 14849 * and assume the segment is valid. 14850 */ 14851 tcp->tcp_ts_recent = 14852 tcpoptp->tcp_opt_ts_val; 14853 } 14854 } 14855 } else { 14856 /* 14857 * If we don't get a timestamp on every packet, we 14858 * figure we can't really trust 'em, so we stop sending 14859 * and parsing them. 14860 */ 14861 tcp->tcp_snd_ts_ok = B_FALSE; 14862 14863 tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 14864 tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN; 14865 tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4); 14866 tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); 14867 if (tcp->tcp_snd_sack_ok) { 14868 ASSERT(tcp->tcp_sack_info != NULL); 14869 tcp->tcp_max_sack_blk = 4; 14870 } 14871 } 14872 return (B_TRUE); 14873 } 14874 14875 /* 14876 * Attach ancillary data to a received TCP segments for the 14877 * ancillary pieces requested by the application that are 14878 * different than they were in the previous data segment. 14879 * 14880 * Save the "current" values once memory allocation is ok so that 14881 * when memory allocation fails we can just wait for the next data segment. 14882 */ 14883 static mblk_t * 14884 tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) 14885 { 14886 struct T_optdata_ind *todi; 14887 int optlen; 14888 uchar_t *optptr; 14889 struct T_opthdr *toh; 14890 uint_t addflag; /* Which pieces to add */ 14891 mblk_t *mp1; 14892 14893 optlen = 0; 14894 addflag = 0; 14895 /* If app asked for pktinfo and the index has changed ... */ 14896 if ((ipp->ipp_fields & IPPF_IFINDEX) && 14897 ipp->ipp_ifindex != tcp->tcp_recvifindex && 14898 (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)) { 14899 optlen += sizeof (struct T_opthdr) + 14900 sizeof (struct in6_pktinfo); 14901 addflag |= TCP_IPV6_RECVPKTINFO; 14902 } 14903 /* If app asked for hoplimit and it has changed ... */ 14904 if ((ipp->ipp_fields & IPPF_HOPLIMIT) && 14905 ipp->ipp_hoplimit != tcp->tcp_recvhops && 14906 (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPLIMIT)) { 14907 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 14908 addflag |= TCP_IPV6_RECVHOPLIMIT; 14909 } 14910 /* If app asked for tclass and it has changed ... */ 14911 if ((ipp->ipp_fields & IPPF_TCLASS) && 14912 ipp->ipp_tclass != tcp->tcp_recvtclass && 14913 (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)) { 14914 optlen += sizeof (struct T_opthdr) + sizeof (uint_t); 14915 addflag |= TCP_IPV6_RECVTCLASS; 14916 } 14917 /* 14918 * If app asked for hopbyhop headers and it has changed ... 14919 * For security labels, note that (1) security labels can't change on 14920 * a connected socket at all, (2) we're connected to at most one peer, 14921 * (3) if anything changes, then it must be some other extra option. 14922 */ 14923 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) && 14924 ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen, 14925 (ipp->ipp_fields & IPPF_HOPOPTS), 14926 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) { 14927 optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen - 14928 tcp->tcp_label_len; 14929 addflag |= TCP_IPV6_RECVHOPOPTS; 14930 if (!ip_allocbuf((void **)&tcp->tcp_hopopts, 14931 &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), 14932 ipp->ipp_hopopts, ipp->ipp_hopoptslen)) 14933 return (mp); 14934 } 14935 /* If app asked for dst headers before routing headers ... */ 14936 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTDSTOPTS) && 14937 ip_cmpbuf(tcp->tcp_rtdstopts, tcp->tcp_rtdstoptslen, 14938 (ipp->ipp_fields & IPPF_RTDSTOPTS), 14939 ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) { 14940 optlen += sizeof (struct T_opthdr) + 14941 ipp->ipp_rtdstoptslen; 14942 addflag |= TCP_IPV6_RECVRTDSTOPTS; 14943 if (!ip_allocbuf((void **)&tcp->tcp_rtdstopts, 14944 &tcp->tcp_rtdstoptslen, (ipp->ipp_fields & IPPF_RTDSTOPTS), 14945 ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) 14946 return (mp); 14947 } 14948 /* If app asked for routing headers and it has changed ... */ 14949 if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) && 14950 ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen, 14951 (ipp->ipp_fields & IPPF_RTHDR), 14952 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { 14953 optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; 14954 addflag |= TCP_IPV6_RECVRTHDR; 14955 if (!ip_allocbuf((void **)&tcp->tcp_rthdr, 14956 &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), 14957 ipp->ipp_rthdr, ipp->ipp_rthdrlen)) 14958 return (mp); 14959 } 14960 /* If app asked for dest headers and it has changed ... */ 14961 if ((tcp->tcp_ipv6_recvancillary & 14962 (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) && 14963 ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen, 14964 (ipp->ipp_fields & IPPF_DSTOPTS), 14965 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) { 14966 optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; 14967 addflag |= TCP_IPV6_RECVDSTOPTS; 14968 if (!ip_allocbuf((void **)&tcp->tcp_dstopts, 14969 &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), 14970 ipp->ipp_dstopts, ipp->ipp_dstoptslen)) 14971 return (mp); 14972 } 14973 14974 if (optlen == 0) { 14975 /* Nothing to add */ 14976 return (mp); 14977 } 14978 mp1 = allocb(sizeof (struct T_optdata_ind) + optlen, BPRI_MED); 14979 if (mp1 == NULL) { 14980 /* 14981 * Defer sending ancillary data until the next TCP segment 14982 * arrives. 14983 */ 14984 return (mp); 14985 } 14986 mp1->b_cont = mp; 14987 mp = mp1; 14988 mp->b_wptr += sizeof (*todi) + optlen; 14989 mp->b_datap->db_type = M_PROTO; 14990 todi = (struct T_optdata_ind *)mp->b_rptr; 14991 todi->PRIM_type = T_OPTDATA_IND; 14992 todi->DATA_flag = 1; /* MORE data */ 14993 todi->OPT_length = optlen; 14994 todi->OPT_offset = sizeof (*todi); 14995 optptr = (uchar_t *)&todi[1]; 14996 /* 14997 * If app asked for pktinfo and the index has changed ... 14998 * Note that the local address never changes for the connection. 14999 */ 15000 if (addflag & TCP_IPV6_RECVPKTINFO) { 15001 struct in6_pktinfo *pkti; 15002 15003 toh = (struct T_opthdr *)optptr; 15004 toh->level = IPPROTO_IPV6; 15005 toh->name = IPV6_PKTINFO; 15006 toh->len = sizeof (*toh) + sizeof (*pkti); 15007 toh->status = 0; 15008 optptr += sizeof (*toh); 15009 pkti = (struct in6_pktinfo *)optptr; 15010 if (tcp->tcp_ipversion == IPV6_VERSION) 15011 pkti->ipi6_addr = tcp->tcp_ip6h->ip6_src; 15012 else 15013 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 15014 &pkti->ipi6_addr); 15015 pkti->ipi6_ifindex = ipp->ipp_ifindex; 15016 optptr += sizeof (*pkti); 15017 ASSERT(OK_32PTR(optptr)); 15018 /* Save as "last" value */ 15019 tcp->tcp_recvifindex = ipp->ipp_ifindex; 15020 } 15021 /* If app asked for hoplimit and it has changed ... */ 15022 if (addflag & TCP_IPV6_RECVHOPLIMIT) { 15023 toh = (struct T_opthdr *)optptr; 15024 toh->level = IPPROTO_IPV6; 15025 toh->name = IPV6_HOPLIMIT; 15026 toh->len = sizeof (*toh) + sizeof (uint_t); 15027 toh->status = 0; 15028 optptr += sizeof (*toh); 15029 *(uint_t *)optptr = ipp->ipp_hoplimit; 15030 optptr += sizeof (uint_t); 15031 ASSERT(OK_32PTR(optptr)); 15032 /* Save as "last" value */ 15033 tcp->tcp_recvhops = ipp->ipp_hoplimit; 15034 } 15035 /* If app asked for tclass and it has changed ... */ 15036 if (addflag & TCP_IPV6_RECVTCLASS) { 15037 toh = (struct T_opthdr *)optptr; 15038 toh->level = IPPROTO_IPV6; 15039 toh->name = IPV6_TCLASS; 15040 toh->len = sizeof (*toh) + sizeof (uint_t); 15041 toh->status = 0; 15042 optptr += sizeof (*toh); 15043 *(uint_t *)optptr = ipp->ipp_tclass; 15044 optptr += sizeof (uint_t); 15045 ASSERT(OK_32PTR(optptr)); 15046 /* Save as "last" value */ 15047 tcp->tcp_recvtclass = ipp->ipp_tclass; 15048 } 15049 if (addflag & TCP_IPV6_RECVHOPOPTS) { 15050 toh = (struct T_opthdr *)optptr; 15051 toh->level = IPPROTO_IPV6; 15052 toh->name = IPV6_HOPOPTS; 15053 toh->len = sizeof (*toh) + ipp->ipp_hopoptslen - 15054 tcp->tcp_label_len; 15055 toh->status = 0; 15056 optptr += sizeof (*toh); 15057 bcopy((uchar_t *)ipp->ipp_hopopts + tcp->tcp_label_len, optptr, 15058 ipp->ipp_hopoptslen - tcp->tcp_label_len); 15059 optptr += ipp->ipp_hopoptslen - tcp->tcp_label_len; 15060 ASSERT(OK_32PTR(optptr)); 15061 /* Save as last value */ 15062 ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, 15063 (ipp->ipp_fields & IPPF_HOPOPTS), 15064 ipp->ipp_hopopts, ipp->ipp_hopoptslen); 15065 } 15066 if (addflag & TCP_IPV6_RECVRTDSTOPTS) { 15067 toh = (struct T_opthdr *)optptr; 15068 toh->level = IPPROTO_IPV6; 15069 toh->name = IPV6_RTHDRDSTOPTS; 15070 toh->len = sizeof (*toh) + ipp->ipp_rtdstoptslen; 15071 toh->status = 0; 15072 optptr += sizeof (*toh); 15073 bcopy(ipp->ipp_rtdstopts, optptr, ipp->ipp_rtdstoptslen); 15074 optptr += ipp->ipp_rtdstoptslen; 15075 ASSERT(OK_32PTR(optptr)); 15076 /* Save as last value */ 15077 ip_savebuf((void **)&tcp->tcp_rtdstopts, 15078 &tcp->tcp_rtdstoptslen, 15079 (ipp->ipp_fields & IPPF_RTDSTOPTS), 15080 ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); 15081 } 15082 if (addflag & TCP_IPV6_RECVRTHDR) { 15083 toh = (struct T_opthdr *)optptr; 15084 toh->level = IPPROTO_IPV6; 15085 toh->name = IPV6_RTHDR; 15086 toh->len = sizeof (*toh) + ipp->ipp_rthdrlen; 15087 toh->status = 0; 15088 optptr += sizeof (*toh); 15089 bcopy(ipp->ipp_rthdr, optptr, ipp->ipp_rthdrlen); 15090 optptr += ipp->ipp_rthdrlen; 15091 ASSERT(OK_32PTR(optptr)); 15092 /* Save as last value */ 15093 ip_savebuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen, 15094 (ipp->ipp_fields & IPPF_RTHDR), 15095 ipp->ipp_rthdr, ipp->ipp_rthdrlen); 15096 } 15097 if (addflag & (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) { 15098 toh = (struct T_opthdr *)optptr; 15099 toh->level = IPPROTO_IPV6; 15100 toh->name = IPV6_DSTOPTS; 15101 toh->len = sizeof (*toh) + ipp->ipp_dstoptslen; 15102 toh->status = 0; 15103 optptr += sizeof (*toh); 15104 bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen); 15105 optptr += ipp->ipp_dstoptslen; 15106 ASSERT(OK_32PTR(optptr)); 15107 /* Save as last value */ 15108 ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen, 15109 (ipp->ipp_fields & IPPF_DSTOPTS), 15110 ipp->ipp_dstopts, ipp->ipp_dstoptslen); 15111 } 15112 ASSERT(optptr == mp->b_wptr); 15113 return (mp); 15114 } 15115 15116 15117 /* 15118 * Handle a *T_BIND_REQ that has failed either due to a T_ERROR_ACK 15119 * or a "bad" IRE detected by tcp_adapt_ire. 15120 * We can't tell if the failure was due to the laddr or the faddr 15121 * thus we clear out all addresses and ports. 15122 */ 15123 static void 15124 tcp_bind_failed(tcp_t *tcp, mblk_t *mp, int error) 15125 { 15126 queue_t *q = tcp->tcp_rq; 15127 tcph_t *tcph; 15128 struct T_error_ack *tea; 15129 conn_t *connp = tcp->tcp_connp; 15130 15131 15132 ASSERT(mp->b_datap->db_type == M_PCPROTO); 15133 15134 if (mp->b_cont) { 15135 freemsg(mp->b_cont); 15136 mp->b_cont = NULL; 15137 } 15138 tea = (struct T_error_ack *)mp->b_rptr; 15139 switch (tea->PRIM_type) { 15140 case T_BIND_ACK: 15141 /* 15142 * Need to unbind with classifier since we were just told that 15143 * our bind succeeded. 15144 */ 15145 tcp->tcp_hard_bound = B_FALSE; 15146 tcp->tcp_hard_binding = B_FALSE; 15147 15148 ipcl_hash_remove(connp); 15149 /* Reuse the mblk if possible */ 15150 ASSERT(mp->b_datap->db_lim - mp->b_datap->db_base >= 15151 sizeof (*tea)); 15152 mp->b_rptr = mp->b_datap->db_base; 15153 mp->b_wptr = mp->b_rptr + sizeof (*tea); 15154 tea = (struct T_error_ack *)mp->b_rptr; 15155 tea->PRIM_type = T_ERROR_ACK; 15156 tea->TLI_error = TSYSERR; 15157 tea->UNIX_error = error; 15158 if (tcp->tcp_state >= TCPS_SYN_SENT) { 15159 tea->ERROR_prim = T_CONN_REQ; 15160 } else { 15161 tea->ERROR_prim = O_T_BIND_REQ; 15162 } 15163 break; 15164 15165 case T_ERROR_ACK: 15166 if (tcp->tcp_state >= TCPS_SYN_SENT) 15167 tea->ERROR_prim = T_CONN_REQ; 15168 break; 15169 default: 15170 panic("tcp_bind_failed: unexpected TPI type"); 15171 /*NOTREACHED*/ 15172 } 15173 15174 tcp->tcp_state = TCPS_IDLE; 15175 if (tcp->tcp_ipversion == IPV4_VERSION) 15176 tcp->tcp_ipha->ipha_src = 0; 15177 else 15178 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); 15179 /* 15180 * Copy of the src addr. in tcp_t is needed since 15181 * the lookup funcs. can only look at tcp_t 15182 */ 15183 V6_SET_ZERO(tcp->tcp_ip_src_v6); 15184 15185 tcph = tcp->tcp_tcph; 15186 tcph->th_lport[0] = 0; 15187 tcph->th_lport[1] = 0; 15188 tcp_bind_hash_remove(tcp); 15189 bzero(&connp->u_port, sizeof (connp->u_port)); 15190 /* blow away saved option results if any */ 15191 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) 15192 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); 15193 15194 conn_delete_ire(tcp->tcp_connp, NULL); 15195 putnext(q, mp); 15196 } 15197 15198 /* 15199 * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA 15200 * messages. 15201 */ 15202 void 15203 tcp_rput_other(tcp_t *tcp, mblk_t *mp) 15204 { 15205 mblk_t *mp1; 15206 uchar_t *rptr = mp->b_rptr; 15207 queue_t *q = tcp->tcp_rq; 15208 struct T_error_ack *tea; 15209 uint32_t mss; 15210 mblk_t *syn_mp; 15211 mblk_t *mdti; 15212 int retval; 15213 mblk_t *ire_mp; 15214 15215 switch (mp->b_datap->db_type) { 15216 case M_PROTO: 15217 case M_PCPROTO: 15218 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 15219 if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) 15220 break; 15221 tea = (struct T_error_ack *)rptr; 15222 switch (tea->PRIM_type) { 15223 case T_BIND_ACK: 15224 /* 15225 * Adapt Multidata information, if any. The 15226 * following tcp_mdt_update routine will free 15227 * the message. 15228 */ 15229 if ((mdti = tcp_mdt_info_mp(mp)) != NULL) { 15230 tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti-> 15231 b_rptr)->mdt_capab, B_TRUE); 15232 freemsg(mdti); 15233 } 15234 15235 /* Get the IRE, if we had requested for it */ 15236 ire_mp = tcp_ire_mp(mp); 15237 15238 if (tcp->tcp_hard_binding) { 15239 tcp->tcp_hard_binding = B_FALSE; 15240 tcp->tcp_hard_bound = B_TRUE; 15241 CL_INET_CONNECT(tcp); 15242 } else { 15243 if (ire_mp != NULL) 15244 freeb(ire_mp); 15245 goto after_syn_sent; 15246 } 15247 15248 retval = tcp_adapt_ire(tcp, ire_mp); 15249 if (ire_mp != NULL) 15250 freeb(ire_mp); 15251 if (retval == 0) { 15252 tcp_bind_failed(tcp, mp, 15253 (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? 15254 ENETUNREACH : EADDRNOTAVAIL)); 15255 return; 15256 } 15257 /* 15258 * Don't let an endpoint connect to itself. 15259 * Also checked in tcp_connect() but that 15260 * check can't handle the case when the 15261 * local IP address is INADDR_ANY. 15262 */ 15263 if (tcp->tcp_ipversion == IPV4_VERSION) { 15264 if ((tcp->tcp_ipha->ipha_dst == 15265 tcp->tcp_ipha->ipha_src) && 15266 (BE16_EQL(tcp->tcp_tcph->th_lport, 15267 tcp->tcp_tcph->th_fport))) { 15268 tcp_bind_failed(tcp, mp, EADDRNOTAVAIL); 15269 return; 15270 } 15271 } else { 15272 if (IN6_ARE_ADDR_EQUAL( 15273 &tcp->tcp_ip6h->ip6_dst, 15274 &tcp->tcp_ip6h->ip6_src) && 15275 (BE16_EQL(tcp->tcp_tcph->th_lport, 15276 tcp->tcp_tcph->th_fport))) { 15277 tcp_bind_failed(tcp, mp, EADDRNOTAVAIL); 15278 return; 15279 } 15280 } 15281 ASSERT(tcp->tcp_state == TCPS_SYN_SENT); 15282 /* 15283 * This should not be possible! Just for 15284 * defensive coding... 15285 */ 15286 if (tcp->tcp_state != TCPS_SYN_SENT) 15287 goto after_syn_sent; 15288 15289 if (is_system_labeled() && 15290 !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) { 15291 tcp_bind_failed(tcp, mp, EHOSTUNREACH); 15292 return; 15293 } 15294 15295 ASSERT(q == tcp->tcp_rq); 15296 /* 15297 * tcp_adapt_ire() does not adjust 15298 * for TCP/IP header length. 15299 */ 15300 mss = tcp->tcp_mss - tcp->tcp_hdr_len; 15301 15302 /* 15303 * Just make sure our rwnd is at 15304 * least tcp_recv_hiwat_mss * MSS 15305 * large, and round up to the nearest 15306 * MSS. 15307 * 15308 * We do the round up here because 15309 * we need to get the interface 15310 * MTU first before we can do the 15311 * round up. 15312 */ 15313 tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), 15314 tcp_recv_hiwat_minmss * mss); 15315 q->q_hiwat = tcp->tcp_rwnd; 15316 tcp_set_ws_value(tcp); 15317 U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), 15318 tcp->tcp_tcph->th_win); 15319 if (tcp->tcp_rcv_ws > 0 || tcp_wscale_always) 15320 tcp->tcp_snd_ws_ok = B_TRUE; 15321 15322 /* 15323 * Set tcp_snd_ts_ok to true 15324 * so that tcp_xmit_mp will 15325 * include the timestamp 15326 * option in the SYN segment. 15327 */ 15328 if (tcp_tstamp_always || 15329 (tcp->tcp_rcv_ws && tcp_tstamp_if_wscale)) { 15330 tcp->tcp_snd_ts_ok = B_TRUE; 15331 } 15332 15333 /* 15334 * tcp_snd_sack_ok can be set in 15335 * tcp_adapt_ire() if the sack metric 15336 * is set. So check it here also. 15337 */ 15338 if (tcp_sack_permitted == 2 || 15339 tcp->tcp_snd_sack_ok) { 15340 if (tcp->tcp_sack_info == NULL) { 15341 tcp->tcp_sack_info = 15342 kmem_cache_alloc(tcp_sack_info_cache, 15343 KM_SLEEP); 15344 } 15345 tcp->tcp_snd_sack_ok = B_TRUE; 15346 } 15347 15348 /* 15349 * Should we use ECN? Note that the current 15350 * default value (SunOS 5.9) of tcp_ecn_permitted 15351 * is 1. The reason for doing this is that there 15352 * are equipments out there that will drop ECN 15353 * enabled IP packets. Setting it to 1 avoids 15354 * compatibility problems. 15355 */ 15356 if (tcp_ecn_permitted == 2) 15357 tcp->tcp_ecn_ok = B_TRUE; 15358 15359 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 15360 syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 15361 tcp->tcp_iss, B_FALSE, NULL, B_FALSE); 15362 if (syn_mp) { 15363 cred_t *cr; 15364 pid_t pid; 15365 15366 /* 15367 * Obtain the credential from the 15368 * thread calling connect(); the credential 15369 * lives on in the second mblk which 15370 * originated from T_CONN_REQ and is echoed 15371 * with the T_BIND_ACK from ip. If none 15372 * can be found, default to the creator 15373 * of the socket. 15374 */ 15375 if (mp->b_cont == NULL || 15376 (cr = DB_CRED(mp->b_cont)) == NULL) { 15377 cr = tcp->tcp_cred; 15378 pid = tcp->tcp_cpid; 15379 } else { 15380 pid = DB_CPID(mp->b_cont); 15381 } 15382 15383 TCP_RECORD_TRACE(tcp, syn_mp, 15384 TCP_TRACE_SEND_PKT); 15385 mblk_setcred(syn_mp, cr); 15386 DB_CPID(syn_mp) = pid; 15387 tcp_send_data(tcp, tcp->tcp_wq, syn_mp); 15388 } 15389 after_syn_sent: 15390 /* 15391 * A trailer mblk indicates a waiting client upstream. 15392 * We complete here the processing begun in 15393 * either tcp_bind() or tcp_connect() by passing 15394 * upstream the reply message they supplied. 15395 */ 15396 mp1 = mp; 15397 mp = mp->b_cont; 15398 freeb(mp1); 15399 if (mp) 15400 break; 15401 return; 15402 case T_ERROR_ACK: 15403 if (tcp->tcp_debug) { 15404 (void) strlog(TCP_MOD_ID, 0, 1, 15405 SL_TRACE|SL_ERROR, 15406 "tcp_rput_other: case T_ERROR_ACK, " 15407 "ERROR_prim == %d", 15408 tea->ERROR_prim); 15409 } 15410 switch (tea->ERROR_prim) { 15411 case O_T_BIND_REQ: 15412 case T_BIND_REQ: 15413 tcp_bind_failed(tcp, mp, 15414 (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? 15415 ENETUNREACH : EADDRNOTAVAIL)); 15416 return; 15417 case T_UNBIND_REQ: 15418 tcp->tcp_hard_binding = B_FALSE; 15419 tcp->tcp_hard_bound = B_FALSE; 15420 if (mp->b_cont) { 15421 freemsg(mp->b_cont); 15422 mp->b_cont = NULL; 15423 } 15424 if (tcp->tcp_unbind_pending) 15425 tcp->tcp_unbind_pending = 0; 15426 else { 15427 /* From tcp_ip_unbind() - free */ 15428 freemsg(mp); 15429 return; 15430 } 15431 break; 15432 case T_SVR4_OPTMGMT_REQ: 15433 if (tcp->tcp_drop_opt_ack_cnt > 0) { 15434 /* T_OPTMGMT_REQ generated by TCP */ 15435 printf("T_SVR4_OPTMGMT_REQ failed " 15436 "%d/%d - dropped (cnt %d)\n", 15437 tea->TLI_error, tea->UNIX_error, 15438 tcp->tcp_drop_opt_ack_cnt); 15439 freemsg(mp); 15440 tcp->tcp_drop_opt_ack_cnt--; 15441 return; 15442 } 15443 break; 15444 } 15445 if (tea->ERROR_prim == T_SVR4_OPTMGMT_REQ && 15446 tcp->tcp_drop_opt_ack_cnt > 0) { 15447 printf("T_SVR4_OPTMGMT_REQ failed %d/%d " 15448 "- dropped (cnt %d)\n", 15449 tea->TLI_error, tea->UNIX_error, 15450 tcp->tcp_drop_opt_ack_cnt); 15451 freemsg(mp); 15452 tcp->tcp_drop_opt_ack_cnt--; 15453 return; 15454 } 15455 break; 15456 case T_OPTMGMT_ACK: 15457 if (tcp->tcp_drop_opt_ack_cnt > 0) { 15458 /* T_OPTMGMT_REQ generated by TCP */ 15459 freemsg(mp); 15460 tcp->tcp_drop_opt_ack_cnt--; 15461 return; 15462 } 15463 break; 15464 default: 15465 break; 15466 } 15467 break; 15468 case M_CTL: 15469 /* 15470 * ICMP messages. 15471 */ 15472 tcp_icmp_error(tcp, mp); 15473 return; 15474 case M_FLUSH: 15475 if (*rptr & FLUSHR) 15476 flushq(q, FLUSHDATA); 15477 break; 15478 default: 15479 break; 15480 } 15481 /* 15482 * Make sure we set this bit before sending the ACK for 15483 * bind. Otherwise accept could possibly run and free 15484 * this tcp struct. 15485 */ 15486 putnext(q, mp); 15487 } 15488 15489 /* 15490 * Called as the result of a qbufcall or a qtimeout to remedy a failure 15491 * to allocate a T_ordrel_ind in tcp_rsrv(). qenable(q) will make 15492 * tcp_rsrv() try again. 15493 */ 15494 static void 15495 tcp_ordrel_kick(void *arg) 15496 { 15497 conn_t *connp = (conn_t *)arg; 15498 tcp_t *tcp = connp->conn_tcp; 15499 15500 tcp->tcp_ordrelid = 0; 15501 tcp->tcp_timeout = B_FALSE; 15502 if (!TCP_IS_DETACHED(tcp) && tcp->tcp_rq != NULL && 15503 tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 15504 qenable(tcp->tcp_rq); 15505 } 15506 } 15507 15508 /* ARGSUSED */ 15509 static void 15510 tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) 15511 { 15512 conn_t *connp = (conn_t *)arg; 15513 tcp_t *tcp = connp->conn_tcp; 15514 queue_t *q = tcp->tcp_rq; 15515 uint_t thwin; 15516 15517 freeb(mp); 15518 15519 TCP_STAT(tcp_rsrv_calls); 15520 15521 if (TCP_IS_DETACHED(tcp) || q == NULL) { 15522 return; 15523 } 15524 15525 if (tcp->tcp_fused) { 15526 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 15527 15528 ASSERT(tcp->tcp_fused); 15529 ASSERT(peer_tcp != NULL && peer_tcp->tcp_fused); 15530 ASSERT(peer_tcp->tcp_loopback_peer == tcp); 15531 ASSERT(!TCP_IS_DETACHED(tcp)); 15532 ASSERT(tcp->tcp_connp->conn_sqp == 15533 peer_tcp->tcp_connp->conn_sqp); 15534 15535 /* 15536 * Normally we would not get backenabled in synchronous 15537 * streams mode, but in case this happens, we need to plug 15538 * synchronous streams during our drain to prevent a race 15539 * with tcp_fuse_rrw() or tcp_fuse_rinfop(). 15540 */ 15541 TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp); 15542 if (tcp->tcp_rcv_list != NULL) 15543 (void) tcp_rcv_drain(tcp->tcp_rq, tcp); 15544 15545 tcp_clrqfull(peer_tcp); 15546 TCP_FUSE_SYNCSTR_UNPLUG_DRAIN(tcp); 15547 TCP_STAT(tcp_fusion_backenabled); 15548 return; 15549 } 15550 15551 if (canputnext(q)) { 15552 tcp->tcp_rwnd = q->q_hiwat; 15553 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 15554 << tcp->tcp_rcv_ws; 15555 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 15556 /* 15557 * Send back a window update immediately if TCP is above 15558 * ESTABLISHED state and the increase of the rcv window 15559 * that the other side knows is at least 1 MSS after flow 15560 * control is lifted. 15561 */ 15562 if (tcp->tcp_state >= TCPS_ESTABLISHED && 15563 (q->q_hiwat - thwin >= tcp->tcp_mss)) { 15564 tcp_xmit_ctl(NULL, tcp, 15565 (tcp->tcp_swnd == 0) ? tcp->tcp_suna : 15566 tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 15567 BUMP_MIB(&tcp_mib, tcpOutWinUpdate); 15568 } 15569 } 15570 /* Handle a failure to allocate a T_ORDREL_IND here */ 15571 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 15572 ASSERT(tcp->tcp_listener == NULL); 15573 if (tcp->tcp_rcv_list != NULL) { 15574 (void) tcp_rcv_drain(q, tcp); 15575 } 15576 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 15577 mp = mi_tpi_ordrel_ind(); 15578 if (mp) { 15579 tcp->tcp_ordrel_done = B_TRUE; 15580 putnext(q, mp); 15581 if (tcp->tcp_deferred_clean_death) { 15582 /* 15583 * tcp_clean_death was deferred for 15584 * T_ORDREL_IND - do it now 15585 */ 15586 tcp->tcp_deferred_clean_death = B_FALSE; 15587 (void) tcp_clean_death(tcp, 15588 tcp->tcp_client_errno, 22); 15589 } 15590 } else if (!tcp->tcp_timeout && tcp->tcp_ordrelid == 0) { 15591 /* 15592 * If there isn't already a timer running 15593 * start one. Use a 4 second 15594 * timer as a fallback since it can't fail. 15595 */ 15596 tcp->tcp_timeout = B_TRUE; 15597 tcp->tcp_ordrelid = TCP_TIMER(tcp, tcp_ordrel_kick, 15598 MSEC_TO_TICK(4000)); 15599 } 15600 } 15601 } 15602 15603 /* 15604 * The read side service routine is called mostly when we get back-enabled as a 15605 * result of flow control relief. Since we don't actually queue anything in 15606 * TCP, we have no data to send out of here. What we do is clear the receive 15607 * window, and send out a window update. 15608 * This routine is also called to drive an orderly release message upstream 15609 * if the attempt in tcp_rput failed. 15610 */ 15611 static void 15612 tcp_rsrv(queue_t *q) 15613 { 15614 conn_t *connp = Q_TO_CONN(q); 15615 tcp_t *tcp = connp->conn_tcp; 15616 mblk_t *mp; 15617 15618 /* No code does a putq on the read side */ 15619 ASSERT(q->q_first == NULL); 15620 15621 /* Nothing to do for the default queue */ 15622 if (q == tcp_g_q) { 15623 return; 15624 } 15625 15626 mp = allocb(0, BPRI_HI); 15627 if (mp == NULL) { 15628 /* 15629 * We are under memory pressure. Return for now and we 15630 * we will be called again later. 15631 */ 15632 if (!tcp->tcp_timeout && tcp->tcp_ordrelid == 0) { 15633 /* 15634 * If there isn't already a timer running 15635 * start one. Use a 4 second 15636 * timer as a fallback since it can't fail. 15637 */ 15638 tcp->tcp_timeout = B_TRUE; 15639 tcp->tcp_ordrelid = TCP_TIMER(tcp, tcp_ordrel_kick, 15640 MSEC_TO_TICK(4000)); 15641 } 15642 return; 15643 } 15644 CONN_INC_REF(connp); 15645 squeue_enter(connp->conn_sqp, mp, tcp_rsrv_input, connp, 15646 SQTAG_TCP_RSRV); 15647 } 15648 15649 /* 15650 * tcp_rwnd_set() is called to adjust the receive window to a desired value. 15651 * We do not allow the receive window to shrink. After setting rwnd, 15652 * set the flow control hiwat of the stream. 15653 * 15654 * This function is called in 2 cases: 15655 * 15656 * 1) Before data transfer begins, in tcp_accept_comm() for accepting a 15657 * connection (passive open) and in tcp_rput_data() for active connect. 15658 * This is called after tcp_mss_set() when the desired MSS value is known. 15659 * This makes sure that our window size is a mutiple of the other side's 15660 * MSS. 15661 * 2) Handling SO_RCVBUF option. 15662 * 15663 * It is ASSUMED that the requested size is a multiple of the current MSS. 15664 * 15665 * XXX - Should allow a lower rwnd than tcp_recv_hiwat_minmss * mss if the 15666 * user requests so. 15667 */ 15668 static int 15669 tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) 15670 { 15671 uint32_t mss = tcp->tcp_mss; 15672 uint32_t old_max_rwnd; 15673 uint32_t max_transmittable_rwnd; 15674 boolean_t tcp_detached = TCP_IS_DETACHED(tcp); 15675 15676 if (tcp->tcp_fused) { 15677 size_t sth_hiwat; 15678 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 15679 15680 ASSERT(peer_tcp != NULL); 15681 /* 15682 * Record the stream head's high water mark for 15683 * this endpoint; this is used for flow-control 15684 * purposes in tcp_fuse_output(). 15685 */ 15686 sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd); 15687 if (!tcp_detached) 15688 (void) mi_set_sth_hiwat(tcp->tcp_rq, sth_hiwat); 15689 15690 /* 15691 * In the fusion case, the maxpsz stream head value of 15692 * our peer is set according to its send buffer size 15693 * and our receive buffer size; since the latter may 15694 * have changed we need to update the peer's maxpsz. 15695 */ 15696 (void) tcp_maxpsz_set(peer_tcp, B_TRUE); 15697 return (rwnd); 15698 } 15699 15700 if (tcp_detached) 15701 old_max_rwnd = tcp->tcp_rwnd; 15702 else 15703 old_max_rwnd = tcp->tcp_rq->q_hiwat; 15704 15705 /* 15706 * Insist on a receive window that is at least 15707 * tcp_recv_hiwat_minmss * MSS (default 4 * MSS) to avoid 15708 * funny TCP interactions of Nagle algorithm, SWS avoidance 15709 * and delayed acknowledgement. 15710 */ 15711 rwnd = MAX(rwnd, tcp_recv_hiwat_minmss * mss); 15712 15713 /* 15714 * If window size info has already been exchanged, TCP should not 15715 * shrink the window. Shrinking window is doable if done carefully. 15716 * We may add that support later. But so far there is not a real 15717 * need to do that. 15718 */ 15719 if (rwnd < old_max_rwnd && tcp->tcp_state > TCPS_SYN_SENT) { 15720 /* MSS may have changed, do a round up again. */ 15721 rwnd = MSS_ROUNDUP(old_max_rwnd, mss); 15722 } 15723 15724 /* 15725 * tcp_rcv_ws starts with TCP_MAX_WINSHIFT so the following check 15726 * can be applied even before the window scale option is decided. 15727 */ 15728 max_transmittable_rwnd = TCP_MAXWIN << tcp->tcp_rcv_ws; 15729 if (rwnd > max_transmittable_rwnd) { 15730 rwnd = max_transmittable_rwnd - 15731 (max_transmittable_rwnd % mss); 15732 if (rwnd < mss) 15733 rwnd = max_transmittable_rwnd; 15734 /* 15735 * If we're over the limit we may have to back down tcp_rwnd. 15736 * The increment below won't work for us. So we set all three 15737 * here and the increment below will have no effect. 15738 */ 15739 tcp->tcp_rwnd = old_max_rwnd = rwnd; 15740 } 15741 if (tcp->tcp_localnet) { 15742 tcp->tcp_rack_abs_max = 15743 MIN(tcp_local_dacks_max, rwnd / mss / 2); 15744 } else { 15745 /* 15746 * For a remote host on a different subnet (through a router), 15747 * we ack every other packet to be conforming to RFC1122. 15748 * tcp_deferred_acks_max is default to 2. 15749 */ 15750 tcp->tcp_rack_abs_max = 15751 MIN(tcp_deferred_acks_max, rwnd / mss / 2); 15752 } 15753 if (tcp->tcp_rack_cur_max > tcp->tcp_rack_abs_max) 15754 tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; 15755 else 15756 tcp->tcp_rack_cur_max = 0; 15757 /* 15758 * Increment the current rwnd by the amount the maximum grew (we 15759 * can not overwrite it since we might be in the middle of a 15760 * connection.) 15761 */ 15762 tcp->tcp_rwnd += rwnd - old_max_rwnd; 15763 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); 15764 if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) 15765 tcp->tcp_cwnd_max = rwnd; 15766 15767 if (tcp_detached) 15768 return (rwnd); 15769 /* 15770 * We set the maximum receive window into rq->q_hiwat. 15771 * This is not actually used for flow control. 15772 */ 15773 tcp->tcp_rq->q_hiwat = rwnd; 15774 /* 15775 * Set the Stream head high water mark. This doesn't have to be 15776 * here, since we are simply using default values, but we would 15777 * prefer to choose these values algorithmically, with a likely 15778 * relationship to rwnd. 15779 */ 15780 (void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd, tcp_sth_rcv_hiwat)); 15781 return (rwnd); 15782 } 15783 15784 /* 15785 * Return SNMP stuff in buffer in mpdata. 15786 */ 15787 int 15788 tcp_snmp_get(queue_t *q, mblk_t *mpctl) 15789 { 15790 mblk_t *mpdata; 15791 mblk_t *mp_conn_ctl = NULL; 15792 mblk_t *mp_conn_tail; 15793 mblk_t *mp_attr_ctl = NULL; 15794 mblk_t *mp_attr_tail; 15795 mblk_t *mp6_conn_ctl = NULL; 15796 mblk_t *mp6_conn_tail; 15797 mblk_t *mp6_attr_ctl = NULL; 15798 mblk_t *mp6_attr_tail; 15799 struct opthdr *optp; 15800 mib2_tcpConnEntry_t tce; 15801 mib2_tcp6ConnEntry_t tce6; 15802 mib2_transportMLPEntry_t mlp; 15803 connf_t *connfp; 15804 conn_t *connp; 15805 int i; 15806 boolean_t ispriv; 15807 zoneid_t zoneid; 15808 int v4_conn_idx; 15809 int v6_conn_idx; 15810 15811 if (mpctl == NULL || 15812 (mpdata = mpctl->b_cont) == NULL || 15813 (mp_conn_ctl = copymsg(mpctl)) == NULL || 15814 (mp_attr_ctl = copymsg(mpctl)) == NULL || 15815 (mp6_conn_ctl = copymsg(mpctl)) == NULL || 15816 (mp6_attr_ctl = copymsg(mpctl)) == NULL) { 15817 freemsg(mp_conn_ctl); 15818 freemsg(mp_attr_ctl); 15819 freemsg(mp6_conn_ctl); 15820 freemsg(mp6_attr_ctl); 15821 return (0); 15822 } 15823 15824 /* build table of connections -- need count in fixed part */ 15825 SET_MIB(tcp_mib.tcpRtoAlgorithm, 4); /* vanj */ 15826 SET_MIB(tcp_mib.tcpRtoMin, tcp_rexmit_interval_min); 15827 SET_MIB(tcp_mib.tcpRtoMax, tcp_rexmit_interval_max); 15828 SET_MIB(tcp_mib.tcpMaxConn, -1); 15829 SET_MIB(tcp_mib.tcpCurrEstab, 0); 15830 15831 ispriv = 15832 secpolicy_net_config((Q_TO_CONN(q))->conn_cred, B_TRUE) == 0; 15833 zoneid = Q_TO_CONN(q)->conn_zoneid; 15834 15835 v4_conn_idx = v6_conn_idx = 0; 15836 mp_conn_tail = mp_attr_tail = mp6_conn_tail = mp6_attr_tail = NULL; 15837 15838 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 15839 15840 connfp = &ipcl_globalhash_fanout[i]; 15841 15842 connp = NULL; 15843 15844 while ((connp = 15845 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 15846 tcp_t *tcp; 15847 boolean_t needattr; 15848 15849 if (connp->conn_zoneid != zoneid) 15850 continue; /* not in this zone */ 15851 15852 tcp = connp->conn_tcp; 15853 UPDATE_MIB(&tcp_mib, tcpInSegs, tcp->tcp_ibsegs); 15854 tcp->tcp_ibsegs = 0; 15855 UPDATE_MIB(&tcp_mib, tcpOutSegs, tcp->tcp_obsegs); 15856 tcp->tcp_obsegs = 0; 15857 15858 tce6.tcp6ConnState = tce.tcpConnState = 15859 tcp_snmp_state(tcp); 15860 if (tce.tcpConnState == MIB2_TCP_established || 15861 tce.tcpConnState == MIB2_TCP_closeWait) 15862 BUMP_MIB(&tcp_mib, tcpCurrEstab); 15863 15864 needattr = B_FALSE; 15865 bzero(&mlp, sizeof (mlp)); 15866 if (connp->conn_mlp_type != mlptSingle) { 15867 if (connp->conn_mlp_type == mlptShared || 15868 connp->conn_mlp_type == mlptBoth) 15869 mlp.tme_flags |= MIB2_TMEF_SHARED; 15870 if (connp->conn_mlp_type == mlptPrivate || 15871 connp->conn_mlp_type == mlptBoth) 15872 mlp.tme_flags |= MIB2_TMEF_PRIVATE; 15873 needattr = B_TRUE; 15874 } 15875 if (connp->conn_peercred != NULL) { 15876 ts_label_t *tsl; 15877 15878 tsl = crgetlabel(connp->conn_peercred); 15879 mlp.tme_doi = label2doi(tsl); 15880 mlp.tme_label = *label2bslabel(tsl); 15881 needattr = B_TRUE; 15882 } 15883 15884 /* Create a message to report on IPv6 entries */ 15885 if (tcp->tcp_ipversion == IPV6_VERSION) { 15886 tce6.tcp6ConnLocalAddress = tcp->tcp_ip_src_v6; 15887 tce6.tcp6ConnRemAddress = tcp->tcp_remote_v6; 15888 tce6.tcp6ConnLocalPort = ntohs(tcp->tcp_lport); 15889 tce6.tcp6ConnRemPort = ntohs(tcp->tcp_fport); 15890 tce6.tcp6ConnIfIndex = tcp->tcp_bound_if; 15891 /* Don't want just anybody seeing these... */ 15892 if (ispriv) { 15893 tce6.tcp6ConnEntryInfo.ce_snxt = 15894 tcp->tcp_snxt; 15895 tce6.tcp6ConnEntryInfo.ce_suna = 15896 tcp->tcp_suna; 15897 tce6.tcp6ConnEntryInfo.ce_rnxt = 15898 tcp->tcp_rnxt; 15899 tce6.tcp6ConnEntryInfo.ce_rack = 15900 tcp->tcp_rack; 15901 } else { 15902 /* 15903 * Netstat, unfortunately, uses this to 15904 * get send/receive queue sizes. How to fix? 15905 * Why not compute the difference only? 15906 */ 15907 tce6.tcp6ConnEntryInfo.ce_snxt = 15908 tcp->tcp_snxt - tcp->tcp_suna; 15909 tce6.tcp6ConnEntryInfo.ce_suna = 0; 15910 tce6.tcp6ConnEntryInfo.ce_rnxt = 15911 tcp->tcp_rnxt - tcp->tcp_rack; 15912 tce6.tcp6ConnEntryInfo.ce_rack = 0; 15913 } 15914 15915 tce6.tcp6ConnEntryInfo.ce_swnd = tcp->tcp_swnd; 15916 tce6.tcp6ConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; 15917 tce6.tcp6ConnEntryInfo.ce_rto = tcp->tcp_rto; 15918 tce6.tcp6ConnEntryInfo.ce_mss = tcp->tcp_mss; 15919 tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state; 15920 15921 (void) snmp_append_data2(mp6_conn_ctl->b_cont, 15922 &mp6_conn_tail, (char *)&tce6, sizeof (tce6)); 15923 15924 mlp.tme_connidx = v6_conn_idx++; 15925 if (needattr) 15926 (void) snmp_append_data2(mp6_attr_ctl->b_cont, 15927 &mp6_attr_tail, (char *)&mlp, sizeof (mlp)); 15928 } 15929 /* 15930 * Create an IPv4 table entry for IPv4 entries and also 15931 * for IPv6 entries which are bound to in6addr_any 15932 * but don't have IPV6_V6ONLY set. 15933 * (i.e. anything an IPv4 peer could connect to) 15934 */ 15935 if (tcp->tcp_ipversion == IPV4_VERSION || 15936 (tcp->tcp_state <= TCPS_LISTEN && 15937 !tcp->tcp_connp->conn_ipv6_v6only && 15938 IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip_src_v6))) { 15939 if (tcp->tcp_ipversion == IPV6_VERSION) { 15940 tce.tcpConnRemAddress = INADDR_ANY; 15941 tce.tcpConnLocalAddress = INADDR_ANY; 15942 } else { 15943 tce.tcpConnRemAddress = 15944 tcp->tcp_remote; 15945 tce.tcpConnLocalAddress = 15946 tcp->tcp_ip_src; 15947 } 15948 tce.tcpConnLocalPort = ntohs(tcp->tcp_lport); 15949 tce.tcpConnRemPort = ntohs(tcp->tcp_fport); 15950 /* Don't want just anybody seeing these... */ 15951 if (ispriv) { 15952 tce.tcpConnEntryInfo.ce_snxt = 15953 tcp->tcp_snxt; 15954 tce.tcpConnEntryInfo.ce_suna = 15955 tcp->tcp_suna; 15956 tce.tcpConnEntryInfo.ce_rnxt = 15957 tcp->tcp_rnxt; 15958 tce.tcpConnEntryInfo.ce_rack = 15959 tcp->tcp_rack; 15960 } else { 15961 /* 15962 * Netstat, unfortunately, uses this to 15963 * get send/receive queue sizes. How 15964 * to fix? 15965 * Why not compute the difference only? 15966 */ 15967 tce.tcpConnEntryInfo.ce_snxt = 15968 tcp->tcp_snxt - tcp->tcp_suna; 15969 tce.tcpConnEntryInfo.ce_suna = 0; 15970 tce.tcpConnEntryInfo.ce_rnxt = 15971 tcp->tcp_rnxt - tcp->tcp_rack; 15972 tce.tcpConnEntryInfo.ce_rack = 0; 15973 } 15974 15975 tce.tcpConnEntryInfo.ce_swnd = tcp->tcp_swnd; 15976 tce.tcpConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; 15977 tce.tcpConnEntryInfo.ce_rto = tcp->tcp_rto; 15978 tce.tcpConnEntryInfo.ce_mss = tcp->tcp_mss; 15979 tce.tcpConnEntryInfo.ce_state = 15980 tcp->tcp_state; 15981 15982 (void) snmp_append_data2(mp_conn_ctl->b_cont, 15983 &mp_conn_tail, (char *)&tce, sizeof (tce)); 15984 15985 mlp.tme_connidx = v4_conn_idx++; 15986 if (needattr) 15987 (void) snmp_append_data2( 15988 mp_attr_ctl->b_cont, 15989 &mp_attr_tail, (char *)&mlp, 15990 sizeof (mlp)); 15991 } 15992 } 15993 } 15994 15995 /* fixed length structure for IPv4 and IPv6 counters */ 15996 SET_MIB(tcp_mib.tcpConnTableSize, sizeof (mib2_tcpConnEntry_t)); 15997 SET_MIB(tcp_mib.tcp6ConnTableSize, sizeof (mib2_tcp6ConnEntry_t)); 15998 optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; 15999 optp->level = MIB2_TCP; 16000 optp->name = 0; 16001 (void) snmp_append_data(mpdata, (char *)&tcp_mib, sizeof (tcp_mib)); 16002 optp->len = msgdsize(mpdata); 16003 qreply(q, mpctl); 16004 16005 /* table of connections... */ 16006 optp = (struct opthdr *)&mp_conn_ctl->b_rptr[ 16007 sizeof (struct T_optmgmt_ack)]; 16008 optp->level = MIB2_TCP; 16009 optp->name = MIB2_TCP_CONN; 16010 optp->len = msgdsize(mp_conn_ctl->b_cont); 16011 qreply(q, mp_conn_ctl); 16012 16013 /* table of MLP attributes... */ 16014 optp = (struct opthdr *)&mp_attr_ctl->b_rptr[ 16015 sizeof (struct T_optmgmt_ack)]; 16016 optp->level = MIB2_TCP; 16017 optp->name = EXPER_XPORT_MLP; 16018 optp->len = msgdsize(mp_attr_ctl->b_cont); 16019 if (optp->len == 0) 16020 freemsg(mp_attr_ctl); 16021 else 16022 qreply(q, mp_attr_ctl); 16023 16024 /* table of IPv6 connections... */ 16025 optp = (struct opthdr *)&mp6_conn_ctl->b_rptr[ 16026 sizeof (struct T_optmgmt_ack)]; 16027 optp->level = MIB2_TCP6; 16028 optp->name = MIB2_TCP6_CONN; 16029 optp->len = msgdsize(mp6_conn_ctl->b_cont); 16030 qreply(q, mp6_conn_ctl); 16031 16032 /* table of IPv6 MLP attributes... */ 16033 optp = (struct opthdr *)&mp6_attr_ctl->b_rptr[ 16034 sizeof (struct T_optmgmt_ack)]; 16035 optp->level = MIB2_TCP6; 16036 optp->name = EXPER_XPORT_MLP; 16037 optp->len = msgdsize(mp6_attr_ctl->b_cont); 16038 if (optp->len == 0) 16039 freemsg(mp6_attr_ctl); 16040 else 16041 qreply(q, mp6_attr_ctl); 16042 return (1); 16043 } 16044 16045 /* Return 0 if invalid set request, 1 otherwise, including non-tcp requests */ 16046 /* ARGSUSED */ 16047 int 16048 tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) 16049 { 16050 mib2_tcpConnEntry_t *tce = (mib2_tcpConnEntry_t *)ptr; 16051 16052 switch (level) { 16053 case MIB2_TCP: 16054 switch (name) { 16055 case 13: 16056 if (tce->tcpConnState != MIB2_TCP_deleteTCB) 16057 return (0); 16058 /* TODO: delete entry defined by tce */ 16059 return (1); 16060 default: 16061 return (0); 16062 } 16063 default: 16064 return (1); 16065 } 16066 } 16067 16068 /* Translate TCP state to MIB2 TCP state. */ 16069 static int 16070 tcp_snmp_state(tcp_t *tcp) 16071 { 16072 if (tcp == NULL) 16073 return (0); 16074 16075 switch (tcp->tcp_state) { 16076 case TCPS_CLOSED: 16077 case TCPS_IDLE: /* RFC1213 doesn't have analogue for IDLE & BOUND */ 16078 case TCPS_BOUND: 16079 return (MIB2_TCP_closed); 16080 case TCPS_LISTEN: 16081 return (MIB2_TCP_listen); 16082 case TCPS_SYN_SENT: 16083 return (MIB2_TCP_synSent); 16084 case TCPS_SYN_RCVD: 16085 return (MIB2_TCP_synReceived); 16086 case TCPS_ESTABLISHED: 16087 return (MIB2_TCP_established); 16088 case TCPS_CLOSE_WAIT: 16089 return (MIB2_TCP_closeWait); 16090 case TCPS_FIN_WAIT_1: 16091 return (MIB2_TCP_finWait1); 16092 case TCPS_CLOSING: 16093 return (MIB2_TCP_closing); 16094 case TCPS_LAST_ACK: 16095 return (MIB2_TCP_lastAck); 16096 case TCPS_FIN_WAIT_2: 16097 return (MIB2_TCP_finWait2); 16098 case TCPS_TIME_WAIT: 16099 return (MIB2_TCP_timeWait); 16100 default: 16101 return (0); 16102 } 16103 } 16104 16105 static char tcp_report_header[] = 16106 "TCP " MI_COL_HDRPAD_STR 16107 "zone dest snxt suna " 16108 "swnd rnxt rack rwnd rto mss w sw rw t " 16109 "recent [lport,fport] state"; 16110 16111 /* 16112 * TCP status report triggered via the Named Dispatch mechanism. 16113 */ 16114 /* ARGSUSED */ 16115 static void 16116 tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, tcp_t *thisstream, 16117 cred_t *cr) 16118 { 16119 char hash[10], addrbuf[INET6_ADDRSTRLEN]; 16120 boolean_t ispriv = secpolicy_net_config(cr, B_TRUE) == 0; 16121 char cflag; 16122 in6_addr_t v6dst; 16123 char buf[80]; 16124 uint_t print_len, buf_len; 16125 16126 buf_len = mp->b_datap->db_lim - mp->b_wptr; 16127 if (buf_len <= 0) 16128 return; 16129 16130 if (hashval >= 0) 16131 (void) sprintf(hash, "%03d ", hashval); 16132 else 16133 hash[0] = '\0'; 16134 16135 /* 16136 * Note that we use the remote address in the tcp_b structure. 16137 * This means that it will print out the real destination address, 16138 * not the next hop's address if source routing is used. This 16139 * avoid the confusion on the output because user may not 16140 * know that source routing is used for a connection. 16141 */ 16142 if (tcp->tcp_ipversion == IPV4_VERSION) { 16143 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &v6dst); 16144 } else { 16145 v6dst = tcp->tcp_remote_v6; 16146 } 16147 (void) inet_ntop(AF_INET6, &v6dst, addrbuf, sizeof (addrbuf)); 16148 /* 16149 * the ispriv checks are so that normal users cannot determine 16150 * sequence number information using NDD. 16151 */ 16152 16153 if (TCP_IS_DETACHED(tcp)) 16154 cflag = '*'; 16155 else 16156 cflag = ' '; 16157 print_len = snprintf((char *)mp->b_wptr, buf_len, 16158 "%s " MI_COL_PTRFMT_STR "%d %s %08x %08x %010d %08x %08x " 16159 "%010d %05ld %05d %1d %02d %02d %1d %08x %s%c\n", 16160 hash, 16161 (void *)tcp, 16162 tcp->tcp_connp->conn_zoneid, 16163 addrbuf, 16164 (ispriv) ? tcp->tcp_snxt : 0, 16165 (ispriv) ? tcp->tcp_suna : 0, 16166 tcp->tcp_swnd, 16167 (ispriv) ? tcp->tcp_rnxt : 0, 16168 (ispriv) ? tcp->tcp_rack : 0, 16169 tcp->tcp_rwnd, 16170 tcp->tcp_rto, 16171 tcp->tcp_mss, 16172 tcp->tcp_snd_ws_ok, 16173 tcp->tcp_snd_ws, 16174 tcp->tcp_rcv_ws, 16175 tcp->tcp_snd_ts_ok, 16176 tcp->tcp_ts_recent, 16177 tcp_display(tcp, buf, DISP_PORT_ONLY), cflag); 16178 if (print_len < buf_len) { 16179 ((mblk_t *)mp)->b_wptr += print_len; 16180 } else { 16181 ((mblk_t *)mp)->b_wptr += buf_len; 16182 } 16183 } 16184 16185 /* 16186 * TCP status report (for listeners only) triggered via the Named Dispatch 16187 * mechanism. 16188 */ 16189 /* ARGSUSED */ 16190 static void 16191 tcp_report_listener(mblk_t *mp, tcp_t *tcp, int hashval) 16192 { 16193 char addrbuf[INET6_ADDRSTRLEN]; 16194 in6_addr_t v6dst; 16195 uint_t print_len, buf_len; 16196 16197 buf_len = mp->b_datap->db_lim - mp->b_wptr; 16198 if (buf_len <= 0) 16199 return; 16200 16201 if (tcp->tcp_ipversion == IPV4_VERSION) { 16202 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6dst); 16203 (void) inet_ntop(AF_INET6, &v6dst, addrbuf, sizeof (addrbuf)); 16204 } else { 16205 (void) inet_ntop(AF_INET6, &tcp->tcp_ip6h->ip6_src, 16206 addrbuf, sizeof (addrbuf)); 16207 } 16208 print_len = snprintf((char *)mp->b_wptr, buf_len, 16209 "%03d " 16210 MI_COL_PTRFMT_STR 16211 "%d %s %05u %08u %d/%d/%d%c\n", 16212 hashval, (void *)tcp, 16213 tcp->tcp_connp->conn_zoneid, 16214 addrbuf, 16215 (uint_t)BE16_TO_U16(tcp->tcp_tcph->th_lport), 16216 tcp->tcp_conn_req_seqnum, 16217 tcp->tcp_conn_req_cnt_q0, tcp->tcp_conn_req_cnt_q, 16218 tcp->tcp_conn_req_max, 16219 tcp->tcp_syn_defense ? '*' : ' '); 16220 if (print_len < buf_len) { 16221 ((mblk_t *)mp)->b_wptr += print_len; 16222 } else { 16223 ((mblk_t *)mp)->b_wptr += buf_len; 16224 } 16225 } 16226 16227 /* TCP status report triggered via the Named Dispatch mechanism. */ 16228 /* ARGSUSED */ 16229 static int 16230 tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16231 { 16232 tcp_t *tcp; 16233 int i; 16234 conn_t *connp; 16235 connf_t *connfp; 16236 zoneid_t zoneid; 16237 16238 /* 16239 * Because of the ndd constraint, at most we can have 64K buffer 16240 * to put in all TCP info. So to be more efficient, just 16241 * allocate a 64K buffer here, assuming we need that large buffer. 16242 * This may be a problem as any user can read tcp_status. Therefore 16243 * we limit the rate of doing this using tcp_ndd_get_info_interval. 16244 * This should be OK as normal users should not do this too often. 16245 */ 16246 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16247 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16248 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16249 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16250 return (0); 16251 } 16252 } 16253 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16254 /* The following may work even if we cannot get a large buf. */ 16255 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16256 return (0); 16257 } 16258 16259 (void) mi_mpprintf(mp, "%s", tcp_report_header); 16260 16261 zoneid = Q_TO_CONN(q)->conn_zoneid; 16262 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 16263 16264 connfp = &ipcl_globalhash_fanout[i]; 16265 16266 connp = NULL; 16267 16268 while ((connp = 16269 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 16270 tcp = connp->conn_tcp; 16271 if (zoneid != GLOBAL_ZONEID && 16272 zoneid != connp->conn_zoneid) 16273 continue; 16274 tcp_report_item(mp->b_cont, tcp, -1, tcp, 16275 cr); 16276 } 16277 16278 } 16279 16280 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16281 return (0); 16282 } 16283 16284 /* TCP status report triggered via the Named Dispatch mechanism. */ 16285 /* ARGSUSED */ 16286 static int 16287 tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16288 { 16289 tf_t *tbf; 16290 tcp_t *tcp; 16291 int i; 16292 zoneid_t zoneid; 16293 16294 /* Refer to comments in tcp_status_report(). */ 16295 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16296 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16297 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16298 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16299 return (0); 16300 } 16301 } 16302 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16303 /* The following may work even if we cannot get a large buf. */ 16304 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16305 return (0); 16306 } 16307 16308 (void) mi_mpprintf(mp, " %s", tcp_report_header); 16309 16310 zoneid = Q_TO_CONN(q)->conn_zoneid; 16311 16312 for (i = 0; i < A_CNT(tcp_bind_fanout); i++) { 16313 tbf = &tcp_bind_fanout[i]; 16314 mutex_enter(&tbf->tf_lock); 16315 for (tcp = tbf->tf_tcp; tcp != NULL; 16316 tcp = tcp->tcp_bind_hash) { 16317 if (zoneid != GLOBAL_ZONEID && 16318 zoneid != tcp->tcp_connp->conn_zoneid) 16319 continue; 16320 CONN_INC_REF(tcp->tcp_connp); 16321 tcp_report_item(mp->b_cont, tcp, i, 16322 Q_TO_TCP(q), cr); 16323 CONN_DEC_REF(tcp->tcp_connp); 16324 } 16325 mutex_exit(&tbf->tf_lock); 16326 } 16327 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16328 return (0); 16329 } 16330 16331 /* TCP status report triggered via the Named Dispatch mechanism. */ 16332 /* ARGSUSED */ 16333 static int 16334 tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16335 { 16336 connf_t *connfp; 16337 conn_t *connp; 16338 tcp_t *tcp; 16339 int i; 16340 zoneid_t zoneid; 16341 16342 /* Refer to comments in tcp_status_report(). */ 16343 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16344 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16345 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16346 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16347 return (0); 16348 } 16349 } 16350 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16351 /* The following may work even if we cannot get a large buf. */ 16352 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16353 return (0); 16354 } 16355 16356 (void) mi_mpprintf(mp, 16357 " TCP " MI_COL_HDRPAD_STR 16358 "zone IP addr port seqnum backlog (q0/q/max)"); 16359 16360 zoneid = Q_TO_CONN(q)->conn_zoneid; 16361 16362 for (i = 0; i < ipcl_bind_fanout_size; i++) { 16363 connfp = &ipcl_bind_fanout[i]; 16364 connp = NULL; 16365 while ((connp = 16366 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 16367 tcp = connp->conn_tcp; 16368 if (zoneid != GLOBAL_ZONEID && 16369 zoneid != connp->conn_zoneid) 16370 continue; 16371 tcp_report_listener(mp->b_cont, tcp, i); 16372 } 16373 } 16374 16375 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16376 return (0); 16377 } 16378 16379 /* TCP status report triggered via the Named Dispatch mechanism. */ 16380 /* ARGSUSED */ 16381 static int 16382 tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16383 { 16384 connf_t *connfp; 16385 conn_t *connp; 16386 tcp_t *tcp; 16387 int i; 16388 zoneid_t zoneid; 16389 16390 /* Refer to comments in tcp_status_report(). */ 16391 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16392 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16393 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16394 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16395 return (0); 16396 } 16397 } 16398 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16399 /* The following may work even if we cannot get a large buf. */ 16400 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16401 return (0); 16402 } 16403 16404 (void) mi_mpprintf(mp, "tcp_conn_hash_size = %d", 16405 ipcl_conn_fanout_size); 16406 (void) mi_mpprintf(mp, " %s", tcp_report_header); 16407 16408 zoneid = Q_TO_CONN(q)->conn_zoneid; 16409 16410 for (i = 0; i < ipcl_conn_fanout_size; i++) { 16411 connfp = &ipcl_conn_fanout[i]; 16412 connp = NULL; 16413 while ((connp = 16414 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 16415 tcp = connp->conn_tcp; 16416 if (zoneid != GLOBAL_ZONEID && 16417 zoneid != connp->conn_zoneid) 16418 continue; 16419 tcp_report_item(mp->b_cont, tcp, i, 16420 Q_TO_TCP(q), cr); 16421 } 16422 } 16423 16424 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16425 return (0); 16426 } 16427 16428 /* TCP status report triggered via the Named Dispatch mechanism. */ 16429 /* ARGSUSED */ 16430 static int 16431 tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 16432 { 16433 tf_t *tf; 16434 tcp_t *tcp; 16435 int i; 16436 zoneid_t zoneid; 16437 16438 /* Refer to comments in tcp_status_report(). */ 16439 if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { 16440 if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < 16441 drv_usectohz(tcp_ndd_get_info_interval * 1000)) { 16442 (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); 16443 return (0); 16444 } 16445 } 16446 if ((mp->b_cont = allocb(ND_MAX_BUF_LEN, BPRI_HI)) == NULL) { 16447 /* The following may work even if we cannot get a large buf. */ 16448 (void) mi_mpprintf(mp, NDD_OUT_OF_BUF_MSG); 16449 return (0); 16450 } 16451 16452 (void) mi_mpprintf(mp, " %s", tcp_report_header); 16453 16454 zoneid = Q_TO_CONN(q)->conn_zoneid; 16455 16456 for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) { 16457 tf = &tcp_acceptor_fanout[i]; 16458 mutex_enter(&tf->tf_lock); 16459 for (tcp = tf->tf_tcp; tcp != NULL; 16460 tcp = tcp->tcp_acceptor_hash) { 16461 if (zoneid != GLOBAL_ZONEID && 16462 zoneid != tcp->tcp_connp->conn_zoneid) 16463 continue; 16464 tcp_report_item(mp->b_cont, tcp, i, 16465 Q_TO_TCP(q), cr); 16466 } 16467 mutex_exit(&tf->tf_lock); 16468 } 16469 tcp_last_ndd_get_info_time = ddi_get_lbolt(); 16470 return (0); 16471 } 16472 16473 /* 16474 * tcp_timer is the timer service routine. It handles the retransmission, 16475 * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out 16476 * from the state of the tcp instance what kind of action needs to be done 16477 * at the time it is called. 16478 */ 16479 static void 16480 tcp_timer(void *arg) 16481 { 16482 mblk_t *mp; 16483 clock_t first_threshold; 16484 clock_t second_threshold; 16485 clock_t ms; 16486 uint32_t mss; 16487 conn_t *connp = (conn_t *)arg; 16488 tcp_t *tcp = connp->conn_tcp; 16489 16490 tcp->tcp_timer_tid = 0; 16491 16492 if (tcp->tcp_fused) 16493 return; 16494 16495 first_threshold = tcp->tcp_first_timer_threshold; 16496 second_threshold = tcp->tcp_second_timer_threshold; 16497 switch (tcp->tcp_state) { 16498 case TCPS_IDLE: 16499 case TCPS_BOUND: 16500 case TCPS_LISTEN: 16501 return; 16502 case TCPS_SYN_RCVD: { 16503 tcp_t *listener = tcp->tcp_listener; 16504 16505 if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) { 16506 ASSERT(tcp->tcp_rq == listener->tcp_rq); 16507 /* it's our first timeout */ 16508 tcp->tcp_syn_rcvd_timeout = 1; 16509 mutex_enter(&listener->tcp_eager_lock); 16510 listener->tcp_syn_rcvd_timeout++; 16511 if (!listener->tcp_syn_defense && 16512 (listener->tcp_syn_rcvd_timeout > 16513 (tcp_conn_req_max_q0 >> 2)) && 16514 (tcp_conn_req_max_q0 > 200)) { 16515 /* We may be under attack. Put on a defense. */ 16516 listener->tcp_syn_defense = B_TRUE; 16517 cmn_err(CE_WARN, "High TCP connect timeout " 16518 "rate! System (port %d) may be under a " 16519 "SYN flood attack!", 16520 BE16_TO_U16(listener->tcp_tcph->th_lport)); 16521 16522 listener->tcp_ip_addr_cache = kmem_zalloc( 16523 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t), 16524 KM_NOSLEEP); 16525 } 16526 mutex_exit(&listener->tcp_eager_lock); 16527 } 16528 } 16529 /* FALLTHRU */ 16530 case TCPS_SYN_SENT: 16531 first_threshold = tcp->tcp_first_ctimer_threshold; 16532 second_threshold = tcp->tcp_second_ctimer_threshold; 16533 break; 16534 case TCPS_ESTABLISHED: 16535 case TCPS_FIN_WAIT_1: 16536 case TCPS_CLOSING: 16537 case TCPS_CLOSE_WAIT: 16538 case TCPS_LAST_ACK: 16539 /* If we have data to rexmit */ 16540 if (tcp->tcp_suna != tcp->tcp_snxt) { 16541 clock_t time_to_wait; 16542 16543 BUMP_MIB(&tcp_mib, tcpTimRetrans); 16544 if (!tcp->tcp_xmit_head) 16545 break; 16546 time_to_wait = lbolt - 16547 (clock_t)tcp->tcp_xmit_head->b_prev; 16548 time_to_wait = tcp->tcp_rto - 16549 TICK_TO_MSEC(time_to_wait); 16550 /* 16551 * If the timer fires too early, 1 clock tick earlier, 16552 * restart the timer. 16553 */ 16554 if (time_to_wait > msec_per_tick) { 16555 TCP_STAT(tcp_timer_fire_early); 16556 TCP_TIMER_RESTART(tcp, time_to_wait); 16557 return; 16558 } 16559 /* 16560 * When we probe zero windows, we force the swnd open. 16561 * If our peer acks with a closed window swnd will be 16562 * set to zero by tcp_rput(). As long as we are 16563 * receiving acks tcp_rput will 16564 * reset 'tcp_ms_we_have_waited' so as not to trip the 16565 * first and second interval actions. NOTE: the timer 16566 * interval is allowed to continue its exponential 16567 * backoff. 16568 */ 16569 if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { 16570 if (tcp->tcp_debug) { 16571 (void) strlog(TCP_MOD_ID, 0, 1, 16572 SL_TRACE, "tcp_timer: zero win"); 16573 } 16574 } else { 16575 /* 16576 * After retransmission, we need to do 16577 * slow start. Set the ssthresh to one 16578 * half of current effective window and 16579 * cwnd to one MSS. Also reset 16580 * tcp_cwnd_cnt. 16581 * 16582 * Note that if tcp_ssthresh is reduced because 16583 * of ECN, do not reduce it again unless it is 16584 * already one window of data away (tcp_cwr 16585 * should then be cleared) or this is a 16586 * timeout for a retransmitted segment. 16587 */ 16588 uint32_t npkt; 16589 16590 if (!tcp->tcp_cwr || tcp->tcp_rexmit) { 16591 npkt = ((tcp->tcp_timer_backoff ? 16592 tcp->tcp_cwnd_ssthresh : 16593 tcp->tcp_snxt - 16594 tcp->tcp_suna) >> 1) / tcp->tcp_mss; 16595 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * 16596 tcp->tcp_mss; 16597 } 16598 tcp->tcp_cwnd = tcp->tcp_mss; 16599 tcp->tcp_cwnd_cnt = 0; 16600 if (tcp->tcp_ecn_ok) { 16601 tcp->tcp_cwr = B_TRUE; 16602 tcp->tcp_cwr_snd_max = tcp->tcp_snxt; 16603 tcp->tcp_ecn_cwr_sent = B_FALSE; 16604 } 16605 } 16606 break; 16607 } 16608 /* 16609 * We have something to send yet we cannot send. The 16610 * reason can be: 16611 * 16612 * 1. Zero send window: we need to do zero window probe. 16613 * 2. Zero cwnd: because of ECN, we need to "clock out 16614 * segments. 16615 * 3. SWS avoidance: receiver may have shrunk window, 16616 * reset our knowledge. 16617 * 16618 * Note that condition 2 can happen with either 1 or 16619 * 3. But 1 and 3 are exclusive. 16620 */ 16621 if (tcp->tcp_unsent != 0) { 16622 if (tcp->tcp_cwnd == 0) { 16623 /* 16624 * Set tcp_cwnd to 1 MSS so that a 16625 * new segment can be sent out. We 16626 * are "clocking out" new data when 16627 * the network is really congested. 16628 */ 16629 ASSERT(tcp->tcp_ecn_ok); 16630 tcp->tcp_cwnd = tcp->tcp_mss; 16631 } 16632 if (tcp->tcp_swnd == 0) { 16633 /* Extend window for zero window probe */ 16634 tcp->tcp_swnd++; 16635 tcp->tcp_zero_win_probe = B_TRUE; 16636 BUMP_MIB(&tcp_mib, tcpOutWinProbe); 16637 } else { 16638 /* 16639 * Handle timeout from sender SWS avoidance. 16640 * Reset our knowledge of the max send window 16641 * since the receiver might have reduced its 16642 * receive buffer. Avoid setting tcp_max_swnd 16643 * to one since that will essentially disable 16644 * the SWS checks. 16645 * 16646 * Note that since we don't have a SWS 16647 * state variable, if the timeout is set 16648 * for ECN but not for SWS, this 16649 * code will also be executed. This is 16650 * fine as tcp_max_swnd is updated 16651 * constantly and it will not affect 16652 * anything. 16653 */ 16654 tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); 16655 } 16656 tcp_wput_data(tcp, NULL, B_FALSE); 16657 return; 16658 } 16659 /* Is there a FIN that needs to be to re retransmitted? */ 16660 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 16661 !tcp->tcp_fin_acked) 16662 break; 16663 /* Nothing to do, return without restarting timer. */ 16664 TCP_STAT(tcp_timer_fire_miss); 16665 return; 16666 case TCPS_FIN_WAIT_2: 16667 /* 16668 * User closed the TCP endpoint and peer ACK'ed our FIN. 16669 * We waited some time for for peer's FIN, but it hasn't 16670 * arrived. We flush the connection now to avoid 16671 * case where the peer has rebooted. 16672 */ 16673 if (TCP_IS_DETACHED(tcp)) { 16674 (void) tcp_clean_death(tcp, 0, 23); 16675 } else { 16676 TCP_TIMER_RESTART(tcp, tcp_fin_wait_2_flush_interval); 16677 } 16678 return; 16679 case TCPS_TIME_WAIT: 16680 (void) tcp_clean_death(tcp, 0, 24); 16681 return; 16682 default: 16683 if (tcp->tcp_debug) { 16684 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, 16685 "tcp_timer: strange state (%d) %s", 16686 tcp->tcp_state, tcp_display(tcp, NULL, 16687 DISP_PORT_ONLY)); 16688 } 16689 return; 16690 } 16691 if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { 16692 /* 16693 * For zero window probe, we need to send indefinitely, 16694 * unless we have not heard from the other side for some 16695 * time... 16696 */ 16697 if ((tcp->tcp_zero_win_probe == 0) || 16698 (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) > 16699 second_threshold)) { 16700 BUMP_MIB(&tcp_mib, tcpTimRetransDrop); 16701 /* 16702 * If TCP is in SYN_RCVD state, send back a 16703 * RST|ACK as BSD does. Note that tcp_zero_win_probe 16704 * should be zero in TCPS_SYN_RCVD state. 16705 */ 16706 if (tcp->tcp_state == TCPS_SYN_RCVD) { 16707 tcp_xmit_ctl("tcp_timer: RST sent on timeout " 16708 "in SYN_RCVD", 16709 tcp, tcp->tcp_snxt, 16710 tcp->tcp_rnxt, TH_RST | TH_ACK); 16711 } 16712 (void) tcp_clean_death(tcp, 16713 tcp->tcp_client_errno ? 16714 tcp->tcp_client_errno : ETIMEDOUT, 25); 16715 return; 16716 } else { 16717 /* 16718 * Set tcp_ms_we_have_waited to second_threshold 16719 * so that in next timeout, we will do the above 16720 * check (lbolt - tcp_last_recv_time). This is 16721 * also to avoid overflow. 16722 * 16723 * We don't need to decrement tcp_timer_backoff 16724 * to avoid overflow because it will be decremented 16725 * later if new timeout value is greater than 16726 * tcp_rexmit_interval_max. In the case when 16727 * tcp_rexmit_interval_max is greater than 16728 * second_threshold, it means that we will wait 16729 * longer than second_threshold to send the next 16730 * window probe. 16731 */ 16732 tcp->tcp_ms_we_have_waited = second_threshold; 16733 } 16734 } else if (ms > first_threshold) { 16735 if (tcp->tcp_snd_zcopy_aware && (!tcp->tcp_xmit_zc_clean) && 16736 tcp->tcp_xmit_head != NULL) { 16737 tcp->tcp_xmit_head = 16738 tcp_zcopy_backoff(tcp, tcp->tcp_xmit_head, 1); 16739 } 16740 /* 16741 * We have been retransmitting for too long... The RTT 16742 * we calculated is probably incorrect. Reinitialize it. 16743 * Need to compensate for 0 tcp_rtt_sa. Reset 16744 * tcp_rtt_update so that we won't accidentally cache a 16745 * bad value. But only do this if this is not a zero 16746 * window probe. 16747 */ 16748 if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { 16749 tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + 16750 (tcp->tcp_rtt_sa >> 5); 16751 tcp->tcp_rtt_sa = 0; 16752 tcp_ip_notify(tcp); 16753 tcp->tcp_rtt_update = 0; 16754 } 16755 } 16756 tcp->tcp_timer_backoff++; 16757 if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + 16758 tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < 16759 tcp_rexmit_interval_min) { 16760 /* 16761 * This means the original RTO is tcp_rexmit_interval_min. 16762 * So we will use tcp_rexmit_interval_min as the RTO value 16763 * and do the backoff. 16764 */ 16765 ms = tcp_rexmit_interval_min << tcp->tcp_timer_backoff; 16766 } else { 16767 ms <<= tcp->tcp_timer_backoff; 16768 } 16769 if (ms > tcp_rexmit_interval_max) { 16770 ms = tcp_rexmit_interval_max; 16771 /* 16772 * ms is at max, decrement tcp_timer_backoff to avoid 16773 * overflow. 16774 */ 16775 tcp->tcp_timer_backoff--; 16776 } 16777 tcp->tcp_ms_we_have_waited += ms; 16778 if (tcp->tcp_zero_win_probe == 0) { 16779 tcp->tcp_rto = ms; 16780 } 16781 TCP_TIMER_RESTART(tcp, ms); 16782 /* 16783 * This is after a timeout and tcp_rto is backed off. Set 16784 * tcp_set_timer to 1 so that next time RTO is updated, we will 16785 * restart the timer with a correct value. 16786 */ 16787 tcp->tcp_set_timer = 1; 16788 mss = tcp->tcp_snxt - tcp->tcp_suna; 16789 if (mss > tcp->tcp_mss) 16790 mss = tcp->tcp_mss; 16791 if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) 16792 mss = tcp->tcp_swnd; 16793 16794 if ((mp = tcp->tcp_xmit_head) != NULL) 16795 mp->b_prev = (mblk_t *)lbolt; 16796 mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, 16797 B_TRUE); 16798 16799 /* 16800 * When slow start after retransmission begins, start with 16801 * this seq no. tcp_rexmit_max marks the end of special slow 16802 * start phase. tcp_snd_burst controls how many segments 16803 * can be sent because of an ack. 16804 */ 16805 tcp->tcp_rexmit_nxt = tcp->tcp_suna; 16806 tcp->tcp_snd_burst = TCP_CWND_SS; 16807 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 16808 (tcp->tcp_unsent == 0)) { 16809 tcp->tcp_rexmit_max = tcp->tcp_fss; 16810 } else { 16811 tcp->tcp_rexmit_max = tcp->tcp_snxt; 16812 } 16813 tcp->tcp_rexmit = B_TRUE; 16814 tcp->tcp_dupack_cnt = 0; 16815 16816 /* 16817 * Remove all rexmit SACK blk to start from fresh. 16818 */ 16819 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 16820 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list); 16821 tcp->tcp_num_notsack_blk = 0; 16822 tcp->tcp_cnt_notsack_list = 0; 16823 } 16824 if (mp == NULL) { 16825 return; 16826 } 16827 /* Attach credentials to retransmitted initial SYNs. */ 16828 if (tcp->tcp_state == TCPS_SYN_SENT) { 16829 mblk_setcred(mp, tcp->tcp_cred); 16830 DB_CPID(mp) = tcp->tcp_cpid; 16831 } 16832 16833 tcp->tcp_csuna = tcp->tcp_snxt; 16834 BUMP_MIB(&tcp_mib, tcpRetransSegs); 16835 UPDATE_MIB(&tcp_mib, tcpRetransBytes, mss); 16836 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 16837 tcp_send_data(tcp, tcp->tcp_wq, mp); 16838 16839 } 16840 16841 /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ 16842 static void 16843 tcp_unbind(tcp_t *tcp, mblk_t *mp) 16844 { 16845 conn_t *connp; 16846 16847 switch (tcp->tcp_state) { 16848 case TCPS_BOUND: 16849 case TCPS_LISTEN: 16850 break; 16851 default: 16852 tcp_err_ack(tcp, mp, TOUTSTATE, 0); 16853 return; 16854 } 16855 16856 /* 16857 * Need to clean up all the eagers since after the unbind, segments 16858 * will no longer be delivered to this listener stream. 16859 */ 16860 mutex_enter(&tcp->tcp_eager_lock); 16861 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { 16862 tcp_eager_cleanup(tcp, 0); 16863 } 16864 mutex_exit(&tcp->tcp_eager_lock); 16865 16866 if (tcp->tcp_ipversion == IPV4_VERSION) { 16867 tcp->tcp_ipha->ipha_src = 0; 16868 } else { 16869 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); 16870 } 16871 V6_SET_ZERO(tcp->tcp_ip_src_v6); 16872 bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport)); 16873 tcp_bind_hash_remove(tcp); 16874 tcp->tcp_state = TCPS_IDLE; 16875 tcp->tcp_mdt = B_FALSE; 16876 /* Send M_FLUSH according to TPI */ 16877 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); 16878 connp = tcp->tcp_connp; 16879 connp->conn_mdt_ok = B_FALSE; 16880 ipcl_hash_remove(connp); 16881 bzero(&connp->conn_ports, sizeof (connp->conn_ports)); 16882 mp = mi_tpi_ok_ack_alloc(mp); 16883 putnext(tcp->tcp_rq, mp); 16884 } 16885 16886 /* 16887 * Don't let port fall into the privileged range. 16888 * Since the extra privileged ports can be arbitrary we also 16889 * ensure that we exclude those from consideration. 16890 * tcp_g_epriv_ports is not sorted thus we loop over it until 16891 * there are no changes. 16892 * 16893 * Note: No locks are held when inspecting tcp_g_*epriv_ports 16894 * but instead the code relies on: 16895 * - the fact that the address of the array and its size never changes 16896 * - the atomic assignment of the elements of the array 16897 * 16898 * Returns 0 if there are no more ports available. 16899 * 16900 * TS note: skip multilevel ports. 16901 */ 16902 static in_port_t 16903 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) 16904 { 16905 int i; 16906 boolean_t restart = B_FALSE; 16907 16908 if (random && tcp_random_anon_port != 0) { 16909 (void) random_get_pseudo_bytes((uint8_t *)&port, 16910 sizeof (in_port_t)); 16911 /* 16912 * Unless changed by a sys admin, the smallest anon port 16913 * is 32768 and the largest anon port is 65535. It is 16914 * very likely (50%) for the random port to be smaller 16915 * than the smallest anon port. When that happens, 16916 * add port % (anon port range) to the smallest anon 16917 * port to get the random port. It should fall into the 16918 * valid anon port range. 16919 */ 16920 if (port < tcp_smallest_anon_port) { 16921 port = tcp_smallest_anon_port + 16922 port % (tcp_largest_anon_port - 16923 tcp_smallest_anon_port); 16924 } 16925 } 16926 16927 retry: 16928 if (port < tcp_smallest_anon_port) 16929 port = (in_port_t)tcp_smallest_anon_port; 16930 16931 if (port > tcp_largest_anon_port) { 16932 if (restart) 16933 return (0); 16934 restart = B_TRUE; 16935 port = (in_port_t)tcp_smallest_anon_port; 16936 } 16937 16938 if (port < tcp_smallest_nonpriv_port) 16939 port = (in_port_t)tcp_smallest_nonpriv_port; 16940 16941 for (i = 0; i < tcp_g_num_epriv_ports; i++) { 16942 if (port == tcp_g_epriv_ports[i]) { 16943 port++; 16944 /* 16945 * Make sure whether the port is in the 16946 * valid range. 16947 */ 16948 goto retry; 16949 } 16950 } 16951 if (is_system_labeled() && 16952 (i = tsol_next_port(crgetzone(tcp->tcp_cred), port, 16953 IPPROTO_TCP, B_TRUE)) != 0) { 16954 port = i; 16955 goto retry; 16956 } 16957 return (port); 16958 } 16959 16960 /* 16961 * Return the next anonymous port in the privileged port range for 16962 * bind checking. It starts at IPPORT_RESERVED - 1 and goes 16963 * downwards. This is the same behavior as documented in the userland 16964 * library call rresvport(3N). 16965 * 16966 * TS note: skip multilevel ports. 16967 */ 16968 static in_port_t 16969 tcp_get_next_priv_port(const tcp_t *tcp) 16970 { 16971 static in_port_t next_priv_port = IPPORT_RESERVED - 1; 16972 in_port_t nextport; 16973 boolean_t restart = B_FALSE; 16974 16975 retry: 16976 if (next_priv_port < tcp_min_anonpriv_port || 16977 next_priv_port >= IPPORT_RESERVED) { 16978 next_priv_port = IPPORT_RESERVED - 1; 16979 if (restart) 16980 return (0); 16981 restart = B_TRUE; 16982 } 16983 if (is_system_labeled() && 16984 (nextport = tsol_next_port(crgetzone(tcp->tcp_cred), 16985 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { 16986 next_priv_port = nextport; 16987 goto retry; 16988 } 16989 return (next_priv_port--); 16990 } 16991 16992 /* The write side r/w procedure. */ 16993 16994 #if CCS_STATS 16995 struct { 16996 struct { 16997 int64_t count, bytes; 16998 } tot, hit; 16999 } wrw_stats; 17000 #endif 17001 17002 /* 17003 * Call by tcp_wput() to handle all non data, except M_PROTO and M_PCPROTO, 17004 * messages. 17005 */ 17006 /* ARGSUSED */ 17007 static void 17008 tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2) 17009 { 17010 conn_t *connp = (conn_t *)arg; 17011 tcp_t *tcp = connp->conn_tcp; 17012 queue_t *q = tcp->tcp_wq; 17013 17014 ASSERT(DB_TYPE(mp) != M_IOCTL); 17015 /* 17016 * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close. 17017 * Once the close starts, streamhead and sockfs will not let any data 17018 * packets come down (close ensures that there are no threads using the 17019 * queue and no new threads will come down) but since qprocsoff() 17020 * hasn't happened yet, a M_FLUSH or some non data message might 17021 * get reflected back (in response to our own FLUSHRW) and get 17022 * processed after tcp_close() is done. The conn would still be valid 17023 * because a ref would have added but we need to check the state 17024 * before actually processing the packet. 17025 */ 17026 if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) { 17027 freemsg(mp); 17028 return; 17029 } 17030 17031 switch (DB_TYPE(mp)) { 17032 case M_IOCDATA: 17033 tcp_wput_iocdata(tcp, mp); 17034 break; 17035 case M_FLUSH: 17036 tcp_wput_flush(tcp, mp); 17037 break; 17038 default: 17039 CALL_IP_WPUT(connp, q, mp); 17040 break; 17041 } 17042 } 17043 17044 /* 17045 * The TCP fast path write put procedure. 17046 * NOTE: the logic of the fast path is duplicated from tcp_wput_data() 17047 */ 17048 /* ARGSUSED */ 17049 void 17050 tcp_output(void *arg, mblk_t *mp, void *arg2) 17051 { 17052 int len; 17053 int hdrlen; 17054 int plen; 17055 mblk_t *mp1; 17056 uchar_t *rptr; 17057 uint32_t snxt; 17058 tcph_t *tcph; 17059 struct datab *db; 17060 uint32_t suna; 17061 uint32_t mss; 17062 ipaddr_t *dst; 17063 ipaddr_t *src; 17064 uint32_t sum; 17065 int usable; 17066 conn_t *connp = (conn_t *)arg; 17067 tcp_t *tcp = connp->conn_tcp; 17068 uint32_t msize; 17069 17070 /* 17071 * Try and ASSERT the minimum possible references on the 17072 * conn early enough. Since we are executing on write side, 17073 * the connection is obviously not detached and that means 17074 * there is a ref each for TCP and IP. Since we are behind 17075 * the squeue, the minimum references needed are 3. If the 17076 * conn is in classifier hash list, there should be an 17077 * extra ref for that (we check both the possibilities). 17078 */ 17079 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 17080 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 17081 17082 ASSERT(DB_TYPE(mp) == M_DATA); 17083 msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 17084 17085 mutex_enter(&connp->conn_lock); 17086 tcp->tcp_squeue_bytes -= msize; 17087 mutex_exit(&connp->conn_lock); 17088 17089 /* Bypass tcp protocol for fused tcp loopback */ 17090 if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 17091 return; 17092 17093 mss = tcp->tcp_mss; 17094 if (tcp->tcp_xmit_zc_clean) 17095 mp = tcp_zcopy_backoff(tcp, mp, 0); 17096 17097 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 17098 len = (int)(mp->b_wptr - mp->b_rptr); 17099 17100 /* 17101 * Criteria for fast path: 17102 * 17103 * 1. no unsent data 17104 * 2. single mblk in request 17105 * 3. connection established 17106 * 4. data in mblk 17107 * 5. len <= mss 17108 * 6. no tcp_valid bits 17109 */ 17110 if ((tcp->tcp_unsent != 0) || 17111 (tcp->tcp_cork) || 17112 (mp->b_cont != NULL) || 17113 (tcp->tcp_state != TCPS_ESTABLISHED) || 17114 (len == 0) || 17115 (len > mss) || 17116 (tcp->tcp_valid_bits != 0)) { 17117 tcp_wput_data(tcp, mp, B_FALSE); 17118 return; 17119 } 17120 17121 ASSERT(tcp->tcp_xmit_tail_unsent == 0); 17122 ASSERT(tcp->tcp_fin_sent == 0); 17123 17124 /* queue new packet onto retransmission queue */ 17125 if (tcp->tcp_xmit_head == NULL) { 17126 tcp->tcp_xmit_head = mp; 17127 } else { 17128 tcp->tcp_xmit_last->b_cont = mp; 17129 } 17130 tcp->tcp_xmit_last = mp; 17131 tcp->tcp_xmit_tail = mp; 17132 17133 /* find out how much we can send */ 17134 /* BEGIN CSTYLED */ 17135 /* 17136 * un-acked usable 17137 * |--------------|-----------------| 17138 * tcp_suna tcp_snxt tcp_suna+tcp_swnd 17139 */ 17140 /* END CSTYLED */ 17141 17142 /* start sending from tcp_snxt */ 17143 snxt = tcp->tcp_snxt; 17144 17145 /* 17146 * Check to see if this connection has been idled for some 17147 * time and no ACK is expected. If it is, we need to slow 17148 * start again to get back the connection's "self-clock" as 17149 * described in VJ's paper. 17150 * 17151 * Refer to the comment in tcp_mss_set() for the calculation 17152 * of tcp_cwnd after idle. 17153 */ 17154 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 17155 (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 17156 SET_TCP_INIT_CWND(tcp, mss, tcp_slow_start_after_idle); 17157 } 17158 17159 usable = tcp->tcp_swnd; /* tcp window size */ 17160 if (usable > tcp->tcp_cwnd) 17161 usable = tcp->tcp_cwnd; /* congestion window smaller */ 17162 usable -= snxt; /* subtract stuff already sent */ 17163 suna = tcp->tcp_suna; 17164 usable += suna; 17165 /* usable can be < 0 if the congestion window is smaller */ 17166 if (len > usable) { 17167 /* Can't send complete M_DATA in one shot */ 17168 goto slow; 17169 } 17170 17171 if (tcp->tcp_flow_stopped && 17172 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 17173 tcp_clrqfull(tcp); 17174 } 17175 17176 /* 17177 * determine if anything to send (Nagle). 17178 * 17179 * 1. len < tcp_mss (i.e. small) 17180 * 2. unacknowledged data present 17181 * 3. len < nagle limit 17182 * 4. last packet sent < nagle limit (previous packet sent) 17183 */ 17184 if ((len < mss) && (snxt != suna) && 17185 (len < (int)tcp->tcp_naglim) && 17186 (tcp->tcp_last_sent_len < tcp->tcp_naglim)) { 17187 /* 17188 * This was the first unsent packet and normally 17189 * mss < xmit_hiwater so there is no need to worry 17190 * about flow control. The next packet will go 17191 * through the flow control check in tcp_wput_data(). 17192 */ 17193 /* leftover work from above */ 17194 tcp->tcp_unsent = len; 17195 tcp->tcp_xmit_tail_unsent = len; 17196 17197 return; 17198 } 17199 17200 /* len <= tcp->tcp_mss && len == unsent so no silly window */ 17201 17202 if (snxt == suna) { 17203 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 17204 } 17205 17206 /* we have always sent something */ 17207 tcp->tcp_rack_cnt = 0; 17208 17209 tcp->tcp_snxt = snxt + len; 17210 tcp->tcp_rack = tcp->tcp_rnxt; 17211 17212 if ((mp1 = dupb(mp)) == 0) 17213 goto no_memory; 17214 mp->b_prev = (mblk_t *)(uintptr_t)lbolt; 17215 mp->b_next = (mblk_t *)(uintptr_t)snxt; 17216 17217 /* adjust tcp header information */ 17218 tcph = tcp->tcp_tcph; 17219 tcph->th_flags[0] = (TH_ACK|TH_PUSH); 17220 17221 sum = len + tcp->tcp_tcp_hdr_len + tcp->tcp_sum; 17222 sum = (sum >> 16) + (sum & 0xFFFF); 17223 U16_TO_ABE16(sum, tcph->th_sum); 17224 17225 U32_TO_ABE32(snxt, tcph->th_seq); 17226 17227 BUMP_MIB(&tcp_mib, tcpOutDataSegs); 17228 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len); 17229 BUMP_LOCAL(tcp->tcp_obsegs); 17230 17231 /* Update the latest receive window size in TCP header. */ 17232 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 17233 tcph->th_win); 17234 17235 tcp->tcp_last_sent_len = (ushort_t)len; 17236 17237 plen = len + tcp->tcp_hdr_len; 17238 17239 if (tcp->tcp_ipversion == IPV4_VERSION) { 17240 tcp->tcp_ipha->ipha_length = htons(plen); 17241 } else { 17242 tcp->tcp_ip6h->ip6_plen = htons(plen - 17243 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 17244 } 17245 17246 /* see if we need to allocate a mblk for the headers */ 17247 hdrlen = tcp->tcp_hdr_len; 17248 rptr = mp1->b_rptr - hdrlen; 17249 db = mp1->b_datap; 17250 if ((db->db_ref != 2) || rptr < db->db_base || 17251 (!OK_32PTR(rptr))) { 17252 /* NOTE: we assume allocb returns an OK_32PTR */ 17253 mp = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 17254 tcp_wroff_xtra, BPRI_MED); 17255 if (!mp) { 17256 freemsg(mp1); 17257 goto no_memory; 17258 } 17259 mp->b_cont = mp1; 17260 mp1 = mp; 17261 /* Leave room for Link Level header */ 17262 /* hdrlen = tcp->tcp_hdr_len; */ 17263 rptr = &mp1->b_rptr[tcp_wroff_xtra]; 17264 mp1->b_wptr = &rptr[hdrlen]; 17265 } 17266 mp1->b_rptr = rptr; 17267 17268 /* Fill in the timestamp option. */ 17269 if (tcp->tcp_snd_ts_ok) { 17270 U32_TO_BE32((uint32_t)lbolt, 17271 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 17272 U32_TO_BE32(tcp->tcp_ts_recent, 17273 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 17274 } else { 17275 ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 17276 } 17277 17278 /* copy header into outgoing packet */ 17279 dst = (ipaddr_t *)rptr; 17280 src = (ipaddr_t *)tcp->tcp_iphc; 17281 dst[0] = src[0]; 17282 dst[1] = src[1]; 17283 dst[2] = src[2]; 17284 dst[3] = src[3]; 17285 dst[4] = src[4]; 17286 dst[5] = src[5]; 17287 dst[6] = src[6]; 17288 dst[7] = src[7]; 17289 dst[8] = src[8]; 17290 dst[9] = src[9]; 17291 if (hdrlen -= 40) { 17292 hdrlen >>= 2; 17293 dst += 10; 17294 src += 10; 17295 do { 17296 *dst++ = *src++; 17297 } while (--hdrlen); 17298 } 17299 17300 /* 17301 * Set the ECN info in the TCP header. Note that this 17302 * is not the template header. 17303 */ 17304 if (tcp->tcp_ecn_ok) { 17305 SET_ECT(tcp, rptr); 17306 17307 tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 17308 if (tcp->tcp_ecn_echo_on) 17309 tcph->th_flags[0] |= TH_ECE; 17310 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 17311 tcph->th_flags[0] |= TH_CWR; 17312 tcp->tcp_ecn_cwr_sent = B_TRUE; 17313 } 17314 } 17315 17316 if (tcp->tcp_ip_forward_progress) { 17317 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 17318 *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; 17319 tcp->tcp_ip_forward_progress = B_FALSE; 17320 } 17321 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); 17322 tcp_send_data(tcp, tcp->tcp_wq, mp1); 17323 return; 17324 17325 /* 17326 * If we ran out of memory, we pretend to have sent the packet 17327 * and that it was lost on the wire. 17328 */ 17329 no_memory: 17330 return; 17331 17332 slow: 17333 /* leftover work from above */ 17334 tcp->tcp_unsent = len; 17335 tcp->tcp_xmit_tail_unsent = len; 17336 tcp_wput_data(tcp, NULL, B_FALSE); 17337 } 17338 17339 /* 17340 * The function called through squeue to get behind eager's perimeter to 17341 * finish the accept processing. 17342 */ 17343 /* ARGSUSED */ 17344 void 17345 tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) 17346 { 17347 conn_t *connp = (conn_t *)arg; 17348 tcp_t *tcp = connp->conn_tcp; 17349 queue_t *q = tcp->tcp_rq; 17350 mblk_t *mp1; 17351 mblk_t *stropt_mp = mp; 17352 struct stroptions *stropt; 17353 uint_t thwin; 17354 17355 /* 17356 * Drop the eager's ref on the listener, that was placed when 17357 * this eager began life in tcp_conn_request. 17358 */ 17359 CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); 17360 17361 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) { 17362 /* 17363 * Someone blewoff the eager before we could finish 17364 * the accept. 17365 * 17366 * The only reason eager exists it because we put in 17367 * a ref on it when conn ind went up. We need to send 17368 * a disconnect indication up while the last reference 17369 * on the eager will be dropped by the squeue when we 17370 * return. 17371 */ 17372 ASSERT(tcp->tcp_listener == NULL); 17373 if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { 17374 struct T_discon_ind *tdi; 17375 17376 (void) putnextctl1(q, M_FLUSH, FLUSHRW); 17377 /* 17378 * Let us reuse the incoming mblk to avoid memory 17379 * allocation failure problems. We know that the 17380 * size of the incoming mblk i.e. stroptions is greater 17381 * than sizeof T_discon_ind. So the reallocb below 17382 * can't fail. 17383 */ 17384 freemsg(mp->b_cont); 17385 mp->b_cont = NULL; 17386 ASSERT(DB_REF(mp) == 1); 17387 mp = reallocb(mp, sizeof (struct T_discon_ind), 17388 B_FALSE); 17389 ASSERT(mp != NULL); 17390 DB_TYPE(mp) = M_PROTO; 17391 ((union T_primitives *)mp->b_rptr)->type = T_DISCON_IND; 17392 tdi = (struct T_discon_ind *)mp->b_rptr; 17393 if (tcp->tcp_issocket) { 17394 tdi->DISCON_reason = ECONNREFUSED; 17395 tdi->SEQ_number = 0; 17396 } else { 17397 tdi->DISCON_reason = ENOPROTOOPT; 17398 tdi->SEQ_number = 17399 tcp->tcp_conn_req_seqnum; 17400 } 17401 mp->b_wptr = mp->b_rptr + sizeof (struct T_discon_ind); 17402 putnext(q, mp); 17403 } else { 17404 freemsg(mp); 17405 } 17406 if (tcp->tcp_hard_binding) { 17407 tcp->tcp_hard_binding = B_FALSE; 17408 tcp->tcp_hard_bound = B_TRUE; 17409 } 17410 tcp->tcp_detached = B_FALSE; 17411 return; 17412 } 17413 17414 mp1 = stropt_mp->b_cont; 17415 stropt_mp->b_cont = NULL; 17416 ASSERT(DB_TYPE(stropt_mp) == M_SETOPTS); 17417 stropt = (struct stroptions *)stropt_mp->b_rptr; 17418 17419 while (mp1 != NULL) { 17420 mp = mp1; 17421 mp1 = mp1->b_cont; 17422 mp->b_cont = NULL; 17423 tcp->tcp_drop_opt_ack_cnt++; 17424 CALL_IP_WPUT(connp, tcp->tcp_wq, mp); 17425 } 17426 mp = NULL; 17427 17428 /* 17429 * For a loopback connection with tcp_direct_sockfs on, note that 17430 * we don't have to protect tcp_rcv_list yet because synchronous 17431 * streams has not yet been enabled and tcp_fuse_rrw() cannot 17432 * possibly race with us. 17433 */ 17434 17435 /* 17436 * Set the max window size (tcp_rq->q_hiwat) of the acceptor 17437 * properly. This is the first time we know of the acceptor' 17438 * queue. So we do it here. 17439 */ 17440 if (tcp->tcp_rcv_list == NULL) { 17441 /* 17442 * Recv queue is empty, tcp_rwnd should not have changed. 17443 * That means it should be equal to the listener's tcp_rwnd. 17444 */ 17445 tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd; 17446 } else { 17447 #ifdef DEBUG 17448 uint_t cnt = 0; 17449 17450 mp1 = tcp->tcp_rcv_list; 17451 while ((mp = mp1) != NULL) { 17452 mp1 = mp->b_next; 17453 cnt += msgdsize(mp); 17454 } 17455 ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt); 17456 #endif 17457 /* There is some data, add them back to get the max. */ 17458 tcp->tcp_rq->q_hiwat = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; 17459 } 17460 17461 stropt->so_flags = SO_HIWAT; 17462 stropt->so_hiwat = MAX(q->q_hiwat, tcp_sth_rcv_hiwat); 17463 17464 stropt->so_flags |= SO_MAXBLK; 17465 stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); 17466 17467 /* 17468 * This is the first time we run on the correct 17469 * queue after tcp_accept. So fix all the q parameters 17470 * here. 17471 */ 17472 /* Allocate room for SACK options if needed. */ 17473 stropt->so_flags |= SO_WROFF; 17474 if (tcp->tcp_fused) { 17475 ASSERT(tcp->tcp_loopback); 17476 ASSERT(tcp->tcp_loopback_peer != NULL); 17477 /* 17478 * For fused tcp loopback, set the stream head's write 17479 * offset value to zero since we won't be needing any room 17480 * for TCP/IP headers. This would also improve performance 17481 * since it would reduce the amount of work done by kmem. 17482 * Non-fused tcp loopback case is handled separately below. 17483 */ 17484 stropt->so_wroff = 0; 17485 /* 17486 * Record the stream head's high water mark for this endpoint; 17487 * this is used for flow-control purposes in tcp_fuse_output(). 17488 */ 17489 stropt->so_hiwat = tcp_fuse_set_rcv_hiwat(tcp, q->q_hiwat); 17490 /* 17491 * Update the peer's transmit parameters according to 17492 * our recently calculated high water mark value. 17493 */ 17494 (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); 17495 } else if (tcp->tcp_snd_sack_ok) { 17496 stropt->so_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + 17497 (tcp->tcp_loopback ? 0 : tcp_wroff_xtra); 17498 } else { 17499 stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : 17500 tcp_wroff_xtra); 17501 } 17502 17503 /* 17504 * If this is endpoint is handling SSL, then reserve extra 17505 * offset and space at the end. 17506 * Also have the stream head allocate SSL3_MAX_RECORD_LEN packets, 17507 * overriding the previous setting. The extra cost of signing and 17508 * encrypting multiple MSS-size records (12 of them with Ethernet), 17509 * instead of a single contiguous one by the stream head 17510 * largely outweighs the statistical reduction of ACKs, when 17511 * applicable. The peer will also save on decyption and verification 17512 * costs. 17513 */ 17514 if (tcp->tcp_kssl_ctx != NULL) { 17515 stropt->so_wroff += SSL3_WROFFSET; 17516 17517 stropt->so_flags |= SO_TAIL; 17518 stropt->so_tail = SSL3_MAX_TAIL_LEN; 17519 17520 stropt->so_maxblk = SSL3_MAX_RECORD_LEN; 17521 } 17522 17523 /* Send the options up */ 17524 putnext(q, stropt_mp); 17525 17526 /* 17527 * Pass up any data and/or a fin that has been received. 17528 * 17529 * Adjust receive window in case it had decreased 17530 * (because there is data <=> tcp_rcv_list != NULL) 17531 * while the connection was detached. Note that 17532 * in case the eager was flow-controlled, w/o this 17533 * code, the rwnd may never open up again! 17534 */ 17535 if (tcp->tcp_rcv_list != NULL) { 17536 /* We drain directly in case of fused tcp loopback */ 17537 if (!tcp->tcp_fused && canputnext(q)) { 17538 tcp->tcp_rwnd = q->q_hiwat; 17539 thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) 17540 << tcp->tcp_rcv_ws; 17541 thwin -= tcp->tcp_rnxt - tcp->tcp_rack; 17542 if (tcp->tcp_state >= TCPS_ESTABLISHED && 17543 (q->q_hiwat - thwin >= tcp->tcp_mss)) { 17544 tcp_xmit_ctl(NULL, 17545 tcp, (tcp->tcp_swnd == 0) ? 17546 tcp->tcp_suna : tcp->tcp_snxt, 17547 tcp->tcp_rnxt, TH_ACK); 17548 BUMP_MIB(&tcp_mib, tcpOutWinUpdate); 17549 } 17550 17551 } 17552 (void) tcp_rcv_drain(q, tcp); 17553 17554 /* 17555 * For fused tcp loopback, back-enable peer endpoint 17556 * if it's currently flow-controlled. 17557 */ 17558 if (tcp->tcp_fused && 17559 tcp->tcp_loopback_peer->tcp_flow_stopped) { 17560 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 17561 17562 ASSERT(peer_tcp != NULL); 17563 ASSERT(peer_tcp->tcp_fused); 17564 17565 tcp_clrqfull(peer_tcp); 17566 TCP_STAT(tcp_fusion_backenabled); 17567 } 17568 } 17569 ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); 17570 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { 17571 mp = mi_tpi_ordrel_ind(); 17572 if (mp) { 17573 tcp->tcp_ordrel_done = B_TRUE; 17574 putnext(q, mp); 17575 if (tcp->tcp_deferred_clean_death) { 17576 /* 17577 * tcp_clean_death was deferred 17578 * for T_ORDREL_IND - do it now 17579 */ 17580 (void) tcp_clean_death(tcp, 17581 tcp->tcp_client_errno, 21); 17582 tcp->tcp_deferred_clean_death = B_FALSE; 17583 } 17584 } else { 17585 /* 17586 * Run the orderly release in the 17587 * service routine. 17588 */ 17589 qenable(q); 17590 } 17591 } 17592 if (tcp->tcp_hard_binding) { 17593 tcp->tcp_hard_binding = B_FALSE; 17594 tcp->tcp_hard_bound = B_TRUE; 17595 } 17596 17597 tcp->tcp_detached = B_FALSE; 17598 17599 /* We can enable synchronous streams now */ 17600 if (tcp->tcp_fused) { 17601 tcp_fuse_syncstr_enable_pair(tcp); 17602 } 17603 17604 if (tcp->tcp_ka_enabled) { 17605 tcp->tcp_ka_last_intrvl = 0; 17606 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, 17607 MSEC_TO_TICK(tcp->tcp_ka_interval)); 17608 } 17609 17610 /* 17611 * At this point, eager is fully established and will 17612 * have the following references - 17613 * 17614 * 2 references for connection to exist (1 for TCP and 1 for IP). 17615 * 1 reference for the squeue which will be dropped by the squeue as 17616 * soon as this function returns. 17617 * There will be 1 additonal reference for being in classifier 17618 * hash list provided something bad hasn't happened. 17619 */ 17620 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 17621 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 17622 } 17623 17624 /* 17625 * The function called through squeue to get behind listener's perimeter to 17626 * send a deffered conn_ind. 17627 */ 17628 /* ARGSUSED */ 17629 void 17630 tcp_send_pending(void *arg, mblk_t *mp, void *arg2) 17631 { 17632 conn_t *connp = (conn_t *)arg; 17633 tcp_t *listener = connp->conn_tcp; 17634 17635 if (listener->tcp_state == TCPS_CLOSED || 17636 TCP_IS_DETACHED(listener)) { 17637 /* 17638 * If listener has closed, it would have caused a 17639 * a cleanup/blowoff to happen for the eager. 17640 */ 17641 tcp_t *tcp; 17642 struct T_conn_ind *conn_ind; 17643 17644 conn_ind = (struct T_conn_ind *)mp->b_rptr; 17645 bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, 17646 conn_ind->OPT_length); 17647 /* 17648 * We need to drop the ref on eager that was put 17649 * tcp_rput_data() before trying to send the conn_ind 17650 * to listener. The conn_ind was deferred in tcp_send_conn_ind 17651 * and tcp_wput_accept() is sending this deferred conn_ind but 17652 * listener is closed so we drop the ref. 17653 */ 17654 CONN_DEC_REF(tcp->tcp_connp); 17655 freemsg(mp); 17656 return; 17657 } 17658 putnext(listener->tcp_rq, mp); 17659 } 17660 17661 17662 /* 17663 * This is the STREAMS entry point for T_CONN_RES coming down on 17664 * Acceptor STREAM when sockfs listener does accept processing. 17665 * Read the block comment on top pf tcp_conn_request(). 17666 */ 17667 void 17668 tcp_wput_accept(queue_t *q, mblk_t *mp) 17669 { 17670 queue_t *rq = RD(q); 17671 struct T_conn_res *conn_res; 17672 tcp_t *eager; 17673 tcp_t *listener; 17674 struct T_ok_ack *ok; 17675 t_scalar_t PRIM_type; 17676 mblk_t *opt_mp; 17677 conn_t *econnp; 17678 17679 ASSERT(DB_TYPE(mp) == M_PROTO); 17680 17681 conn_res = (struct T_conn_res *)mp->b_rptr; 17682 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); 17683 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { 17684 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 17685 if (mp != NULL) 17686 putnext(rq, mp); 17687 return; 17688 } 17689 switch (conn_res->PRIM_type) { 17690 case O_T_CONN_RES: 17691 case T_CONN_RES: 17692 /* 17693 * We pass up an err ack if allocb fails. This will 17694 * cause sockfs to issue a T_DISCON_REQ which will cause 17695 * tcp_eager_blowoff to be called. sockfs will then call 17696 * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. 17697 * we need to do the allocb up here because we have to 17698 * make sure rq->q_qinfo->qi_qclose still points to the 17699 * correct function (tcpclose_accept) in case allocb 17700 * fails. 17701 */ 17702 opt_mp = allocb(sizeof (struct stroptions), BPRI_HI); 17703 if (opt_mp == NULL) { 17704 mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); 17705 if (mp != NULL) 17706 putnext(rq, mp); 17707 return; 17708 } 17709 17710 bcopy(mp->b_rptr + conn_res->OPT_offset, 17711 &eager, conn_res->OPT_length); 17712 PRIM_type = conn_res->PRIM_type; 17713 mp->b_datap->db_type = M_PCPROTO; 17714 mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); 17715 ok = (struct T_ok_ack *)mp->b_rptr; 17716 ok->PRIM_type = T_OK_ACK; 17717 ok->CORRECT_prim = PRIM_type; 17718 econnp = eager->tcp_connp; 17719 econnp->conn_dev = (dev_t)q->q_ptr; 17720 eager->tcp_rq = rq; 17721 eager->tcp_wq = q; 17722 rq->q_ptr = econnp; 17723 rq->q_qinfo = &tcp_rinit; 17724 q->q_ptr = econnp; 17725 q->q_qinfo = &tcp_winit; 17726 listener = eager->tcp_listener; 17727 eager->tcp_issocket = B_TRUE; 17728 econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; 17729 17730 /* Put the ref for IP */ 17731 CONN_INC_REF(econnp); 17732 17733 /* 17734 * We should have minimum of 3 references on the conn 17735 * at this point. One each for TCP and IP and one for 17736 * the T_conn_ind that was sent up when the 3-way handshake 17737 * completed. In the normal case we would also have another 17738 * reference (making a total of 4) for the conn being in the 17739 * classifier hash list. However the eager could have received 17740 * an RST subsequently and tcp_closei_local could have removed 17741 * the eager from the classifier hash list, hence we can't 17742 * assert that reference. 17743 */ 17744 ASSERT(econnp->conn_ref >= 3); 17745 17746 /* 17747 * Send the new local address also up to sockfs. There 17748 * should already be enough space in the mp that came 17749 * down from soaccept(). 17750 */ 17751 if (eager->tcp_family == AF_INET) { 17752 sin_t *sin; 17753 17754 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 17755 (sizeof (struct T_ok_ack) + sizeof (sin_t))); 17756 sin = (sin_t *)mp->b_wptr; 17757 mp->b_wptr += sizeof (sin_t); 17758 sin->sin_family = AF_INET; 17759 sin->sin_port = eager->tcp_lport; 17760 sin->sin_addr.s_addr = eager->tcp_ipha->ipha_src; 17761 } else { 17762 sin6_t *sin6; 17763 17764 ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= 17765 sizeof (struct T_ok_ack) + sizeof (sin6_t)); 17766 sin6 = (sin6_t *)mp->b_wptr; 17767 mp->b_wptr += sizeof (sin6_t); 17768 sin6->sin6_family = AF_INET6; 17769 sin6->sin6_port = eager->tcp_lport; 17770 if (eager->tcp_ipversion == IPV4_VERSION) { 17771 sin6->sin6_flowinfo = 0; 17772 IN6_IPADDR_TO_V4MAPPED( 17773 eager->tcp_ipha->ipha_src, 17774 &sin6->sin6_addr); 17775 } else { 17776 ASSERT(eager->tcp_ip6h != NULL); 17777 sin6->sin6_flowinfo = 17778 eager->tcp_ip6h->ip6_vcf & 17779 ~IPV6_VERS_AND_FLOW_MASK; 17780 sin6->sin6_addr = eager->tcp_ip6h->ip6_src; 17781 } 17782 sin6->sin6_scope_id = 0; 17783 sin6->__sin6_src_id = 0; 17784 } 17785 17786 putnext(rq, mp); 17787 17788 opt_mp->b_datap->db_type = M_SETOPTS; 17789 opt_mp->b_wptr += sizeof (struct stroptions); 17790 17791 /* 17792 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO 17793 * from listener to acceptor. The message is chained on the 17794 * bind_mp which tcp_rput_other will send down to IP. 17795 */ 17796 if (listener->tcp_bound_if != 0) { 17797 /* allocate optmgmt req */ 17798 mp = tcp_setsockopt_mp(IPPROTO_IPV6, 17799 IPV6_BOUND_IF, (char *)&listener->tcp_bound_if, 17800 sizeof (int)); 17801 if (mp != NULL) 17802 linkb(opt_mp, mp); 17803 } 17804 if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { 17805 uint_t on = 1; 17806 17807 /* allocate optmgmt req */ 17808 mp = tcp_setsockopt_mp(IPPROTO_IPV6, 17809 IPV6_RECVPKTINFO, (char *)&on, sizeof (on)); 17810 if (mp != NULL) 17811 linkb(opt_mp, mp); 17812 } 17813 17814 17815 mutex_enter(&listener->tcp_eager_lock); 17816 17817 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { 17818 17819 tcp_t *tail; 17820 tcp_t *tcp; 17821 mblk_t *mp1; 17822 17823 tcp = listener->tcp_eager_prev_q0; 17824 /* 17825 * listener->tcp_eager_prev_q0 points to the TAIL of the 17826 * deferred T_conn_ind queue. We need to get to the head 17827 * of the queue in order to send up T_conn_ind the same 17828 * order as how the 3WHS is completed. 17829 */ 17830 while (tcp != listener) { 17831 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 && 17832 !tcp->tcp_kssl_pending) 17833 break; 17834 else 17835 tcp = tcp->tcp_eager_prev_q0; 17836 } 17837 /* None of the pending eagers can be sent up now */ 17838 if (tcp == listener) 17839 goto no_more_eagers; 17840 17841 mp1 = tcp->tcp_conn.tcp_eager_conn_ind; 17842 tcp->tcp_conn.tcp_eager_conn_ind = NULL; 17843 /* Move from q0 to q */ 17844 ASSERT(listener->tcp_conn_req_cnt_q0 > 0); 17845 listener->tcp_conn_req_cnt_q0--; 17846 listener->tcp_conn_req_cnt_q++; 17847 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = 17848 tcp->tcp_eager_prev_q0; 17849 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = 17850 tcp->tcp_eager_next_q0; 17851 tcp->tcp_eager_prev_q0 = NULL; 17852 tcp->tcp_eager_next_q0 = NULL; 17853 tcp->tcp_conn_def_q0 = B_FALSE; 17854 17855 /* 17856 * Insert at end of the queue because sockfs sends 17857 * down T_CONN_RES in chronological order. Leaving 17858 * the older conn indications at front of the queue 17859 * helps reducing search time. 17860 */ 17861 tail = listener->tcp_eager_last_q; 17862 if (tail != NULL) { 17863 tail->tcp_eager_next_q = tcp; 17864 } else { 17865 listener->tcp_eager_next_q = tcp; 17866 } 17867 listener->tcp_eager_last_q = tcp; 17868 tcp->tcp_eager_next_q = NULL; 17869 17870 /* Need to get inside the listener perimeter */ 17871 CONN_INC_REF(listener->tcp_connp); 17872 squeue_fill(listener->tcp_connp->conn_sqp, mp1, 17873 tcp_send_pending, listener->tcp_connp, 17874 SQTAG_TCP_SEND_PENDING); 17875 } 17876 no_more_eagers: 17877 tcp_eager_unlink(eager); 17878 mutex_exit(&listener->tcp_eager_lock); 17879 17880 /* 17881 * At this point, the eager is detached from the listener 17882 * but we still have an extra refs on eager (apart from the 17883 * usual tcp references). The ref was placed in tcp_rput_data 17884 * before sending the conn_ind in tcp_send_conn_ind. 17885 * The ref will be dropped in tcp_accept_finish(). 17886 */ 17887 squeue_enter_nodrain(econnp->conn_sqp, opt_mp, 17888 tcp_accept_finish, econnp, SQTAG_TCP_ACCEPT_FINISH_Q0); 17889 return; 17890 default: 17891 mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); 17892 if (mp != NULL) 17893 putnext(rq, mp); 17894 return; 17895 } 17896 } 17897 17898 void 17899 tcp_wput(queue_t *q, mblk_t *mp) 17900 { 17901 conn_t *connp = Q_TO_CONN(q); 17902 tcp_t *tcp; 17903 void (*output_proc)(); 17904 t_scalar_t type; 17905 uchar_t *rptr; 17906 struct iocblk *iocp; 17907 uint32_t msize; 17908 17909 ASSERT(connp->conn_ref >= 2); 17910 17911 switch (DB_TYPE(mp)) { 17912 case M_DATA: 17913 tcp = connp->conn_tcp; 17914 ASSERT(tcp != NULL); 17915 17916 msize = msgdsize(mp); 17917 17918 mutex_enter(&connp->conn_lock); 17919 CONN_INC_REF_LOCKED(connp); 17920 17921 tcp->tcp_squeue_bytes += msize; 17922 if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) { 17923 mutex_exit(&connp->conn_lock); 17924 tcp_setqfull(tcp); 17925 } else 17926 mutex_exit(&connp->conn_lock); 17927 17928 (*tcp_squeue_wput_proc)(connp->conn_sqp, mp, 17929 tcp_output, connp, SQTAG_TCP_OUTPUT); 17930 return; 17931 case M_PROTO: 17932 case M_PCPROTO: 17933 /* 17934 * if it is a snmp message, don't get behind the squeue 17935 */ 17936 tcp = connp->conn_tcp; 17937 rptr = mp->b_rptr; 17938 if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 17939 type = ((union T_primitives *)rptr)->type; 17940 } else { 17941 if (tcp->tcp_debug) { 17942 (void) strlog(TCP_MOD_ID, 0, 1, 17943 SL_ERROR|SL_TRACE, 17944 "tcp_wput_proto, dropping one..."); 17945 } 17946 freemsg(mp); 17947 return; 17948 } 17949 if (type == T_SVR4_OPTMGMT_REQ) { 17950 cred_t *cr = DB_CREDDEF(mp, tcp->tcp_cred); 17951 if (snmpcom_req(q, mp, tcp_snmp_set, tcp_snmp_get, 17952 cr)) { 17953 /* 17954 * This was a SNMP request 17955 */ 17956 return; 17957 } else { 17958 output_proc = tcp_wput_proto; 17959 } 17960 } else { 17961 output_proc = tcp_wput_proto; 17962 } 17963 break; 17964 case M_IOCTL: 17965 /* 17966 * Most ioctls can be processed right away without going via 17967 * squeues - process them right here. Those that do require 17968 * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK) 17969 * are processed by tcp_wput_ioctl(). 17970 */ 17971 iocp = (struct iocblk *)mp->b_rptr; 17972 tcp = connp->conn_tcp; 17973 17974 switch (iocp->ioc_cmd) { 17975 case TCP_IOC_ABORT_CONN: 17976 tcp_ioctl_abort_conn(q, mp); 17977 return; 17978 case TI_GETPEERNAME: 17979 if (tcp->tcp_state < TCPS_SYN_RCVD) { 17980 iocp->ioc_error = ENOTCONN; 17981 iocp->ioc_count = 0; 17982 mp->b_datap->db_type = M_IOCACK; 17983 qreply(q, mp); 17984 return; 17985 } 17986 /* FALLTHRU */ 17987 case TI_GETMYNAME: 17988 mi_copyin(q, mp, NULL, 17989 SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); 17990 return; 17991 case ND_SET: 17992 /* nd_getset does the necessary checks */ 17993 case ND_GET: 17994 if (!nd_getset(q, tcp_g_nd, mp)) { 17995 CALL_IP_WPUT(connp, q, mp); 17996 return; 17997 } 17998 qreply(q, mp); 17999 return; 18000 case TCP_IOC_DEFAULT_Q: 18001 /* 18002 * Wants to be the default wq. Check the credentials 18003 * first, the rest is executed via squeue. 18004 */ 18005 if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) { 18006 iocp->ioc_error = EPERM; 18007 iocp->ioc_count = 0; 18008 mp->b_datap->db_type = M_IOCACK; 18009 qreply(q, mp); 18010 return; 18011 } 18012 output_proc = tcp_wput_ioctl; 18013 break; 18014 default: 18015 output_proc = tcp_wput_ioctl; 18016 break; 18017 } 18018 break; 18019 default: 18020 output_proc = tcp_wput_nondata; 18021 break; 18022 } 18023 18024 CONN_INC_REF(connp); 18025 (*tcp_squeue_wput_proc)(connp->conn_sqp, mp, 18026 output_proc, connp, SQTAG_TCP_WPUT_OTHER); 18027 } 18028 18029 /* 18030 * Initial STREAMS write side put() procedure for sockets. It tries to 18031 * handle the T_CAPABILITY_REQ which sockfs sends down while setting 18032 * up the socket without using the squeue. Non T_CAPABILITY_REQ messages 18033 * are handled by tcp_wput() as usual. 18034 * 18035 * All further messages will also be handled by tcp_wput() because we cannot 18036 * be sure that the above short cut is safe later. 18037 */ 18038 static void 18039 tcp_wput_sock(queue_t *wq, mblk_t *mp) 18040 { 18041 conn_t *connp = Q_TO_CONN(wq); 18042 tcp_t *tcp = connp->conn_tcp; 18043 struct T_capability_req *car = (struct T_capability_req *)mp->b_rptr; 18044 18045 ASSERT(wq->q_qinfo == &tcp_sock_winit); 18046 wq->q_qinfo = &tcp_winit; 18047 18048 ASSERT(IPCL_IS_TCP(connp)); 18049 ASSERT(TCP_IS_SOCKET(tcp)); 18050 18051 if (DB_TYPE(mp) == M_PCPROTO && 18052 MBLKL(mp) == sizeof (struct T_capability_req) && 18053 car->PRIM_type == T_CAPABILITY_REQ) { 18054 tcp_capability_req(tcp, mp); 18055 return; 18056 } 18057 18058 tcp_wput(wq, mp); 18059 } 18060 18061 static boolean_t 18062 tcp_zcopy_check(tcp_t *tcp) 18063 { 18064 conn_t *connp = tcp->tcp_connp; 18065 ire_t *ire; 18066 boolean_t zc_enabled = B_FALSE; 18067 18068 if (do_tcpzcopy == 2) 18069 zc_enabled = B_TRUE; 18070 else if (tcp->tcp_ipversion == IPV4_VERSION && 18071 IPCL_IS_CONNECTED(connp) && 18072 (connp->conn_flags & IPCL_CHECK_POLICY) == 0 && 18073 connp->conn_dontroute == 0 && 18074 !connp->conn_nexthop_set && 18075 connp->conn_xmit_if_ill == NULL && 18076 connp->conn_nofailover_ill == NULL && 18077 do_tcpzcopy == 1) { 18078 /* 18079 * the checks above closely resemble the fast path checks 18080 * in tcp_send_data(). 18081 */ 18082 mutex_enter(&connp->conn_lock); 18083 ire = connp->conn_ire_cache; 18084 ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); 18085 if (ire != NULL && !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18086 IRE_REFHOLD(ire); 18087 if (ire->ire_stq != NULL) { 18088 ill_t *ill = (ill_t *)ire->ire_stq->q_ptr; 18089 18090 zc_enabled = ill && (ill->ill_capabilities & 18091 ILL_CAPAB_ZEROCOPY) && 18092 (ill->ill_zerocopy_capab-> 18093 ill_zerocopy_flags != 0); 18094 } 18095 IRE_REFRELE(ire); 18096 } 18097 mutex_exit(&connp->conn_lock); 18098 } 18099 tcp->tcp_snd_zcopy_on = zc_enabled; 18100 if (!TCP_IS_DETACHED(tcp)) { 18101 if (zc_enabled) { 18102 (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMSAFE); 18103 TCP_STAT(tcp_zcopy_on); 18104 } else { 18105 (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE); 18106 TCP_STAT(tcp_zcopy_off); 18107 } 18108 } 18109 return (zc_enabled); 18110 } 18111 18112 static mblk_t * 18113 tcp_zcopy_disable(tcp_t *tcp, mblk_t *bp) 18114 { 18115 if (do_tcpzcopy == 2) 18116 return (bp); 18117 else if (tcp->tcp_snd_zcopy_on) { 18118 tcp->tcp_snd_zcopy_on = B_FALSE; 18119 if (!TCP_IS_DETACHED(tcp)) { 18120 (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE); 18121 TCP_STAT(tcp_zcopy_disable); 18122 } 18123 } 18124 return (tcp_zcopy_backoff(tcp, bp, 0)); 18125 } 18126 18127 /* 18128 * Backoff from a zero-copy mblk by copying data to a new mblk and freeing 18129 * the original desballoca'ed segmapped mblk. 18130 */ 18131 static mblk_t * 18132 tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, int fix_xmitlist) 18133 { 18134 mblk_t *head, *tail, *nbp; 18135 if (IS_VMLOANED_MBLK(bp)) { 18136 TCP_STAT(tcp_zcopy_backoff); 18137 if ((head = copyb(bp)) == NULL) { 18138 /* fail to backoff; leave it for the next backoff */ 18139 tcp->tcp_xmit_zc_clean = B_FALSE; 18140 return (bp); 18141 } 18142 if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { 18143 if (fix_xmitlist) 18144 tcp_zcopy_notify(tcp); 18145 else 18146 head->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 18147 } 18148 nbp = bp->b_cont; 18149 if (fix_xmitlist) { 18150 head->b_prev = bp->b_prev; 18151 head->b_next = bp->b_next; 18152 if (tcp->tcp_xmit_tail == bp) 18153 tcp->tcp_xmit_tail = head; 18154 } 18155 bp->b_next = NULL; 18156 bp->b_prev = NULL; 18157 freeb(bp); 18158 } else { 18159 head = bp; 18160 nbp = bp->b_cont; 18161 } 18162 tail = head; 18163 while (nbp) { 18164 if (IS_VMLOANED_MBLK(nbp)) { 18165 TCP_STAT(tcp_zcopy_backoff); 18166 if ((tail->b_cont = copyb(nbp)) == NULL) { 18167 tcp->tcp_xmit_zc_clean = B_FALSE; 18168 tail->b_cont = nbp; 18169 return (head); 18170 } 18171 tail = tail->b_cont; 18172 if (nbp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { 18173 if (fix_xmitlist) 18174 tcp_zcopy_notify(tcp); 18175 else 18176 tail->b_datap->db_struioflag |= 18177 STRUIO_ZCNOTIFY; 18178 } 18179 bp = nbp; 18180 nbp = nbp->b_cont; 18181 if (fix_xmitlist) { 18182 tail->b_prev = bp->b_prev; 18183 tail->b_next = bp->b_next; 18184 if (tcp->tcp_xmit_tail == bp) 18185 tcp->tcp_xmit_tail = tail; 18186 } 18187 bp->b_next = NULL; 18188 bp->b_prev = NULL; 18189 freeb(bp); 18190 } else { 18191 tail->b_cont = nbp; 18192 tail = nbp; 18193 nbp = nbp->b_cont; 18194 } 18195 } 18196 if (fix_xmitlist) { 18197 tcp->tcp_xmit_last = tail; 18198 tcp->tcp_xmit_zc_clean = B_TRUE; 18199 } 18200 return (head); 18201 } 18202 18203 static void 18204 tcp_zcopy_notify(tcp_t *tcp) 18205 { 18206 struct stdata *stp; 18207 18208 if (tcp->tcp_detached) 18209 return; 18210 stp = STREAM(tcp->tcp_rq); 18211 mutex_enter(&stp->sd_lock); 18212 stp->sd_flag |= STZCNOTIFY; 18213 cv_broadcast(&stp->sd_zcopy_wait); 18214 mutex_exit(&stp->sd_lock); 18215 } 18216 18217 static void 18218 tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) 18219 { 18220 ipha_t *ipha; 18221 ipaddr_t src; 18222 ipaddr_t dst; 18223 uint32_t cksum; 18224 ire_t *ire; 18225 uint16_t *up; 18226 ill_t *ill; 18227 conn_t *connp = tcp->tcp_connp; 18228 uint32_t hcksum_txflags = 0; 18229 mblk_t *ire_fp_mp; 18230 uint_t ire_fp_mp_len; 18231 18232 ASSERT(DB_TYPE(mp) == M_DATA); 18233 18234 if (DB_CRED(mp) == NULL) 18235 mblk_setcred(mp, CONN_CRED(connp)); 18236 18237 ipha = (ipha_t *)mp->b_rptr; 18238 src = ipha->ipha_src; 18239 dst = ipha->ipha_dst; 18240 18241 /* 18242 * Drop off fast path for IPv6 and also if options are present or 18243 * we need to resolve a TS label. 18244 */ 18245 if (tcp->tcp_ipversion != IPV4_VERSION || 18246 !IPCL_IS_CONNECTED(connp) || 18247 (connp->conn_flags & IPCL_CHECK_POLICY) != 0 || 18248 connp->conn_dontroute || 18249 connp->conn_nexthop_set || 18250 connp->conn_xmit_if_ill != NULL || 18251 connp->conn_nofailover_ill != NULL || 18252 !connp->conn_ulp_labeled || 18253 ipha->ipha_ident == IP_HDR_INCLUDED || 18254 ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION || 18255 IPP_ENABLED(IPP_LOCAL_OUT)) { 18256 if (tcp->tcp_snd_zcopy_aware) 18257 mp = tcp_zcopy_disable(tcp, mp); 18258 TCP_STAT(tcp_ip_send); 18259 CALL_IP_WPUT(connp, q, mp); 18260 return; 18261 } 18262 18263 mutex_enter(&connp->conn_lock); 18264 ire = connp->conn_ire_cache; 18265 ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); 18266 if (ire != NULL && ire->ire_addr == dst && 18267 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18268 IRE_REFHOLD(ire); 18269 mutex_exit(&connp->conn_lock); 18270 } else { 18271 boolean_t cached = B_FALSE; 18272 18273 /* force a recheck later on */ 18274 tcp->tcp_ire_ill_check_done = B_FALSE; 18275 18276 TCP_DBGSTAT(tcp_ire_null1); 18277 connp->conn_ire_cache = NULL; 18278 mutex_exit(&connp->conn_lock); 18279 if (ire != NULL) 18280 IRE_REFRELE_NOTR(ire); 18281 ire = ire_cache_lookup(dst, connp->conn_zoneid, 18282 MBLK_GETLABEL(mp)); 18283 if (ire == NULL) { 18284 if (tcp->tcp_snd_zcopy_aware) 18285 mp = tcp_zcopy_backoff(tcp, mp, 0); 18286 TCP_STAT(tcp_ire_null); 18287 CALL_IP_WPUT(connp, q, mp); 18288 return; 18289 } 18290 IRE_REFHOLD_NOTR(ire); 18291 /* 18292 * Since we are inside the squeue, there cannot be another 18293 * thread in TCP trying to set the conn_ire_cache now. The 18294 * check for IRE_MARK_CONDEMNED ensures that an interface 18295 * unplumb thread has not yet started cleaning up the conns. 18296 * Hence we don't need to grab the conn lock. 18297 */ 18298 if (!(connp->conn_state_flags & CONN_CLOSING)) { 18299 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 18300 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 18301 connp->conn_ire_cache = ire; 18302 cached = B_TRUE; 18303 } 18304 rw_exit(&ire->ire_bucket->irb_lock); 18305 } 18306 18307 /* 18308 * We can continue to use the ire but since it was 18309 * not cached, we should drop the extra reference. 18310 */ 18311 if (!cached) 18312 IRE_REFRELE_NOTR(ire); 18313 18314 /* 18315 * Rampart note: no need to select a new label here, since 18316 * labels are not allowed to change during the life of a TCP 18317 * connection. 18318 */ 18319 } 18320 18321 /* 18322 * The following if case identifies whether or not 18323 * we are forced to take the slowpath. 18324 */ 18325 if (ire->ire_flags & RTF_MULTIRT || 18326 ire->ire_stq == NULL || 18327 ire->ire_max_frag < ntohs(ipha->ipha_length) || 18328 (ire->ire_nce != NULL && 18329 (ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL) || 18330 (ire_fp_mp_len = MBLKL(ire_fp_mp)) > MBLKHEAD(mp)) { 18331 if (tcp->tcp_snd_zcopy_aware) 18332 mp = tcp_zcopy_disable(tcp, mp); 18333 TCP_STAT(tcp_ip_ire_send); 18334 IRE_REFRELE(ire); 18335 CALL_IP_WPUT(connp, q, mp); 18336 return; 18337 } 18338 18339 ill = ire_to_ill(ire); 18340 if (connp->conn_outgoing_ill != NULL) { 18341 ill_t *conn_outgoing_ill = NULL; 18342 /* 18343 * Choose a good ill in the group to send the packets on. 18344 */ 18345 ire = conn_set_outgoing_ill(connp, ire, &conn_outgoing_ill); 18346 ill = ire_to_ill(ire); 18347 } 18348 ASSERT(ill != NULL); 18349 18350 if (!tcp->tcp_ire_ill_check_done) { 18351 tcp_ire_ill_check(tcp, ire, ill, B_TRUE); 18352 tcp->tcp_ire_ill_check_done = B_TRUE; 18353 } 18354 18355 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); 18356 ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); 18357 #ifndef _BIG_ENDIAN 18358 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); 18359 #endif 18360 18361 /* 18362 * Check to see if we need to re-enable MDT for this connection 18363 * because it was previously disabled due to changes in the ill; 18364 * note that by doing it here, this re-enabling only applies when 18365 * the packet is not dispatched through CALL_IP_WPUT(). 18366 * 18367 * That means for IPv4, it is worth re-enabling MDT for the fastpath 18368 * case, since that's how we ended up here. For IPv6, we do the 18369 * re-enabling work in ip_xmit_v6(), albeit indirectly via squeue. 18370 */ 18371 if (connp->conn_mdt_ok && !tcp->tcp_mdt && ILL_MDT_USABLE(ill)) { 18372 /* 18373 * Restore MDT for this connection, so that next time around 18374 * it is eligible to go through tcp_multisend() path again. 18375 */ 18376 TCP_STAT(tcp_mdt_conn_resumed1); 18377 tcp->tcp_mdt = B_TRUE; 18378 ip1dbg(("tcp_send_data: reenabling MDT for connp %p on " 18379 "interface %s\n", (void *)connp, ill->ill_name)); 18380 } 18381 18382 if (tcp->tcp_snd_zcopy_aware) { 18383 if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 || 18384 (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0)) 18385 mp = tcp_zcopy_disable(tcp, mp); 18386 /* 18387 * we shouldn't need to reset ipha as the mp containing 18388 * ipha should never be a zero-copy mp. 18389 */ 18390 } 18391 18392 if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { 18393 ASSERT(ill->ill_hcksum_capab != NULL); 18394 hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; 18395 } 18396 18397 /* pseudo-header checksum (do it in parts for IP header checksum) */ 18398 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); 18399 18400 ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION); 18401 up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); 18402 18403 IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up, 18404 IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum); 18405 18406 /* Software checksum? */ 18407 if (DB_CKSUMFLAGS(mp) == 0) { 18408 TCP_STAT(tcp_out_sw_cksum); 18409 TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes, 18410 ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH); 18411 } 18412 18413 ipha->ipha_fragment_offset_and_flags |= 18414 (uint32_t)htons(ire->ire_frag_flag); 18415 18416 /* Calculate IP header checksum if hardware isn't capable */ 18417 if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { 18418 IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0], 18419 ((uint16_t *)ipha)[4]); 18420 } 18421 18422 ASSERT(DB_TYPE(ire_fp_mp) == M_DATA); 18423 mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len; 18424 bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len); 18425 18426 UPDATE_OB_PKT_COUNT(ire); 18427 ire->ire_last_used_time = lbolt; 18428 BUMP_MIB(&ip_mib, ipOutRequests); 18429 18430 if (ILL_DLS_CAPABLE(ill)) { 18431 /* 18432 * Send the packet directly to DLD, where it may be queued 18433 * depending on the availability of transmit resources at 18434 * the media layer. 18435 */ 18436 IP_DLS_ILL_TX(ill, mp); 18437 } else { 18438 putnext(ire->ire_stq, mp); 18439 } 18440 IRE_REFRELE(ire); 18441 } 18442 18443 /* 18444 * This handles the case when the receiver has shrunk its win. Per RFC 1122 18445 * if the receiver shrinks the window, i.e. moves the right window to the 18446 * left, the we should not send new data, but should retransmit normally the 18447 * old unacked data between suna and suna + swnd. We might has sent data 18448 * that is now outside the new window, pretend that we didn't send it. 18449 */ 18450 static void 18451 tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count) 18452 { 18453 uint32_t snxt = tcp->tcp_snxt; 18454 mblk_t *xmit_tail; 18455 int32_t offset; 18456 18457 ASSERT(shrunk_count > 0); 18458 18459 /* Pretend we didn't send the data outside the window */ 18460 snxt -= shrunk_count; 18461 18462 /* Get the mblk and the offset in it per the shrunk window */ 18463 xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset); 18464 18465 ASSERT(xmit_tail != NULL); 18466 18467 /* Reset all the values per the now shrunk window */ 18468 tcp->tcp_snxt = snxt; 18469 tcp->tcp_xmit_tail = xmit_tail; 18470 tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr - xmit_tail->b_rptr - 18471 offset; 18472 tcp->tcp_unsent += shrunk_count; 18473 18474 if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0) 18475 /* 18476 * Make sure the timer is running so that we will probe a zero 18477 * window. 18478 */ 18479 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 18480 } 18481 18482 18483 /* 18484 * The TCP normal data output path. 18485 * NOTE: the logic of the fast path is duplicated from this function. 18486 */ 18487 static void 18488 tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) 18489 { 18490 int len; 18491 mblk_t *local_time; 18492 mblk_t *mp1; 18493 uint32_t snxt; 18494 int tail_unsent; 18495 int tcpstate; 18496 int usable = 0; 18497 mblk_t *xmit_tail; 18498 queue_t *q = tcp->tcp_wq; 18499 int32_t mss; 18500 int32_t num_sack_blk = 0; 18501 int32_t tcp_hdr_len; 18502 int32_t tcp_tcp_hdr_len; 18503 int mdt_thres; 18504 int rc; 18505 18506 tcpstate = tcp->tcp_state; 18507 if (mp == NULL) { 18508 /* 18509 * tcp_wput_data() with NULL mp should only be called when 18510 * there is unsent data. 18511 */ 18512 ASSERT(tcp->tcp_unsent > 0); 18513 /* Really tacky... but we need this for detached closes. */ 18514 len = tcp->tcp_unsent; 18515 goto data_null; 18516 } 18517 18518 #if CCS_STATS 18519 wrw_stats.tot.count++; 18520 wrw_stats.tot.bytes += msgdsize(mp); 18521 #endif 18522 ASSERT(mp->b_datap->db_type == M_DATA); 18523 /* 18524 * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, 18525 * or before a connection attempt has begun. 18526 */ 18527 if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || 18528 (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 18529 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { 18530 #ifdef DEBUG 18531 cmn_err(CE_WARN, 18532 "tcp_wput_data: data after ordrel, %s", 18533 tcp_display(tcp, NULL, 18534 DISP_ADDR_AND_PORT)); 18535 #else 18536 if (tcp->tcp_debug) { 18537 (void) strlog(TCP_MOD_ID, 0, 1, 18538 SL_TRACE|SL_ERROR, 18539 "tcp_wput_data: data after ordrel, %s\n", 18540 tcp_display(tcp, NULL, 18541 DISP_ADDR_AND_PORT)); 18542 } 18543 #endif /* DEBUG */ 18544 } 18545 if (tcp->tcp_snd_zcopy_aware && 18546 (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0) 18547 tcp_zcopy_notify(tcp); 18548 freemsg(mp); 18549 if (tcp->tcp_flow_stopped && 18550 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 18551 tcp_clrqfull(tcp); 18552 } 18553 return; 18554 } 18555 18556 /* Strip empties */ 18557 for (;;) { 18558 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= 18559 (uintptr_t)INT_MAX); 18560 len = (int)(mp->b_wptr - mp->b_rptr); 18561 if (len > 0) 18562 break; 18563 mp1 = mp; 18564 mp = mp->b_cont; 18565 freeb(mp1); 18566 if (!mp) { 18567 return; 18568 } 18569 } 18570 18571 /* If we are the first on the list ... */ 18572 if (tcp->tcp_xmit_head == NULL) { 18573 tcp->tcp_xmit_head = mp; 18574 tcp->tcp_xmit_tail = mp; 18575 tcp->tcp_xmit_tail_unsent = len; 18576 } else { 18577 /* If tiny tx and room in txq tail, pullup to save mblks. */ 18578 struct datab *dp; 18579 18580 mp1 = tcp->tcp_xmit_last; 18581 if (len < tcp_tx_pull_len && 18582 (dp = mp1->b_datap)->db_ref == 1 && 18583 dp->db_lim - mp1->b_wptr >= len) { 18584 ASSERT(len > 0); 18585 ASSERT(!mp1->b_cont); 18586 if (len == 1) { 18587 *mp1->b_wptr++ = *mp->b_rptr; 18588 } else { 18589 bcopy(mp->b_rptr, mp1->b_wptr, len); 18590 mp1->b_wptr += len; 18591 } 18592 if (mp1 == tcp->tcp_xmit_tail) 18593 tcp->tcp_xmit_tail_unsent += len; 18594 mp1->b_cont = mp->b_cont; 18595 if (tcp->tcp_snd_zcopy_aware && 18596 (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) 18597 mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; 18598 freeb(mp); 18599 mp = mp1; 18600 } else { 18601 tcp->tcp_xmit_last->b_cont = mp; 18602 } 18603 len += tcp->tcp_unsent; 18604 } 18605 18606 /* Tack on however many more positive length mblks we have */ 18607 if ((mp1 = mp->b_cont) != NULL) { 18608 do { 18609 int tlen; 18610 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 18611 (uintptr_t)INT_MAX); 18612 tlen = (int)(mp1->b_wptr - mp1->b_rptr); 18613 if (tlen <= 0) { 18614 mp->b_cont = mp1->b_cont; 18615 freeb(mp1); 18616 } else { 18617 len += tlen; 18618 mp = mp1; 18619 } 18620 } while ((mp1 = mp->b_cont) != NULL); 18621 } 18622 tcp->tcp_xmit_last = mp; 18623 tcp->tcp_unsent = len; 18624 18625 if (urgent) 18626 usable = 1; 18627 18628 data_null: 18629 snxt = tcp->tcp_snxt; 18630 xmit_tail = tcp->tcp_xmit_tail; 18631 tail_unsent = tcp->tcp_xmit_tail_unsent; 18632 18633 /* 18634 * Note that tcp_mss has been adjusted to take into account the 18635 * timestamp option if applicable. Because SACK options do not 18636 * appear in every TCP segments and they are of variable lengths, 18637 * they cannot be included in tcp_mss. Thus we need to calculate 18638 * the actual segment length when we need to send a segment which 18639 * includes SACK options. 18640 */ 18641 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 18642 int32_t opt_len; 18643 18644 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 18645 tcp->tcp_num_sack_blk); 18646 opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 18647 2 + TCPOPT_HEADER_LEN; 18648 mss = tcp->tcp_mss - opt_len; 18649 tcp_hdr_len = tcp->tcp_hdr_len + opt_len; 18650 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + opt_len; 18651 } else { 18652 mss = tcp->tcp_mss; 18653 tcp_hdr_len = tcp->tcp_hdr_len; 18654 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; 18655 } 18656 18657 if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && 18658 (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { 18659 SET_TCP_INIT_CWND(tcp, mss, tcp_slow_start_after_idle); 18660 } 18661 if (tcpstate == TCPS_SYN_RCVD) { 18662 /* 18663 * The three-way connection establishment handshake is not 18664 * complete yet. We want to queue the data for transmission 18665 * after entering ESTABLISHED state (RFC793). A jump to 18666 * "done" label effectively leaves data on the queue. 18667 */ 18668 goto done; 18669 } else { 18670 int usable_r; 18671 18672 /* 18673 * In the special case when cwnd is zero, which can only 18674 * happen if the connection is ECN capable, return now. 18675 * New segments is sent using tcp_timer(). The timer 18676 * is set in tcp_rput_data(). 18677 */ 18678 if (tcp->tcp_cwnd == 0) { 18679 /* 18680 * Note that tcp_cwnd is 0 before 3-way handshake is 18681 * finished. 18682 */ 18683 ASSERT(tcp->tcp_ecn_ok || 18684 tcp->tcp_state < TCPS_ESTABLISHED); 18685 return; 18686 } 18687 18688 /* NOTE: trouble if xmitting while SYN not acked? */ 18689 usable_r = snxt - tcp->tcp_suna; 18690 usable_r = tcp->tcp_swnd - usable_r; 18691 18692 /* 18693 * Check if the receiver has shrunk the window. If 18694 * tcp_wput_data() with NULL mp is called, tcp_fin_sent 18695 * cannot be set as there is unsent data, so FIN cannot 18696 * be sent out. Otherwise, we need to take into account 18697 * of FIN as it consumes an "invisible" sequence number. 18698 */ 18699 ASSERT(tcp->tcp_fin_sent == 0); 18700 if (usable_r < 0) { 18701 /* 18702 * The receiver has shrunk the window and we have sent 18703 * -usable_r date beyond the window, re-adjust. 18704 * 18705 * If TCP window scaling is enabled, there can be 18706 * round down error as the advertised receive window 18707 * is actually right shifted n bits. This means that 18708 * the lower n bits info is wiped out. It will look 18709 * like the window is shrunk. Do a check here to 18710 * see if the shrunk amount is actually within the 18711 * error in window calculation. If it is, just 18712 * return. Note that this check is inside the 18713 * shrunk window check. This makes sure that even 18714 * though tcp_process_shrunk_swnd() is not called, 18715 * we will stop further processing. 18716 */ 18717 if ((-usable_r >> tcp->tcp_snd_ws) > 0) { 18718 tcp_process_shrunk_swnd(tcp, -usable_r); 18719 } 18720 return; 18721 } 18722 18723 /* usable = MIN(swnd, cwnd) - unacked_bytes */ 18724 if (tcp->tcp_swnd > tcp->tcp_cwnd) 18725 usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd; 18726 18727 /* usable = MIN(usable, unsent) */ 18728 if (usable_r > len) 18729 usable_r = len; 18730 18731 /* usable = MAX(usable, {1 for urgent, 0 for data}) */ 18732 if (usable_r > 0) { 18733 usable = usable_r; 18734 } else { 18735 /* Bypass all other unnecessary processing. */ 18736 goto done; 18737 } 18738 } 18739 18740 local_time = (mblk_t *)lbolt; 18741 18742 /* 18743 * "Our" Nagle Algorithm. This is not the same as in the old 18744 * BSD. This is more in line with the true intent of Nagle. 18745 * 18746 * The conditions are: 18747 * 1. The amount of unsent data (or amount of data which can be 18748 * sent, whichever is smaller) is less than Nagle limit. 18749 * 2. The last sent size is also less than Nagle limit. 18750 * 3. There is unack'ed data. 18751 * 4. Urgent pointer is not set. Send urgent data ignoring the 18752 * Nagle algorithm. This reduces the probability that urgent 18753 * bytes get "merged" together. 18754 * 5. The app has not closed the connection. This eliminates the 18755 * wait time of the receiving side waiting for the last piece of 18756 * (small) data. 18757 * 18758 * If all are satisified, exit without sending anything. Note 18759 * that Nagle limit can be smaller than 1 MSS. Nagle limit is 18760 * the smaller of 1 MSS and global tcp_naglim_def (default to be 18761 * 4095). 18762 */ 18763 if (usable < (int)tcp->tcp_naglim && 18764 tcp->tcp_naglim > tcp->tcp_last_sent_len && 18765 snxt != tcp->tcp_suna && 18766 !(tcp->tcp_valid_bits & TCP_URG_VALID) && 18767 !(tcp->tcp_valid_bits & TCP_FSS_VALID)) { 18768 goto done; 18769 } 18770 18771 if (tcp->tcp_cork) { 18772 /* 18773 * if the tcp->tcp_cork option is set, then we have to force 18774 * TCP not to send partial segment (smaller than MSS bytes). 18775 * We are calculating the usable now based on full mss and 18776 * will save the rest of remaining data for later. 18777 */ 18778 if (usable < mss) 18779 goto done; 18780 usable = (usable / mss) * mss; 18781 } 18782 18783 /* Update the latest receive window size in TCP header. */ 18784 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 18785 tcp->tcp_tcph->th_win); 18786 18787 /* 18788 * Determine if it's worthwhile to attempt MDT, based on: 18789 * 18790 * 1. Simple TCP/IP{v4,v6} (no options). 18791 * 2. IPSEC/IPQoS processing is not needed for the TCP connection. 18792 * 3. If the TCP connection is in ESTABLISHED state. 18793 * 4. The TCP is not detached. 18794 * 18795 * If any of the above conditions have changed during the 18796 * connection, stop using MDT and restore the stream head 18797 * parameters accordingly. 18798 */ 18799 if (tcp->tcp_mdt && 18800 ((tcp->tcp_ipversion == IPV4_VERSION && 18801 tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || 18802 (tcp->tcp_ipversion == IPV6_VERSION && 18803 tcp->tcp_ip_hdr_len != IPV6_HDR_LEN) || 18804 tcp->tcp_state != TCPS_ESTABLISHED || 18805 TCP_IS_DETACHED(tcp) || !CONN_IS_MD_FASTPATH(tcp->tcp_connp) || 18806 CONN_IPSEC_OUT_ENCAPSULATED(tcp->tcp_connp) || 18807 IPP_ENABLED(IPP_LOCAL_OUT))) { 18808 tcp->tcp_connp->conn_mdt_ok = B_FALSE; 18809 tcp->tcp_mdt = B_FALSE; 18810 18811 /* Anything other than detached is considered pathological */ 18812 if (!TCP_IS_DETACHED(tcp)) { 18813 TCP_STAT(tcp_mdt_conn_halted1); 18814 (void) tcp_maxpsz_set(tcp, B_TRUE); 18815 } 18816 } 18817 18818 /* Use MDT if sendable amount is greater than the threshold */ 18819 if (tcp->tcp_mdt && 18820 (mdt_thres = mss << tcp_mdt_smss_threshold, usable > mdt_thres) && 18821 (tail_unsent > mdt_thres || (xmit_tail->b_cont != NULL && 18822 MBLKL(xmit_tail->b_cont) > mdt_thres)) && 18823 (tcp->tcp_valid_bits == 0 || 18824 tcp->tcp_valid_bits == TCP_FSS_VALID)) { 18825 ASSERT(tcp->tcp_connp->conn_mdt_ok); 18826 rc = tcp_multisend(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, 18827 num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, 18828 local_time, mdt_thres); 18829 } else { 18830 rc = tcp_send(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, 18831 num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, 18832 local_time, INT_MAX); 18833 } 18834 18835 /* Pretend that all we were trying to send really got sent */ 18836 if (rc < 0 && tail_unsent < 0) { 18837 do { 18838 xmit_tail = xmit_tail->b_cont; 18839 xmit_tail->b_prev = local_time; 18840 ASSERT((uintptr_t)(xmit_tail->b_wptr - 18841 xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); 18842 tail_unsent += (int)(xmit_tail->b_wptr - 18843 xmit_tail->b_rptr); 18844 } while (tail_unsent < 0); 18845 } 18846 done:; 18847 tcp->tcp_xmit_tail = xmit_tail; 18848 tcp->tcp_xmit_tail_unsent = tail_unsent; 18849 len = tcp->tcp_snxt - snxt; 18850 if (len) { 18851 /* 18852 * If new data was sent, need to update the notsack 18853 * list, which is, afterall, data blocks that have 18854 * not been sack'ed by the receiver. New data is 18855 * not sack'ed. 18856 */ 18857 if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { 18858 /* len is a negative value. */ 18859 tcp->tcp_pipe -= len; 18860 tcp_notsack_update(&(tcp->tcp_notsack_list), 18861 tcp->tcp_snxt, snxt, 18862 &(tcp->tcp_num_notsack_blk), 18863 &(tcp->tcp_cnt_notsack_list)); 18864 } 18865 tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; 18866 tcp->tcp_rack = tcp->tcp_rnxt; 18867 tcp->tcp_rack_cnt = 0; 18868 if ((snxt + len) == tcp->tcp_suna) { 18869 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 18870 } 18871 } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { 18872 /* 18873 * Didn't send anything. Make sure the timer is running 18874 * so that we will probe a zero window. 18875 */ 18876 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 18877 } 18878 /* Note that len is the amount we just sent but with a negative sign */ 18879 tcp->tcp_unsent += len; 18880 if (tcp->tcp_flow_stopped) { 18881 if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { 18882 tcp_clrqfull(tcp); 18883 } 18884 } else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) { 18885 tcp_setqfull(tcp); 18886 } 18887 } 18888 18889 /* 18890 * tcp_fill_header is called by tcp_send() and tcp_multisend() to fill the 18891 * outgoing TCP header with the template header, as well as other 18892 * options such as time-stamp, ECN and/or SACK. 18893 */ 18894 static void 18895 tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) 18896 { 18897 tcph_t *tcp_tmpl, *tcp_h; 18898 uint32_t *dst, *src; 18899 int hdrlen; 18900 18901 ASSERT(OK_32PTR(rptr)); 18902 18903 /* Template header */ 18904 tcp_tmpl = tcp->tcp_tcph; 18905 18906 /* Header of outgoing packet */ 18907 tcp_h = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 18908 18909 /* dst and src are opaque 32-bit fields, used for copying */ 18910 dst = (uint32_t *)rptr; 18911 src = (uint32_t *)tcp->tcp_iphc; 18912 hdrlen = tcp->tcp_hdr_len; 18913 18914 /* Fill time-stamp option if needed */ 18915 if (tcp->tcp_snd_ts_ok) { 18916 U32_TO_BE32((uint32_t)now, 18917 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); 18918 U32_TO_BE32(tcp->tcp_ts_recent, 18919 (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); 18920 } else { 18921 ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); 18922 } 18923 18924 /* 18925 * Copy the template header; is this really more efficient than 18926 * calling bcopy()? For simple IPv4/TCP, it may be the case, 18927 * but perhaps not for other scenarios. 18928 */ 18929 dst[0] = src[0]; 18930 dst[1] = src[1]; 18931 dst[2] = src[2]; 18932 dst[3] = src[3]; 18933 dst[4] = src[4]; 18934 dst[5] = src[5]; 18935 dst[6] = src[6]; 18936 dst[7] = src[7]; 18937 dst[8] = src[8]; 18938 dst[9] = src[9]; 18939 if (hdrlen -= 40) { 18940 hdrlen >>= 2; 18941 dst += 10; 18942 src += 10; 18943 do { 18944 *dst++ = *src++; 18945 } while (--hdrlen); 18946 } 18947 18948 /* 18949 * Set the ECN info in the TCP header if it is not a zero 18950 * window probe. Zero window probe is only sent in 18951 * tcp_wput_data() and tcp_timer(). 18952 */ 18953 if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { 18954 SET_ECT(tcp, rptr); 18955 18956 if (tcp->tcp_ecn_echo_on) 18957 tcp_h->th_flags[0] |= TH_ECE; 18958 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 18959 tcp_h->th_flags[0] |= TH_CWR; 18960 tcp->tcp_ecn_cwr_sent = B_TRUE; 18961 } 18962 } 18963 18964 /* Fill in SACK options */ 18965 if (num_sack_blk > 0) { 18966 uchar_t *wptr = rptr + tcp->tcp_hdr_len; 18967 sack_blk_t *tmp; 18968 int32_t i; 18969 18970 wptr[0] = TCPOPT_NOP; 18971 wptr[1] = TCPOPT_NOP; 18972 wptr[2] = TCPOPT_SACK; 18973 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 18974 sizeof (sack_blk_t); 18975 wptr += TCPOPT_REAL_SACK_LEN; 18976 18977 tmp = tcp->tcp_sack_list; 18978 for (i = 0; i < num_sack_blk; i++) { 18979 U32_TO_BE32(tmp[i].begin, wptr); 18980 wptr += sizeof (tcp_seq); 18981 U32_TO_BE32(tmp[i].end, wptr); 18982 wptr += sizeof (tcp_seq); 18983 } 18984 tcp_h->th_offset_and_rsrvd[0] += 18985 ((num_sack_blk * 2 + 1) << 4); 18986 } 18987 } 18988 18989 /* 18990 * tcp_mdt_add_attrs() is called by tcp_multisend() in order to attach 18991 * the destination address and SAP attribute, and if necessary, the 18992 * hardware checksum offload attribute to a Multidata message. 18993 */ 18994 static int 18995 tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum, 18996 const uint32_t start, const uint32_t stuff, const uint32_t end, 18997 const uint32_t flags) 18998 { 18999 /* Add global destination address & SAP attribute */ 19000 if (dlmp == NULL || !ip_md_addr_attr(mmd, NULL, dlmp)) { 19001 ip1dbg(("tcp_mdt_add_attrs: can't add global physical " 19002 "destination address+SAP\n")); 19003 19004 if (dlmp != NULL) 19005 TCP_STAT(tcp_mdt_allocfail); 19006 return (-1); 19007 } 19008 19009 /* Add global hwcksum attribute */ 19010 if (hwcksum && 19011 !ip_md_hcksum_attr(mmd, NULL, start, stuff, end, flags)) { 19012 ip1dbg(("tcp_mdt_add_attrs: can't add global hardware " 19013 "checksum attribute\n")); 19014 19015 TCP_STAT(tcp_mdt_allocfail); 19016 return (-1); 19017 } 19018 19019 return (0); 19020 } 19021 19022 /* 19023 * Smaller and private version of pdescinfo_t used specifically for TCP, 19024 * which allows for only two payload spans per packet. 19025 */ 19026 typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t; 19027 19028 /* 19029 * tcp_multisend() is called by tcp_wput_data() for Multidata Transmit 19030 * scheme, and returns one the following: 19031 * 19032 * -1 = failed allocation. 19033 * 0 = success; burst count reached, or usable send window is too small, 19034 * and that we'd rather wait until later before sending again. 19035 */ 19036 static int 19037 tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, 19038 const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable, 19039 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 19040 const int mdt_thres) 19041 { 19042 mblk_t *md_mp_head, *md_mp, *md_pbuf, *md_pbuf_nxt, *md_hbuf; 19043 multidata_t *mmd; 19044 uint_t obsegs, obbytes, hdr_frag_sz; 19045 uint_t cur_hdr_off, cur_pld_off, base_pld_off, first_snxt; 19046 int num_burst_seg, max_pld; 19047 pdesc_t *pkt; 19048 tcp_pdescinfo_t tcp_pkt_info; 19049 pdescinfo_t *pkt_info; 19050 int pbuf_idx, pbuf_idx_nxt; 19051 int seg_len, len, spill, af; 19052 boolean_t add_buffer, zcopy, clusterwide; 19053 boolean_t rconfirm = B_FALSE; 19054 boolean_t done = B_FALSE; 19055 uint32_t cksum; 19056 uint32_t hwcksum_flags; 19057 ire_t *ire; 19058 ill_t *ill; 19059 ipha_t *ipha; 19060 ip6_t *ip6h; 19061 ipaddr_t src, dst; 19062 ill_zerocopy_capab_t *zc_cap = NULL; 19063 uint16_t *up; 19064 int err; 19065 conn_t *connp; 19066 19067 #ifdef _BIG_ENDIAN 19068 #define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 28) & 0x7) 19069 #else 19070 #define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 4) & 0x7) 19071 #endif 19072 19073 #define PREP_NEW_MULTIDATA() { \ 19074 mmd = NULL; \ 19075 md_mp = md_hbuf = NULL; \ 19076 cur_hdr_off = 0; \ 19077 max_pld = tcp->tcp_mdt_max_pld; \ 19078 pbuf_idx = pbuf_idx_nxt = -1; \ 19079 add_buffer = B_TRUE; \ 19080 zcopy = B_FALSE; \ 19081 } 19082 19083 #define PREP_NEW_PBUF() { \ 19084 md_pbuf = md_pbuf_nxt = NULL; \ 19085 pbuf_idx = pbuf_idx_nxt = -1; \ 19086 cur_pld_off = 0; \ 19087 first_snxt = *snxt; \ 19088 ASSERT(*tail_unsent > 0); \ 19089 base_pld_off = MBLKL(*xmit_tail) - *tail_unsent; \ 19090 } 19091 19092 ASSERT(mdt_thres >= mss); 19093 ASSERT(*usable > 0 && *usable > mdt_thres); 19094 ASSERT(tcp->tcp_state == TCPS_ESTABLISHED); 19095 ASSERT(!TCP_IS_DETACHED(tcp)); 19096 ASSERT(tcp->tcp_valid_bits == 0 || 19097 tcp->tcp_valid_bits == TCP_FSS_VALID); 19098 ASSERT((tcp->tcp_ipversion == IPV4_VERSION && 19099 tcp->tcp_ip_hdr_len == IP_SIMPLE_HDR_LENGTH) || 19100 (tcp->tcp_ipversion == IPV6_VERSION && 19101 tcp->tcp_ip_hdr_len == IPV6_HDR_LEN)); 19102 19103 connp = tcp->tcp_connp; 19104 ASSERT(connp != NULL); 19105 ASSERT(CONN_IS_MD_FASTPATH(connp)); 19106 ASSERT(!CONN_IPSEC_OUT_ENCAPSULATED(connp)); 19107 19108 /* 19109 * Note that tcp will only declare at most 2 payload spans per 19110 * packet, which is much lower than the maximum allowable number 19111 * of packet spans per Multidata. For this reason, we use the 19112 * privately declared and smaller descriptor info structure, in 19113 * order to save some stack space. 19114 */ 19115 pkt_info = (pdescinfo_t *)&tcp_pkt_info; 19116 19117 af = (tcp->tcp_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6; 19118 if (af == AF_INET) { 19119 dst = tcp->tcp_ipha->ipha_dst; 19120 src = tcp->tcp_ipha->ipha_src; 19121 ASSERT(!CLASSD(dst)); 19122 } 19123 ASSERT(af == AF_INET || 19124 !IN6_IS_ADDR_MULTICAST(&tcp->tcp_ip6h->ip6_dst)); 19125 19126 obsegs = obbytes = 0; 19127 num_burst_seg = tcp->tcp_snd_burst; 19128 md_mp_head = NULL; 19129 PREP_NEW_MULTIDATA(); 19130 19131 /* 19132 * Before we go on further, make sure there is an IRE that we can 19133 * use, and that the ILL supports MDT. Otherwise, there's no point 19134 * in proceeding any further, and we should just hand everything 19135 * off to the legacy path. 19136 */ 19137 mutex_enter(&connp->conn_lock); 19138 ire = connp->conn_ire_cache; 19139 ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); 19140 if (ire != NULL && ((af == AF_INET && ire->ire_addr == dst) || 19141 (af == AF_INET6 && IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, 19142 &tcp->tcp_ip6h->ip6_dst))) && 19143 !(ire->ire_marks & IRE_MARK_CONDEMNED)) { 19144 IRE_REFHOLD(ire); 19145 mutex_exit(&connp->conn_lock); 19146 } else { 19147 boolean_t cached = B_FALSE; 19148 ts_label_t *tsl; 19149 19150 /* force a recheck later on */ 19151 tcp->tcp_ire_ill_check_done = B_FALSE; 19152 19153 TCP_DBGSTAT(tcp_ire_null1); 19154 connp->conn_ire_cache = NULL; 19155 mutex_exit(&connp->conn_lock); 19156 19157 /* Release the old ire */ 19158 if (ire != NULL) 19159 IRE_REFRELE_NOTR(ire); 19160 19161 tsl = crgetlabel(CONN_CRED(connp)); 19162 ire = (af == AF_INET) ? 19163 ire_cache_lookup(dst, connp->conn_zoneid, tsl) : 19164 ire_cache_lookup_v6(&tcp->tcp_ip6h->ip6_dst, 19165 connp->conn_zoneid, tsl); 19166 19167 if (ire == NULL) { 19168 TCP_STAT(tcp_ire_null); 19169 goto legacy_send_no_md; 19170 } 19171 19172 IRE_REFHOLD_NOTR(ire); 19173 /* 19174 * Since we are inside the squeue, there cannot be another 19175 * thread in TCP trying to set the conn_ire_cache now. The 19176 * check for IRE_MARK_CONDEMNED ensures that an interface 19177 * unplumb thread has not yet started cleaning up the conns. 19178 * Hence we don't need to grab the conn lock. 19179 */ 19180 if (!(connp->conn_state_flags & CONN_CLOSING)) { 19181 rw_enter(&ire->ire_bucket->irb_lock, RW_READER); 19182 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { 19183 connp->conn_ire_cache = ire; 19184 cached = B_TRUE; 19185 } 19186 rw_exit(&ire->ire_bucket->irb_lock); 19187 } 19188 19189 /* 19190 * We can continue to use the ire but since it was not 19191 * cached, we should drop the extra reference. 19192 */ 19193 if (!cached) 19194 IRE_REFRELE_NOTR(ire); 19195 } 19196 19197 ASSERT(ire != NULL); 19198 ASSERT(af != AF_INET || ire->ire_ipversion == IPV4_VERSION); 19199 ASSERT(af == AF_INET || !IN6_IS_ADDR_V4MAPPED(&(ire->ire_addr_v6))); 19200 ASSERT(af == AF_INET || ire->ire_nce != NULL); 19201 ASSERT(!(ire->ire_type & IRE_BROADCAST)); 19202 /* 19203 * If we do support loopback for MDT (which requires modifications 19204 * to the receiving paths), the following assertions should go away, 19205 * and we would be sending the Multidata to loopback conn later on. 19206 */ 19207 ASSERT(!IRE_IS_LOCAL(ire)); 19208 ASSERT(ire->ire_stq != NULL); 19209 19210 ill = ire_to_ill(ire); 19211 ASSERT(ill != NULL); 19212 ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL); 19213 19214 if (!tcp->tcp_ire_ill_check_done) { 19215 tcp_ire_ill_check(tcp, ire, ill, B_TRUE); 19216 tcp->tcp_ire_ill_check_done = B_TRUE; 19217 } 19218 19219 /* 19220 * If the underlying interface conditions have changed, or if the 19221 * new interface does not support MDT, go back to legacy path. 19222 */ 19223 if (!ILL_MDT_USABLE(ill) || (ire->ire_flags & RTF_MULTIRT) != 0) { 19224 /* don't go through this path anymore for this connection */ 19225 TCP_STAT(tcp_mdt_conn_halted2); 19226 tcp->tcp_mdt = B_FALSE; 19227 ip1dbg(("tcp_multisend: disabling MDT for connp %p on " 19228 "interface %s\n", (void *)connp, ill->ill_name)); 19229 /* IRE will be released prior to returning */ 19230 goto legacy_send_no_md; 19231 } 19232 19233 if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) 19234 zc_cap = ill->ill_zerocopy_capab; 19235 19236 /* 19237 * Check if we can take tcp fast-path. Note that "incomplete" 19238 * ire's (where the link-layer for next hop is not resolved 19239 * or where the fast-path header in nce_fp_mp is not available 19240 * yet) are sent down the legacy (slow) path. 19241 * NOTE: We should fix ip_xmit_v4 to handle M_MULTIDATA 19242 */ 19243 if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) { 19244 /* IRE will be released prior to returning */ 19245 goto legacy_send_no_md; 19246 } 19247 19248 /* go to legacy path if interface doesn't support zerocopy */ 19249 if (tcp->tcp_snd_zcopy_aware && do_tcpzcopy != 2 && 19250 (zc_cap == NULL || zc_cap->ill_zerocopy_flags == 0)) { 19251 /* IRE will be released prior to returning */ 19252 goto legacy_send_no_md; 19253 } 19254 19255 /* does the interface support hardware checksum offload? */ 19256 hwcksum_flags = 0; 19257 if (ILL_HCKSUM_CAPABLE(ill) && 19258 (ill->ill_hcksum_capab->ill_hcksum_txflags & 19259 (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL | 19260 HCKSUM_IPHDRCKSUM)) && dohwcksum) { 19261 if (ill->ill_hcksum_capab->ill_hcksum_txflags & 19262 HCKSUM_IPHDRCKSUM) 19263 hwcksum_flags = HCK_IPV4_HDRCKSUM; 19264 19265 if (ill->ill_hcksum_capab->ill_hcksum_txflags & 19266 (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6)) 19267 hwcksum_flags |= HCK_FULLCKSUM; 19268 else if (ill->ill_hcksum_capab->ill_hcksum_txflags & 19269 HCKSUM_INET_PARTIAL) 19270 hwcksum_flags |= HCK_PARTIALCKSUM; 19271 } 19272 19273 /* 19274 * Each header fragment consists of the leading extra space, 19275 * followed by the TCP/IP header, and the trailing extra space. 19276 * We make sure that each header fragment begins on a 32-bit 19277 * aligned memory address (tcp_mdt_hdr_head is already 32-bit 19278 * aligned in tcp_mdt_update). 19279 */ 19280 hdr_frag_sz = roundup((tcp->tcp_mdt_hdr_head + tcp_hdr_len + 19281 tcp->tcp_mdt_hdr_tail), 4); 19282 19283 /* are we starting from the beginning of data block? */ 19284 if (*tail_unsent == 0) { 19285 *xmit_tail = (*xmit_tail)->b_cont; 19286 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= (uintptr_t)INT_MAX); 19287 *tail_unsent = (int)MBLKL(*xmit_tail); 19288 } 19289 19290 /* 19291 * Here we create one or more Multidata messages, each made up of 19292 * one header buffer and up to N payload buffers. This entire 19293 * operation is done within two loops: 19294 * 19295 * The outer loop mostly deals with creating the Multidata message, 19296 * as well as the header buffer that gets added to it. It also 19297 * links the Multidata messages together such that all of them can 19298 * be sent down to the lower layer in a single putnext call; this 19299 * linking behavior depends on the tcp_mdt_chain tunable. 19300 * 19301 * The inner loop takes an existing Multidata message, and adds 19302 * one or more (up to tcp_mdt_max_pld) payload buffers to it. It 19303 * packetizes those buffers by filling up the corresponding header 19304 * buffer fragments with the proper IP and TCP headers, and by 19305 * describing the layout of each packet in the packet descriptors 19306 * that get added to the Multidata. 19307 */ 19308 do { 19309 /* 19310 * If usable send window is too small, or data blocks in 19311 * transmit list are smaller than our threshold (i.e. app 19312 * performs large writes followed by small ones), we hand 19313 * off the control over to the legacy path. Note that we'll 19314 * get back the control once it encounters a large block. 19315 */ 19316 if (*usable < mss || (*tail_unsent <= mdt_thres && 19317 (*xmit_tail)->b_cont != NULL && 19318 MBLKL((*xmit_tail)->b_cont) <= mdt_thres)) { 19319 /* send down what we've got so far */ 19320 if (md_mp_head != NULL) { 19321 tcp_multisend_data(tcp, ire, ill, md_mp_head, 19322 obsegs, obbytes, &rconfirm); 19323 } 19324 /* 19325 * Pass control over to tcp_send(), but tell it to 19326 * return to us once a large-size transmission is 19327 * possible. 19328 */ 19329 TCP_STAT(tcp_mdt_legacy_small); 19330 if ((err = tcp_send(q, tcp, mss, tcp_hdr_len, 19331 tcp_tcp_hdr_len, num_sack_blk, usable, snxt, 19332 tail_unsent, xmit_tail, local_time, 19333 mdt_thres)) <= 0) { 19334 /* burst count reached, or alloc failed */ 19335 IRE_REFRELE(ire); 19336 return (err); 19337 } 19338 19339 /* tcp_send() may have sent everything, so check */ 19340 if (*usable <= 0) { 19341 IRE_REFRELE(ire); 19342 return (0); 19343 } 19344 19345 TCP_STAT(tcp_mdt_legacy_ret); 19346 /* 19347 * We may have delivered the Multidata, so make sure 19348 * to re-initialize before the next round. 19349 */ 19350 md_mp_head = NULL; 19351 obsegs = obbytes = 0; 19352 num_burst_seg = tcp->tcp_snd_burst; 19353 PREP_NEW_MULTIDATA(); 19354 19355 /* are we starting from the beginning of data block? */ 19356 if (*tail_unsent == 0) { 19357 *xmit_tail = (*xmit_tail)->b_cont; 19358 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= 19359 (uintptr_t)INT_MAX); 19360 *tail_unsent = (int)MBLKL(*xmit_tail); 19361 } 19362 } 19363 19364 /* 19365 * max_pld limits the number of mblks in tcp's transmit 19366 * queue that can be added to a Multidata message. Once 19367 * this counter reaches zero, no more additional mblks 19368 * can be added to it. What happens afterwards depends 19369 * on whether or not we are set to chain the Multidata 19370 * messages. If we are to link them together, reset 19371 * max_pld to its original value (tcp_mdt_max_pld) and 19372 * prepare to create a new Multidata message which will 19373 * get linked to md_mp_head. Else, leave it alone and 19374 * let the inner loop break on its own. 19375 */ 19376 if (tcp_mdt_chain && max_pld == 0) 19377 PREP_NEW_MULTIDATA(); 19378 19379 /* adding a payload buffer; re-initialize values */ 19380 if (add_buffer) 19381 PREP_NEW_PBUF(); 19382 19383 /* 19384 * If we don't have a Multidata, either because we just 19385 * (re)entered this outer loop, or after we branched off 19386 * to tcp_send above, setup the Multidata and header 19387 * buffer to be used. 19388 */ 19389 if (md_mp == NULL) { 19390 int md_hbuflen; 19391 uint32_t start, stuff; 19392 19393 /* 19394 * Calculate Multidata header buffer size large enough 19395 * to hold all of the headers that can possibly be 19396 * sent at this moment. We'd rather over-estimate 19397 * the size than running out of space; this is okay 19398 * since this buffer is small anyway. 19399 */ 19400 md_hbuflen = (howmany(*usable, mss) + 1) * hdr_frag_sz; 19401 19402 /* 19403 * Start and stuff offset for partial hardware 19404 * checksum offload; these are currently for IPv4. 19405 * For full checksum offload, they are set to zero. 19406 */ 19407 if ((hwcksum_flags & HCK_PARTIALCKSUM)) { 19408 if (af == AF_INET) { 19409 start = IP_SIMPLE_HDR_LENGTH; 19410 stuff = IP_SIMPLE_HDR_LENGTH + 19411 TCP_CHECKSUM_OFFSET; 19412 } else { 19413 start = IPV6_HDR_LEN; 19414 stuff = IPV6_HDR_LEN + 19415 TCP_CHECKSUM_OFFSET; 19416 } 19417 } else { 19418 start = stuff = 0; 19419 } 19420 19421 /* 19422 * Create the header buffer, Multidata, as well as 19423 * any necessary attributes (destination address, 19424 * SAP and hardware checksum offload) that should 19425 * be associated with the Multidata message. 19426 */ 19427 ASSERT(cur_hdr_off == 0); 19428 if ((md_hbuf = allocb(md_hbuflen, BPRI_HI)) == NULL || 19429 ((md_hbuf->b_wptr += md_hbuflen), 19430 (mmd = mmd_alloc(md_hbuf, &md_mp, 19431 KM_NOSLEEP)) == NULL) || (tcp_mdt_add_attrs(mmd, 19432 /* fastpath mblk */ 19433 ire->ire_nce->nce_res_mp, 19434 /* hardware checksum enabled */ 19435 (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)), 19436 /* hardware checksum offsets */ 19437 start, stuff, 0, 19438 /* hardware checksum flag */ 19439 hwcksum_flags) != 0)) { 19440 legacy_send: 19441 if (md_mp != NULL) { 19442 /* Unlink message from the chain */ 19443 if (md_mp_head != NULL) { 19444 err = (intptr_t)rmvb(md_mp_head, 19445 md_mp); 19446 /* 19447 * We can't assert that rmvb 19448 * did not return -1, since we 19449 * may get here before linkb 19450 * happens. We do, however, 19451 * check if we just removed the 19452 * only element in the list. 19453 */ 19454 if (err == 0) 19455 md_mp_head = NULL; 19456 } 19457 /* md_hbuf gets freed automatically */ 19458 TCP_STAT(tcp_mdt_discarded); 19459 freeb(md_mp); 19460 } else { 19461 /* Either allocb or mmd_alloc failed */ 19462 TCP_STAT(tcp_mdt_allocfail); 19463 if (md_hbuf != NULL) 19464 freeb(md_hbuf); 19465 } 19466 19467 /* send down what we've got so far */ 19468 if (md_mp_head != NULL) { 19469 tcp_multisend_data(tcp, ire, ill, 19470 md_mp_head, obsegs, obbytes, 19471 &rconfirm); 19472 } 19473 legacy_send_no_md: 19474 if (ire != NULL) 19475 IRE_REFRELE(ire); 19476 /* 19477 * Too bad; let the legacy path handle this. 19478 * We specify INT_MAX for the threshold, since 19479 * we gave up with the Multidata processings 19480 * and let the old path have it all. 19481 */ 19482 TCP_STAT(tcp_mdt_legacy_all); 19483 return (tcp_send(q, tcp, mss, tcp_hdr_len, 19484 tcp_tcp_hdr_len, num_sack_blk, usable, 19485 snxt, tail_unsent, xmit_tail, local_time, 19486 INT_MAX)); 19487 } 19488 19489 /* link to any existing ones, if applicable */ 19490 TCP_STAT(tcp_mdt_allocd); 19491 if (md_mp_head == NULL) { 19492 md_mp_head = md_mp; 19493 } else if (tcp_mdt_chain) { 19494 TCP_STAT(tcp_mdt_linked); 19495 linkb(md_mp_head, md_mp); 19496 } 19497 } 19498 19499 ASSERT(md_mp_head != NULL); 19500 ASSERT(tcp_mdt_chain || md_mp_head->b_cont == NULL); 19501 ASSERT(md_mp != NULL && mmd != NULL); 19502 ASSERT(md_hbuf != NULL); 19503 19504 /* 19505 * Packetize the transmittable portion of the data block; 19506 * each data block is essentially added to the Multidata 19507 * as a payload buffer. We also deal with adding more 19508 * than one payload buffers, which happens when the remaining 19509 * packetized portion of the current payload buffer is less 19510 * than MSS, while the next data block in transmit queue 19511 * has enough data to make up for one. This "spillover" 19512 * case essentially creates a split-packet, where portions 19513 * of the packet's payload fragments may span across two 19514 * virtually discontiguous address blocks. 19515 */ 19516 seg_len = mss; 19517 do { 19518 len = seg_len; 19519 19520 ASSERT(len > 0); 19521 ASSERT(max_pld >= 0); 19522 ASSERT(!add_buffer || cur_pld_off == 0); 19523 19524 /* 19525 * First time around for this payload buffer; note 19526 * in the case of a spillover, the following has 19527 * been done prior to adding the split-packet 19528 * descriptor to Multidata, and we don't want to 19529 * repeat the process. 19530 */ 19531 if (add_buffer) { 19532 ASSERT(mmd != NULL); 19533 ASSERT(md_pbuf == NULL); 19534 ASSERT(md_pbuf_nxt == NULL); 19535 ASSERT(pbuf_idx == -1 && pbuf_idx_nxt == -1); 19536 19537 /* 19538 * Have we reached the limit? We'd get to 19539 * this case when we're not chaining the 19540 * Multidata messages together, and since 19541 * we're done, terminate this loop. 19542 */ 19543 if (max_pld == 0) 19544 break; /* done */ 19545 19546 if ((md_pbuf = dupb(*xmit_tail)) == NULL) { 19547 TCP_STAT(tcp_mdt_allocfail); 19548 goto legacy_send; /* out_of_mem */ 19549 } 19550 19551 if (IS_VMLOANED_MBLK(md_pbuf) && !zcopy && 19552 zc_cap != NULL) { 19553 if (!ip_md_zcopy_attr(mmd, NULL, 19554 zc_cap->ill_zerocopy_flags)) { 19555 freeb(md_pbuf); 19556 TCP_STAT(tcp_mdt_allocfail); 19557 /* out_of_mem */ 19558 goto legacy_send; 19559 } 19560 zcopy = B_TRUE; 19561 } 19562 19563 md_pbuf->b_rptr += base_pld_off; 19564 19565 /* 19566 * Add a payload buffer to the Multidata; this 19567 * operation must not fail, or otherwise our 19568 * logic in this routine is broken. There 19569 * is no memory allocation done by the 19570 * routine, so any returned failure simply 19571 * tells us that we've done something wrong. 19572 * 19573 * A failure tells us that either we're adding 19574 * the same payload buffer more than once, or 19575 * we're trying to add more buffers than 19576 * allowed (max_pld calculation is wrong). 19577 * None of the above cases should happen, and 19578 * we panic because either there's horrible 19579 * heap corruption, and/or programming mistake. 19580 */ 19581 pbuf_idx = mmd_addpldbuf(mmd, md_pbuf); 19582 if (pbuf_idx < 0) { 19583 cmn_err(CE_PANIC, "tcp_multisend: " 19584 "payload buffer logic error " 19585 "detected for tcp %p mmd %p " 19586 "pbuf %p (%d)\n", 19587 (void *)tcp, (void *)mmd, 19588 (void *)md_pbuf, pbuf_idx); 19589 } 19590 19591 ASSERT(max_pld > 0); 19592 --max_pld; 19593 add_buffer = B_FALSE; 19594 } 19595 19596 ASSERT(md_mp_head != NULL); 19597 ASSERT(md_pbuf != NULL); 19598 ASSERT(md_pbuf_nxt == NULL); 19599 ASSERT(pbuf_idx != -1); 19600 ASSERT(pbuf_idx_nxt == -1); 19601 ASSERT(*usable > 0); 19602 19603 /* 19604 * We spillover to the next payload buffer only 19605 * if all of the following is true: 19606 * 19607 * 1. There is not enough data on the current 19608 * payload buffer to make up `len', 19609 * 2. We are allowed to send `len', 19610 * 3. The next payload buffer length is large 19611 * enough to accomodate `spill'. 19612 */ 19613 if ((spill = len - *tail_unsent) > 0 && 19614 *usable >= len && 19615 MBLKL((*xmit_tail)->b_cont) >= spill && 19616 max_pld > 0) { 19617 md_pbuf_nxt = dupb((*xmit_tail)->b_cont); 19618 if (md_pbuf_nxt == NULL) { 19619 TCP_STAT(tcp_mdt_allocfail); 19620 goto legacy_send; /* out_of_mem */ 19621 } 19622 19623 if (IS_VMLOANED_MBLK(md_pbuf_nxt) && !zcopy && 19624 zc_cap != NULL) { 19625 if (!ip_md_zcopy_attr(mmd, NULL, 19626 zc_cap->ill_zerocopy_flags)) { 19627 freeb(md_pbuf_nxt); 19628 TCP_STAT(tcp_mdt_allocfail); 19629 /* out_of_mem */ 19630 goto legacy_send; 19631 } 19632 zcopy = B_TRUE; 19633 } 19634 19635 /* 19636 * See comments above on the first call to 19637 * mmd_addpldbuf for explanation on the panic. 19638 */ 19639 pbuf_idx_nxt = mmd_addpldbuf(mmd, md_pbuf_nxt); 19640 if (pbuf_idx_nxt < 0) { 19641 panic("tcp_multisend: " 19642 "next payload buffer logic error " 19643 "detected for tcp %p mmd %p " 19644 "pbuf %p (%d)\n", 19645 (void *)tcp, (void *)mmd, 19646 (void *)md_pbuf_nxt, pbuf_idx_nxt); 19647 } 19648 19649 ASSERT(max_pld > 0); 19650 --max_pld; 19651 } else if (spill > 0) { 19652 /* 19653 * If there's a spillover, but the following 19654 * xmit_tail couldn't give us enough octets 19655 * to reach "len", then stop the current 19656 * Multidata creation and let the legacy 19657 * tcp_send() path take over. We don't want 19658 * to send the tiny segment as part of this 19659 * Multidata for performance reasons; instead, 19660 * we let the legacy path deal with grouping 19661 * it with the subsequent small mblks. 19662 */ 19663 if (*usable >= len && 19664 MBLKL((*xmit_tail)->b_cont) < spill) { 19665 max_pld = 0; 19666 break; /* done */ 19667 } 19668 19669 /* 19670 * We can't spillover, and we are near 19671 * the end of the current payload buffer, 19672 * so send what's left. 19673 */ 19674 ASSERT(*tail_unsent > 0); 19675 len = *tail_unsent; 19676 } 19677 19678 /* tail_unsent is negated if there is a spillover */ 19679 *tail_unsent -= len; 19680 *usable -= len; 19681 ASSERT(*usable >= 0); 19682 19683 if (*usable < mss) 19684 seg_len = *usable; 19685 /* 19686 * Sender SWS avoidance; see comments in tcp_send(); 19687 * everything else is the same, except that we only 19688 * do this here if there is no more data to be sent 19689 * following the current xmit_tail. We don't check 19690 * for 1-byte urgent data because we shouldn't get 19691 * here if TCP_URG_VALID is set. 19692 */ 19693 if (*usable > 0 && *usable < mss && 19694 ((md_pbuf_nxt == NULL && 19695 (*xmit_tail)->b_cont == NULL) || 19696 (md_pbuf_nxt != NULL && 19697 (*xmit_tail)->b_cont->b_cont == NULL)) && 19698 seg_len < (tcp->tcp_max_swnd >> 1) && 19699 (tcp->tcp_unsent - 19700 ((*snxt + len) - tcp->tcp_snxt)) > seg_len && 19701 !tcp->tcp_zero_win_probe) { 19702 if ((*snxt + len) == tcp->tcp_snxt && 19703 (*snxt + len) == tcp->tcp_suna) { 19704 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 19705 } 19706 done = B_TRUE; 19707 } 19708 19709 /* 19710 * Prime pump for IP's checksumming on our behalf; 19711 * include the adjustment for a source route if any. 19712 * Do this only for software/partial hardware checksum 19713 * offload, as this field gets zeroed out later for 19714 * the full hardware checksum offload case. 19715 */ 19716 if (!(hwcksum_flags & HCK_FULLCKSUM)) { 19717 cksum = len + tcp_tcp_hdr_len + tcp->tcp_sum; 19718 cksum = (cksum >> 16) + (cksum & 0xFFFF); 19719 U16_TO_ABE16(cksum, tcp->tcp_tcph->th_sum); 19720 } 19721 19722 U32_TO_ABE32(*snxt, tcp->tcp_tcph->th_seq); 19723 *snxt += len; 19724 19725 tcp->tcp_tcph->th_flags[0] = TH_ACK; 19726 /* 19727 * We set the PUSH bit only if TCP has no more buffered 19728 * data to be transmitted (or if sender SWS avoidance 19729 * takes place), as opposed to setting it for every 19730 * last packet in the burst. 19731 */ 19732 if (done || 19733 (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) == 0) 19734 tcp->tcp_tcph->th_flags[0] |= TH_PUSH; 19735 19736 /* 19737 * Set FIN bit if this is our last segment; snxt 19738 * already includes its length, and it will not 19739 * be adjusted after this point. 19740 */ 19741 if (tcp->tcp_valid_bits == TCP_FSS_VALID && 19742 *snxt == tcp->tcp_fss) { 19743 if (!tcp->tcp_fin_acked) { 19744 tcp->tcp_tcph->th_flags[0] |= TH_FIN; 19745 BUMP_MIB(&tcp_mib, tcpOutControl); 19746 } 19747 if (!tcp->tcp_fin_sent) { 19748 tcp->tcp_fin_sent = B_TRUE; 19749 /* 19750 * tcp state must be ESTABLISHED 19751 * in order for us to get here in 19752 * the first place. 19753 */ 19754 tcp->tcp_state = TCPS_FIN_WAIT_1; 19755 19756 /* 19757 * Upon returning from this routine, 19758 * tcp_wput_data() will set tcp_snxt 19759 * to be equal to snxt + tcp_fin_sent. 19760 * This is essentially the same as 19761 * setting it to tcp_fss + 1. 19762 */ 19763 } 19764 } 19765 19766 tcp->tcp_last_sent_len = (ushort_t)len; 19767 19768 len += tcp_hdr_len; 19769 if (tcp->tcp_ipversion == IPV4_VERSION) 19770 tcp->tcp_ipha->ipha_length = htons(len); 19771 else 19772 tcp->tcp_ip6h->ip6_plen = htons(len - 19773 ((char *)&tcp->tcp_ip6h[1] - 19774 tcp->tcp_iphc)); 19775 19776 pkt_info->flags = (PDESC_HBUF_REF | PDESC_PBUF_REF); 19777 19778 /* setup header fragment */ 19779 PDESC_HDR_ADD(pkt_info, 19780 md_hbuf->b_rptr + cur_hdr_off, /* base */ 19781 tcp->tcp_mdt_hdr_head, /* head room */ 19782 tcp_hdr_len, /* len */ 19783 tcp->tcp_mdt_hdr_tail); /* tail room */ 19784 19785 ASSERT(pkt_info->hdr_lim - pkt_info->hdr_base == 19786 hdr_frag_sz); 19787 ASSERT(MBLKIN(md_hbuf, 19788 (pkt_info->hdr_base - md_hbuf->b_rptr), 19789 PDESC_HDRSIZE(pkt_info))); 19790 19791 /* setup first payload fragment */ 19792 PDESC_PLD_INIT(pkt_info); 19793 PDESC_PLD_SPAN_ADD(pkt_info, 19794 pbuf_idx, /* index */ 19795 md_pbuf->b_rptr + cur_pld_off, /* start */ 19796 tcp->tcp_last_sent_len); /* len */ 19797 19798 /* create a split-packet in case of a spillover */ 19799 if (md_pbuf_nxt != NULL) { 19800 ASSERT(spill > 0); 19801 ASSERT(pbuf_idx_nxt > pbuf_idx); 19802 ASSERT(!add_buffer); 19803 19804 md_pbuf = md_pbuf_nxt; 19805 md_pbuf_nxt = NULL; 19806 pbuf_idx = pbuf_idx_nxt; 19807 pbuf_idx_nxt = -1; 19808 cur_pld_off = spill; 19809 19810 /* trim out first payload fragment */ 19811 PDESC_PLD_SPAN_TRIM(pkt_info, 0, spill); 19812 19813 /* setup second payload fragment */ 19814 PDESC_PLD_SPAN_ADD(pkt_info, 19815 pbuf_idx, /* index */ 19816 md_pbuf->b_rptr, /* start */ 19817 spill); /* len */ 19818 19819 if ((*xmit_tail)->b_next == NULL) { 19820 /* 19821 * Store the lbolt used for RTT 19822 * estimation. We can only record one 19823 * timestamp per mblk so we do it when 19824 * we reach the end of the payload 19825 * buffer. Also we only take a new 19826 * timestamp sample when the previous 19827 * timed data from the same mblk has 19828 * been ack'ed. 19829 */ 19830 (*xmit_tail)->b_prev = local_time; 19831 (*xmit_tail)->b_next = 19832 (mblk_t *)(uintptr_t)first_snxt; 19833 } 19834 19835 first_snxt = *snxt - spill; 19836 19837 /* 19838 * Advance xmit_tail; usable could be 0 by 19839 * the time we got here, but we made sure 19840 * above that we would only spillover to 19841 * the next data block if usable includes 19842 * the spilled-over amount prior to the 19843 * subtraction. Therefore, we are sure 19844 * that xmit_tail->b_cont can't be NULL. 19845 */ 19846 ASSERT((*xmit_tail)->b_cont != NULL); 19847 *xmit_tail = (*xmit_tail)->b_cont; 19848 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= 19849 (uintptr_t)INT_MAX); 19850 *tail_unsent = (int)MBLKL(*xmit_tail) - spill; 19851 } else { 19852 cur_pld_off += tcp->tcp_last_sent_len; 19853 } 19854 19855 /* 19856 * Fill in the header using the template header, and 19857 * add options such as time-stamp, ECN and/or SACK, 19858 * as needed. 19859 */ 19860 tcp_fill_header(tcp, pkt_info->hdr_rptr, 19861 (clock_t)local_time, num_sack_blk); 19862 19863 /* take care of some IP header businesses */ 19864 if (af == AF_INET) { 19865 ipha = (ipha_t *)pkt_info->hdr_rptr; 19866 19867 ASSERT(OK_32PTR((uchar_t *)ipha)); 19868 ASSERT(PDESC_HDRL(pkt_info) >= 19869 IP_SIMPLE_HDR_LENGTH); 19870 ASSERT(ipha->ipha_version_and_hdr_length == 19871 IP_SIMPLE_HDR_VERSION); 19872 19873 /* 19874 * Assign ident value for current packet; see 19875 * related comments in ip_wput_ire() about the 19876 * contract private interface with clustering 19877 * group. 19878 */ 19879 clusterwide = B_FALSE; 19880 if (cl_inet_ipident != NULL) { 19881 ASSERT(cl_inet_isclusterwide != NULL); 19882 if ((*cl_inet_isclusterwide)(IPPROTO_IP, 19883 AF_INET, 19884 (uint8_t *)(uintptr_t)src)) { 19885 ipha->ipha_ident = 19886 (*cl_inet_ipident) 19887 (IPPROTO_IP, AF_INET, 19888 (uint8_t *)(uintptr_t)src, 19889 (uint8_t *)(uintptr_t)dst); 19890 clusterwide = B_TRUE; 19891 } 19892 } 19893 19894 if (!clusterwide) { 19895 ipha->ipha_ident = (uint16_t) 19896 atomic_add_32_nv( 19897 &ire->ire_ident, 1); 19898 } 19899 #ifndef _BIG_ENDIAN 19900 ipha->ipha_ident = (ipha->ipha_ident << 8) | 19901 (ipha->ipha_ident >> 8); 19902 #endif 19903 } else { 19904 ip6h = (ip6_t *)pkt_info->hdr_rptr; 19905 19906 ASSERT(OK_32PTR((uchar_t *)ip6h)); 19907 ASSERT(IPVER(ip6h) == IPV6_VERSION); 19908 ASSERT(ip6h->ip6_nxt == IPPROTO_TCP); 19909 ASSERT(PDESC_HDRL(pkt_info) >= 19910 (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET + 19911 TCP_CHECKSUM_SIZE)); 19912 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 19913 19914 if (tcp->tcp_ip_forward_progress) { 19915 rconfirm = B_TRUE; 19916 tcp->tcp_ip_forward_progress = B_FALSE; 19917 } 19918 } 19919 19920 /* at least one payload span, and at most two */ 19921 ASSERT(pkt_info->pld_cnt > 0 && pkt_info->pld_cnt < 3); 19922 19923 /* add the packet descriptor to Multidata */ 19924 if ((pkt = mmd_addpdesc(mmd, pkt_info, &err, 19925 KM_NOSLEEP)) == NULL) { 19926 /* 19927 * Any failure other than ENOMEM indicates 19928 * that we have passed in invalid pkt_info 19929 * or parameters to mmd_addpdesc, which must 19930 * not happen. 19931 * 19932 * EINVAL is a result of failure on boundary 19933 * checks against the pkt_info contents. It 19934 * should not happen, and we panic because 19935 * either there's horrible heap corruption, 19936 * and/or programming mistake. 19937 */ 19938 if (err != ENOMEM) { 19939 cmn_err(CE_PANIC, "tcp_multisend: " 19940 "pdesc logic error detected for " 19941 "tcp %p mmd %p pinfo %p (%d)\n", 19942 (void *)tcp, (void *)mmd, 19943 (void *)pkt_info, err); 19944 } 19945 TCP_STAT(tcp_mdt_addpdescfail); 19946 goto legacy_send; /* out_of_mem */ 19947 } 19948 ASSERT(pkt != NULL); 19949 19950 /* calculate IP header and TCP checksums */ 19951 if (af == AF_INET) { 19952 /* calculate pseudo-header checksum */ 19953 cksum = (dst >> 16) + (dst & 0xFFFF) + 19954 (src >> 16) + (src & 0xFFFF); 19955 19956 /* offset for TCP header checksum */ 19957 up = IPH_TCPH_CHECKSUMP(ipha, 19958 IP_SIMPLE_HDR_LENGTH); 19959 } else { 19960 up = (uint16_t *)&ip6h->ip6_src; 19961 19962 /* calculate pseudo-header checksum */ 19963 cksum = up[0] + up[1] + up[2] + up[3] + 19964 up[4] + up[5] + up[6] + up[7] + 19965 up[8] + up[9] + up[10] + up[11] + 19966 up[12] + up[13] + up[14] + up[15]; 19967 19968 /* Fold the initial sum */ 19969 cksum = (cksum & 0xffff) + (cksum >> 16); 19970 19971 up = (uint16_t *)(((uchar_t *)ip6h) + 19972 IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET); 19973 } 19974 19975 if (hwcksum_flags & HCK_FULLCKSUM) { 19976 /* clear checksum field for hardware */ 19977 *up = 0; 19978 } else if (hwcksum_flags & HCK_PARTIALCKSUM) { 19979 uint32_t sum; 19980 19981 /* pseudo-header checksumming */ 19982 sum = *up + cksum + IP_TCP_CSUM_COMP; 19983 sum = (sum & 0xFFFF) + (sum >> 16); 19984 *up = (sum & 0xFFFF) + (sum >> 16); 19985 } else { 19986 /* software checksumming */ 19987 TCP_STAT(tcp_out_sw_cksum); 19988 TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes, 19989 tcp->tcp_hdr_len + tcp->tcp_last_sent_len); 19990 *up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len, 19991 cksum + IP_TCP_CSUM_COMP); 19992 if (*up == 0) 19993 *up = 0xFFFF; 19994 } 19995 19996 /* IPv4 header checksum */ 19997 if (af == AF_INET) { 19998 ipha->ipha_fragment_offset_and_flags |= 19999 (uint32_t)htons(ire->ire_frag_flag); 20000 20001 if (hwcksum_flags & HCK_IPV4_HDRCKSUM) { 20002 ipha->ipha_hdr_checksum = 0; 20003 } else { 20004 IP_HDR_CKSUM(ipha, cksum, 20005 ((uint32_t *)ipha)[0], 20006 ((uint16_t *)ipha)[4]); 20007 } 20008 } 20009 20010 /* advance header offset */ 20011 cur_hdr_off += hdr_frag_sz; 20012 20013 obbytes += tcp->tcp_last_sent_len; 20014 ++obsegs; 20015 } while (!done && *usable > 0 && --num_burst_seg > 0 && 20016 *tail_unsent > 0); 20017 20018 if ((*xmit_tail)->b_next == NULL) { 20019 /* 20020 * Store the lbolt used for RTT estimation. We can only 20021 * record one timestamp per mblk so we do it when we 20022 * reach the end of the payload buffer. Also we only 20023 * take a new timestamp sample when the previous timed 20024 * data from the same mblk has been ack'ed. 20025 */ 20026 (*xmit_tail)->b_prev = local_time; 20027 (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)first_snxt; 20028 } 20029 20030 ASSERT(*tail_unsent >= 0); 20031 if (*tail_unsent > 0) { 20032 /* 20033 * We got here because we broke out of the above 20034 * loop due to of one of the following cases: 20035 * 20036 * 1. len < adjusted MSS (i.e. small), 20037 * 2. Sender SWS avoidance, 20038 * 3. max_pld is zero. 20039 * 20040 * We are done for this Multidata, so trim our 20041 * last payload buffer (if any) accordingly. 20042 */ 20043 if (md_pbuf != NULL) 20044 md_pbuf->b_wptr -= *tail_unsent; 20045 } else if (*usable > 0) { 20046 *xmit_tail = (*xmit_tail)->b_cont; 20047 ASSERT((uintptr_t)MBLKL(*xmit_tail) <= 20048 (uintptr_t)INT_MAX); 20049 *tail_unsent = (int)MBLKL(*xmit_tail); 20050 add_buffer = B_TRUE; 20051 } 20052 } while (!done && *usable > 0 && num_burst_seg > 0 && 20053 (tcp_mdt_chain || max_pld > 0)); 20054 20055 /* send everything down */ 20056 tcp_multisend_data(tcp, ire, ill, md_mp_head, obsegs, obbytes, 20057 &rconfirm); 20058 20059 #undef PREP_NEW_MULTIDATA 20060 #undef PREP_NEW_PBUF 20061 #undef IPVER 20062 20063 IRE_REFRELE(ire); 20064 return (0); 20065 } 20066 20067 /* 20068 * A wrapper function for sending one or more Multidata messages down to 20069 * the module below ip; this routine does not release the reference of the 20070 * IRE (caller does that). This routine is analogous to tcp_send_data(). 20071 */ 20072 static void 20073 tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head, 20074 const uint_t obsegs, const uint_t obbytes, boolean_t *rconfirm) 20075 { 20076 uint64_t delta; 20077 nce_t *nce; 20078 20079 ASSERT(ire != NULL && ill != NULL); 20080 ASSERT(ire->ire_stq != NULL); 20081 ASSERT(md_mp_head != NULL); 20082 ASSERT(rconfirm != NULL); 20083 20084 /* adjust MIBs and IRE timestamp */ 20085 TCP_RECORD_TRACE(tcp, md_mp_head, TCP_TRACE_SEND_PKT); 20086 tcp->tcp_obsegs += obsegs; 20087 UPDATE_MIB(&tcp_mib, tcpOutDataSegs, obsegs); 20088 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, obbytes); 20089 TCP_STAT_UPDATE(tcp_mdt_pkt_out, obsegs); 20090 20091 if (tcp->tcp_ipversion == IPV4_VERSION) { 20092 TCP_STAT_UPDATE(tcp_mdt_pkt_out_v4, obsegs); 20093 UPDATE_MIB(&ip_mib, ipOutRequests, obsegs); 20094 } else { 20095 TCP_STAT_UPDATE(tcp_mdt_pkt_out_v6, obsegs); 20096 UPDATE_MIB(&ip6_mib, ipv6OutRequests, obsegs); 20097 } 20098 20099 ire->ire_ob_pkt_count += obsegs; 20100 if (ire->ire_ipif != NULL) 20101 atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, obsegs); 20102 ire->ire_last_used_time = lbolt; 20103 20104 /* send it down */ 20105 putnext(ire->ire_stq, md_mp_head); 20106 20107 /* we're done for TCP/IPv4 */ 20108 if (tcp->tcp_ipversion == IPV4_VERSION) 20109 return; 20110 20111 nce = ire->ire_nce; 20112 20113 ASSERT(nce != NULL); 20114 ASSERT(!(nce->nce_flags & (NCE_F_NONUD|NCE_F_PERMANENT))); 20115 ASSERT(nce->nce_state != ND_INCOMPLETE); 20116 20117 /* reachability confirmation? */ 20118 if (*rconfirm) { 20119 nce->nce_last = TICK_TO_MSEC(lbolt64); 20120 if (nce->nce_state != ND_REACHABLE) { 20121 mutex_enter(&nce->nce_lock); 20122 nce->nce_state = ND_REACHABLE; 20123 nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; 20124 mutex_exit(&nce->nce_lock); 20125 (void) untimeout(nce->nce_timeout_id); 20126 if (ip_debug > 2) { 20127 /* ip1dbg */ 20128 pr_addr_dbg("tcp_multisend_data: state " 20129 "for %s changed to REACHABLE\n", 20130 AF_INET6, &ire->ire_addr_v6); 20131 } 20132 } 20133 /* reset transport reachability confirmation */ 20134 *rconfirm = B_FALSE; 20135 } 20136 20137 delta = TICK_TO_MSEC(lbolt64) - nce->nce_last; 20138 ip1dbg(("tcp_multisend_data: delta = %" PRId64 20139 " ill_reachable_time = %d \n", delta, ill->ill_reachable_time)); 20140 20141 if (delta > (uint64_t)ill->ill_reachable_time) { 20142 mutex_enter(&nce->nce_lock); 20143 switch (nce->nce_state) { 20144 case ND_REACHABLE: 20145 case ND_STALE: 20146 /* 20147 * ND_REACHABLE is identical to ND_STALE in this 20148 * specific case. If reachable time has expired for 20149 * this neighbor (delta is greater than reachable 20150 * time), conceptually, the neighbor cache is no 20151 * longer in REACHABLE state, but already in STALE 20152 * state. So the correct transition here is to 20153 * ND_DELAY. 20154 */ 20155 nce->nce_state = ND_DELAY; 20156 mutex_exit(&nce->nce_lock); 20157 NDP_RESTART_TIMER(nce, delay_first_probe_time); 20158 if (ip_debug > 3) { 20159 /* ip2dbg */ 20160 pr_addr_dbg("tcp_multisend_data: state " 20161 "for %s changed to DELAY\n", 20162 AF_INET6, &ire->ire_addr_v6); 20163 } 20164 break; 20165 case ND_DELAY: 20166 case ND_PROBE: 20167 mutex_exit(&nce->nce_lock); 20168 /* Timers have already started */ 20169 break; 20170 case ND_UNREACHABLE: 20171 /* 20172 * ndp timer has detected that this nce is 20173 * unreachable and initiated deleting this nce 20174 * and all its associated IREs. This is a race 20175 * where we found the ire before it was deleted 20176 * and have just sent out a packet using this 20177 * unreachable nce. 20178 */ 20179 mutex_exit(&nce->nce_lock); 20180 break; 20181 default: 20182 ASSERT(0); 20183 } 20184 } 20185 } 20186 20187 /* 20188 * tcp_send() is called by tcp_wput_data() for non-Multidata transmission 20189 * scheme, and returns one of the following: 20190 * 20191 * -1 = failed allocation. 20192 * 0 = success; burst count reached, or usable send window is too small, 20193 * and that we'd rather wait until later before sending again. 20194 * 1 = success; we are called from tcp_multisend(), and both usable send 20195 * window and tail_unsent are greater than the MDT threshold, and thus 20196 * Multidata Transmit should be used instead. 20197 */ 20198 static int 20199 tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, 20200 const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable, 20201 uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, 20202 const int mdt_thres) 20203 { 20204 int num_burst_seg = tcp->tcp_snd_burst; 20205 20206 for (;;) { 20207 struct datab *db; 20208 tcph_t *tcph; 20209 uint32_t sum; 20210 mblk_t *mp, *mp1; 20211 uchar_t *rptr; 20212 int len; 20213 20214 /* 20215 * If we're called by tcp_multisend(), and the amount of 20216 * sendable data as well as the size of current xmit_tail 20217 * is beyond the MDT threshold, return to the caller and 20218 * let the large data transmit be done using MDT. 20219 */ 20220 if (*usable > 0 && *usable > mdt_thres && 20221 (*tail_unsent > mdt_thres || (*tail_unsent == 0 && 20222 MBLKL((*xmit_tail)->b_cont) > mdt_thres))) { 20223 ASSERT(tcp->tcp_mdt); 20224 return (1); /* success; do large send */ 20225 } 20226 20227 if (num_burst_seg-- == 0) 20228 break; /* success; burst count reached */ 20229 20230 len = mss; 20231 if (len > *usable) { 20232 len = *usable; 20233 if (len <= 0) { 20234 /* Terminate the loop */ 20235 break; /* success; too small */ 20236 } 20237 /* 20238 * Sender silly-window avoidance. 20239 * Ignore this if we are going to send a 20240 * zero window probe out. 20241 * 20242 * TODO: force data into microscopic window? 20243 * ==> (!pushed || (unsent > usable)) 20244 */ 20245 if (len < (tcp->tcp_max_swnd >> 1) && 20246 (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len && 20247 !((tcp->tcp_valid_bits & TCP_URG_VALID) && 20248 len == 1) && (! tcp->tcp_zero_win_probe)) { 20249 /* 20250 * If the retransmit timer is not running 20251 * we start it so that we will retransmit 20252 * in the case when the the receiver has 20253 * decremented the window. 20254 */ 20255 if (*snxt == tcp->tcp_snxt && 20256 *snxt == tcp->tcp_suna) { 20257 /* 20258 * We are not supposed to send 20259 * anything. So let's wait a little 20260 * bit longer before breaking SWS 20261 * avoidance. 20262 * 20263 * What should the value be? 20264 * Suggestion: MAX(init rexmit time, 20265 * tcp->tcp_rto) 20266 */ 20267 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 20268 } 20269 break; /* success; too small */ 20270 } 20271 } 20272 20273 tcph = tcp->tcp_tcph; 20274 20275 *usable -= len; /* Approximate - can be adjusted later */ 20276 if (*usable > 0) 20277 tcph->th_flags[0] = TH_ACK; 20278 else 20279 tcph->th_flags[0] = (TH_ACK | TH_PUSH); 20280 20281 /* 20282 * Prime pump for IP's checksumming on our behalf 20283 * Include the adjustment for a source route if any. 20284 */ 20285 sum = len + tcp_tcp_hdr_len + tcp->tcp_sum; 20286 sum = (sum >> 16) + (sum & 0xFFFF); 20287 U16_TO_ABE16(sum, tcph->th_sum); 20288 20289 U32_TO_ABE32(*snxt, tcph->th_seq); 20290 20291 /* 20292 * Branch off to tcp_xmit_mp() if any of the VALID bits is 20293 * set. For the case when TCP_FSS_VALID is the only valid 20294 * bit (normal active close), branch off only when we think 20295 * that the FIN flag needs to be set. Note for this case, 20296 * that (snxt + len) may not reflect the actual seg_len, 20297 * as len may be further reduced in tcp_xmit_mp(). If len 20298 * gets modified, we will end up here again. 20299 */ 20300 if (tcp->tcp_valid_bits != 0 && 20301 (tcp->tcp_valid_bits != TCP_FSS_VALID || 20302 ((*snxt + len) == tcp->tcp_fss))) { 20303 uchar_t *prev_rptr; 20304 uint32_t prev_snxt = tcp->tcp_snxt; 20305 20306 if (*tail_unsent == 0) { 20307 ASSERT((*xmit_tail)->b_cont != NULL); 20308 *xmit_tail = (*xmit_tail)->b_cont; 20309 prev_rptr = (*xmit_tail)->b_rptr; 20310 *tail_unsent = (int)((*xmit_tail)->b_wptr - 20311 (*xmit_tail)->b_rptr); 20312 } else { 20313 prev_rptr = (*xmit_tail)->b_rptr; 20314 (*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr - 20315 *tail_unsent; 20316 } 20317 mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL, 20318 *snxt, B_FALSE, (uint32_t *)&len, B_FALSE); 20319 /* Restore tcp_snxt so we get amount sent right. */ 20320 tcp->tcp_snxt = prev_snxt; 20321 if (prev_rptr == (*xmit_tail)->b_rptr) { 20322 /* 20323 * If the previous timestamp is still in use, 20324 * don't stomp on it. 20325 */ 20326 if ((*xmit_tail)->b_next == NULL) { 20327 (*xmit_tail)->b_prev = local_time; 20328 (*xmit_tail)->b_next = 20329 (mblk_t *)(uintptr_t)(*snxt); 20330 } 20331 } else 20332 (*xmit_tail)->b_rptr = prev_rptr; 20333 20334 if (mp == NULL) 20335 return (-1); 20336 mp1 = mp->b_cont; 20337 20338 tcp->tcp_last_sent_len = (ushort_t)len; 20339 while (mp1->b_cont) { 20340 *xmit_tail = (*xmit_tail)->b_cont; 20341 (*xmit_tail)->b_prev = local_time; 20342 (*xmit_tail)->b_next = 20343 (mblk_t *)(uintptr_t)(*snxt); 20344 mp1 = mp1->b_cont; 20345 } 20346 *snxt += len; 20347 *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; 20348 BUMP_LOCAL(tcp->tcp_obsegs); 20349 BUMP_MIB(&tcp_mib, tcpOutDataSegs); 20350 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len); 20351 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 20352 tcp_send_data(tcp, q, mp); 20353 continue; 20354 } 20355 20356 *snxt += len; /* Adjust later if we don't send all of len */ 20357 BUMP_MIB(&tcp_mib, tcpOutDataSegs); 20358 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len); 20359 20360 if (*tail_unsent) { 20361 /* Are the bytes above us in flight? */ 20362 rptr = (*xmit_tail)->b_wptr - *tail_unsent; 20363 if (rptr != (*xmit_tail)->b_rptr) { 20364 *tail_unsent -= len; 20365 tcp->tcp_last_sent_len = (ushort_t)len; 20366 len += tcp_hdr_len; 20367 if (tcp->tcp_ipversion == IPV4_VERSION) 20368 tcp->tcp_ipha->ipha_length = htons(len); 20369 else 20370 tcp->tcp_ip6h->ip6_plen = 20371 htons(len - 20372 ((char *)&tcp->tcp_ip6h[1] - 20373 tcp->tcp_iphc)); 20374 mp = dupb(*xmit_tail); 20375 if (!mp) 20376 return (-1); /* out_of_mem */ 20377 mp->b_rptr = rptr; 20378 /* 20379 * If the old timestamp is no longer in use, 20380 * sample a new timestamp now. 20381 */ 20382 if ((*xmit_tail)->b_next == NULL) { 20383 (*xmit_tail)->b_prev = local_time; 20384 (*xmit_tail)->b_next = 20385 (mblk_t *)(uintptr_t)(*snxt-len); 20386 } 20387 goto must_alloc; 20388 } 20389 } else { 20390 *xmit_tail = (*xmit_tail)->b_cont; 20391 ASSERT((uintptr_t)((*xmit_tail)->b_wptr - 20392 (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX); 20393 *tail_unsent = (int)((*xmit_tail)->b_wptr - 20394 (*xmit_tail)->b_rptr); 20395 } 20396 20397 (*xmit_tail)->b_prev = local_time; 20398 (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len); 20399 20400 *tail_unsent -= len; 20401 tcp->tcp_last_sent_len = (ushort_t)len; 20402 20403 len += tcp_hdr_len; 20404 if (tcp->tcp_ipversion == IPV4_VERSION) 20405 tcp->tcp_ipha->ipha_length = htons(len); 20406 else 20407 tcp->tcp_ip6h->ip6_plen = htons(len - 20408 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 20409 20410 mp = dupb(*xmit_tail); 20411 if (!mp) 20412 return (-1); /* out_of_mem */ 20413 20414 len = tcp_hdr_len; 20415 /* 20416 * There are four reasons to allocate a new hdr mblk: 20417 * 1) The bytes above us are in use by another packet 20418 * 2) We don't have good alignment 20419 * 3) The mblk is being shared 20420 * 4) We don't have enough room for a header 20421 */ 20422 rptr = mp->b_rptr - len; 20423 if (!OK_32PTR(rptr) || 20424 ((db = mp->b_datap), db->db_ref != 2) || 20425 rptr < db->db_base) { 20426 /* NOTE: we assume allocb returns an OK_32PTR */ 20427 20428 must_alloc:; 20429 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + 20430 tcp_wroff_xtra, BPRI_MED); 20431 if (!mp1) { 20432 freemsg(mp); 20433 return (-1); /* out_of_mem */ 20434 } 20435 mp1->b_cont = mp; 20436 mp = mp1; 20437 /* Leave room for Link Level header */ 20438 len = tcp_hdr_len; 20439 rptr = &mp->b_rptr[tcp_wroff_xtra]; 20440 mp->b_wptr = &rptr[len]; 20441 } 20442 20443 /* 20444 * Fill in the header using the template header, and add 20445 * options such as time-stamp, ECN and/or SACK, as needed. 20446 */ 20447 tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk); 20448 20449 mp->b_rptr = rptr; 20450 20451 if (*tail_unsent) { 20452 int spill = *tail_unsent; 20453 20454 mp1 = mp->b_cont; 20455 if (!mp1) 20456 mp1 = mp; 20457 20458 /* 20459 * If we're a little short, tack on more mblks until 20460 * there is no more spillover. 20461 */ 20462 while (spill < 0) { 20463 mblk_t *nmp; 20464 int nmpsz; 20465 20466 nmp = (*xmit_tail)->b_cont; 20467 nmpsz = MBLKL(nmp); 20468 20469 /* 20470 * Excess data in mblk; can we split it? 20471 * If MDT is enabled for the connection, 20472 * keep on splitting as this is a transient 20473 * send path. 20474 */ 20475 if (!tcp->tcp_mdt && (spill + nmpsz > 0)) { 20476 /* 20477 * Don't split if stream head was 20478 * told to break up larger writes 20479 * into smaller ones. 20480 */ 20481 if (tcp->tcp_maxpsz > 0) 20482 break; 20483 20484 /* 20485 * Next mblk is less than SMSS/2 20486 * rounded up to nearest 64-byte; 20487 * let it get sent as part of the 20488 * next segment. 20489 */ 20490 if (tcp->tcp_localnet && 20491 !tcp->tcp_cork && 20492 (nmpsz < roundup((mss >> 1), 64))) 20493 break; 20494 } 20495 20496 *xmit_tail = nmp; 20497 ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX); 20498 /* Stash for rtt use later */ 20499 (*xmit_tail)->b_prev = local_time; 20500 (*xmit_tail)->b_next = 20501 (mblk_t *)(uintptr_t)(*snxt - len); 20502 mp1->b_cont = dupb(*xmit_tail); 20503 mp1 = mp1->b_cont; 20504 20505 spill += nmpsz; 20506 if (mp1 == NULL) { 20507 *tail_unsent = spill; 20508 freemsg(mp); 20509 return (-1); /* out_of_mem */ 20510 } 20511 } 20512 20513 /* Trim back any surplus on the last mblk */ 20514 if (spill >= 0) { 20515 mp1->b_wptr -= spill; 20516 *tail_unsent = spill; 20517 } else { 20518 /* 20519 * We did not send everything we could in 20520 * order to remain within the b_cont limit. 20521 */ 20522 *usable -= spill; 20523 *snxt += spill; 20524 tcp->tcp_last_sent_len += spill; 20525 UPDATE_MIB(&tcp_mib, tcpOutDataBytes, spill); 20526 /* 20527 * Adjust the checksum 20528 */ 20529 tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); 20530 sum += spill; 20531 sum = (sum >> 16) + (sum & 0xFFFF); 20532 U16_TO_ABE16(sum, tcph->th_sum); 20533 if (tcp->tcp_ipversion == IPV4_VERSION) { 20534 sum = ntohs( 20535 ((ipha_t *)rptr)->ipha_length) + 20536 spill; 20537 ((ipha_t *)rptr)->ipha_length = 20538 htons(sum); 20539 } else { 20540 sum = ntohs( 20541 ((ip6_t *)rptr)->ip6_plen) + 20542 spill; 20543 ((ip6_t *)rptr)->ip6_plen = 20544 htons(sum); 20545 } 20546 *tail_unsent = 0; 20547 } 20548 } 20549 if (tcp->tcp_ip_forward_progress) { 20550 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 20551 *(uint32_t *)mp->b_rptr |= IP_FORWARD_PROG; 20552 tcp->tcp_ip_forward_progress = B_FALSE; 20553 } 20554 20555 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 20556 tcp_send_data(tcp, q, mp); 20557 BUMP_LOCAL(tcp->tcp_obsegs); 20558 } 20559 20560 return (0); 20561 } 20562 20563 /* Unlink and return any mblk that looks like it contains a MDT info */ 20564 static mblk_t * 20565 tcp_mdt_info_mp(mblk_t *mp) 20566 { 20567 mblk_t *prev_mp; 20568 20569 for (;;) { 20570 prev_mp = mp; 20571 /* no more to process? */ 20572 if ((mp = mp->b_cont) == NULL) 20573 break; 20574 20575 switch (DB_TYPE(mp)) { 20576 case M_CTL: 20577 if (*(uint32_t *)mp->b_rptr != MDT_IOC_INFO_UPDATE) 20578 continue; 20579 ASSERT(prev_mp != NULL); 20580 prev_mp->b_cont = mp->b_cont; 20581 mp->b_cont = NULL; 20582 return (mp); 20583 default: 20584 break; 20585 } 20586 } 20587 return (mp); 20588 } 20589 20590 /* MDT info update routine, called when IP notifies us about MDT */ 20591 static void 20592 tcp_mdt_update(tcp_t *tcp, ill_mdt_capab_t *mdt_capab, boolean_t first) 20593 { 20594 boolean_t prev_state; 20595 20596 /* 20597 * IP is telling us to abort MDT on this connection? We know 20598 * this because the capability is only turned off when IP 20599 * encounters some pathological cases, e.g. link-layer change 20600 * where the new driver doesn't support MDT, or in situation 20601 * where MDT usage on the link-layer has been switched off. 20602 * IP would not have sent us the initial MDT_IOC_INFO_UPDATE 20603 * if the link-layer doesn't support MDT, and if it does, it 20604 * will indicate that the feature is to be turned on. 20605 */ 20606 prev_state = tcp->tcp_mdt; 20607 tcp->tcp_mdt = (mdt_capab->ill_mdt_on != 0); 20608 if (!tcp->tcp_mdt && !first) { 20609 TCP_STAT(tcp_mdt_conn_halted3); 20610 ip1dbg(("tcp_mdt_update: disabling MDT for connp %p\n", 20611 (void *)tcp->tcp_connp)); 20612 } 20613 20614 /* 20615 * We currently only support MDT on simple TCP/{IPv4,IPv6}, 20616 * so disable MDT otherwise. The checks are done here 20617 * and in tcp_wput_data(). 20618 */ 20619 if (tcp->tcp_mdt && 20620 (tcp->tcp_ipversion == IPV4_VERSION && 20621 tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || 20622 (tcp->tcp_ipversion == IPV6_VERSION && 20623 tcp->tcp_ip_hdr_len != IPV6_HDR_LEN)) 20624 tcp->tcp_mdt = B_FALSE; 20625 20626 if (tcp->tcp_mdt) { 20627 if (mdt_capab->ill_mdt_version != MDT_VERSION_2) { 20628 cmn_err(CE_NOTE, "tcp_mdt_update: unknown MDT " 20629 "version (%d), expected version is %d", 20630 mdt_capab->ill_mdt_version, MDT_VERSION_2); 20631 tcp->tcp_mdt = B_FALSE; 20632 return; 20633 } 20634 20635 /* 20636 * We need the driver to be able to handle at least three 20637 * spans per packet in order for tcp MDT to be utilized. 20638 * The first is for the header portion, while the rest are 20639 * needed to handle a packet that straddles across two 20640 * virtually non-contiguous buffers; a typical tcp packet 20641 * therefore consists of only two spans. Note that we take 20642 * a zero as "don't care". 20643 */ 20644 if (mdt_capab->ill_mdt_span_limit > 0 && 20645 mdt_capab->ill_mdt_span_limit < 3) { 20646 tcp->tcp_mdt = B_FALSE; 20647 return; 20648 } 20649 20650 /* a zero means driver wants default value */ 20651 tcp->tcp_mdt_max_pld = MIN(mdt_capab->ill_mdt_max_pld, 20652 tcp_mdt_max_pbufs); 20653 if (tcp->tcp_mdt_max_pld == 0) 20654 tcp->tcp_mdt_max_pld = tcp_mdt_max_pbufs; 20655 20656 /* ensure 32-bit alignment */ 20657 tcp->tcp_mdt_hdr_head = roundup(MAX(tcp_mdt_hdr_head_min, 20658 mdt_capab->ill_mdt_hdr_head), 4); 20659 tcp->tcp_mdt_hdr_tail = roundup(MAX(tcp_mdt_hdr_tail_min, 20660 mdt_capab->ill_mdt_hdr_tail), 4); 20661 20662 if (!first && !prev_state) { 20663 TCP_STAT(tcp_mdt_conn_resumed2); 20664 ip1dbg(("tcp_mdt_update: reenabling MDT for connp %p\n", 20665 (void *)tcp->tcp_connp)); 20666 } 20667 } 20668 } 20669 20670 static void 20671 tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_mdt) 20672 { 20673 conn_t *connp = tcp->tcp_connp; 20674 20675 ASSERT(ire != NULL); 20676 20677 /* 20678 * We may be in the fastpath here, and although we essentially do 20679 * similar checks as in ip_bind_connected{_v6}/ip_mdinfo_return, 20680 * we try to keep things as brief as possible. After all, these 20681 * are only best-effort checks, and we do more thorough ones prior 20682 * to calling tcp_multisend(). 20683 */ 20684 if (ip_multidata_outbound && check_mdt && 20685 !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && 20686 ill != NULL && ILL_MDT_CAPABLE(ill) && 20687 !CONN_IPSEC_OUT_ENCAPSULATED(connp) && 20688 !(ire->ire_flags & RTF_MULTIRT) && 20689 !IPP_ENABLED(IPP_LOCAL_OUT) && 20690 CONN_IS_MD_FASTPATH(connp)) { 20691 /* Remember the result */ 20692 connp->conn_mdt_ok = B_TRUE; 20693 20694 ASSERT(ill->ill_mdt_capab != NULL); 20695 if (!ill->ill_mdt_capab->ill_mdt_on) { 20696 /* 20697 * If MDT has been previously turned off in the past, 20698 * and we currently can do MDT (due to IPQoS policy 20699 * removal, etc.) then enable it for this interface. 20700 */ 20701 ill->ill_mdt_capab->ill_mdt_on = 1; 20702 ip1dbg(("tcp_ire_ill_check: connp %p enables MDT for " 20703 "interface %s\n", (void *)connp, ill->ill_name)); 20704 } 20705 tcp_mdt_update(tcp, ill->ill_mdt_capab, B_TRUE); 20706 } 20707 20708 /* 20709 * The goal is to reduce the number of generated tcp segments by 20710 * setting the maxpsz multiplier to 0; this will have an affect on 20711 * tcp_maxpsz_set(). With this behavior, tcp will pack more data 20712 * into each packet, up to SMSS bytes. Doing this reduces the number 20713 * of outbound segments and incoming ACKs, thus allowing for better 20714 * network and system performance. In contrast the legacy behavior 20715 * may result in sending less than SMSS size, because the last mblk 20716 * for some packets may have more data than needed to make up SMSS, 20717 * and the legacy code refused to "split" it. 20718 * 20719 * We apply the new behavior on following situations: 20720 * 20721 * 1) Loopback connections, 20722 * 2) Connections in which the remote peer is not on local subnet, 20723 * 3) Local subnet connections over the bge interface (see below). 20724 * 20725 * Ideally, we would like this behavior to apply for interfaces other 20726 * than bge. However, doing so would negatively impact drivers which 20727 * perform dynamic mapping and unmapping of DMA resources, which are 20728 * increased by setting the maxpsz multiplier to 0 (more mblks per 20729 * packet will be generated by tcp). The bge driver does not suffer 20730 * from this, as it copies the mblks into pre-mapped buffers, and 20731 * therefore does not require more I/O resources than before. 20732 * 20733 * Otherwise, this behavior is present on all network interfaces when 20734 * the destination endpoint is non-local, since reducing the number 20735 * of packets in general is good for the network. 20736 * 20737 * TODO We need to remove this hard-coded conditional for bge once 20738 * a better "self-tuning" mechanism, or a way to comprehend 20739 * the driver transmit strategy is devised. Until the solution 20740 * is found and well understood, we live with this hack. 20741 */ 20742 if (!tcp_static_maxpsz && 20743 (tcp->tcp_loopback || !tcp->tcp_localnet || 20744 (ill->ill_name_length > 3 && bcmp(ill->ill_name, "bge", 3) == 0))) { 20745 /* override the default value */ 20746 tcp->tcp_maxpsz = 0; 20747 20748 ip3dbg(("tcp_ire_ill_check: connp %p tcp_maxpsz %d on " 20749 "interface %s\n", (void *)connp, tcp->tcp_maxpsz, 20750 ill != NULL ? ill->ill_name : ipif_loopback_name)); 20751 } 20752 20753 /* set the stream head parameters accordingly */ 20754 (void) tcp_maxpsz_set(tcp, B_TRUE); 20755 } 20756 20757 /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */ 20758 static void 20759 tcp_wput_flush(tcp_t *tcp, mblk_t *mp) 20760 { 20761 uchar_t fval = *mp->b_rptr; 20762 mblk_t *tail; 20763 queue_t *q = tcp->tcp_wq; 20764 20765 /* TODO: How should flush interact with urgent data? */ 20766 if ((fval & FLUSHW) && tcp->tcp_xmit_head && 20767 !(tcp->tcp_valid_bits & TCP_URG_VALID)) { 20768 /* 20769 * Flush only data that has not yet been put on the wire. If 20770 * we flush data that we have already transmitted, life, as we 20771 * know it, may come to an end. 20772 */ 20773 tail = tcp->tcp_xmit_tail; 20774 tail->b_wptr -= tcp->tcp_xmit_tail_unsent; 20775 tcp->tcp_xmit_tail_unsent = 0; 20776 tcp->tcp_unsent = 0; 20777 if (tail->b_wptr != tail->b_rptr) 20778 tail = tail->b_cont; 20779 if (tail) { 20780 mblk_t **excess = &tcp->tcp_xmit_head; 20781 for (;;) { 20782 mblk_t *mp1 = *excess; 20783 if (mp1 == tail) 20784 break; 20785 tcp->tcp_xmit_tail = mp1; 20786 tcp->tcp_xmit_last = mp1; 20787 excess = &mp1->b_cont; 20788 } 20789 *excess = NULL; 20790 tcp_close_mpp(&tail); 20791 if (tcp->tcp_snd_zcopy_aware) 20792 tcp_zcopy_notify(tcp); 20793 } 20794 /* 20795 * We have no unsent data, so unsent must be less than 20796 * tcp_xmit_lowater, so re-enable flow. 20797 */ 20798 if (tcp->tcp_flow_stopped) { 20799 tcp_clrqfull(tcp); 20800 } 20801 } 20802 /* 20803 * TODO: you can't just flush these, you have to increase rwnd for one 20804 * thing. For another, how should urgent data interact? 20805 */ 20806 if (fval & FLUSHR) { 20807 *mp->b_rptr = fval & ~FLUSHW; 20808 /* XXX */ 20809 qreply(q, mp); 20810 return; 20811 } 20812 freemsg(mp); 20813 } 20814 20815 /* 20816 * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA 20817 * messages. 20818 */ 20819 static void 20820 tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) 20821 { 20822 mblk_t *mp1; 20823 STRUCT_HANDLE(strbuf, sb); 20824 uint16_t port; 20825 queue_t *q = tcp->tcp_wq; 20826 in6_addr_t v6addr; 20827 ipaddr_t v4addr; 20828 uint32_t flowinfo = 0; 20829 int addrlen; 20830 20831 /* Make sure it is one of ours. */ 20832 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 20833 case TI_GETMYNAME: 20834 case TI_GETPEERNAME: 20835 break; 20836 default: 20837 CALL_IP_WPUT(tcp->tcp_connp, q, mp); 20838 return; 20839 } 20840 switch (mi_copy_state(q, mp, &mp1)) { 20841 case -1: 20842 return; 20843 case MI_COPY_CASE(MI_COPY_IN, 1): 20844 break; 20845 case MI_COPY_CASE(MI_COPY_OUT, 1): 20846 /* Copy out the strbuf. */ 20847 mi_copyout(q, mp); 20848 return; 20849 case MI_COPY_CASE(MI_COPY_OUT, 2): 20850 /* All done. */ 20851 mi_copy_done(q, mp, 0); 20852 return; 20853 default: 20854 mi_copy_done(q, mp, EPROTO); 20855 return; 20856 } 20857 /* Check alignment of the strbuf */ 20858 if (!OK_32PTR(mp1->b_rptr)) { 20859 mi_copy_done(q, mp, EINVAL); 20860 return; 20861 } 20862 20863 STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, 20864 (void *)mp1->b_rptr); 20865 addrlen = tcp->tcp_family == AF_INET ? sizeof (sin_t) : sizeof (sin6_t); 20866 20867 if (STRUCT_FGET(sb, maxlen) < addrlen) { 20868 mi_copy_done(q, mp, EINVAL); 20869 return; 20870 } 20871 switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { 20872 case TI_GETMYNAME: 20873 if (tcp->tcp_family == AF_INET) { 20874 if (tcp->tcp_ipversion == IPV4_VERSION) { 20875 v4addr = tcp->tcp_ipha->ipha_src; 20876 } else { 20877 /* can't return an address in this case */ 20878 v4addr = 0; 20879 } 20880 } else { 20881 /* tcp->tcp_family == AF_INET6 */ 20882 if (tcp->tcp_ipversion == IPV4_VERSION) { 20883 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 20884 &v6addr); 20885 } else { 20886 v6addr = tcp->tcp_ip6h->ip6_src; 20887 } 20888 } 20889 port = tcp->tcp_lport; 20890 break; 20891 case TI_GETPEERNAME: 20892 if (tcp->tcp_family == AF_INET) { 20893 if (tcp->tcp_ipversion == IPV4_VERSION) { 20894 IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6, 20895 v4addr); 20896 } else { 20897 /* can't return an address in this case */ 20898 v4addr = 0; 20899 } 20900 } else { 20901 /* tcp->tcp_family == AF_INET6) */ 20902 v6addr = tcp->tcp_remote_v6; 20903 if (tcp->tcp_ipversion == IPV6_VERSION) { 20904 /* 20905 * No flowinfo if tcp->tcp_ipversion is v4. 20906 * 20907 * flowinfo was already initialized to zero 20908 * where it was declared above, so only 20909 * set it if ipversion is v6. 20910 */ 20911 flowinfo = tcp->tcp_ip6h->ip6_vcf & 20912 ~IPV6_VERS_AND_FLOW_MASK; 20913 } 20914 } 20915 port = tcp->tcp_fport; 20916 break; 20917 default: 20918 mi_copy_done(q, mp, EPROTO); 20919 return; 20920 } 20921 mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); 20922 if (!mp1) 20923 return; 20924 20925 if (tcp->tcp_family == AF_INET) { 20926 sin_t *sin; 20927 20928 STRUCT_FSET(sb, len, (int)sizeof (sin_t)); 20929 sin = (sin_t *)mp1->b_rptr; 20930 mp1->b_wptr = (uchar_t *)&sin[1]; 20931 *sin = sin_null; 20932 sin->sin_family = AF_INET; 20933 sin->sin_addr.s_addr = v4addr; 20934 sin->sin_port = port; 20935 } else { 20936 /* tcp->tcp_family == AF_INET6 */ 20937 sin6_t *sin6; 20938 20939 STRUCT_FSET(sb, len, (int)sizeof (sin6_t)); 20940 sin6 = (sin6_t *)mp1->b_rptr; 20941 mp1->b_wptr = (uchar_t *)&sin6[1]; 20942 *sin6 = sin6_null; 20943 sin6->sin6_family = AF_INET6; 20944 sin6->sin6_flowinfo = flowinfo; 20945 sin6->sin6_addr = v6addr; 20946 sin6->sin6_port = port; 20947 } 20948 /* Copy out the address */ 20949 mi_copyout(q, mp); 20950 } 20951 20952 /* 20953 * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL 20954 * messages. 20955 */ 20956 /* ARGSUSED */ 20957 static void 20958 tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) 20959 { 20960 conn_t *connp = (conn_t *)arg; 20961 tcp_t *tcp = connp->conn_tcp; 20962 queue_t *q = tcp->tcp_wq; 20963 struct iocblk *iocp; 20964 20965 ASSERT(DB_TYPE(mp) == M_IOCTL); 20966 /* 20967 * Try and ASSERT the minimum possible references on the 20968 * conn early enough. Since we are executing on write side, 20969 * the connection is obviously not detached and that means 20970 * there is a ref each for TCP and IP. Since we are behind 20971 * the squeue, the minimum references needed are 3. If the 20972 * conn is in classifier hash list, there should be an 20973 * extra ref for that (we check both the possibilities). 20974 */ 20975 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 20976 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 20977 20978 iocp = (struct iocblk *)mp->b_rptr; 20979 switch (iocp->ioc_cmd) { 20980 case TCP_IOC_DEFAULT_Q: 20981 /* Wants to be the default wq. */ 20982 if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) { 20983 iocp->ioc_error = EPERM; 20984 iocp->ioc_count = 0; 20985 mp->b_datap->db_type = M_IOCACK; 20986 qreply(q, mp); 20987 return; 20988 } 20989 tcp_def_q_set(tcp, mp); 20990 return; 20991 case _SIOCSOCKFALLBACK: 20992 /* 20993 * Either sockmod is about to be popped and the socket 20994 * would now be treated as a plain stream, or a module 20995 * is about to be pushed so we could no longer use read- 20996 * side synchronous streams for fused loopback tcp. 20997 * Drain any queued data and disable direct sockfs 20998 * interface from now on. 20999 */ 21000 if (!tcp->tcp_issocket) { 21001 DB_TYPE(mp) = M_IOCNAK; 21002 iocp->ioc_error = EINVAL; 21003 } else { 21004 #ifdef _ILP32 21005 tcp->tcp_acceptor_id = (t_uscalar_t)RD(q); 21006 #else 21007 tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev; 21008 #endif 21009 /* 21010 * Insert this socket into the acceptor hash. 21011 * We might need it for T_CONN_RES message 21012 */ 21013 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); 21014 21015 if (tcp->tcp_fused) { 21016 /* 21017 * This is a fused loopback tcp; disable 21018 * read-side synchronous streams interface 21019 * and drain any queued data. It is okay 21020 * to do this for non-synchronous streams 21021 * fused tcp as well. 21022 */ 21023 tcp_fuse_disable_pair(tcp, B_FALSE); 21024 } 21025 tcp->tcp_issocket = B_FALSE; 21026 TCP_STAT(tcp_sock_fallback); 21027 21028 DB_TYPE(mp) = M_IOCACK; 21029 iocp->ioc_error = 0; 21030 } 21031 iocp->ioc_count = 0; 21032 iocp->ioc_rval = 0; 21033 qreply(q, mp); 21034 return; 21035 } 21036 CALL_IP_WPUT(connp, q, mp); 21037 } 21038 21039 /* 21040 * This routine is called by tcp_wput() to handle all TPI requests. 21041 */ 21042 /* ARGSUSED */ 21043 static void 21044 tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) 21045 { 21046 conn_t *connp = (conn_t *)arg; 21047 tcp_t *tcp = connp->conn_tcp; 21048 union T_primitives *tprim = (union T_primitives *)mp->b_rptr; 21049 uchar_t *rptr; 21050 t_scalar_t type; 21051 int len; 21052 cred_t *cr = DB_CREDDEF(mp, tcp->tcp_cred); 21053 21054 /* 21055 * Try and ASSERT the minimum possible references on the 21056 * conn early enough. Since we are executing on write side, 21057 * the connection is obviously not detached and that means 21058 * there is a ref each for TCP and IP. Since we are behind 21059 * the squeue, the minimum references needed are 3. If the 21060 * conn is in classifier hash list, there should be an 21061 * extra ref for that (we check both the possibilities). 21062 */ 21063 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || 21064 (connp->conn_fanout == NULL && connp->conn_ref >= 3)); 21065 21066 rptr = mp->b_rptr; 21067 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); 21068 if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { 21069 type = ((union T_primitives *)rptr)->type; 21070 if (type == T_EXDATA_REQ) { 21071 uint32_t msize = msgdsize(mp->b_cont); 21072 21073 len = msize - 1; 21074 if (len < 0) { 21075 freemsg(mp); 21076 return; 21077 } 21078 /* 21079 * Try to force urgent data out on the wire. 21080 * Even if we have unsent data this will 21081 * at least send the urgent flag. 21082 * XXX does not handle more flag correctly. 21083 */ 21084 len += tcp->tcp_unsent; 21085 len += tcp->tcp_snxt; 21086 tcp->tcp_urg = len; 21087 tcp->tcp_valid_bits |= TCP_URG_VALID; 21088 21089 /* Bypass tcp protocol for fused tcp loopback */ 21090 if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) 21091 return; 21092 } else if (type != T_DATA_REQ) { 21093 goto non_urgent_data; 21094 } 21095 /* TODO: options, flags, ... from user */ 21096 /* Set length to zero for reclamation below */ 21097 tcp_wput_data(tcp, mp->b_cont, B_TRUE); 21098 freeb(mp); 21099 return; 21100 } else { 21101 if (tcp->tcp_debug) { 21102 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 21103 "tcp_wput_proto, dropping one..."); 21104 } 21105 freemsg(mp); 21106 return; 21107 } 21108 21109 non_urgent_data: 21110 21111 switch ((int)tprim->type) { 21112 case T_SSL_PROXY_BIND_REQ: /* an SSL proxy endpoint bind request */ 21113 /* 21114 * save the kssl_ent_t from the next block, and convert this 21115 * back to a normal bind_req. 21116 */ 21117 if (mp->b_cont != NULL) { 21118 ASSERT(MBLKL(mp->b_cont) >= sizeof (kssl_ent_t)); 21119 21120 if (tcp->tcp_kssl_ent != NULL) { 21121 kssl_release_ent(tcp->tcp_kssl_ent, NULL, 21122 KSSL_NO_PROXY); 21123 tcp->tcp_kssl_ent = NULL; 21124 } 21125 bcopy(mp->b_cont->b_rptr, &tcp->tcp_kssl_ent, 21126 sizeof (kssl_ent_t)); 21127 kssl_hold_ent(tcp->tcp_kssl_ent); 21128 freemsg(mp->b_cont); 21129 mp->b_cont = NULL; 21130 } 21131 tprim->type = T_BIND_REQ; 21132 21133 /* FALLTHROUGH */ 21134 case O_T_BIND_REQ: /* bind request */ 21135 case T_BIND_REQ: /* new semantics bind request */ 21136 tcp_bind(tcp, mp); 21137 break; 21138 case T_UNBIND_REQ: /* unbind request */ 21139 tcp_unbind(tcp, mp); 21140 break; 21141 case O_T_CONN_RES: /* old connection response XXX */ 21142 case T_CONN_RES: /* connection response */ 21143 tcp_accept(tcp, mp); 21144 break; 21145 case T_CONN_REQ: /* connection request */ 21146 tcp_connect(tcp, mp); 21147 break; 21148 case T_DISCON_REQ: /* disconnect request */ 21149 tcp_disconnect(tcp, mp); 21150 break; 21151 case T_CAPABILITY_REQ: 21152 tcp_capability_req(tcp, mp); /* capability request */ 21153 break; 21154 case T_INFO_REQ: /* information request */ 21155 tcp_info_req(tcp, mp); 21156 break; 21157 case T_SVR4_OPTMGMT_REQ: /* manage options req */ 21158 /* Only IP is allowed to return meaningful value */ 21159 (void) svr4_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj); 21160 break; 21161 case T_OPTMGMT_REQ: 21162 /* 21163 * Note: no support for snmpcom_req() through new 21164 * T_OPTMGMT_REQ. See comments in ip.c 21165 */ 21166 /* Only IP is allowed to return meaningful value */ 21167 (void) tpi_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj); 21168 break; 21169 21170 case T_UNITDATA_REQ: /* unitdata request */ 21171 tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 21172 break; 21173 case T_ORDREL_REQ: /* orderly release req */ 21174 freemsg(mp); 21175 21176 if (tcp->tcp_fused) 21177 tcp_unfuse(tcp); 21178 21179 if (tcp_xmit_end(tcp) != 0) { 21180 /* 21181 * We were crossing FINs and got a reset from 21182 * the other side. Just ignore it. 21183 */ 21184 if (tcp->tcp_debug) { 21185 (void) strlog(TCP_MOD_ID, 0, 1, 21186 SL_ERROR|SL_TRACE, 21187 "tcp_wput_proto, T_ORDREL_REQ out of " 21188 "state %s", 21189 tcp_display(tcp, NULL, 21190 DISP_ADDR_AND_PORT)); 21191 } 21192 } 21193 break; 21194 case T_ADDR_REQ: 21195 tcp_addr_req(tcp, mp); 21196 break; 21197 default: 21198 if (tcp->tcp_debug) { 21199 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 21200 "tcp_wput_proto, bogus TPI msg, type %d", 21201 tprim->type); 21202 } 21203 /* 21204 * We used to M_ERROR. Sending TNOTSUPPORT gives the user 21205 * to recover. 21206 */ 21207 tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); 21208 break; 21209 } 21210 } 21211 21212 /* 21213 * The TCP write service routine should never be called... 21214 */ 21215 /* ARGSUSED */ 21216 static void 21217 tcp_wsrv(queue_t *q) 21218 { 21219 TCP_STAT(tcp_wsrv_called); 21220 } 21221 21222 /* Non overlapping byte exchanger */ 21223 static void 21224 tcp_xchg(uchar_t *a, uchar_t *b, int len) 21225 { 21226 uchar_t uch; 21227 21228 while (len-- > 0) { 21229 uch = a[len]; 21230 a[len] = b[len]; 21231 b[len] = uch; 21232 } 21233 } 21234 21235 /* 21236 * Send out a control packet on the tcp connection specified. This routine 21237 * is typically called where we need a simple ACK or RST generated. 21238 */ 21239 static void 21240 tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) 21241 { 21242 uchar_t *rptr; 21243 tcph_t *tcph; 21244 ipha_t *ipha = NULL; 21245 ip6_t *ip6h = NULL; 21246 uint32_t sum; 21247 int tcp_hdr_len; 21248 int tcp_ip_hdr_len; 21249 mblk_t *mp; 21250 21251 /* 21252 * Save sum for use in source route later. 21253 */ 21254 ASSERT(tcp != NULL); 21255 sum = tcp->tcp_tcp_hdr_len + tcp->tcp_sum; 21256 tcp_hdr_len = tcp->tcp_hdr_len; 21257 tcp_ip_hdr_len = tcp->tcp_ip_hdr_len; 21258 21259 /* If a text string is passed in with the request, pass it to strlog. */ 21260 if (str != NULL && tcp->tcp_debug) { 21261 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 21262 "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", 21263 str, seq, ack, ctl); 21264 } 21265 mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 21266 BPRI_MED); 21267 if (mp == NULL) { 21268 return; 21269 } 21270 rptr = &mp->b_rptr[tcp_wroff_xtra]; 21271 mp->b_rptr = rptr; 21272 mp->b_wptr = &rptr[tcp_hdr_len]; 21273 bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); 21274 21275 if (tcp->tcp_ipversion == IPV4_VERSION) { 21276 ipha = (ipha_t *)rptr; 21277 ipha->ipha_length = htons(tcp_hdr_len); 21278 } else { 21279 ip6h = (ip6_t *)rptr; 21280 ASSERT(tcp != NULL); 21281 ip6h->ip6_plen = htons(tcp->tcp_hdr_len - 21282 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 21283 } 21284 tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; 21285 tcph->th_flags[0] = (uint8_t)ctl; 21286 if (ctl & TH_RST) { 21287 BUMP_MIB(&tcp_mib, tcpOutRsts); 21288 BUMP_MIB(&tcp_mib, tcpOutControl); 21289 /* 21290 * Don't send TSopt w/ TH_RST packets per RFC 1323. 21291 */ 21292 if (tcp->tcp_snd_ts_ok && 21293 tcp->tcp_state > TCPS_SYN_SENT) { 21294 mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN]; 21295 *(mp->b_wptr) = TCPOPT_EOL; 21296 if (tcp->tcp_ipversion == IPV4_VERSION) { 21297 ipha->ipha_length = htons(tcp_hdr_len - 21298 TCPOPT_REAL_TS_LEN); 21299 } else { 21300 ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - 21301 TCPOPT_REAL_TS_LEN); 21302 } 21303 tcph->th_offset_and_rsrvd[0] -= (3 << 4); 21304 sum -= TCPOPT_REAL_TS_LEN; 21305 } 21306 } 21307 if (ctl & TH_ACK) { 21308 if (tcp->tcp_snd_ts_ok) { 21309 U32_TO_BE32(lbolt, 21310 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 21311 U32_TO_BE32(tcp->tcp_ts_recent, 21312 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 21313 } 21314 21315 /* Update the latest receive window size in TCP header. */ 21316 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 21317 tcph->th_win); 21318 tcp->tcp_rack = ack; 21319 tcp->tcp_rack_cnt = 0; 21320 BUMP_MIB(&tcp_mib, tcpOutAck); 21321 } 21322 BUMP_LOCAL(tcp->tcp_obsegs); 21323 U32_TO_BE32(seq, tcph->th_seq); 21324 U32_TO_BE32(ack, tcph->th_ack); 21325 /* 21326 * Include the adjustment for a source route if any. 21327 */ 21328 sum = (sum >> 16) + (sum & 0xFFFF); 21329 U16_TO_BE16(sum, tcph->th_sum); 21330 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 21331 tcp_send_data(tcp, tcp->tcp_wq, mp); 21332 } 21333 21334 /* 21335 * If this routine returns B_TRUE, TCP can generate a RST in response 21336 * to a segment. If it returns B_FALSE, TCP should not respond. 21337 */ 21338 static boolean_t 21339 tcp_send_rst_chk(void) 21340 { 21341 clock_t now; 21342 21343 /* 21344 * TCP needs to protect itself from generating too many RSTs. 21345 * This can be a DoS attack by sending us random segments 21346 * soliciting RSTs. 21347 * 21348 * What we do here is to have a limit of tcp_rst_sent_rate RSTs 21349 * in each 1 second interval. In this way, TCP still generate 21350 * RSTs in normal cases but when under attack, the impact is 21351 * limited. 21352 */ 21353 if (tcp_rst_sent_rate_enabled != 0) { 21354 now = lbolt; 21355 /* lbolt can wrap around. */ 21356 if ((tcp_last_rst_intrvl > now) || 21357 (TICK_TO_MSEC(now - tcp_last_rst_intrvl) > 1*SECONDS)) { 21358 tcp_last_rst_intrvl = now; 21359 tcp_rst_cnt = 1; 21360 } else if (++tcp_rst_cnt > tcp_rst_sent_rate) { 21361 return (B_FALSE); 21362 } 21363 } 21364 return (B_TRUE); 21365 } 21366 21367 /* 21368 * Send down the advice IP ioctl to tell IP to mark an IRE temporary. 21369 */ 21370 static void 21371 tcp_ip_ire_mark_advice(tcp_t *tcp) 21372 { 21373 mblk_t *mp; 21374 ipic_t *ipic; 21375 21376 if (tcp->tcp_ipversion == IPV4_VERSION) { 21377 mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN, 21378 &ipic); 21379 } else { 21380 mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN, 21381 &ipic); 21382 } 21383 if (mp == NULL) 21384 return; 21385 ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY; 21386 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 21387 } 21388 21389 /* 21390 * Return an IP advice ioctl mblk and set ipic to be the pointer 21391 * to the advice structure. 21392 */ 21393 static mblk_t * 21394 tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic) 21395 { 21396 struct iocblk *ioc; 21397 mblk_t *mp, *mp1; 21398 21399 mp = allocb(sizeof (ipic_t) + addr_len, BPRI_HI); 21400 if (mp == NULL) 21401 return (NULL); 21402 bzero(mp->b_rptr, sizeof (ipic_t) + addr_len); 21403 *ipic = (ipic_t *)mp->b_rptr; 21404 (*ipic)->ipic_cmd = IP_IOC_IRE_ADVISE_NO_REPLY; 21405 (*ipic)->ipic_addr_offset = sizeof (ipic_t); 21406 21407 bcopy(addr, *ipic + 1, addr_len); 21408 21409 (*ipic)->ipic_addr_length = addr_len; 21410 mp->b_wptr = &mp->b_rptr[sizeof (ipic_t) + addr_len]; 21411 21412 mp1 = mkiocb(IP_IOCTL); 21413 if (mp1 == NULL) { 21414 freemsg(mp); 21415 return (NULL); 21416 } 21417 mp1->b_cont = mp; 21418 ioc = (struct iocblk *)mp1->b_rptr; 21419 ioc->ioc_count = sizeof (ipic_t) + addr_len; 21420 21421 return (mp1); 21422 } 21423 21424 /* 21425 * Generate a reset based on an inbound packet for which there is no active 21426 * tcp state that we can find. 21427 * 21428 * IPSEC NOTE : Try to send the reply with the same protection as it came 21429 * in. We still have the ipsec_mp that the packet was attached to. Thus 21430 * the packet will go out at the same level of protection as it came in by 21431 * converting the IPSEC_IN to IPSEC_OUT. 21432 */ 21433 static void 21434 tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, 21435 uint32_t ack, int ctl, uint_t ip_hdr_len) 21436 { 21437 ipha_t *ipha = NULL; 21438 ip6_t *ip6h = NULL; 21439 ushort_t len; 21440 tcph_t *tcph; 21441 int i; 21442 mblk_t *ipsec_mp; 21443 boolean_t mctl_present; 21444 ipic_t *ipic; 21445 ipaddr_t v4addr; 21446 in6_addr_t v6addr; 21447 int addr_len; 21448 void *addr; 21449 queue_t *q = tcp_g_q; 21450 tcp_t *tcp = Q_TO_TCP(q); 21451 cred_t *cr; 21452 21453 if (!tcp_send_rst_chk()) { 21454 tcp_rst_unsent++; 21455 freemsg(mp); 21456 return; 21457 } 21458 21459 if (mp->b_datap->db_type == M_CTL) { 21460 ipsec_mp = mp; 21461 mp = mp->b_cont; 21462 mctl_present = B_TRUE; 21463 } else { 21464 ipsec_mp = mp; 21465 mctl_present = B_FALSE; 21466 } 21467 21468 if (str && q && tcp_dbg) { 21469 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, 21470 "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " 21471 "flags 0x%x", 21472 str, seq, ack, ctl); 21473 } 21474 if (mp->b_datap->db_ref != 1) { 21475 mblk_t *mp1 = copyb(mp); 21476 freemsg(mp); 21477 mp = mp1; 21478 if (!mp) { 21479 if (mctl_present) 21480 freeb(ipsec_mp); 21481 return; 21482 } else { 21483 if (mctl_present) { 21484 ipsec_mp->b_cont = mp; 21485 } else { 21486 ipsec_mp = mp; 21487 } 21488 } 21489 } else if (mp->b_cont) { 21490 freemsg(mp->b_cont); 21491 mp->b_cont = NULL; 21492 } 21493 /* 21494 * We skip reversing source route here. 21495 * (for now we replace all IP options with EOL) 21496 */ 21497 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 21498 ipha = (ipha_t *)mp->b_rptr; 21499 for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) 21500 mp->b_rptr[i] = IPOPT_EOL; 21501 /* 21502 * Make sure that src address isn't flagrantly invalid. 21503 * Not all broadcast address checking for the src address 21504 * is possible, since we don't know the netmask of the src 21505 * addr. No check for destination address is done, since 21506 * IP will not pass up a packet with a broadcast dest 21507 * address to TCP. Similar checks are done below for IPv6. 21508 */ 21509 if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST || 21510 CLASSD(ipha->ipha_src)) { 21511 freemsg(ipsec_mp); 21512 BUMP_MIB(&ip_mib, ipInDiscards); 21513 return; 21514 } 21515 } else { 21516 ip6h = (ip6_t *)mp->b_rptr; 21517 21518 if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || 21519 IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { 21520 freemsg(ipsec_mp); 21521 BUMP_MIB(&ip6_mib, ipv6InDiscards); 21522 return; 21523 } 21524 21525 /* Remove any extension headers assuming partial overlay */ 21526 if (ip_hdr_len > IPV6_HDR_LEN) { 21527 uint8_t *to; 21528 21529 to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN; 21530 ovbcopy(ip6h, to, IPV6_HDR_LEN); 21531 mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN; 21532 ip_hdr_len = IPV6_HDR_LEN; 21533 ip6h = (ip6_t *)mp->b_rptr; 21534 ip6h->ip6_nxt = IPPROTO_TCP; 21535 } 21536 } 21537 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; 21538 if (tcph->th_flags[0] & TH_RST) { 21539 freemsg(ipsec_mp); 21540 return; 21541 } 21542 tcph->th_offset_and_rsrvd[0] = (5 << 4); 21543 len = ip_hdr_len + sizeof (tcph_t); 21544 mp->b_wptr = &mp->b_rptr[len]; 21545 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 21546 ipha->ipha_length = htons(len); 21547 /* Swap addresses */ 21548 v4addr = ipha->ipha_src; 21549 ipha->ipha_src = ipha->ipha_dst; 21550 ipha->ipha_dst = v4addr; 21551 ipha->ipha_ident = 0; 21552 ipha->ipha_ttl = (uchar_t)tcp_ipv4_ttl; 21553 addr_len = IP_ADDR_LEN; 21554 addr = &v4addr; 21555 } else { 21556 /* No ip6i_t in this case */ 21557 ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); 21558 /* Swap addresses */ 21559 v6addr = ip6h->ip6_src; 21560 ip6h->ip6_src = ip6h->ip6_dst; 21561 ip6h->ip6_dst = v6addr; 21562 ip6h->ip6_hops = (uchar_t)tcp_ipv6_hoplimit; 21563 addr_len = IPV6_ADDR_LEN; 21564 addr = &v6addr; 21565 } 21566 tcp_xchg(tcph->th_fport, tcph->th_lport, 2); 21567 U32_TO_BE32(ack, tcph->th_ack); 21568 U32_TO_BE32(seq, tcph->th_seq); 21569 U16_TO_BE16(0, tcph->th_win); 21570 U16_TO_BE16(sizeof (tcph_t), tcph->th_sum); 21571 tcph->th_flags[0] = (uint8_t)ctl; 21572 if (ctl & TH_RST) { 21573 BUMP_MIB(&tcp_mib, tcpOutRsts); 21574 BUMP_MIB(&tcp_mib, tcpOutControl); 21575 } 21576 21577 /* IP trusts us to set up labels when required. */ 21578 if (is_system_labeled() && (cr = DB_CRED(mp)) != NULL && 21579 crgetlabel(cr) != NULL) { 21580 int err, adjust; 21581 21582 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) 21583 err = tsol_check_label(cr, &mp, &adjust, 21584 tcp->tcp_connp->conn_mac_exempt); 21585 else 21586 err = tsol_check_label_v6(cr, &mp, &adjust, 21587 tcp->tcp_connp->conn_mac_exempt); 21588 if (mctl_present) 21589 ipsec_mp->b_cont = mp; 21590 else 21591 ipsec_mp = mp; 21592 if (err != 0) { 21593 freemsg(ipsec_mp); 21594 return; 21595 } 21596 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 21597 ipha = (ipha_t *)mp->b_rptr; 21598 adjust += ntohs(ipha->ipha_length); 21599 ipha->ipha_length = htons(adjust); 21600 } else { 21601 ip6h = (ip6_t *)mp->b_rptr; 21602 } 21603 } 21604 21605 if (mctl_present) { 21606 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; 21607 21608 ASSERT(ii->ipsec_in_type == IPSEC_IN); 21609 if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h)) { 21610 return; 21611 } 21612 } 21613 /* 21614 * NOTE: one might consider tracing a TCP packet here, but 21615 * this function has no active TCP state and no tcp structure 21616 * that has a trace buffer. If we traced here, we would have 21617 * to keep a local trace buffer in tcp_record_trace(). 21618 * 21619 * TSol note: The mblk that contains the incoming packet was 21620 * reused by tcp_xmit_listener_reset, so it already contains 21621 * the right credentials and we don't need to call mblk_setcred. 21622 * Also the conn's cred is not right since it is associated 21623 * with tcp_g_q. 21624 */ 21625 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp); 21626 21627 /* 21628 * Tell IP to mark the IRE used for this destination temporary. 21629 * This way, we can limit our exposure to DoS attack because IP 21630 * creates an IRE for each destination. If there are too many, 21631 * the time to do any routing lookup will be extremely long. And 21632 * the lookup can be in interrupt context. 21633 * 21634 * Note that in normal circumstances, this marking should not 21635 * affect anything. It would be nice if only 1 message is 21636 * needed to inform IP that the IRE created for this RST should 21637 * not be added to the cache table. But there is currently 21638 * not such communication mechanism between TCP and IP. So 21639 * the best we can do now is to send the advice ioctl to IP 21640 * to mark the IRE temporary. 21641 */ 21642 if ((mp = tcp_ip_advise_mblk(addr, addr_len, &ipic)) != NULL) { 21643 ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY; 21644 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 21645 } 21646 } 21647 21648 /* 21649 * Initiate closedown sequence on an active connection. (May be called as 21650 * writer.) Return value zero for OK return, non-zero for error return. 21651 */ 21652 static int 21653 tcp_xmit_end(tcp_t *tcp) 21654 { 21655 ipic_t *ipic; 21656 mblk_t *mp; 21657 21658 if (tcp->tcp_state < TCPS_SYN_RCVD || 21659 tcp->tcp_state > TCPS_CLOSE_WAIT) { 21660 /* 21661 * Invalid state, only states TCPS_SYN_RCVD, 21662 * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid 21663 */ 21664 return (-1); 21665 } 21666 21667 tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; 21668 tcp->tcp_valid_bits |= TCP_FSS_VALID; 21669 /* 21670 * If there is nothing more unsent, send the FIN now. 21671 * Otherwise, it will go out with the last segment. 21672 */ 21673 if (tcp->tcp_unsent == 0) { 21674 mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, 21675 tcp->tcp_fss, B_FALSE, NULL, B_FALSE); 21676 21677 if (mp) { 21678 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 21679 tcp_send_data(tcp, tcp->tcp_wq, mp); 21680 } else { 21681 /* 21682 * Couldn't allocate msg. Pretend we got it out. 21683 * Wait for rexmit timeout. 21684 */ 21685 tcp->tcp_snxt = tcp->tcp_fss + 1; 21686 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 21687 } 21688 21689 /* 21690 * If needed, update tcp_rexmit_snxt as tcp_snxt is 21691 * changed. 21692 */ 21693 if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { 21694 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 21695 } 21696 } else { 21697 /* 21698 * If tcp->tcp_cork is set, then the data will not get sent, 21699 * so we have to check that and unset it first. 21700 */ 21701 if (tcp->tcp_cork) 21702 tcp->tcp_cork = B_FALSE; 21703 tcp_wput_data(tcp, NULL, B_FALSE); 21704 } 21705 21706 /* 21707 * If TCP does not get enough samples of RTT or tcp_rtt_updates 21708 * is 0, don't update the cache. 21709 */ 21710 if (tcp_rtt_updates == 0 || tcp->tcp_rtt_update < tcp_rtt_updates) 21711 return (0); 21712 21713 /* 21714 * NOTE: should not update if source routes i.e. if tcp_remote if 21715 * different from the destination. 21716 */ 21717 if (tcp->tcp_ipversion == IPV4_VERSION) { 21718 if (tcp->tcp_remote != tcp->tcp_ipha->ipha_dst) { 21719 return (0); 21720 } 21721 mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN, 21722 &ipic); 21723 } else { 21724 if (!(IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, 21725 &tcp->tcp_ip6h->ip6_dst))) { 21726 return (0); 21727 } 21728 mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN, 21729 &ipic); 21730 } 21731 21732 /* Record route attributes in the IRE for use by future connections. */ 21733 if (mp == NULL) 21734 return (0); 21735 21736 /* 21737 * We do not have a good algorithm to update ssthresh at this time. 21738 * So don't do any update. 21739 */ 21740 ipic->ipic_rtt = tcp->tcp_rtt_sa; 21741 ipic->ipic_rtt_sd = tcp->tcp_rtt_sd; 21742 21743 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); 21744 return (0); 21745 } 21746 21747 /* 21748 * Generate a "no listener here" RST in response to an "unknown" segment. 21749 * Note that we are reusing the incoming mp to construct the outgoing 21750 * RST. 21751 */ 21752 void 21753 tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len) 21754 { 21755 uchar_t *rptr; 21756 uint32_t seg_len; 21757 tcph_t *tcph; 21758 uint32_t seg_seq; 21759 uint32_t seg_ack; 21760 uint_t flags; 21761 mblk_t *ipsec_mp; 21762 ipha_t *ipha; 21763 ip6_t *ip6h; 21764 boolean_t mctl_present = B_FALSE; 21765 boolean_t check = B_TRUE; 21766 boolean_t policy_present; 21767 21768 TCP_STAT(tcp_no_listener); 21769 21770 ipsec_mp = mp; 21771 21772 if (mp->b_datap->db_type == M_CTL) { 21773 ipsec_in_t *ii; 21774 21775 mctl_present = B_TRUE; 21776 mp = mp->b_cont; 21777 21778 ii = (ipsec_in_t *)ipsec_mp->b_rptr; 21779 ASSERT(ii->ipsec_in_type == IPSEC_IN); 21780 if (ii->ipsec_in_dont_check) { 21781 check = B_FALSE; 21782 if (!ii->ipsec_in_secure) { 21783 freeb(ipsec_mp); 21784 mctl_present = B_FALSE; 21785 ipsec_mp = mp; 21786 } 21787 } 21788 } 21789 21790 if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { 21791 policy_present = ipsec_inbound_v4_policy_present; 21792 ipha = (ipha_t *)mp->b_rptr; 21793 ip6h = NULL; 21794 } else { 21795 policy_present = ipsec_inbound_v6_policy_present; 21796 ipha = NULL; 21797 ip6h = (ip6_t *)mp->b_rptr; 21798 } 21799 21800 if (check && policy_present) { 21801 /* 21802 * The conn_t parameter is NULL because we already know 21803 * nobody's home. 21804 */ 21805 ipsec_mp = ipsec_check_global_policy( 21806 ipsec_mp, (conn_t *)NULL, ipha, ip6h, mctl_present); 21807 if (ipsec_mp == NULL) 21808 return; 21809 } 21810 if (is_system_labeled() && !tsol_can_reply_error(mp)) { 21811 DTRACE_PROBE2( 21812 tx__ip__log__error__nolistener__tcp, 21813 char *, "Could not reply with RST to mp(1)", 21814 mblk_t *, mp); 21815 ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n")); 21816 freemsg(ipsec_mp); 21817 return; 21818 } 21819 21820 rptr = mp->b_rptr; 21821 21822 tcph = (tcph_t *)&rptr[ip_hdr_len]; 21823 seg_seq = BE32_TO_U32(tcph->th_seq); 21824 seg_ack = BE32_TO_U32(tcph->th_ack); 21825 flags = tcph->th_flags[0]; 21826 21827 seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len); 21828 if (flags & TH_RST) { 21829 freemsg(ipsec_mp); 21830 } else if (flags & TH_ACK) { 21831 tcp_xmit_early_reset("no tcp, reset", 21832 ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len); 21833 } else { 21834 if (flags & TH_SYN) { 21835 seg_len++; 21836 } else { 21837 /* 21838 * Here we violate the RFC. Note that a normal 21839 * TCP will never send a segment without the ACK 21840 * flag, except for RST or SYN segment. This 21841 * segment is neither. Just drop it on the 21842 * floor. 21843 */ 21844 freemsg(ipsec_mp); 21845 tcp_rst_unsent++; 21846 return; 21847 } 21848 21849 tcp_xmit_early_reset("no tcp, reset/ack", 21850 ipsec_mp, 0, seg_seq + seg_len, 21851 TH_RST | TH_ACK, ip_hdr_len); 21852 } 21853 } 21854 21855 /* 21856 * tcp_xmit_mp is called to return a pointer to an mblk chain complete with 21857 * ip and tcp header ready to pass down to IP. If the mp passed in is 21858 * non-NULL, then up to max_to_send bytes of data will be dup'ed off that 21859 * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary 21860 * otherwise it will dup partial mblks.) 21861 * Otherwise, an appropriate ACK packet will be generated. This 21862 * routine is not usually called to send new data for the first time. It 21863 * is mostly called out of the timer for retransmits, and to generate ACKs. 21864 * 21865 * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will 21866 * be adjusted by *offset. And after dupb(), the offset and the ending mblk 21867 * of the original mblk chain will be returned in *offset and *end_mp. 21868 */ 21869 static mblk_t * 21870 tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, 21871 mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, 21872 boolean_t rexmit) 21873 { 21874 int data_length; 21875 int32_t off = 0; 21876 uint_t flags; 21877 mblk_t *mp1; 21878 mblk_t *mp2; 21879 uchar_t *rptr; 21880 tcph_t *tcph; 21881 int32_t num_sack_blk = 0; 21882 int32_t sack_opt_len = 0; 21883 21884 /* Allocate for our maximum TCP header + link-level */ 21885 mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, 21886 BPRI_MED); 21887 if (!mp1) 21888 return (NULL); 21889 data_length = 0; 21890 21891 /* 21892 * Note that tcp_mss has been adjusted to take into account the 21893 * timestamp option if applicable. Because SACK options do not 21894 * appear in every TCP segments and they are of variable lengths, 21895 * they cannot be included in tcp_mss. Thus we need to calculate 21896 * the actual segment length when we need to send a segment which 21897 * includes SACK options. 21898 */ 21899 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 21900 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 21901 tcp->tcp_num_sack_blk); 21902 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 21903 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 21904 if (max_to_send + sack_opt_len > tcp->tcp_mss) 21905 max_to_send -= sack_opt_len; 21906 } 21907 21908 if (offset != NULL) { 21909 off = *offset; 21910 /* We use offset as an indicator that end_mp is not NULL. */ 21911 *end_mp = NULL; 21912 } 21913 for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { 21914 /* This could be faster with cooperation from downstream */ 21915 if (mp2 != mp1 && !sendall && 21916 data_length + (int)(mp->b_wptr - mp->b_rptr) > 21917 max_to_send) 21918 /* 21919 * Don't send the next mblk since the whole mblk 21920 * does not fit. 21921 */ 21922 break; 21923 mp2->b_cont = dupb(mp); 21924 mp2 = mp2->b_cont; 21925 if (!mp2) { 21926 freemsg(mp1); 21927 return (NULL); 21928 } 21929 mp2->b_rptr += off; 21930 ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= 21931 (uintptr_t)INT_MAX); 21932 21933 data_length += (int)(mp2->b_wptr - mp2->b_rptr); 21934 if (data_length > max_to_send) { 21935 mp2->b_wptr -= data_length - max_to_send; 21936 data_length = max_to_send; 21937 off = mp2->b_wptr - mp->b_rptr; 21938 break; 21939 } else { 21940 off = 0; 21941 } 21942 } 21943 if (offset != NULL) { 21944 *offset = off; 21945 *end_mp = mp; 21946 } 21947 if (seg_len != NULL) { 21948 *seg_len = data_length; 21949 } 21950 21951 /* Update the latest receive window size in TCP header. */ 21952 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 21953 tcp->tcp_tcph->th_win); 21954 21955 rptr = mp1->b_rptr + tcp_wroff_xtra; 21956 mp1->b_rptr = rptr; 21957 mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; 21958 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 21959 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 21960 U32_TO_ABE32(seq, tcph->th_seq); 21961 21962 /* 21963 * Use tcp_unsent to determine if the PUSH bit should be used assumes 21964 * that this function was called from tcp_wput_data. Thus, when called 21965 * to retransmit data the setting of the PUSH bit may appear some 21966 * what random in that it might get set when it should not. This 21967 * should not pose any performance issues. 21968 */ 21969 if (data_length != 0 && (tcp->tcp_unsent == 0 || 21970 tcp->tcp_unsent == data_length)) { 21971 flags = TH_ACK | TH_PUSH; 21972 } else { 21973 flags = TH_ACK; 21974 } 21975 21976 if (tcp->tcp_ecn_ok) { 21977 if (tcp->tcp_ecn_echo_on) 21978 flags |= TH_ECE; 21979 21980 /* 21981 * Only set ECT bit and ECN_CWR if a segment contains new data. 21982 * There is no TCP flow control for non-data segments, and 21983 * only data segment is transmitted reliably. 21984 */ 21985 if (data_length > 0 && !rexmit) { 21986 SET_ECT(tcp, rptr); 21987 if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { 21988 flags |= TH_CWR; 21989 tcp->tcp_ecn_cwr_sent = B_TRUE; 21990 } 21991 } 21992 } 21993 21994 if (tcp->tcp_valid_bits) { 21995 uint32_t u1; 21996 21997 if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && 21998 seq == tcp->tcp_iss) { 21999 uchar_t *wptr; 22000 22001 /* 22002 * If TCP_ISS_VALID and the seq number is tcp_iss, 22003 * TCP can only be in SYN-SENT, SYN-RCVD or 22004 * FIN-WAIT-1 state. It can be FIN-WAIT-1 if 22005 * our SYN is not ack'ed but the app closes this 22006 * TCP connection. 22007 */ 22008 ASSERT(tcp->tcp_state == TCPS_SYN_SENT || 22009 tcp->tcp_state == TCPS_SYN_RCVD || 22010 tcp->tcp_state == TCPS_FIN_WAIT_1); 22011 22012 /* 22013 * Tack on the MSS option. It is always needed 22014 * for both active and passive open. 22015 * 22016 * MSS option value should be interface MTU - MIN 22017 * TCP/IP header according to RFC 793 as it means 22018 * the maximum segment size TCP can receive. But 22019 * to get around some broken middle boxes/end hosts 22020 * out there, we allow the option value to be the 22021 * same as the MSS option size on the peer side. 22022 * In this way, the other side will not send 22023 * anything larger than they can receive. 22024 * 22025 * Note that for SYN_SENT state, the ndd param 22026 * tcp_use_smss_as_mss_opt has no effect as we 22027 * don't know the peer's MSS option value. So 22028 * the only case we need to take care of is in 22029 * SYN_RCVD state, which is done later. 22030 */ 22031 wptr = mp1->b_wptr; 22032 wptr[0] = TCPOPT_MAXSEG; 22033 wptr[1] = TCPOPT_MAXSEG_LEN; 22034 wptr += 2; 22035 u1 = tcp->tcp_if_mtu - 22036 (tcp->tcp_ipversion == IPV4_VERSION ? 22037 IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - 22038 TCP_MIN_HEADER_LENGTH; 22039 U16_TO_BE16(u1, wptr); 22040 mp1->b_wptr = wptr + 2; 22041 /* Update the offset to cover the additional word */ 22042 tcph->th_offset_and_rsrvd[0] += (1 << 4); 22043 22044 /* 22045 * Note that the following way of filling in 22046 * TCP options are not optimal. Some NOPs can 22047 * be saved. But there is no need at this time 22048 * to optimize it. When it is needed, we will 22049 * do it. 22050 */ 22051 switch (tcp->tcp_state) { 22052 case TCPS_SYN_SENT: 22053 flags = TH_SYN; 22054 22055 if (tcp->tcp_snd_ts_ok) { 22056 uint32_t llbolt = (uint32_t)lbolt; 22057 22058 wptr = mp1->b_wptr; 22059 wptr[0] = TCPOPT_NOP; 22060 wptr[1] = TCPOPT_NOP; 22061 wptr[2] = TCPOPT_TSTAMP; 22062 wptr[3] = TCPOPT_TSTAMP_LEN; 22063 wptr += 4; 22064 U32_TO_BE32(llbolt, wptr); 22065 wptr += 4; 22066 ASSERT(tcp->tcp_ts_recent == 0); 22067 U32_TO_BE32(0L, wptr); 22068 mp1->b_wptr += TCPOPT_REAL_TS_LEN; 22069 tcph->th_offset_and_rsrvd[0] += 22070 (3 << 4); 22071 } 22072 22073 /* 22074 * Set up all the bits to tell other side 22075 * we are ECN capable. 22076 */ 22077 if (tcp->tcp_ecn_ok) { 22078 flags |= (TH_ECE | TH_CWR); 22079 } 22080 break; 22081 case TCPS_SYN_RCVD: 22082 flags |= TH_SYN; 22083 22084 /* 22085 * Reset the MSS option value to be SMSS 22086 * We should probably add back the bytes 22087 * for timestamp option and IPsec. We 22088 * don't do that as this is a workaround 22089 * for broken middle boxes/end hosts, it 22090 * is better for us to be more cautious. 22091 * They may not take these things into 22092 * account in their SMSS calculation. Thus 22093 * the peer's calculated SMSS may be smaller 22094 * than what it can be. This should be OK. 22095 */ 22096 if (tcp_use_smss_as_mss_opt) { 22097 u1 = tcp->tcp_mss; 22098 U16_TO_BE16(u1, wptr); 22099 } 22100 22101 /* 22102 * If the other side is ECN capable, reply 22103 * that we are also ECN capable. 22104 */ 22105 if (tcp->tcp_ecn_ok) 22106 flags |= TH_ECE; 22107 break; 22108 default: 22109 /* 22110 * The above ASSERT() makes sure that this 22111 * must be FIN-WAIT-1 state. Our SYN has 22112 * not been ack'ed so retransmit it. 22113 */ 22114 flags |= TH_SYN; 22115 break; 22116 } 22117 22118 if (tcp->tcp_snd_ws_ok) { 22119 wptr = mp1->b_wptr; 22120 wptr[0] = TCPOPT_NOP; 22121 wptr[1] = TCPOPT_WSCALE; 22122 wptr[2] = TCPOPT_WS_LEN; 22123 wptr[3] = (uchar_t)tcp->tcp_rcv_ws; 22124 mp1->b_wptr += TCPOPT_REAL_WS_LEN; 22125 tcph->th_offset_and_rsrvd[0] += (1 << 4); 22126 } 22127 22128 if (tcp->tcp_snd_sack_ok) { 22129 wptr = mp1->b_wptr; 22130 wptr[0] = TCPOPT_NOP; 22131 wptr[1] = TCPOPT_NOP; 22132 wptr[2] = TCPOPT_SACK_PERMITTED; 22133 wptr[3] = TCPOPT_SACK_OK_LEN; 22134 mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; 22135 tcph->th_offset_and_rsrvd[0] += (1 << 4); 22136 } 22137 22138 /* allocb() of adequate mblk assures space */ 22139 ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= 22140 (uintptr_t)INT_MAX); 22141 u1 = (int)(mp1->b_wptr - mp1->b_rptr); 22142 /* 22143 * Get IP set to checksum on our behalf 22144 * Include the adjustment for a source route if any. 22145 */ 22146 u1 += tcp->tcp_sum; 22147 u1 = (u1 >> 16) + (u1 & 0xFFFF); 22148 U16_TO_BE16(u1, tcph->th_sum); 22149 BUMP_MIB(&tcp_mib, tcpOutControl); 22150 } 22151 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && 22152 (seq + data_length) == tcp->tcp_fss) { 22153 if (!tcp->tcp_fin_acked) { 22154 flags |= TH_FIN; 22155 BUMP_MIB(&tcp_mib, tcpOutControl); 22156 } 22157 if (!tcp->tcp_fin_sent) { 22158 tcp->tcp_fin_sent = B_TRUE; 22159 switch (tcp->tcp_state) { 22160 case TCPS_SYN_RCVD: 22161 case TCPS_ESTABLISHED: 22162 tcp->tcp_state = TCPS_FIN_WAIT_1; 22163 break; 22164 case TCPS_CLOSE_WAIT: 22165 tcp->tcp_state = TCPS_LAST_ACK; 22166 break; 22167 } 22168 if (tcp->tcp_suna == tcp->tcp_snxt) 22169 TCP_TIMER_RESTART(tcp, tcp->tcp_rto); 22170 tcp->tcp_snxt = tcp->tcp_fss + 1; 22171 } 22172 } 22173 /* 22174 * Note the trick here. u1 is unsigned. When tcp_urg 22175 * is smaller than seq, u1 will become a very huge value. 22176 * So the comparison will fail. Also note that tcp_urp 22177 * should be positive, see RFC 793 page 17. 22178 */ 22179 u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION; 22180 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 && 22181 u1 < (uint32_t)(64 * 1024)) { 22182 flags |= TH_URG; 22183 BUMP_MIB(&tcp_mib, tcpOutUrg); 22184 U32_TO_ABE16(u1, tcph->th_urp); 22185 } 22186 } 22187 tcph->th_flags[0] = (uchar_t)flags; 22188 tcp->tcp_rack = tcp->tcp_rnxt; 22189 tcp->tcp_rack_cnt = 0; 22190 22191 if (tcp->tcp_snd_ts_ok) { 22192 if (tcp->tcp_state != TCPS_SYN_SENT) { 22193 uint32_t llbolt = (uint32_t)lbolt; 22194 22195 U32_TO_BE32(llbolt, 22196 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 22197 U32_TO_BE32(tcp->tcp_ts_recent, 22198 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 22199 } 22200 } 22201 22202 if (num_sack_blk > 0) { 22203 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 22204 sack_blk_t *tmp; 22205 int32_t i; 22206 22207 wptr[0] = TCPOPT_NOP; 22208 wptr[1] = TCPOPT_NOP; 22209 wptr[2] = TCPOPT_SACK; 22210 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 22211 sizeof (sack_blk_t); 22212 wptr += TCPOPT_REAL_SACK_LEN; 22213 22214 tmp = tcp->tcp_sack_list; 22215 for (i = 0; i < num_sack_blk; i++) { 22216 U32_TO_BE32(tmp[i].begin, wptr); 22217 wptr += sizeof (tcp_seq); 22218 U32_TO_BE32(tmp[i].end, wptr); 22219 wptr += sizeof (tcp_seq); 22220 } 22221 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4); 22222 } 22223 ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); 22224 data_length += (int)(mp1->b_wptr - rptr); 22225 if (tcp->tcp_ipversion == IPV4_VERSION) { 22226 ((ipha_t *)rptr)->ipha_length = htons(data_length); 22227 } else { 22228 ip6_t *ip6 = (ip6_t *)(rptr + 22229 (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ? 22230 sizeof (ip6i_t) : 0)); 22231 22232 ip6->ip6_plen = htons(data_length - 22233 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 22234 } 22235 22236 /* 22237 * Prime pump for IP 22238 * Include the adjustment for a source route if any. 22239 */ 22240 data_length -= tcp->tcp_ip_hdr_len; 22241 data_length += tcp->tcp_sum; 22242 data_length = (data_length >> 16) + (data_length & 0xFFFF); 22243 U16_TO_ABE16(data_length, tcph->th_sum); 22244 if (tcp->tcp_ip_forward_progress) { 22245 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 22246 *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; 22247 tcp->tcp_ip_forward_progress = B_FALSE; 22248 } 22249 return (mp1); 22250 } 22251 22252 /* This function handles the push timeout. */ 22253 void 22254 tcp_push_timer(void *arg) 22255 { 22256 conn_t *connp = (conn_t *)arg; 22257 tcp_t *tcp = connp->conn_tcp; 22258 22259 TCP_DBGSTAT(tcp_push_timer_cnt); 22260 22261 ASSERT(tcp->tcp_listener == NULL); 22262 22263 /* 22264 * We need to plug synchronous streams during our drain to prevent 22265 * a race with tcp_fuse_rrw() or tcp_fusion_rinfop(). 22266 */ 22267 TCP_FUSE_SYNCSTR_PLUG_DRAIN(tcp); 22268 tcp->tcp_push_tid = 0; 22269 if ((tcp->tcp_rcv_list != NULL) && 22270 (tcp_rcv_drain(tcp->tcp_rq, tcp) == TH_ACK_NEEDED)) 22271 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); 22272 TCP_FUSE_SYNCSTR_UNPLUG_DRAIN(tcp); 22273 } 22274 22275 /* 22276 * This function handles delayed ACK timeout. 22277 */ 22278 static void 22279 tcp_ack_timer(void *arg) 22280 { 22281 conn_t *connp = (conn_t *)arg; 22282 tcp_t *tcp = connp->conn_tcp; 22283 mblk_t *mp; 22284 22285 TCP_DBGSTAT(tcp_ack_timer_cnt); 22286 22287 tcp->tcp_ack_tid = 0; 22288 22289 if (tcp->tcp_fused) 22290 return; 22291 22292 /* 22293 * Do not send ACK if there is no outstanding unack'ed data. 22294 */ 22295 if (tcp->tcp_rnxt == tcp->tcp_rack) { 22296 return; 22297 } 22298 22299 if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) { 22300 /* 22301 * Make sure we don't allow deferred ACKs to result in 22302 * timer-based ACKing. If we have held off an ACK 22303 * when there was more than an mss here, and the timer 22304 * goes off, we have to worry about the possibility 22305 * that the sender isn't doing slow-start, or is out 22306 * of step with us for some other reason. We fall 22307 * permanently back in the direction of 22308 * ACK-every-other-packet as suggested in RFC 1122. 22309 */ 22310 if (tcp->tcp_rack_abs_max > 2) 22311 tcp->tcp_rack_abs_max--; 22312 tcp->tcp_rack_cur_max = 2; 22313 } 22314 mp = tcp_ack_mp(tcp); 22315 22316 if (mp != NULL) { 22317 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); 22318 BUMP_LOCAL(tcp->tcp_obsegs); 22319 BUMP_MIB(&tcp_mib, tcpOutAck); 22320 BUMP_MIB(&tcp_mib, tcpOutAckDelayed); 22321 tcp_send_data(tcp, tcp->tcp_wq, mp); 22322 } 22323 } 22324 22325 22326 /* Generate an ACK-only (no data) segment for a TCP endpoint */ 22327 static mblk_t * 22328 tcp_ack_mp(tcp_t *tcp) 22329 { 22330 uint32_t seq_no; 22331 22332 /* 22333 * There are a few cases to be considered while setting the sequence no. 22334 * Essentially, we can come here while processing an unacceptable pkt 22335 * in the TCPS_SYN_RCVD state, in which case we set the sequence number 22336 * to snxt (per RFC 793), note the swnd wouldn't have been set yet. 22337 * If we are here for a zero window probe, stick with suna. In all 22338 * other cases, we check if suna + swnd encompasses snxt and set 22339 * the sequence number to snxt, if so. If snxt falls outside the 22340 * window (the receiver probably shrunk its window), we will go with 22341 * suna + swnd, otherwise the sequence no will be unacceptable to the 22342 * receiver. 22343 */ 22344 if (tcp->tcp_zero_win_probe) { 22345 seq_no = tcp->tcp_suna; 22346 } else if (tcp->tcp_state == TCPS_SYN_RCVD) { 22347 ASSERT(tcp->tcp_swnd == 0); 22348 seq_no = tcp->tcp_snxt; 22349 } else { 22350 seq_no = SEQ_GT(tcp->tcp_snxt, 22351 (tcp->tcp_suna + tcp->tcp_swnd)) ? 22352 (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt; 22353 } 22354 22355 if (tcp->tcp_valid_bits) { 22356 /* 22357 * For the complex case where we have to send some 22358 * controls (FIN or SYN), let tcp_xmit_mp do it. 22359 */ 22360 return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE, 22361 NULL, B_FALSE)); 22362 } else { 22363 /* Generate a simple ACK */ 22364 int data_length; 22365 uchar_t *rptr; 22366 tcph_t *tcph; 22367 mblk_t *mp1; 22368 int32_t tcp_hdr_len; 22369 int32_t tcp_tcp_hdr_len; 22370 int32_t num_sack_blk = 0; 22371 int32_t sack_opt_len; 22372 22373 /* 22374 * Allocate space for TCP + IP headers 22375 * and link-level header 22376 */ 22377 if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { 22378 num_sack_blk = MIN(tcp->tcp_max_sack_blk, 22379 tcp->tcp_num_sack_blk); 22380 sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + 22381 TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; 22382 tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len; 22383 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + sack_opt_len; 22384 } else { 22385 tcp_hdr_len = tcp->tcp_hdr_len; 22386 tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; 22387 } 22388 mp1 = allocb(tcp_hdr_len + tcp_wroff_xtra, BPRI_MED); 22389 if (!mp1) 22390 return (NULL); 22391 22392 /* Update the latest receive window size in TCP header. */ 22393 U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, 22394 tcp->tcp_tcph->th_win); 22395 /* copy in prototype TCP + IP header */ 22396 rptr = mp1->b_rptr + tcp_wroff_xtra; 22397 mp1->b_rptr = rptr; 22398 mp1->b_wptr = rptr + tcp_hdr_len; 22399 bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); 22400 22401 tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; 22402 22403 /* Set the TCP sequence number. */ 22404 U32_TO_ABE32(seq_no, tcph->th_seq); 22405 22406 /* Set up the TCP flag field. */ 22407 tcph->th_flags[0] = (uchar_t)TH_ACK; 22408 if (tcp->tcp_ecn_echo_on) 22409 tcph->th_flags[0] |= TH_ECE; 22410 22411 tcp->tcp_rack = tcp->tcp_rnxt; 22412 tcp->tcp_rack_cnt = 0; 22413 22414 /* fill in timestamp option if in use */ 22415 if (tcp->tcp_snd_ts_ok) { 22416 uint32_t llbolt = (uint32_t)lbolt; 22417 22418 U32_TO_BE32(llbolt, 22419 (char *)tcph+TCP_MIN_HEADER_LENGTH+4); 22420 U32_TO_BE32(tcp->tcp_ts_recent, 22421 (char *)tcph+TCP_MIN_HEADER_LENGTH+8); 22422 } 22423 22424 /* Fill in SACK options */ 22425 if (num_sack_blk > 0) { 22426 uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; 22427 sack_blk_t *tmp; 22428 int32_t i; 22429 22430 wptr[0] = TCPOPT_NOP; 22431 wptr[1] = TCPOPT_NOP; 22432 wptr[2] = TCPOPT_SACK; 22433 wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * 22434 sizeof (sack_blk_t); 22435 wptr += TCPOPT_REAL_SACK_LEN; 22436 22437 tmp = tcp->tcp_sack_list; 22438 for (i = 0; i < num_sack_blk; i++) { 22439 U32_TO_BE32(tmp[i].begin, wptr); 22440 wptr += sizeof (tcp_seq); 22441 U32_TO_BE32(tmp[i].end, wptr); 22442 wptr += sizeof (tcp_seq); 22443 } 22444 tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) 22445 << 4); 22446 } 22447 22448 if (tcp->tcp_ipversion == IPV4_VERSION) { 22449 ((ipha_t *)rptr)->ipha_length = htons(tcp_hdr_len); 22450 } else { 22451 /* Check for ip6i_t header in sticky hdrs */ 22452 ip6_t *ip6 = (ip6_t *)(rptr + 22453 (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ? 22454 sizeof (ip6i_t) : 0)); 22455 22456 ip6->ip6_plen = htons(tcp_hdr_len - 22457 ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); 22458 } 22459 22460 /* 22461 * Prime pump for checksum calculation in IP. Include the 22462 * adjustment for a source route if any. 22463 */ 22464 data_length = tcp_tcp_hdr_len + tcp->tcp_sum; 22465 data_length = (data_length >> 16) + (data_length & 0xFFFF); 22466 U16_TO_ABE16(data_length, tcph->th_sum); 22467 22468 if (tcp->tcp_ip_forward_progress) { 22469 ASSERT(tcp->tcp_ipversion == IPV6_VERSION); 22470 *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; 22471 tcp->tcp_ip_forward_progress = B_FALSE; 22472 } 22473 return (mp1); 22474 } 22475 } 22476 22477 /* 22478 * To create a temporary tcp structure for inserting into bind hash list. 22479 * The parameter is assumed to be in network byte order, ready for use. 22480 */ 22481 /* ARGSUSED */ 22482 static tcp_t * 22483 tcp_alloc_temp_tcp(in_port_t port) 22484 { 22485 conn_t *connp; 22486 tcp_t *tcp; 22487 22488 connp = ipcl_conn_create(IPCL_TCPCONN, KM_SLEEP); 22489 if (connp == NULL) 22490 return (NULL); 22491 22492 tcp = connp->conn_tcp; 22493 22494 /* 22495 * Only initialize the necessary info in those structures. Note 22496 * that since INADDR_ANY is all 0, we do not need to set 22497 * tcp_bound_source to INADDR_ANY here. 22498 */ 22499 tcp->tcp_state = TCPS_BOUND; 22500 tcp->tcp_lport = port; 22501 tcp->tcp_exclbind = 1; 22502 tcp->tcp_reserved_port = 1; 22503 22504 /* Just for place holding... */ 22505 tcp->tcp_ipversion = IPV4_VERSION; 22506 22507 return (tcp); 22508 } 22509 22510 /* 22511 * To remove a port range specified by lo_port and hi_port from the 22512 * reserved port ranges. This is one of the three public functions of 22513 * the reserved port interface. Note that a port range has to be removed 22514 * as a whole. Ports in a range cannot be removed individually. 22515 * 22516 * Params: 22517 * in_port_t lo_port: the beginning port of the reserved port range to 22518 * be deleted. 22519 * in_port_t hi_port: the ending port of the reserved port range to 22520 * be deleted. 22521 * 22522 * Return: 22523 * B_TRUE if the deletion is successful, B_FALSE otherwise. 22524 */ 22525 boolean_t 22526 tcp_reserved_port_del(in_port_t lo_port, in_port_t hi_port) 22527 { 22528 int i, j; 22529 int size; 22530 tcp_t **temp_tcp_array; 22531 tcp_t *tcp; 22532 22533 rw_enter(&tcp_reserved_port_lock, RW_WRITER); 22534 22535 /* First make sure that the port ranage is indeed reserved. */ 22536 for (i = 0; i < tcp_reserved_port_array_size; i++) { 22537 if (tcp_reserved_port[i].lo_port == lo_port) { 22538 hi_port = tcp_reserved_port[i].hi_port; 22539 temp_tcp_array = tcp_reserved_port[i].temp_tcp_array; 22540 break; 22541 } 22542 } 22543 if (i == tcp_reserved_port_array_size) { 22544 rw_exit(&tcp_reserved_port_lock); 22545 return (B_FALSE); 22546 } 22547 22548 /* 22549 * Remove the range from the array. This simple loop is possible 22550 * because port ranges are inserted in ascending order. 22551 */ 22552 for (j = i; j < tcp_reserved_port_array_size - 1; j++) { 22553 tcp_reserved_port[j].lo_port = tcp_reserved_port[j+1].lo_port; 22554 tcp_reserved_port[j].hi_port = tcp_reserved_port[j+1].hi_port; 22555 tcp_reserved_port[j].temp_tcp_array = 22556 tcp_reserved_port[j+1].temp_tcp_array; 22557 } 22558 22559 /* Remove all the temporary tcp structures. */ 22560 size = hi_port - lo_port + 1; 22561 while (size > 0) { 22562 tcp = temp_tcp_array[size - 1]; 22563 ASSERT(tcp != NULL); 22564 tcp_bind_hash_remove(tcp); 22565 CONN_DEC_REF(tcp->tcp_connp); 22566 size--; 22567 } 22568 kmem_free(temp_tcp_array, (hi_port - lo_port + 1) * sizeof (tcp_t *)); 22569 tcp_reserved_port_array_size--; 22570 rw_exit(&tcp_reserved_port_lock); 22571 return (B_TRUE); 22572 } 22573 22574 /* 22575 * Macro to remove temporary tcp structure from the bind hash list. The 22576 * first parameter is the list of tcp to be removed. The second parameter 22577 * is the number of tcps in the array. 22578 */ 22579 #define TCP_TMP_TCP_REMOVE(tcp_array, num) \ 22580 { \ 22581 while ((num) > 0) { \ 22582 tcp_t *tcp = (tcp_array)[(num) - 1]; \ 22583 tf_t *tbf; \ 22584 tcp_t *tcpnext; \ 22585 tbf = &tcp_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)]; \ 22586 mutex_enter(&tbf->tf_lock); \ 22587 tcpnext = tcp->tcp_bind_hash; \ 22588 if (tcpnext) { \ 22589 tcpnext->tcp_ptpbhn = \ 22590 tcp->tcp_ptpbhn; \ 22591 } \ 22592 *tcp->tcp_ptpbhn = tcpnext; \ 22593 mutex_exit(&tbf->tf_lock); \ 22594 kmem_free(tcp, sizeof (tcp_t)); \ 22595 (tcp_array)[(num) - 1] = NULL; \ 22596 (num)--; \ 22597 } \ 22598 } 22599 22600 /* 22601 * The public interface for other modules to call to reserve a port range 22602 * in TCP. The caller passes in how large a port range it wants. TCP 22603 * will try to find a range and return it via lo_port and hi_port. This is 22604 * used by NCA's nca_conn_init. 22605 * NCA can only be used in the global zone so this only affects the global 22606 * zone's ports. 22607 * 22608 * Params: 22609 * int size: the size of the port range to be reserved. 22610 * in_port_t *lo_port (referenced): returns the beginning port of the 22611 * reserved port range added. 22612 * in_port_t *hi_port (referenced): returns the ending port of the 22613 * reserved port range added. 22614 * 22615 * Return: 22616 * B_TRUE if the port reservation is successful, B_FALSE otherwise. 22617 */ 22618 boolean_t 22619 tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) 22620 { 22621 tcp_t *tcp; 22622 tcp_t *tmp_tcp; 22623 tcp_t **temp_tcp_array; 22624 tf_t *tbf; 22625 in_port_t net_port; 22626 in_port_t port; 22627 int32_t cur_size; 22628 int i, j; 22629 boolean_t used; 22630 tcp_rport_t tmp_ports[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE]; 22631 zoneid_t zoneid = GLOBAL_ZONEID; 22632 22633 /* Sanity check. */ 22634 if (size <= 0 || size > TCP_RESERVED_PORTS_RANGE_MAX) { 22635 return (B_FALSE); 22636 } 22637 22638 rw_enter(&tcp_reserved_port_lock, RW_WRITER); 22639 if (tcp_reserved_port_array_size == TCP_RESERVED_PORTS_ARRAY_MAX_SIZE) { 22640 rw_exit(&tcp_reserved_port_lock); 22641 return (B_FALSE); 22642 } 22643 22644 /* 22645 * Find the starting port to try. Since the port ranges are ordered 22646 * in the reserved port array, we can do a simple search here. 22647 */ 22648 *lo_port = TCP_SMALLEST_RESERVED_PORT; 22649 *hi_port = TCP_LARGEST_RESERVED_PORT; 22650 for (i = 0; i < tcp_reserved_port_array_size; 22651 *lo_port = tcp_reserved_port[i].hi_port + 1, i++) { 22652 if (tcp_reserved_port[i].lo_port - *lo_port >= size) { 22653 *hi_port = tcp_reserved_port[i].lo_port - 1; 22654 break; 22655 } 22656 } 22657 /* No available port range. */ 22658 if (i == tcp_reserved_port_array_size && *hi_port - *lo_port < size) { 22659 rw_exit(&tcp_reserved_port_lock); 22660 return (B_FALSE); 22661 } 22662 22663 temp_tcp_array = kmem_zalloc(size * sizeof (tcp_t *), KM_NOSLEEP); 22664 if (temp_tcp_array == NULL) { 22665 rw_exit(&tcp_reserved_port_lock); 22666 return (B_FALSE); 22667 } 22668 22669 /* Go thru the port range to see if some ports are already bound. */ 22670 for (port = *lo_port, cur_size = 0; 22671 cur_size < size && port <= *hi_port; 22672 cur_size++, port++) { 22673 used = B_FALSE; 22674 net_port = htons(port); 22675 tbf = &tcp_bind_fanout[TCP_BIND_HASH(net_port)]; 22676 mutex_enter(&tbf->tf_lock); 22677 for (tcp = tbf->tf_tcp; tcp != NULL; 22678 tcp = tcp->tcp_bind_hash) { 22679 if (IPCL_ZONE_MATCH(tcp->tcp_connp, zoneid) && 22680 net_port == tcp->tcp_lport) { 22681 /* 22682 * A port is already bound. Search again 22683 * starting from port + 1. Release all 22684 * temporary tcps. 22685 */ 22686 mutex_exit(&tbf->tf_lock); 22687 TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size); 22688 *lo_port = port + 1; 22689 cur_size = -1; 22690 used = B_TRUE; 22691 break; 22692 } 22693 } 22694 if (!used) { 22695 if ((tmp_tcp = tcp_alloc_temp_tcp(net_port)) == NULL) { 22696 /* 22697 * Allocation failure. Just fail the request. 22698 * Need to remove all those temporary tcp 22699 * structures. 22700 */ 22701 mutex_exit(&tbf->tf_lock); 22702 TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size); 22703 rw_exit(&tcp_reserved_port_lock); 22704 kmem_free(temp_tcp_array, 22705 (hi_port - lo_port + 1) * 22706 sizeof (tcp_t *)); 22707 return (B_FALSE); 22708 } 22709 temp_tcp_array[cur_size] = tmp_tcp; 22710 tcp_bind_hash_insert(tbf, tmp_tcp, B_TRUE); 22711 mutex_exit(&tbf->tf_lock); 22712 } 22713 } 22714 22715 /* 22716 * The current range is not large enough. We can actually do another 22717 * search if this search is done between 2 reserved port ranges. But 22718 * for first release, we just stop here and return saying that no port 22719 * range is available. 22720 */ 22721 if (cur_size < size) { 22722 TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size); 22723 rw_exit(&tcp_reserved_port_lock); 22724 kmem_free(temp_tcp_array, size * sizeof (tcp_t *)); 22725 return (B_FALSE); 22726 } 22727 *hi_port = port - 1; 22728 22729 /* 22730 * Insert range into array in ascending order. Since this function 22731 * must not be called often, we choose to use the simplest method. 22732 * The above array should not consume excessive stack space as 22733 * the size must be very small. If in future releases, we find 22734 * that we should provide more reserved port ranges, this function 22735 * has to be modified to be more efficient. 22736 */ 22737 if (tcp_reserved_port_array_size == 0) { 22738 tcp_reserved_port[0].lo_port = *lo_port; 22739 tcp_reserved_port[0].hi_port = *hi_port; 22740 tcp_reserved_port[0].temp_tcp_array = temp_tcp_array; 22741 } else { 22742 for (i = 0, j = 0; i < tcp_reserved_port_array_size; i++, j++) { 22743 if (*lo_port < tcp_reserved_port[i].lo_port && i == j) { 22744 tmp_ports[j].lo_port = *lo_port; 22745 tmp_ports[j].hi_port = *hi_port; 22746 tmp_ports[j].temp_tcp_array = temp_tcp_array; 22747 j++; 22748 } 22749 tmp_ports[j].lo_port = tcp_reserved_port[i].lo_port; 22750 tmp_ports[j].hi_port = tcp_reserved_port[i].hi_port; 22751 tmp_ports[j].temp_tcp_array = 22752 tcp_reserved_port[i].temp_tcp_array; 22753 } 22754 if (j == i) { 22755 tmp_ports[j].lo_port = *lo_port; 22756 tmp_ports[j].hi_port = *hi_port; 22757 tmp_ports[j].temp_tcp_array = temp_tcp_array; 22758 } 22759 bcopy(tmp_ports, tcp_reserved_port, sizeof (tmp_ports)); 22760 } 22761 tcp_reserved_port_array_size++; 22762 rw_exit(&tcp_reserved_port_lock); 22763 return (B_TRUE); 22764 } 22765 22766 /* 22767 * Check to see if a port is in any reserved port range. 22768 * 22769 * Params: 22770 * in_port_t port: the port to be verified. 22771 * 22772 * Return: 22773 * B_TRUE is the port is inside a reserved port range, B_FALSE otherwise. 22774 */ 22775 boolean_t 22776 tcp_reserved_port_check(in_port_t port) 22777 { 22778 int i; 22779 22780 rw_enter(&tcp_reserved_port_lock, RW_READER); 22781 for (i = 0; i < tcp_reserved_port_array_size; i++) { 22782 if (port >= tcp_reserved_port[i].lo_port || 22783 port <= tcp_reserved_port[i].hi_port) { 22784 rw_exit(&tcp_reserved_port_lock); 22785 return (B_TRUE); 22786 } 22787 } 22788 rw_exit(&tcp_reserved_port_lock); 22789 return (B_FALSE); 22790 } 22791 22792 /* 22793 * To list all reserved port ranges. This is the function to handle 22794 * ndd tcp_reserved_port_list. 22795 */ 22796 /* ARGSUSED */ 22797 static int 22798 tcp_reserved_port_list(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 22799 { 22800 int i; 22801 22802 rw_enter(&tcp_reserved_port_lock, RW_READER); 22803 if (tcp_reserved_port_array_size > 0) 22804 (void) mi_mpprintf(mp, "The following ports are reserved:"); 22805 else 22806 (void) mi_mpprintf(mp, "No port is reserved."); 22807 for (i = 0; i < tcp_reserved_port_array_size; i++) { 22808 (void) mi_mpprintf(mp, "%d-%d", 22809 tcp_reserved_port[i].lo_port, tcp_reserved_port[i].hi_port); 22810 } 22811 rw_exit(&tcp_reserved_port_lock); 22812 return (0); 22813 } 22814 22815 /* 22816 * Hash list insertion routine for tcp_t structures. 22817 * Inserts entries with the ones bound to a specific IP address first 22818 * followed by those bound to INADDR_ANY. 22819 */ 22820 static void 22821 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) 22822 { 22823 tcp_t **tcpp; 22824 tcp_t *tcpnext; 22825 22826 if (tcp->tcp_ptpbhn != NULL) { 22827 ASSERT(!caller_holds_lock); 22828 tcp_bind_hash_remove(tcp); 22829 } 22830 tcpp = &tbf->tf_tcp; 22831 if (!caller_holds_lock) { 22832 mutex_enter(&tbf->tf_lock); 22833 } else { 22834 ASSERT(MUTEX_HELD(&tbf->tf_lock)); 22835 } 22836 tcpnext = tcpp[0]; 22837 if (tcpnext) { 22838 /* 22839 * If the new tcp bound to the INADDR_ANY address 22840 * and the first one in the list is not bound to 22841 * INADDR_ANY we skip all entries until we find the 22842 * first one bound to INADDR_ANY. 22843 * This makes sure that applications binding to a 22844 * specific address get preference over those binding to 22845 * INADDR_ANY. 22846 */ 22847 if (V6_OR_V4_INADDR_ANY(tcp->tcp_bound_source_v6) && 22848 !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) { 22849 while ((tcpnext = tcpp[0]) != NULL && 22850 !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) 22851 tcpp = &(tcpnext->tcp_bind_hash); 22852 if (tcpnext) 22853 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash; 22854 } else 22855 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash; 22856 } 22857 tcp->tcp_bind_hash = tcpnext; 22858 tcp->tcp_ptpbhn = tcpp; 22859 tcpp[0] = tcp; 22860 if (!caller_holds_lock) 22861 mutex_exit(&tbf->tf_lock); 22862 } 22863 22864 /* 22865 * Hash list removal routine for tcp_t structures. 22866 */ 22867 static void 22868 tcp_bind_hash_remove(tcp_t *tcp) 22869 { 22870 tcp_t *tcpnext; 22871 kmutex_t *lockp; 22872 22873 if (tcp->tcp_ptpbhn == NULL) 22874 return; 22875 22876 /* 22877 * Extract the lock pointer in case there are concurrent 22878 * hash_remove's for this instance. 22879 */ 22880 ASSERT(tcp->tcp_lport != 0); 22881 lockp = &tcp_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)].tf_lock; 22882 22883 ASSERT(lockp != NULL); 22884 mutex_enter(lockp); 22885 if (tcp->tcp_ptpbhn) { 22886 tcpnext = tcp->tcp_bind_hash; 22887 if (tcpnext) { 22888 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 22889 tcp->tcp_bind_hash = NULL; 22890 } 22891 *tcp->tcp_ptpbhn = tcpnext; 22892 tcp->tcp_ptpbhn = NULL; 22893 } 22894 mutex_exit(lockp); 22895 } 22896 22897 22898 /* 22899 * Hash list lookup routine for tcp_t structures. 22900 * Returns with a CONN_INC_REF tcp structure. Caller must do a CONN_DEC_REF. 22901 */ 22902 static tcp_t * 22903 tcp_acceptor_hash_lookup(t_uscalar_t id) 22904 { 22905 tf_t *tf; 22906 tcp_t *tcp; 22907 22908 tf = &tcp_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 22909 mutex_enter(&tf->tf_lock); 22910 for (tcp = tf->tf_tcp; tcp != NULL; 22911 tcp = tcp->tcp_acceptor_hash) { 22912 if (tcp->tcp_acceptor_id == id) { 22913 CONN_INC_REF(tcp->tcp_connp); 22914 mutex_exit(&tf->tf_lock); 22915 return (tcp); 22916 } 22917 } 22918 mutex_exit(&tf->tf_lock); 22919 return (NULL); 22920 } 22921 22922 22923 /* 22924 * Hash list insertion routine for tcp_t structures. 22925 */ 22926 void 22927 tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp) 22928 { 22929 tf_t *tf; 22930 tcp_t **tcpp; 22931 tcp_t *tcpnext; 22932 22933 tf = &tcp_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; 22934 22935 if (tcp->tcp_ptpahn != NULL) 22936 tcp_acceptor_hash_remove(tcp); 22937 tcpp = &tf->tf_tcp; 22938 mutex_enter(&tf->tf_lock); 22939 tcpnext = tcpp[0]; 22940 if (tcpnext) 22941 tcpnext->tcp_ptpahn = &tcp->tcp_acceptor_hash; 22942 tcp->tcp_acceptor_hash = tcpnext; 22943 tcp->tcp_ptpahn = tcpp; 22944 tcpp[0] = tcp; 22945 tcp->tcp_acceptor_lockp = &tf->tf_lock; /* For tcp_*_hash_remove */ 22946 mutex_exit(&tf->tf_lock); 22947 } 22948 22949 /* 22950 * Hash list removal routine for tcp_t structures. 22951 */ 22952 static void 22953 tcp_acceptor_hash_remove(tcp_t *tcp) 22954 { 22955 tcp_t *tcpnext; 22956 kmutex_t *lockp; 22957 22958 /* 22959 * Extract the lock pointer in case there are concurrent 22960 * hash_remove's for this instance. 22961 */ 22962 lockp = tcp->tcp_acceptor_lockp; 22963 22964 if (tcp->tcp_ptpahn == NULL) 22965 return; 22966 22967 ASSERT(lockp != NULL); 22968 mutex_enter(lockp); 22969 if (tcp->tcp_ptpahn) { 22970 tcpnext = tcp->tcp_acceptor_hash; 22971 if (tcpnext) { 22972 tcpnext->tcp_ptpahn = tcp->tcp_ptpahn; 22973 tcp->tcp_acceptor_hash = NULL; 22974 } 22975 *tcp->tcp_ptpahn = tcpnext; 22976 tcp->tcp_ptpahn = NULL; 22977 } 22978 mutex_exit(lockp); 22979 tcp->tcp_acceptor_lockp = NULL; 22980 } 22981 22982 /* ARGSUSED */ 22983 static int 22984 tcp_host_param_setvalue(queue_t *q, mblk_t *mp, char *value, caddr_t cp, int af) 22985 { 22986 int error = 0; 22987 int retval; 22988 char *end; 22989 22990 tcp_hsp_t *hsp; 22991 tcp_hsp_t *hspprev; 22992 22993 ipaddr_t addr = 0; /* Address we're looking for */ 22994 in6_addr_t v6addr; /* Address we're looking for */ 22995 uint32_t hash; /* Hash of that address */ 22996 22997 /* 22998 * If the following variables are still zero after parsing the input 22999 * string, the user didn't specify them and we don't change them in 23000 * the HSP. 23001 */ 23002 23003 ipaddr_t mask = 0; /* Subnet mask */ 23004 in6_addr_t v6mask; 23005 long sendspace = 0; /* Send buffer size */ 23006 long recvspace = 0; /* Receive buffer size */ 23007 long timestamp = 0; /* Originate TCP TSTAMP option, 1 = yes */ 23008 boolean_t delete = B_FALSE; /* User asked to delete this HSP */ 23009 23010 rw_enter(&tcp_hsp_lock, RW_WRITER); 23011 23012 /* Parse and validate address */ 23013 if (af == AF_INET) { 23014 retval = inet_pton(af, value, &addr); 23015 if (retval == 1) 23016 IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); 23017 } else if (af == AF_INET6) { 23018 retval = inet_pton(af, value, &v6addr); 23019 } else { 23020 error = EINVAL; 23021 goto done; 23022 } 23023 if (retval == 0) { 23024 error = EINVAL; 23025 goto done; 23026 } 23027 23028 while ((*value) && *value != ' ') 23029 value++; 23030 23031 /* Parse individual keywords, set variables if found */ 23032 while (*value) { 23033 /* Skip leading blanks */ 23034 23035 while (*value == ' ' || *value == '\t') 23036 value++; 23037 23038 /* If at end of string, we're done */ 23039 23040 if (!*value) 23041 break; 23042 23043 /* We have a word, figure out what it is */ 23044 23045 if (strncmp("mask", value, 4) == 0) { 23046 value += 4; 23047 while (*value == ' ' || *value == '\t') 23048 value++; 23049 /* Parse subnet mask */ 23050 if (af == AF_INET) { 23051 retval = inet_pton(af, value, &mask); 23052 if (retval == 1) { 23053 V4MASK_TO_V6(mask, v6mask); 23054 } 23055 } else if (af == AF_INET6) { 23056 retval = inet_pton(af, value, &v6mask); 23057 } 23058 if (retval != 1) { 23059 error = EINVAL; 23060 goto done; 23061 } 23062 while ((*value) && *value != ' ') 23063 value++; 23064 } else if (strncmp("sendspace", value, 9) == 0) { 23065 value += 9; 23066 23067 if (ddi_strtol(value, &end, 0, &sendspace) != 0 || 23068 sendspace < TCP_XMIT_HIWATER || 23069 sendspace >= (1L<<30)) { 23070 error = EINVAL; 23071 goto done; 23072 } 23073 value = end; 23074 } else if (strncmp("recvspace", value, 9) == 0) { 23075 value += 9; 23076 23077 if (ddi_strtol(value, &end, 0, &recvspace) != 0 || 23078 recvspace < TCP_RECV_HIWATER || 23079 recvspace >= (1L<<30)) { 23080 error = EINVAL; 23081 goto done; 23082 } 23083 value = end; 23084 } else if (strncmp("timestamp", value, 9) == 0) { 23085 value += 9; 23086 23087 if (ddi_strtol(value, &end, 0, ×tamp) != 0 || 23088 timestamp < 0 || timestamp > 1) { 23089 error = EINVAL; 23090 goto done; 23091 } 23092 23093 /* 23094 * We increment timestamp so we know it's been set; 23095 * this is undone when we put it in the HSP 23096 */ 23097 timestamp++; 23098 value = end; 23099 } else if (strncmp("delete", value, 6) == 0) { 23100 value += 6; 23101 delete = B_TRUE; 23102 } else { 23103 error = EINVAL; 23104 goto done; 23105 } 23106 } 23107 23108 /* Hash address for lookup */ 23109 23110 hash = TCP_HSP_HASH(addr); 23111 23112 if (delete) { 23113 /* 23114 * Note that deletes don't return an error if the thing 23115 * we're trying to delete isn't there. 23116 */ 23117 if (tcp_hsp_hash == NULL) 23118 goto done; 23119 hsp = tcp_hsp_hash[hash]; 23120 23121 if (hsp) { 23122 if (IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, 23123 &v6addr)) { 23124 tcp_hsp_hash[hash] = hsp->tcp_hsp_next; 23125 mi_free((char *)hsp); 23126 } else { 23127 hspprev = hsp; 23128 while ((hsp = hsp->tcp_hsp_next) != NULL) { 23129 if (IN6_ARE_ADDR_EQUAL( 23130 &hsp->tcp_hsp_addr_v6, &v6addr)) { 23131 hspprev->tcp_hsp_next = 23132 hsp->tcp_hsp_next; 23133 mi_free((char *)hsp); 23134 break; 23135 } 23136 hspprev = hsp; 23137 } 23138 } 23139 } 23140 } else { 23141 /* 23142 * We're adding/modifying an HSP. If we haven't already done 23143 * so, allocate the hash table. 23144 */ 23145 23146 if (!tcp_hsp_hash) { 23147 tcp_hsp_hash = (tcp_hsp_t **) 23148 mi_zalloc(sizeof (tcp_hsp_t *) * TCP_HSP_HASH_SIZE); 23149 if (!tcp_hsp_hash) { 23150 error = EINVAL; 23151 goto done; 23152 } 23153 } 23154 23155 /* Get head of hash chain */ 23156 23157 hsp = tcp_hsp_hash[hash]; 23158 23159 /* Try to find pre-existing hsp on hash chain */ 23160 /* Doesn't handle CIDR prefixes. */ 23161 while (hsp) { 23162 if (IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, &v6addr)) 23163 break; 23164 hsp = hsp->tcp_hsp_next; 23165 } 23166 23167 /* 23168 * If we didn't, create one with default values and put it 23169 * at head of hash chain 23170 */ 23171 23172 if (!hsp) { 23173 hsp = (tcp_hsp_t *)mi_zalloc(sizeof (tcp_hsp_t)); 23174 if (!hsp) { 23175 error = EINVAL; 23176 goto done; 23177 } 23178 hsp->tcp_hsp_next = tcp_hsp_hash[hash]; 23179 tcp_hsp_hash[hash] = hsp; 23180 } 23181 23182 /* Set values that the user asked us to change */ 23183 23184 hsp->tcp_hsp_addr_v6 = v6addr; 23185 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) 23186 hsp->tcp_hsp_vers = IPV4_VERSION; 23187 else 23188 hsp->tcp_hsp_vers = IPV6_VERSION; 23189 hsp->tcp_hsp_subnet_v6 = v6mask; 23190 if (sendspace > 0) 23191 hsp->tcp_hsp_sendspace = sendspace; 23192 if (recvspace > 0) 23193 hsp->tcp_hsp_recvspace = recvspace; 23194 if (timestamp > 0) 23195 hsp->tcp_hsp_tstamp = timestamp - 1; 23196 } 23197 23198 done: 23199 rw_exit(&tcp_hsp_lock); 23200 return (error); 23201 } 23202 23203 /* Set callback routine passed to nd_load by tcp_param_register. */ 23204 /* ARGSUSED */ 23205 static int 23206 tcp_host_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) 23207 { 23208 return (tcp_host_param_setvalue(q, mp, value, cp, AF_INET)); 23209 } 23210 /* ARGSUSED */ 23211 static int 23212 tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 23213 cred_t *cr) 23214 { 23215 return (tcp_host_param_setvalue(q, mp, value, cp, AF_INET6)); 23216 } 23217 23218 /* TCP host parameters report triggered via the Named Dispatch mechanism. */ 23219 /* ARGSUSED */ 23220 static int 23221 tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) 23222 { 23223 tcp_hsp_t *hsp; 23224 int i; 23225 char addrbuf[INET6_ADDRSTRLEN], subnetbuf[INET6_ADDRSTRLEN]; 23226 23227 rw_enter(&tcp_hsp_lock, RW_READER); 23228 (void) mi_mpprintf(mp, 23229 "Hash HSP " MI_COL_HDRPAD_STR 23230 "Address Subnet Mask Send Receive TStamp"); 23231 if (tcp_hsp_hash) { 23232 for (i = 0; i < TCP_HSP_HASH_SIZE; i++) { 23233 hsp = tcp_hsp_hash[i]; 23234 while (hsp) { 23235 if (hsp->tcp_hsp_vers == IPV4_VERSION) { 23236 (void) inet_ntop(AF_INET, 23237 &hsp->tcp_hsp_addr, 23238 addrbuf, sizeof (addrbuf)); 23239 (void) inet_ntop(AF_INET, 23240 &hsp->tcp_hsp_subnet, 23241 subnetbuf, sizeof (subnetbuf)); 23242 } else { 23243 (void) inet_ntop(AF_INET6, 23244 &hsp->tcp_hsp_addr_v6, 23245 addrbuf, sizeof (addrbuf)); 23246 (void) inet_ntop(AF_INET6, 23247 &hsp->tcp_hsp_subnet_v6, 23248 subnetbuf, sizeof (subnetbuf)); 23249 } 23250 (void) mi_mpprintf(mp, 23251 " %03d " MI_COL_PTRFMT_STR 23252 "%s %s %010d %010d %d", 23253 i, 23254 (void *)hsp, 23255 addrbuf, 23256 subnetbuf, 23257 hsp->tcp_hsp_sendspace, 23258 hsp->tcp_hsp_recvspace, 23259 hsp->tcp_hsp_tstamp); 23260 23261 hsp = hsp->tcp_hsp_next; 23262 } 23263 } 23264 } 23265 rw_exit(&tcp_hsp_lock); 23266 return (0); 23267 } 23268 23269 23270 /* Data for fast netmask macro used by tcp_hsp_lookup */ 23271 23272 static ipaddr_t netmasks[] = { 23273 IN_CLASSA_NET, IN_CLASSA_NET, IN_CLASSB_NET, 23274 IN_CLASSC_NET | IN_CLASSD_NET /* Class C,D,E */ 23275 }; 23276 23277 #define netmask(addr) (netmasks[(ipaddr_t)(addr) >> 30]) 23278 23279 /* 23280 * XXX This routine should go away and instead we should use the metrics 23281 * associated with the routes to determine the default sndspace and rcvspace. 23282 */ 23283 static tcp_hsp_t * 23284 tcp_hsp_lookup(ipaddr_t addr) 23285 { 23286 tcp_hsp_t *hsp = NULL; 23287 23288 /* Quick check without acquiring the lock. */ 23289 if (tcp_hsp_hash == NULL) 23290 return (NULL); 23291 23292 rw_enter(&tcp_hsp_lock, RW_READER); 23293 23294 /* This routine finds the best-matching HSP for address addr. */ 23295 23296 if (tcp_hsp_hash) { 23297 int i; 23298 ipaddr_t srchaddr; 23299 tcp_hsp_t *hsp_net; 23300 23301 /* We do three passes: host, network, and subnet. */ 23302 23303 srchaddr = addr; 23304 23305 for (i = 1; i <= 3; i++) { 23306 /* Look for exact match on srchaddr */ 23307 23308 hsp = tcp_hsp_hash[TCP_HSP_HASH(srchaddr)]; 23309 while (hsp) { 23310 if (hsp->tcp_hsp_vers == IPV4_VERSION && 23311 hsp->tcp_hsp_addr == srchaddr) 23312 break; 23313 hsp = hsp->tcp_hsp_next; 23314 } 23315 ASSERT(hsp == NULL || 23316 hsp->tcp_hsp_vers == IPV4_VERSION); 23317 23318 /* 23319 * If this is the first pass: 23320 * If we found a match, great, return it. 23321 * If not, search for the network on the second pass. 23322 */ 23323 23324 if (i == 1) 23325 if (hsp) 23326 break; 23327 else 23328 { 23329 srchaddr = addr & netmask(addr); 23330 continue; 23331 } 23332 23333 /* 23334 * If this is the second pass: 23335 * If we found a match, but there's a subnet mask, 23336 * save the match but try again using the subnet 23337 * mask on the third pass. 23338 * Otherwise, return whatever we found. 23339 */ 23340 23341 if (i == 2) { 23342 if (hsp && hsp->tcp_hsp_subnet) { 23343 hsp_net = hsp; 23344 srchaddr = addr & hsp->tcp_hsp_subnet; 23345 continue; 23346 } else { 23347 break; 23348 } 23349 } 23350 23351 /* 23352 * This must be the third pass. If we didn't find 23353 * anything, return the saved network HSP instead. 23354 */ 23355 23356 if (!hsp) 23357 hsp = hsp_net; 23358 } 23359 } 23360 23361 rw_exit(&tcp_hsp_lock); 23362 return (hsp); 23363 } 23364 23365 /* 23366 * XXX Equally broken as the IPv4 routine. Doesn't handle longest 23367 * match lookup. 23368 */ 23369 static tcp_hsp_t * 23370 tcp_hsp_lookup_ipv6(in6_addr_t *v6addr) 23371 { 23372 tcp_hsp_t *hsp = NULL; 23373 23374 /* Quick check without acquiring the lock. */ 23375 if (tcp_hsp_hash == NULL) 23376 return (NULL); 23377 23378 rw_enter(&tcp_hsp_lock, RW_READER); 23379 23380 /* This routine finds the best-matching HSP for address addr. */ 23381 23382 if (tcp_hsp_hash) { 23383 int i; 23384 in6_addr_t v6srchaddr; 23385 tcp_hsp_t *hsp_net; 23386 23387 /* We do three passes: host, network, and subnet. */ 23388 23389 v6srchaddr = *v6addr; 23390 23391 for (i = 1; i <= 3; i++) { 23392 /* Look for exact match on srchaddr */ 23393 23394 hsp = tcp_hsp_hash[TCP_HSP_HASH( 23395 V4_PART_OF_V6(v6srchaddr))]; 23396 while (hsp) { 23397 if (hsp->tcp_hsp_vers == IPV6_VERSION && 23398 IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, 23399 &v6srchaddr)) 23400 break; 23401 hsp = hsp->tcp_hsp_next; 23402 } 23403 23404 /* 23405 * If this is the first pass: 23406 * If we found a match, great, return it. 23407 * If not, search for the network on the second pass. 23408 */ 23409 23410 if (i == 1) 23411 if (hsp) 23412 break; 23413 else { 23414 /* Assume a 64 bit mask */ 23415 v6srchaddr.s6_addr32[0] = 23416 v6addr->s6_addr32[0]; 23417 v6srchaddr.s6_addr32[1] = 23418 v6addr->s6_addr32[1]; 23419 v6srchaddr.s6_addr32[2] = 0; 23420 v6srchaddr.s6_addr32[3] = 0; 23421 continue; 23422 } 23423 23424 /* 23425 * If this is the second pass: 23426 * If we found a match, but there's a subnet mask, 23427 * save the match but try again using the subnet 23428 * mask on the third pass. 23429 * Otherwise, return whatever we found. 23430 */ 23431 23432 if (i == 2) { 23433 ASSERT(hsp == NULL || 23434 hsp->tcp_hsp_vers == IPV6_VERSION); 23435 if (hsp && 23436 !IN6_IS_ADDR_UNSPECIFIED( 23437 &hsp->tcp_hsp_subnet_v6)) { 23438 hsp_net = hsp; 23439 V6_MASK_COPY(*v6addr, 23440 hsp->tcp_hsp_subnet_v6, v6srchaddr); 23441 continue; 23442 } else { 23443 break; 23444 } 23445 } 23446 23447 /* 23448 * This must be the third pass. If we didn't find 23449 * anything, return the saved network HSP instead. 23450 */ 23451 23452 if (!hsp) 23453 hsp = hsp_net; 23454 } 23455 } 23456 23457 rw_exit(&tcp_hsp_lock); 23458 return (hsp); 23459 } 23460 23461 /* 23462 * Type three generator adapted from the random() function in 4.4 BSD: 23463 */ 23464 23465 /* 23466 * Copyright (c) 1983, 1993 23467 * The Regents of the University of California. All rights reserved. 23468 * 23469 * Redistribution and use in source and binary forms, with or without 23470 * modification, are permitted provided that the following conditions 23471 * are met: 23472 * 1. Redistributions of source code must retain the above copyright 23473 * notice, this list of conditions and the following disclaimer. 23474 * 2. Redistributions in binary form must reproduce the above copyright 23475 * notice, this list of conditions and the following disclaimer in the 23476 * documentation and/or other materials provided with the distribution. 23477 * 3. All advertising materials mentioning features or use of this software 23478 * must display the following acknowledgement: 23479 * This product includes software developed by the University of 23480 * California, Berkeley and its contributors. 23481 * 4. Neither the name of the University nor the names of its contributors 23482 * may be used to endorse or promote products derived from this software 23483 * without specific prior written permission. 23484 * 23485 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23486 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23487 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23488 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23489 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23490 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23491 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23492 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23493 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23494 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23495 * SUCH DAMAGE. 23496 */ 23497 23498 /* Type 3 -- x**31 + x**3 + 1 */ 23499 #define DEG_3 31 23500 #define SEP_3 3 23501 23502 23503 /* Protected by tcp_random_lock */ 23504 static int tcp_randtbl[DEG_3 + 1]; 23505 23506 static int *tcp_random_fptr = &tcp_randtbl[SEP_3 + 1]; 23507 static int *tcp_random_rptr = &tcp_randtbl[1]; 23508 23509 static int *tcp_random_state = &tcp_randtbl[1]; 23510 static int *tcp_random_end_ptr = &tcp_randtbl[DEG_3 + 1]; 23511 23512 kmutex_t tcp_random_lock; 23513 23514 void 23515 tcp_random_init(void) 23516 { 23517 int i; 23518 hrtime_t hrt; 23519 time_t wallclock; 23520 uint64_t result; 23521 23522 /* 23523 * Use high-res timer and current time for seed. Gethrtime() returns 23524 * a longlong, which may contain resolution down to nanoseconds. 23525 * The current time will either be a 32-bit or a 64-bit quantity. 23526 * XOR the two together in a 64-bit result variable. 23527 * Convert the result to a 32-bit value by multiplying the high-order 23528 * 32-bits by the low-order 32-bits. 23529 */ 23530 23531 hrt = gethrtime(); 23532 (void) drv_getparm(TIME, &wallclock); 23533 result = (uint64_t)wallclock ^ (uint64_t)hrt; 23534 mutex_enter(&tcp_random_lock); 23535 tcp_random_state[0] = ((result >> 32) & 0xffffffff) * 23536 (result & 0xffffffff); 23537 23538 for (i = 1; i < DEG_3; i++) 23539 tcp_random_state[i] = 1103515245 * tcp_random_state[i - 1] 23540 + 12345; 23541 tcp_random_fptr = &tcp_random_state[SEP_3]; 23542 tcp_random_rptr = &tcp_random_state[0]; 23543 mutex_exit(&tcp_random_lock); 23544 for (i = 0; i < 10 * DEG_3; i++) 23545 (void) tcp_random(); 23546 } 23547 23548 /* 23549 * tcp_random: Return a random number in the range [1 - (128K + 1)]. 23550 * This range is selected to be approximately centered on TCP_ISS / 2, 23551 * and easy to compute. We get this value by generating a 32-bit random 23552 * number, selecting out the high-order 17 bits, and then adding one so 23553 * that we never return zero. 23554 */ 23555 int 23556 tcp_random(void) 23557 { 23558 int i; 23559 23560 mutex_enter(&tcp_random_lock); 23561 *tcp_random_fptr += *tcp_random_rptr; 23562 23563 /* 23564 * The high-order bits are more random than the low-order bits, 23565 * so we select out the high-order 17 bits and add one so that 23566 * we never return zero. 23567 */ 23568 i = ((*tcp_random_fptr >> 15) & 0x1ffff) + 1; 23569 if (++tcp_random_fptr >= tcp_random_end_ptr) { 23570 tcp_random_fptr = tcp_random_state; 23571 ++tcp_random_rptr; 23572 } else if (++tcp_random_rptr >= tcp_random_end_ptr) 23573 tcp_random_rptr = tcp_random_state; 23574 23575 mutex_exit(&tcp_random_lock); 23576 return (i); 23577 } 23578 23579 /* 23580 * XXX This will go away when TPI is extended to send 23581 * info reqs to sockfs/timod ..... 23582 * Given a queue, set the max packet size for the write 23583 * side of the queue below stream head. This value is 23584 * cached on the stream head. 23585 * Returns 1 on success, 0 otherwise. 23586 */ 23587 static int 23588 setmaxps(queue_t *q, int maxpsz) 23589 { 23590 struct stdata *stp; 23591 queue_t *wq; 23592 stp = STREAM(q); 23593 23594 /* 23595 * At this point change of a queue parameter is not allowed 23596 * when a multiplexor is sitting on top. 23597 */ 23598 if (stp->sd_flag & STPLEX) 23599 return (0); 23600 23601 claimstr(stp->sd_wrq); 23602 wq = stp->sd_wrq->q_next; 23603 ASSERT(wq != NULL); 23604 (void) strqset(wq, QMAXPSZ, 0, maxpsz); 23605 releasestr(stp->sd_wrq); 23606 return (1); 23607 } 23608 23609 static int 23610 tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, 23611 int *t_errorp, int *sys_errorp) 23612 { 23613 int error; 23614 int is_absreq_failure; 23615 t_scalar_t *opt_lenp; 23616 t_scalar_t opt_offset; 23617 int prim_type; 23618 struct T_conn_req *tcreqp; 23619 struct T_conn_res *tcresp; 23620 cred_t *cr; 23621 23622 cr = DB_CREDDEF(mp, tcp->tcp_cred); 23623 23624 prim_type = ((union T_primitives *)mp->b_rptr)->type; 23625 ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || 23626 prim_type == T_CONN_RES); 23627 23628 switch (prim_type) { 23629 case T_CONN_REQ: 23630 tcreqp = (struct T_conn_req *)mp->b_rptr; 23631 opt_offset = tcreqp->OPT_offset; 23632 opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; 23633 break; 23634 case O_T_CONN_RES: 23635 case T_CONN_RES: 23636 tcresp = (struct T_conn_res *)mp->b_rptr; 23637 opt_offset = tcresp->OPT_offset; 23638 opt_lenp = (t_scalar_t *)&tcresp->OPT_length; 23639 break; 23640 } 23641 23642 *t_errorp = 0; 23643 *sys_errorp = 0; 23644 *do_disconnectp = 0; 23645 23646 error = tpi_optcom_buf(tcp->tcp_wq, mp, opt_lenp, 23647 opt_offset, cr, &tcp_opt_obj, 23648 NULL, &is_absreq_failure); 23649 23650 switch (error) { 23651 case 0: /* no error */ 23652 ASSERT(is_absreq_failure == 0); 23653 return (0); 23654 case ENOPROTOOPT: 23655 *t_errorp = TBADOPT; 23656 break; 23657 case EACCES: 23658 *t_errorp = TACCES; 23659 break; 23660 default: 23661 *t_errorp = TSYSERR; *sys_errorp = error; 23662 break; 23663 } 23664 if (is_absreq_failure != 0) { 23665 /* 23666 * The connection request should get the local ack 23667 * T_OK_ACK and then a T_DISCON_IND. 23668 */ 23669 *do_disconnectp = 1; 23670 } 23671 return (-1); 23672 } 23673 23674 /* 23675 * Split this function out so that if the secret changes, I'm okay. 23676 * 23677 * Initialize the tcp_iss_cookie and tcp_iss_key. 23678 */ 23679 23680 #define PASSWD_SIZE 16 /* MUST be multiple of 4 */ 23681 23682 static void 23683 tcp_iss_key_init(uint8_t *phrase, int len) 23684 { 23685 struct { 23686 int32_t current_time; 23687 uint32_t randnum; 23688 uint16_t pad; 23689 uint8_t ether[6]; 23690 uint8_t passwd[PASSWD_SIZE]; 23691 } tcp_iss_cookie; 23692 time_t t; 23693 23694 /* 23695 * Start with the current absolute time. 23696 */ 23697 (void) drv_getparm(TIME, &t); 23698 tcp_iss_cookie.current_time = t; 23699 23700 /* 23701 * XXX - Need a more random number per RFC 1750, not this crap. 23702 * OTOH, if what follows is pretty random, then I'm in better shape. 23703 */ 23704 tcp_iss_cookie.randnum = (uint32_t)(gethrtime() + tcp_random()); 23705 tcp_iss_cookie.pad = 0x365c; /* Picked from HMAC pad values. */ 23706 23707 /* 23708 * The cpu_type_info is pretty non-random. Ugggh. It does serve 23709 * as a good template. 23710 */ 23711 bcopy(&cpu_list->cpu_type_info, &tcp_iss_cookie.passwd, 23712 min(PASSWD_SIZE, sizeof (cpu_list->cpu_type_info))); 23713 23714 /* 23715 * The pass-phrase. Normally this is supplied by user-called NDD. 23716 */ 23717 bcopy(phrase, &tcp_iss_cookie.passwd, min(PASSWD_SIZE, len)); 23718 23719 /* 23720 * See 4010593 if this section becomes a problem again, 23721 * but the local ethernet address is useful here. 23722 */ 23723 (void) localetheraddr(NULL, 23724 (struct ether_addr *)&tcp_iss_cookie.ether); 23725 23726 /* 23727 * Hash 'em all together. The MD5Final is called per-connection. 23728 */ 23729 mutex_enter(&tcp_iss_key_lock); 23730 MD5Init(&tcp_iss_key); 23731 MD5Update(&tcp_iss_key, (uchar_t *)&tcp_iss_cookie, 23732 sizeof (tcp_iss_cookie)); 23733 mutex_exit(&tcp_iss_key_lock); 23734 } 23735 23736 /* 23737 * Set the RFC 1948 pass phrase 23738 */ 23739 /* ARGSUSED */ 23740 static int 23741 tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 23742 cred_t *cr) 23743 { 23744 /* 23745 * Basically, value contains a new pass phrase. Pass it along! 23746 */ 23747 tcp_iss_key_init((uint8_t *)value, strlen(value)); 23748 return (0); 23749 } 23750 23751 /* ARGSUSED */ 23752 static int 23753 tcp_sack_info_constructor(void *buf, void *cdrarg, int kmflags) 23754 { 23755 bzero(buf, sizeof (tcp_sack_info_t)); 23756 return (0); 23757 } 23758 23759 /* ARGSUSED */ 23760 static int 23761 tcp_iphc_constructor(void *buf, void *cdrarg, int kmflags) 23762 { 23763 bzero(buf, TCP_MAX_COMBINED_HEADER_LENGTH); 23764 return (0); 23765 } 23766 23767 void 23768 tcp_ddi_init(void) 23769 { 23770 int i; 23771 23772 /* Initialize locks */ 23773 rw_init(&tcp_hsp_lock, NULL, RW_DEFAULT, NULL); 23774 mutex_init(&tcp_g_q_lock, NULL, MUTEX_DEFAULT, NULL); 23775 mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL); 23776 mutex_init(&tcp_iss_key_lock, NULL, MUTEX_DEFAULT, NULL); 23777 mutex_init(&tcp_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL); 23778 rw_init(&tcp_reserved_port_lock, NULL, RW_DEFAULT, NULL); 23779 23780 for (i = 0; i < A_CNT(tcp_bind_fanout); i++) { 23781 mutex_init(&tcp_bind_fanout[i].tf_lock, NULL, 23782 MUTEX_DEFAULT, NULL); 23783 } 23784 23785 for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) { 23786 mutex_init(&tcp_acceptor_fanout[i].tf_lock, NULL, 23787 MUTEX_DEFAULT, NULL); 23788 } 23789 23790 /* TCP's IPsec code calls the packet dropper. */ 23791 ip_drop_register(&tcp_dropper, "TCP IPsec policy enforcement"); 23792 23793 if (!tcp_g_nd) { 23794 if (!tcp_param_register(tcp_param_arr, A_CNT(tcp_param_arr))) { 23795 nd_free(&tcp_g_nd); 23796 } 23797 } 23798 23799 /* 23800 * Note: To really walk the device tree you need the devinfo 23801 * pointer to your device which is only available after probe/attach. 23802 * The following is safe only because it uses ddi_root_node() 23803 */ 23804 tcp_max_optsize = optcom_max_optsize(tcp_opt_obj.odb_opt_des_arr, 23805 tcp_opt_obj.odb_opt_arr_cnt); 23806 23807 tcp_timercache = kmem_cache_create("tcp_timercache", 23808 sizeof (tcp_timer_t) + sizeof (mblk_t), 0, 23809 NULL, NULL, NULL, NULL, NULL, 0); 23810 23811 tcp_sack_info_cache = kmem_cache_create("tcp_sack_info_cache", 23812 sizeof (tcp_sack_info_t), 0, 23813 tcp_sack_info_constructor, NULL, NULL, NULL, NULL, 0); 23814 23815 tcp_iphc_cache = kmem_cache_create("tcp_iphc_cache", 23816 TCP_MAX_COMBINED_HEADER_LENGTH, 0, 23817 tcp_iphc_constructor, NULL, NULL, NULL, NULL, 0); 23818 23819 tcp_squeue_wput_proc = tcp_squeue_switch(tcp_squeue_wput); 23820 tcp_squeue_close_proc = tcp_squeue_switch(tcp_squeue_close); 23821 23822 ip_squeue_init(tcp_squeue_add); 23823 23824 /* Initialize the random number generator */ 23825 tcp_random_init(); 23826 23827 /* 23828 * Initialize RFC 1948 secret values. This will probably be reset once 23829 * by the boot scripts. 23830 * 23831 * Use NULL name, as the name is caught by the new lockstats. 23832 * 23833 * Initialize with some random, non-guessable string, like the global 23834 * T_INFO_ACK. 23835 */ 23836 23837 tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack, 23838 sizeof (tcp_g_t_info_ack)); 23839 23840 if ((tcp_kstat = kstat_create(TCP_MOD_NAME, 0, "tcpstat", 23841 "net", KSTAT_TYPE_NAMED, 23842 sizeof (tcp_statistics) / sizeof (kstat_named_t), 23843 KSTAT_FLAG_VIRTUAL)) != NULL) { 23844 tcp_kstat->ks_data = &tcp_statistics; 23845 kstat_install(tcp_kstat); 23846 } 23847 23848 tcp_kstat_init(); 23849 } 23850 23851 void 23852 tcp_ddi_destroy(void) 23853 { 23854 int i; 23855 23856 nd_free(&tcp_g_nd); 23857 23858 for (i = 0; i < A_CNT(tcp_bind_fanout); i++) { 23859 mutex_destroy(&tcp_bind_fanout[i].tf_lock); 23860 } 23861 23862 for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) { 23863 mutex_destroy(&tcp_acceptor_fanout[i].tf_lock); 23864 } 23865 23866 mutex_destroy(&tcp_iss_key_lock); 23867 rw_destroy(&tcp_hsp_lock); 23868 mutex_destroy(&tcp_g_q_lock); 23869 mutex_destroy(&tcp_random_lock); 23870 mutex_destroy(&tcp_epriv_port_lock); 23871 rw_destroy(&tcp_reserved_port_lock); 23872 23873 ip_drop_unregister(&tcp_dropper); 23874 23875 kmem_cache_destroy(tcp_timercache); 23876 kmem_cache_destroy(tcp_sack_info_cache); 23877 kmem_cache_destroy(tcp_iphc_cache); 23878 23879 tcp_kstat_fini(); 23880 } 23881 23882 /* 23883 * Generate ISS, taking into account NDD changes may happen halfway through. 23884 * (If the iss is not zero, set it.) 23885 */ 23886 23887 static void 23888 tcp_iss_init(tcp_t *tcp) 23889 { 23890 MD5_CTX context; 23891 struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg; 23892 uint32_t answer[4]; 23893 23894 tcp_iss_incr_extra += (ISS_INCR >> 1); 23895 tcp->tcp_iss = tcp_iss_incr_extra; 23896 switch (tcp_strong_iss) { 23897 case 2: 23898 mutex_enter(&tcp_iss_key_lock); 23899 context = tcp_iss_key; 23900 mutex_exit(&tcp_iss_key_lock); 23901 arg.ports = tcp->tcp_ports; 23902 if (tcp->tcp_ipversion == IPV4_VERSION) { 23903 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, 23904 &arg.src); 23905 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_dst, 23906 &arg.dst); 23907 } else { 23908 arg.src = tcp->tcp_ip6h->ip6_src; 23909 arg.dst = tcp->tcp_ip6h->ip6_dst; 23910 } 23911 MD5Update(&context, (uchar_t *)&arg, sizeof (arg)); 23912 MD5Final((uchar_t *)answer, &context); 23913 tcp->tcp_iss += answer[0] ^ answer[1] ^ answer[2] ^ answer[3]; 23914 /* 23915 * Now that we've hashed into a unique per-connection sequence 23916 * space, add a random increment per strong_iss == 1. So I 23917 * guess we'll have to... 23918 */ 23919 /* FALLTHRU */ 23920 case 1: 23921 tcp->tcp_iss += (gethrtime() >> ISS_NSEC_SHT) + tcp_random(); 23922 break; 23923 default: 23924 tcp->tcp_iss += (uint32_t)gethrestime_sec() * ISS_INCR; 23925 break; 23926 } 23927 tcp->tcp_valid_bits = TCP_ISS_VALID; 23928 tcp->tcp_fss = tcp->tcp_iss - 1; 23929 tcp->tcp_suna = tcp->tcp_iss; 23930 tcp->tcp_snxt = tcp->tcp_iss + 1; 23931 tcp->tcp_rexmit_nxt = tcp->tcp_snxt; 23932 tcp->tcp_csuna = tcp->tcp_snxt; 23933 } 23934 23935 /* 23936 * Exported routine for extracting active tcp connection status. 23937 * 23938 * This is used by the Solaris Cluster Networking software to 23939 * gather a list of connections that need to be forwarded to 23940 * specific nodes in the cluster when configuration changes occur. 23941 * 23942 * The callback is invoked for each tcp_t structure. Returning 23943 * non-zero from the callback routine terminates the search. 23944 */ 23945 int 23946 cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg) 23947 { 23948 tcp_t *tcp; 23949 cl_tcp_info_t cl_tcpi; 23950 connf_t *connfp; 23951 conn_t *connp; 23952 int i; 23953 23954 ASSERT(callback != NULL); 23955 23956 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 23957 23958 connfp = &ipcl_globalhash_fanout[i]; 23959 connp = NULL; 23960 23961 while ((connp = 23962 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 23963 23964 tcp = connp->conn_tcp; 23965 cl_tcpi.cl_tcpi_version = CL_TCPI_V1; 23966 cl_tcpi.cl_tcpi_ipversion = tcp->tcp_ipversion; 23967 cl_tcpi.cl_tcpi_state = tcp->tcp_state; 23968 cl_tcpi.cl_tcpi_lport = tcp->tcp_lport; 23969 cl_tcpi.cl_tcpi_fport = tcp->tcp_fport; 23970 /* 23971 * The macros tcp_laddr and tcp_faddr give the IPv4 23972 * addresses. They are copied implicitly below as 23973 * mapped addresses. 23974 */ 23975 cl_tcpi.cl_tcpi_laddr_v6 = tcp->tcp_ip_src_v6; 23976 if (tcp->tcp_ipversion == IPV4_VERSION) { 23977 cl_tcpi.cl_tcpi_faddr = 23978 tcp->tcp_ipha->ipha_dst; 23979 } else { 23980 cl_tcpi.cl_tcpi_faddr_v6 = 23981 tcp->tcp_ip6h->ip6_dst; 23982 } 23983 23984 /* 23985 * If the callback returns non-zero 23986 * we terminate the traversal. 23987 */ 23988 if ((*callback)(&cl_tcpi, arg) != 0) { 23989 CONN_DEC_REF(tcp->tcp_connp); 23990 return (1); 23991 } 23992 } 23993 } 23994 23995 return (0); 23996 } 23997 23998 /* 23999 * Macros used for accessing the different types of sockaddr 24000 * structures inside a tcp_ioc_abort_conn_t. 24001 */ 24002 #define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local) 24003 #define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote) 24004 #define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr) 24005 #define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr) 24006 #define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port) 24007 #define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port) 24008 #define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local) 24009 #define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote) 24010 #define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr) 24011 #define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr) 24012 #define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port) 24013 #define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port) 24014 24015 /* 24016 * Return the correct error code to mimic the behavior 24017 * of a connection reset. 24018 */ 24019 #define TCP_AC_GET_ERRCODE(state, err) { \ 24020 switch ((state)) { \ 24021 case TCPS_SYN_SENT: \ 24022 case TCPS_SYN_RCVD: \ 24023 (err) = ECONNREFUSED; \ 24024 break; \ 24025 case TCPS_ESTABLISHED: \ 24026 case TCPS_FIN_WAIT_1: \ 24027 case TCPS_FIN_WAIT_2: \ 24028 case TCPS_CLOSE_WAIT: \ 24029 (err) = ECONNRESET; \ 24030 break; \ 24031 case TCPS_CLOSING: \ 24032 case TCPS_LAST_ACK: \ 24033 case TCPS_TIME_WAIT: \ 24034 (err) = 0; \ 24035 break; \ 24036 default: \ 24037 (err) = ENXIO; \ 24038 } \ 24039 } 24040 24041 /* 24042 * Check if a tcp structure matches the info in acp. 24043 */ 24044 #define TCP_AC_ADDR_MATCH(acp, tcp) \ 24045 (((acp)->ac_local.ss_family == AF_INET) ? \ 24046 ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \ 24047 TCP_AC_V4LOCAL((acp)) == (tcp)->tcp_ip_src) && \ 24048 (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \ 24049 TCP_AC_V4REMOTE((acp)) == (tcp)->tcp_remote) && \ 24050 (TCP_AC_V4LPORT((acp)) == 0 || \ 24051 TCP_AC_V4LPORT((acp)) == (tcp)->tcp_lport) && \ 24052 (TCP_AC_V4RPORT((acp)) == 0 || \ 24053 TCP_AC_V4RPORT((acp)) == (tcp)->tcp_fport) && \ 24054 (acp)->ac_start <= (tcp)->tcp_state && \ 24055 (acp)->ac_end >= (tcp)->tcp_state) : \ 24056 ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \ 24057 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \ 24058 &(tcp)->tcp_ip_src_v6)) && \ 24059 (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \ 24060 IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \ 24061 &(tcp)->tcp_remote_v6)) && \ 24062 (TCP_AC_V6LPORT((acp)) == 0 || \ 24063 TCP_AC_V6LPORT((acp)) == (tcp)->tcp_lport) && \ 24064 (TCP_AC_V6RPORT((acp)) == 0 || \ 24065 TCP_AC_V6RPORT((acp)) == (tcp)->tcp_fport) && \ 24066 (acp)->ac_start <= (tcp)->tcp_state && \ 24067 (acp)->ac_end >= (tcp)->tcp_state)) 24068 24069 #define TCP_AC_MATCH(acp, tcp) \ 24070 (((acp)->ac_zoneid == ALL_ZONES || \ 24071 (acp)->ac_zoneid == tcp->tcp_connp->conn_zoneid) ? \ 24072 TCP_AC_ADDR_MATCH(acp, tcp) : 0) 24073 24074 /* 24075 * Build a message containing a tcp_ioc_abort_conn_t structure 24076 * which is filled in with information from acp and tp. 24077 */ 24078 static mblk_t * 24079 tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) 24080 { 24081 mblk_t *mp; 24082 tcp_ioc_abort_conn_t *tacp; 24083 24084 mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO); 24085 if (mp == NULL) 24086 return (NULL); 24087 24088 mp->b_datap->db_type = M_CTL; 24089 24090 *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN; 24091 tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr + 24092 sizeof (uint32_t)); 24093 24094 tacp->ac_start = acp->ac_start; 24095 tacp->ac_end = acp->ac_end; 24096 tacp->ac_zoneid = acp->ac_zoneid; 24097 24098 if (acp->ac_local.ss_family == AF_INET) { 24099 tacp->ac_local.ss_family = AF_INET; 24100 tacp->ac_remote.ss_family = AF_INET; 24101 TCP_AC_V4LOCAL(tacp) = tp->tcp_ip_src; 24102 TCP_AC_V4REMOTE(tacp) = tp->tcp_remote; 24103 TCP_AC_V4LPORT(tacp) = tp->tcp_lport; 24104 TCP_AC_V4RPORT(tacp) = tp->tcp_fport; 24105 } else { 24106 tacp->ac_local.ss_family = AF_INET6; 24107 tacp->ac_remote.ss_family = AF_INET6; 24108 TCP_AC_V6LOCAL(tacp) = tp->tcp_ip_src_v6; 24109 TCP_AC_V6REMOTE(tacp) = tp->tcp_remote_v6; 24110 TCP_AC_V6LPORT(tacp) = tp->tcp_lport; 24111 TCP_AC_V6RPORT(tacp) = tp->tcp_fport; 24112 } 24113 mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp); 24114 return (mp); 24115 } 24116 24117 /* 24118 * Print a tcp_ioc_abort_conn_t structure. 24119 */ 24120 static void 24121 tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) 24122 { 24123 char lbuf[128]; 24124 char rbuf[128]; 24125 sa_family_t af; 24126 in_port_t lport, rport; 24127 ushort_t logflags; 24128 24129 af = acp->ac_local.ss_family; 24130 24131 if (af == AF_INET) { 24132 (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp), 24133 lbuf, 128); 24134 (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp), 24135 rbuf, 128); 24136 lport = ntohs(TCP_AC_V4LPORT(acp)); 24137 rport = ntohs(TCP_AC_V4RPORT(acp)); 24138 } else { 24139 (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp), 24140 lbuf, 128); 24141 (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp), 24142 rbuf, 128); 24143 lport = ntohs(TCP_AC_V6LPORT(acp)); 24144 rport = ntohs(TCP_AC_V6RPORT(acp)); 24145 } 24146 24147 logflags = SL_TRACE | SL_NOTE; 24148 /* 24149 * Don't print this message to the console if the operation was done 24150 * to a non-global zone. 24151 */ 24152 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 24153 logflags |= SL_CONSOLE; 24154 (void) strlog(TCP_MOD_ID, 0, 1, logflags, 24155 "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, " 24156 "start = %d, end = %d\n", lbuf, lport, rbuf, rport, 24157 acp->ac_start, acp->ac_end); 24158 } 24159 24160 /* 24161 * Called inside tcp_rput when a message built using 24162 * tcp_ioctl_abort_build_msg is put into a queue. 24163 * Note that when we get here there is no wildcard in acp any more. 24164 */ 24165 static void 24166 tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp) 24167 { 24168 tcp_ioc_abort_conn_t *acp; 24169 24170 acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t)); 24171 if (tcp->tcp_state <= acp->ac_end) { 24172 /* 24173 * If we get here, we are already on the correct 24174 * squeue. This ioctl follows the following path 24175 * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn 24176 * ->tcp_ioctl_abort->squeue_fill (if on a 24177 * different squeue) 24178 */ 24179 int errcode; 24180 24181 TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode); 24182 (void) tcp_clean_death(tcp, errcode, 26); 24183 } 24184 freemsg(mp); 24185 } 24186 24187 /* 24188 * Abort all matching connections on a hash chain. 24189 */ 24190 static int 24191 tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count, 24192 boolean_t exact) 24193 { 24194 int nmatch, err = 0; 24195 tcp_t *tcp; 24196 MBLKP mp, last, listhead = NULL; 24197 conn_t *tconnp; 24198 connf_t *connfp = &ipcl_conn_fanout[index]; 24199 24200 startover: 24201 nmatch = 0; 24202 24203 mutex_enter(&connfp->connf_lock); 24204 for (tconnp = connfp->connf_head; tconnp != NULL; 24205 tconnp = tconnp->conn_next) { 24206 tcp = tconnp->conn_tcp; 24207 if (TCP_AC_MATCH(acp, tcp)) { 24208 CONN_INC_REF(tcp->tcp_connp); 24209 mp = tcp_ioctl_abort_build_msg(acp, tcp); 24210 if (mp == NULL) { 24211 err = ENOMEM; 24212 CONN_DEC_REF(tcp->tcp_connp); 24213 break; 24214 } 24215 mp->b_prev = (mblk_t *)tcp; 24216 24217 if (listhead == NULL) { 24218 listhead = mp; 24219 last = mp; 24220 } else { 24221 last->b_next = mp; 24222 last = mp; 24223 } 24224 nmatch++; 24225 if (exact) 24226 break; 24227 } 24228 24229 /* Avoid holding lock for too long. */ 24230 if (nmatch >= 500) 24231 break; 24232 } 24233 mutex_exit(&connfp->connf_lock); 24234 24235 /* Pass mp into the correct tcp */ 24236 while ((mp = listhead) != NULL) { 24237 listhead = listhead->b_next; 24238 tcp = (tcp_t *)mp->b_prev; 24239 mp->b_next = mp->b_prev = NULL; 24240 squeue_fill(tcp->tcp_connp->conn_sqp, mp, 24241 tcp_input, tcp->tcp_connp, SQTAG_TCP_ABORT_BUCKET); 24242 } 24243 24244 *count += nmatch; 24245 if (nmatch >= 500 && err == 0) 24246 goto startover; 24247 return (err); 24248 } 24249 24250 /* 24251 * Abort all connections that matches the attributes specified in acp. 24252 */ 24253 static int 24254 tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp) 24255 { 24256 sa_family_t af; 24257 uint32_t ports; 24258 uint16_t *pports; 24259 int err = 0, count = 0; 24260 boolean_t exact = B_FALSE; /* set when there is no wildcard */ 24261 int index = -1; 24262 ushort_t logflags; 24263 24264 af = acp->ac_local.ss_family; 24265 24266 if (af == AF_INET) { 24267 if (TCP_AC_V4REMOTE(acp) != INADDR_ANY && 24268 TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) { 24269 pports = (uint16_t *)&ports; 24270 pports[1] = TCP_AC_V4LPORT(acp); 24271 pports[0] = TCP_AC_V4RPORT(acp); 24272 exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY); 24273 } 24274 } else { 24275 if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) && 24276 TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) { 24277 pports = (uint16_t *)&ports; 24278 pports[1] = TCP_AC_V6LPORT(acp); 24279 pports[0] = TCP_AC_V6RPORT(acp); 24280 exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp)); 24281 } 24282 } 24283 24284 /* 24285 * For cases where remote addr, local port, and remote port are non- 24286 * wildcards, tcp_ioctl_abort_bucket will only be called once. 24287 */ 24288 if (index != -1) { 24289 err = tcp_ioctl_abort_bucket(acp, index, 24290 &count, exact); 24291 } else { 24292 /* 24293 * loop through all entries for wildcard case 24294 */ 24295 for (index = 0; index < ipcl_conn_fanout_size; index++) { 24296 err = tcp_ioctl_abort_bucket(acp, index, 24297 &count, exact); 24298 if (err != 0) 24299 break; 24300 } 24301 } 24302 24303 logflags = SL_TRACE | SL_NOTE; 24304 /* 24305 * Don't print this message to the console if the operation was done 24306 * to a non-global zone. 24307 */ 24308 if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) 24309 logflags |= SL_CONSOLE; 24310 (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: " 24311 "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' ')); 24312 if (err == 0 && count == 0) 24313 err = ENOENT; 24314 return (err); 24315 } 24316 24317 /* 24318 * Process the TCP_IOC_ABORT_CONN ioctl request. 24319 */ 24320 static void 24321 tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) 24322 { 24323 int err; 24324 IOCP iocp; 24325 MBLKP mp1; 24326 sa_family_t laf, raf; 24327 tcp_ioc_abort_conn_t *acp; 24328 zone_t *zptr; 24329 zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid; 24330 24331 iocp = (IOCP)mp->b_rptr; 24332 24333 if ((mp1 = mp->b_cont) == NULL || 24334 iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) { 24335 err = EINVAL; 24336 goto out; 24337 } 24338 24339 /* check permissions */ 24340 if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) { 24341 err = EPERM; 24342 goto out; 24343 } 24344 24345 if (mp1->b_cont != NULL) { 24346 freemsg(mp1->b_cont); 24347 mp1->b_cont = NULL; 24348 } 24349 24350 acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr; 24351 laf = acp->ac_local.ss_family; 24352 raf = acp->ac_remote.ss_family; 24353 24354 /* check that a zone with the supplied zoneid exists */ 24355 if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) { 24356 zptr = zone_find_by_id(zoneid); 24357 if (zptr != NULL) { 24358 zone_rele(zptr); 24359 } else { 24360 err = EINVAL; 24361 goto out; 24362 } 24363 } 24364 24365 if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT || 24366 acp->ac_start > acp->ac_end || laf != raf || 24367 (laf != AF_INET && laf != AF_INET6)) { 24368 err = EINVAL; 24369 goto out; 24370 } 24371 24372 tcp_ioctl_abort_dump(acp); 24373 err = tcp_ioctl_abort(acp); 24374 24375 out: 24376 if (mp1 != NULL) { 24377 freemsg(mp1); 24378 mp->b_cont = NULL; 24379 } 24380 24381 if (err != 0) 24382 miocnak(q, mp, 0, err); 24383 else 24384 miocack(q, mp, 0, 0); 24385 } 24386 24387 /* 24388 * tcp_time_wait_processing() handles processing of incoming packets when 24389 * the tcp is in the TIME_WAIT state. 24390 * A TIME_WAIT tcp that has an associated open TCP stream is never put 24391 * on the time wait list. 24392 */ 24393 void 24394 tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, 24395 uint32_t seg_ack, int seg_len, tcph_t *tcph) 24396 { 24397 int32_t bytes_acked; 24398 int32_t gap; 24399 int32_t rgap; 24400 tcp_opt_t tcpopt; 24401 uint_t flags; 24402 uint32_t new_swnd = 0; 24403 conn_t *connp; 24404 24405 BUMP_LOCAL(tcp->tcp_ibsegs); 24406 TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_RECV_PKT); 24407 24408 flags = (unsigned int)tcph->th_flags[0] & 0xFF; 24409 new_swnd = BE16_TO_U16(tcph->th_win) << 24410 ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); 24411 if (tcp->tcp_snd_ts_ok) { 24412 if (!tcp_paws_check(tcp, tcph, &tcpopt)) { 24413 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 24414 tcp->tcp_rnxt, TH_ACK); 24415 goto done; 24416 } 24417 } 24418 gap = seg_seq - tcp->tcp_rnxt; 24419 rgap = tcp->tcp_rwnd - (gap + seg_len); 24420 if (gap < 0) { 24421 BUMP_MIB(&tcp_mib, tcpInDataDupSegs); 24422 UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, 24423 (seg_len > -gap ? -gap : seg_len)); 24424 seg_len += gap; 24425 if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { 24426 if (flags & TH_RST) { 24427 goto done; 24428 } 24429 if ((flags & TH_FIN) && seg_len == -1) { 24430 /* 24431 * When TCP receives a duplicate FIN in 24432 * TIME_WAIT state, restart the 2 MSL timer. 24433 * See page 73 in RFC 793. Make sure this TCP 24434 * is already on the TIME_WAIT list. If not, 24435 * just restart the timer. 24436 */ 24437 if (TCP_IS_DETACHED(tcp)) { 24438 tcp_time_wait_remove(tcp, NULL); 24439 tcp_time_wait_append(tcp); 24440 TCP_DBGSTAT(tcp_rput_time_wait); 24441 } else { 24442 ASSERT(tcp != NULL); 24443 TCP_TIMER_RESTART(tcp, 24444 tcp_time_wait_interval); 24445 } 24446 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 24447 tcp->tcp_rnxt, TH_ACK); 24448 goto done; 24449 } 24450 flags |= TH_ACK_NEEDED; 24451 seg_len = 0; 24452 goto process_ack; 24453 } 24454 24455 /* Fix seg_seq, and chew the gap off the front. */ 24456 seg_seq = tcp->tcp_rnxt; 24457 } 24458 24459 if ((flags & TH_SYN) && gap > 0 && rgap < 0) { 24460 /* 24461 * Make sure that when we accept the connection, pick 24462 * an ISS greater than (tcp_snxt + ISS_INCR/2) for the 24463 * old connection. 24464 * 24465 * The next ISS generated is equal to tcp_iss_incr_extra 24466 * + ISS_INCR/2 + other components depending on the 24467 * value of tcp_strong_iss. We pre-calculate the new 24468 * ISS here and compare with tcp_snxt to determine if 24469 * we need to make adjustment to tcp_iss_incr_extra. 24470 * 24471 * The above calculation is ugly and is a 24472 * waste of CPU cycles... 24473 */ 24474 uint32_t new_iss = tcp_iss_incr_extra; 24475 int32_t adj; 24476 24477 switch (tcp_strong_iss) { 24478 case 2: { 24479 /* Add time and MD5 components. */ 24480 uint32_t answer[4]; 24481 struct { 24482 uint32_t ports; 24483 in6_addr_t src; 24484 in6_addr_t dst; 24485 } arg; 24486 MD5_CTX context; 24487 24488 mutex_enter(&tcp_iss_key_lock); 24489 context = tcp_iss_key; 24490 mutex_exit(&tcp_iss_key_lock); 24491 arg.ports = tcp->tcp_ports; 24492 /* We use MAPPED addresses in tcp_iss_init */ 24493 arg.src = tcp->tcp_ip_src_v6; 24494 if (tcp->tcp_ipversion == IPV4_VERSION) { 24495 IN6_IPADDR_TO_V4MAPPED( 24496 tcp->tcp_ipha->ipha_dst, 24497 &arg.dst); 24498 } else { 24499 arg.dst = 24500 tcp->tcp_ip6h->ip6_dst; 24501 } 24502 MD5Update(&context, (uchar_t *)&arg, 24503 sizeof (arg)); 24504 MD5Final((uchar_t *)answer, &context); 24505 answer[0] ^= answer[1] ^ answer[2] ^ answer[3]; 24506 new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0]; 24507 break; 24508 } 24509 case 1: 24510 /* Add time component and min random (i.e. 1). */ 24511 new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1; 24512 break; 24513 default: 24514 /* Add only time component. */ 24515 new_iss += (uint32_t)gethrestime_sec() * ISS_INCR; 24516 break; 24517 } 24518 if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { 24519 /* 24520 * New ISS not guaranteed to be ISS_INCR/2 24521 * ahead of the current tcp_snxt, so add the 24522 * difference to tcp_iss_incr_extra. 24523 */ 24524 tcp_iss_incr_extra += adj; 24525 } 24526 /* 24527 * If tcp_clean_death() can not perform the task now, 24528 * drop the SYN packet and let the other side re-xmit. 24529 * Otherwise pass the SYN packet back in, since the 24530 * old tcp state has been cleaned up or freed. 24531 */ 24532 if (tcp_clean_death(tcp, 0, 27) == -1) 24533 goto done; 24534 /* 24535 * We will come back to tcp_rput_data 24536 * on the global queue. Packets destined 24537 * for the global queue will be checked 24538 * with global policy. But the policy for 24539 * this packet has already been checked as 24540 * this was destined for the detached 24541 * connection. We need to bypass policy 24542 * check this time by attaching a dummy 24543 * ipsec_in with ipsec_in_dont_check set. 24544 */ 24545 if ((connp = ipcl_classify(mp, tcp->tcp_connp->conn_zoneid)) != 24546 NULL) { 24547 TCP_STAT(tcp_time_wait_syn_success); 24548 tcp_reinput(connp, mp, tcp->tcp_connp->conn_sqp); 24549 return; 24550 } 24551 goto done; 24552 } 24553 24554 /* 24555 * rgap is the amount of stuff received out of window. A negative 24556 * value is the amount out of window. 24557 */ 24558 if (rgap < 0) { 24559 BUMP_MIB(&tcp_mib, tcpInDataPastWinSegs); 24560 UPDATE_MIB(&tcp_mib, tcpInDataPastWinBytes, -rgap); 24561 /* Fix seg_len and make sure there is something left. */ 24562 seg_len += rgap; 24563 if (seg_len <= 0) { 24564 if (flags & TH_RST) { 24565 goto done; 24566 } 24567 flags |= TH_ACK_NEEDED; 24568 seg_len = 0; 24569 goto process_ack; 24570 } 24571 } 24572 /* 24573 * Check whether we can update tcp_ts_recent. This test is 24574 * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP 24575 * Extensions for High Performance: An Update", Internet Draft. 24576 */ 24577 if (tcp->tcp_snd_ts_ok && 24578 TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && 24579 SEQ_LEQ(seg_seq, tcp->tcp_rack)) { 24580 tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; 24581 tcp->tcp_last_rcv_lbolt = lbolt64; 24582 } 24583 24584 if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { 24585 /* Always ack out of order packets */ 24586 flags |= TH_ACK_NEEDED; 24587 seg_len = 0; 24588 } else if (seg_len > 0) { 24589 BUMP_MIB(&tcp_mib, tcpInClosed); 24590 BUMP_MIB(&tcp_mib, tcpInDataInorderSegs); 24591 UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, seg_len); 24592 } 24593 if (flags & TH_RST) { 24594 (void) tcp_clean_death(tcp, 0, 28); 24595 goto done; 24596 } 24597 if (flags & TH_SYN) { 24598 tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, 24599 TH_RST|TH_ACK); 24600 /* 24601 * Do not delete the TCP structure if it is in 24602 * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. 24603 */ 24604 goto done; 24605 } 24606 process_ack: 24607 if (flags & TH_ACK) { 24608 bytes_acked = (int)(seg_ack - tcp->tcp_suna); 24609 if (bytes_acked <= 0) { 24610 if (bytes_acked == 0 && seg_len == 0 && 24611 new_swnd == tcp->tcp_swnd) 24612 BUMP_MIB(&tcp_mib, tcpInDupAck); 24613 } else { 24614 /* Acks something not sent */ 24615 flags |= TH_ACK_NEEDED; 24616 } 24617 } 24618 if (flags & TH_ACK_NEEDED) { 24619 /* 24620 * Time to send an ack for some reason. 24621 */ 24622 tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, 24623 tcp->tcp_rnxt, TH_ACK); 24624 } 24625 done: 24626 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 24627 DB_CKSUMSTART(mp) = 0; 24628 mp->b_datap->db_struioflag &= ~STRUIO_EAGER; 24629 TCP_STAT(tcp_time_wait_syn_fail); 24630 } 24631 freemsg(mp); 24632 } 24633 24634 /* 24635 * Allocate a T_SVR4_OPTMGMT_REQ. 24636 * The caller needs to increment tcp_drop_opt_ack_cnt when sending these so 24637 * that tcp_rput_other can drop the acks. 24638 */ 24639 static mblk_t * 24640 tcp_setsockopt_mp(int level, int cmd, char *opt, int optlen) 24641 { 24642 mblk_t *mp; 24643 struct T_optmgmt_req *tor; 24644 struct opthdr *oh; 24645 uint_t size; 24646 char *optptr; 24647 24648 size = sizeof (*tor) + sizeof (*oh) + optlen; 24649 mp = allocb(size, BPRI_MED); 24650 if (mp == NULL) 24651 return (NULL); 24652 24653 mp->b_wptr += size; 24654 mp->b_datap->db_type = M_PROTO; 24655 tor = (struct T_optmgmt_req *)mp->b_rptr; 24656 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 24657 tor->MGMT_flags = T_NEGOTIATE; 24658 tor->OPT_length = sizeof (*oh) + optlen; 24659 tor->OPT_offset = (t_scalar_t)sizeof (*tor); 24660 24661 oh = (struct opthdr *)&tor[1]; 24662 oh->level = level; 24663 oh->name = cmd; 24664 oh->len = optlen; 24665 if (optlen != 0) { 24666 optptr = (char *)&oh[1]; 24667 bcopy(opt, optptr, optlen); 24668 } 24669 return (mp); 24670 } 24671 24672 /* 24673 * TCP Timers Implementation. 24674 */ 24675 timeout_id_t 24676 tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim) 24677 { 24678 mblk_t *mp; 24679 tcp_timer_t *tcpt; 24680 tcp_t *tcp = connp->conn_tcp; 24681 24682 ASSERT(connp->conn_sqp != NULL); 24683 24684 TCP_DBGSTAT(tcp_timeout_calls); 24685 24686 if (tcp->tcp_timercache == NULL) { 24687 mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC); 24688 } else { 24689 TCP_DBGSTAT(tcp_timeout_cached_alloc); 24690 mp = tcp->tcp_timercache; 24691 tcp->tcp_timercache = mp->b_next; 24692 mp->b_next = NULL; 24693 ASSERT(mp->b_wptr == NULL); 24694 } 24695 24696 CONN_INC_REF(connp); 24697 tcpt = (tcp_timer_t *)mp->b_rptr; 24698 tcpt->connp = connp; 24699 tcpt->tcpt_proc = f; 24700 tcpt->tcpt_tid = timeout(tcp_timer_callback, mp, tim); 24701 return ((timeout_id_t)mp); 24702 } 24703 24704 static void 24705 tcp_timer_callback(void *arg) 24706 { 24707 mblk_t *mp = (mblk_t *)arg; 24708 tcp_timer_t *tcpt; 24709 conn_t *connp; 24710 24711 tcpt = (tcp_timer_t *)mp->b_rptr; 24712 connp = tcpt->connp; 24713 squeue_fill(connp->conn_sqp, mp, 24714 tcp_timer_handler, connp, SQTAG_TCP_TIMER); 24715 } 24716 24717 static void 24718 tcp_timer_handler(void *arg, mblk_t *mp, void *arg2) 24719 { 24720 tcp_timer_t *tcpt; 24721 conn_t *connp = (conn_t *)arg; 24722 tcp_t *tcp = connp->conn_tcp; 24723 24724 tcpt = (tcp_timer_t *)mp->b_rptr; 24725 ASSERT(connp == tcpt->connp); 24726 ASSERT((squeue_t *)arg2 == connp->conn_sqp); 24727 24728 /* 24729 * If the TCP has reached the closed state, don't proceed any 24730 * further. This TCP logically does not exist on the system. 24731 * tcpt_proc could for example access queues, that have already 24732 * been qprocoff'ed off. Also see comments at the start of tcp_input 24733 */ 24734 if (tcp->tcp_state != TCPS_CLOSED) { 24735 (*tcpt->tcpt_proc)(connp); 24736 } else { 24737 tcp->tcp_timer_tid = 0; 24738 } 24739 tcp_timer_free(connp->conn_tcp, mp); 24740 } 24741 24742 /* 24743 * There is potential race with untimeout and the handler firing at the same 24744 * time. The mblock may be freed by the handler while we are trying to use 24745 * it. But since both should execute on the same squeue, this race should not 24746 * occur. 24747 */ 24748 clock_t 24749 tcp_timeout_cancel(conn_t *connp, timeout_id_t id) 24750 { 24751 mblk_t *mp = (mblk_t *)id; 24752 tcp_timer_t *tcpt; 24753 clock_t delta; 24754 24755 TCP_DBGSTAT(tcp_timeout_cancel_reqs); 24756 24757 if (mp == NULL) 24758 return (-1); 24759 24760 tcpt = (tcp_timer_t *)mp->b_rptr; 24761 ASSERT(tcpt->connp == connp); 24762 24763 delta = untimeout(tcpt->tcpt_tid); 24764 24765 if (delta >= 0) { 24766 TCP_DBGSTAT(tcp_timeout_canceled); 24767 tcp_timer_free(connp->conn_tcp, mp); 24768 CONN_DEC_REF(connp); 24769 } 24770 24771 return (delta); 24772 } 24773 24774 /* 24775 * Allocate space for the timer event. The allocation looks like mblk, but it is 24776 * not a proper mblk. To avoid confusion we set b_wptr to NULL. 24777 * 24778 * Dealing with failures: If we can't allocate from the timer cache we try 24779 * allocating from dblock caches using allocb_tryhard(). In this case b_wptr 24780 * points to b_rptr. 24781 * If we can't allocate anything using allocb_tryhard(), we perform a last 24782 * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and 24783 * save the actual allocation size in b_datap. 24784 */ 24785 mblk_t * 24786 tcp_timermp_alloc(int kmflags) 24787 { 24788 mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache, 24789 kmflags & ~KM_PANIC); 24790 24791 if (mp != NULL) { 24792 mp->b_next = mp->b_prev = NULL; 24793 mp->b_rptr = (uchar_t *)(&mp[1]); 24794 mp->b_wptr = NULL; 24795 mp->b_datap = NULL; 24796 mp->b_queue = NULL; 24797 } else if (kmflags & KM_PANIC) { 24798 /* 24799 * Failed to allocate memory for the timer. Try allocating from 24800 * dblock caches. 24801 */ 24802 TCP_STAT(tcp_timermp_allocfail); 24803 mp = allocb_tryhard(sizeof (tcp_timer_t)); 24804 if (mp == NULL) { 24805 size_t size = 0; 24806 /* 24807 * Memory is really low. Try tryhard allocation. 24808 */ 24809 TCP_STAT(tcp_timermp_allocdblfail); 24810 mp = kmem_alloc_tryhard(sizeof (mblk_t) + 24811 sizeof (tcp_timer_t), &size, kmflags); 24812 mp->b_rptr = (uchar_t *)(&mp[1]); 24813 mp->b_next = mp->b_prev = NULL; 24814 mp->b_wptr = (uchar_t *)-1; 24815 mp->b_datap = (dblk_t *)size; 24816 mp->b_queue = NULL; 24817 } 24818 ASSERT(mp->b_wptr != NULL); 24819 } 24820 TCP_DBGSTAT(tcp_timermp_alloced); 24821 24822 return (mp); 24823 } 24824 24825 /* 24826 * Free per-tcp timer cache. 24827 * It can only contain entries from tcp_timercache. 24828 */ 24829 void 24830 tcp_timermp_free(tcp_t *tcp) 24831 { 24832 mblk_t *mp; 24833 24834 while ((mp = tcp->tcp_timercache) != NULL) { 24835 ASSERT(mp->b_wptr == NULL); 24836 tcp->tcp_timercache = tcp->tcp_timercache->b_next; 24837 kmem_cache_free(tcp_timercache, mp); 24838 } 24839 } 24840 24841 /* 24842 * Free timer event. Put it on the per-tcp timer cache if there is not too many 24843 * events there already (currently at most two events are cached). 24844 * If the event is not allocated from the timer cache, free it right away. 24845 */ 24846 static void 24847 tcp_timer_free(tcp_t *tcp, mblk_t *mp) 24848 { 24849 mblk_t *mp1 = tcp->tcp_timercache; 24850 24851 if (mp->b_wptr != NULL) { 24852 /* 24853 * This allocation is not from a timer cache, free it right 24854 * away. 24855 */ 24856 if (mp->b_wptr != (uchar_t *)-1) 24857 freeb(mp); 24858 else 24859 kmem_free(mp, (size_t)mp->b_datap); 24860 } else if (mp1 == NULL || mp1->b_next == NULL) { 24861 /* Cache this timer block for future allocations */ 24862 mp->b_rptr = (uchar_t *)(&mp[1]); 24863 mp->b_next = mp1; 24864 tcp->tcp_timercache = mp; 24865 } else { 24866 kmem_cache_free(tcp_timercache, mp); 24867 TCP_DBGSTAT(tcp_timermp_freed); 24868 } 24869 } 24870 24871 /* 24872 * End of TCP Timers implementation. 24873 */ 24874 24875 /* 24876 * tcp_{set,clr}qfull() functions are used to either set or clear QFULL 24877 * on the specified backing STREAMS q. Note, the caller may make the 24878 * decision to call based on the tcp_t.tcp_flow_stopped value which 24879 * when check outside the q's lock is only an advisory check ... 24880 */ 24881 24882 void 24883 tcp_setqfull(tcp_t *tcp) 24884 { 24885 queue_t *q = tcp->tcp_wq; 24886 24887 if (!(q->q_flag & QFULL)) { 24888 mutex_enter(QLOCK(q)); 24889 if (!(q->q_flag & QFULL)) { 24890 /* still need to set QFULL */ 24891 q->q_flag |= QFULL; 24892 tcp->tcp_flow_stopped = B_TRUE; 24893 mutex_exit(QLOCK(q)); 24894 TCP_STAT(tcp_flwctl_on); 24895 } else { 24896 mutex_exit(QLOCK(q)); 24897 } 24898 } 24899 } 24900 24901 void 24902 tcp_clrqfull(tcp_t *tcp) 24903 { 24904 queue_t *q = tcp->tcp_wq; 24905 24906 if (q->q_flag & QFULL) { 24907 mutex_enter(QLOCK(q)); 24908 if (q->q_flag & QFULL) { 24909 q->q_flag &= ~QFULL; 24910 tcp->tcp_flow_stopped = B_FALSE; 24911 mutex_exit(QLOCK(q)); 24912 if (q->q_flag & QWANTW) 24913 qbackenable(q, 0); 24914 } else { 24915 mutex_exit(QLOCK(q)); 24916 } 24917 } 24918 } 24919 24920 /* 24921 * TCP Kstats implementation 24922 */ 24923 static void 24924 tcp_kstat_init(void) 24925 { 24926 tcp_named_kstat_t template = { 24927 { "rtoAlgorithm", KSTAT_DATA_INT32, 0 }, 24928 { "rtoMin", KSTAT_DATA_INT32, 0 }, 24929 { "rtoMax", KSTAT_DATA_INT32, 0 }, 24930 { "maxConn", KSTAT_DATA_INT32, 0 }, 24931 { "activeOpens", KSTAT_DATA_UINT32, 0 }, 24932 { "passiveOpens", KSTAT_DATA_UINT32, 0 }, 24933 { "attemptFails", KSTAT_DATA_UINT32, 0 }, 24934 { "estabResets", KSTAT_DATA_UINT32, 0 }, 24935 { "currEstab", KSTAT_DATA_UINT32, 0 }, 24936 { "inSegs", KSTAT_DATA_UINT32, 0 }, 24937 { "outSegs", KSTAT_DATA_UINT32, 0 }, 24938 { "retransSegs", KSTAT_DATA_UINT32, 0 }, 24939 { "connTableSize", KSTAT_DATA_INT32, 0 }, 24940 { "outRsts", KSTAT_DATA_UINT32, 0 }, 24941 { "outDataSegs", KSTAT_DATA_UINT32, 0 }, 24942 { "outDataBytes", KSTAT_DATA_UINT32, 0 }, 24943 { "retransBytes", KSTAT_DATA_UINT32, 0 }, 24944 { "outAck", KSTAT_DATA_UINT32, 0 }, 24945 { "outAckDelayed", KSTAT_DATA_UINT32, 0 }, 24946 { "outUrg", KSTAT_DATA_UINT32, 0 }, 24947 { "outWinUpdate", KSTAT_DATA_UINT32, 0 }, 24948 { "outWinProbe", KSTAT_DATA_UINT32, 0 }, 24949 { "outControl", KSTAT_DATA_UINT32, 0 }, 24950 { "outFastRetrans", KSTAT_DATA_UINT32, 0 }, 24951 { "inAckSegs", KSTAT_DATA_UINT32, 0 }, 24952 { "inAckBytes", KSTAT_DATA_UINT32, 0 }, 24953 { "inDupAck", KSTAT_DATA_UINT32, 0 }, 24954 { "inAckUnsent", KSTAT_DATA_UINT32, 0 }, 24955 { "inDataInorderSegs", KSTAT_DATA_UINT32, 0 }, 24956 { "inDataInorderBytes", KSTAT_DATA_UINT32, 0 }, 24957 { "inDataUnorderSegs", KSTAT_DATA_UINT32, 0 }, 24958 { "inDataUnorderBytes", KSTAT_DATA_UINT32, 0 }, 24959 { "inDataDupSegs", KSTAT_DATA_UINT32, 0 }, 24960 { "inDataDupBytes", KSTAT_DATA_UINT32, 0 }, 24961 { "inDataPartDupSegs", KSTAT_DATA_UINT32, 0 }, 24962 { "inDataPartDupBytes", KSTAT_DATA_UINT32, 0 }, 24963 { "inDataPastWinSegs", KSTAT_DATA_UINT32, 0 }, 24964 { "inDataPastWinBytes", KSTAT_DATA_UINT32, 0 }, 24965 { "inWinProbe", KSTAT_DATA_UINT32, 0 }, 24966 { "inWinUpdate", KSTAT_DATA_UINT32, 0 }, 24967 { "inClosed", KSTAT_DATA_UINT32, 0 }, 24968 { "rttUpdate", KSTAT_DATA_UINT32, 0 }, 24969 { "rttNoUpdate", KSTAT_DATA_UINT32, 0 }, 24970 { "timRetrans", KSTAT_DATA_UINT32, 0 }, 24971 { "timRetransDrop", KSTAT_DATA_UINT32, 0 }, 24972 { "timKeepalive", KSTAT_DATA_UINT32, 0 }, 24973 { "timKeepaliveProbe", KSTAT_DATA_UINT32, 0 }, 24974 { "timKeepaliveDrop", KSTAT_DATA_UINT32, 0 }, 24975 { "listenDrop", KSTAT_DATA_UINT32, 0 }, 24976 { "listenDropQ0", KSTAT_DATA_UINT32, 0 }, 24977 { "halfOpenDrop", KSTAT_DATA_UINT32, 0 }, 24978 { "outSackRetransSegs", KSTAT_DATA_UINT32, 0 }, 24979 { "connTableSize6", KSTAT_DATA_INT32, 0 } 24980 }; 24981 24982 tcp_mibkp = kstat_create(TCP_MOD_NAME, 0, TCP_MOD_NAME, 24983 "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0); 24984 24985 if (tcp_mibkp == NULL) 24986 return; 24987 24988 template.rtoAlgorithm.value.ui32 = 4; 24989 template.rtoMin.value.ui32 = tcp_rexmit_interval_min; 24990 template.rtoMax.value.ui32 = tcp_rexmit_interval_max; 24991 template.maxConn.value.i32 = -1; 24992 24993 bcopy(&template, tcp_mibkp->ks_data, sizeof (template)); 24994 24995 tcp_mibkp->ks_update = tcp_kstat_update; 24996 24997 kstat_install(tcp_mibkp); 24998 } 24999 25000 static void 25001 tcp_kstat_fini(void) 25002 { 25003 25004 if (tcp_mibkp != NULL) { 25005 kstat_delete(tcp_mibkp); 25006 tcp_mibkp = NULL; 25007 } 25008 } 25009 25010 static int 25011 tcp_kstat_update(kstat_t *kp, int rw) 25012 { 25013 tcp_named_kstat_t *tcpkp; 25014 tcp_t *tcp; 25015 connf_t *connfp; 25016 conn_t *connp; 25017 int i; 25018 25019 if (!kp || !kp->ks_data) 25020 return (EIO); 25021 25022 if (rw == KSTAT_WRITE) 25023 return (EACCES); 25024 25025 tcpkp = (tcp_named_kstat_t *)kp->ks_data; 25026 25027 tcpkp->currEstab.value.ui32 = 0; 25028 25029 for (i = 0; i < CONN_G_HASH_SIZE; i++) { 25030 connfp = &ipcl_globalhash_fanout[i]; 25031 connp = NULL; 25032 while ((connp = 25033 ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { 25034 tcp = connp->conn_tcp; 25035 switch (tcp_snmp_state(tcp)) { 25036 case MIB2_TCP_established: 25037 case MIB2_TCP_closeWait: 25038 tcpkp->currEstab.value.ui32++; 25039 break; 25040 } 25041 } 25042 } 25043 25044 tcpkp->activeOpens.value.ui32 = tcp_mib.tcpActiveOpens; 25045 tcpkp->passiveOpens.value.ui32 = tcp_mib.tcpPassiveOpens; 25046 tcpkp->attemptFails.value.ui32 = tcp_mib.tcpAttemptFails; 25047 tcpkp->estabResets.value.ui32 = tcp_mib.tcpEstabResets; 25048 tcpkp->inSegs.value.ui32 = tcp_mib.tcpInSegs; 25049 tcpkp->outSegs.value.ui32 = tcp_mib.tcpOutSegs; 25050 tcpkp->retransSegs.value.ui32 = tcp_mib.tcpRetransSegs; 25051 tcpkp->connTableSize.value.i32 = tcp_mib.tcpConnTableSize; 25052 tcpkp->outRsts.value.ui32 = tcp_mib.tcpOutRsts; 25053 tcpkp->outDataSegs.value.ui32 = tcp_mib.tcpOutDataSegs; 25054 tcpkp->outDataBytes.value.ui32 = tcp_mib.tcpOutDataBytes; 25055 tcpkp->retransBytes.value.ui32 = tcp_mib.tcpRetransBytes; 25056 tcpkp->outAck.value.ui32 = tcp_mib.tcpOutAck; 25057 tcpkp->outAckDelayed.value.ui32 = tcp_mib.tcpOutAckDelayed; 25058 tcpkp->outUrg.value.ui32 = tcp_mib.tcpOutUrg; 25059 tcpkp->outWinUpdate.value.ui32 = tcp_mib.tcpOutWinUpdate; 25060 tcpkp->outWinProbe.value.ui32 = tcp_mib.tcpOutWinProbe; 25061 tcpkp->outControl.value.ui32 = tcp_mib.tcpOutControl; 25062 tcpkp->outFastRetrans.value.ui32 = tcp_mib.tcpOutFastRetrans; 25063 tcpkp->inAckSegs.value.ui32 = tcp_mib.tcpInAckSegs; 25064 tcpkp->inAckBytes.value.ui32 = tcp_mib.tcpInAckBytes; 25065 tcpkp->inDupAck.value.ui32 = tcp_mib.tcpInDupAck; 25066 tcpkp->inAckUnsent.value.ui32 = tcp_mib.tcpInAckUnsent; 25067 tcpkp->inDataInorderSegs.value.ui32 = tcp_mib.tcpInDataInorderSegs; 25068 tcpkp->inDataInorderBytes.value.ui32 = tcp_mib.tcpInDataInorderBytes; 25069 tcpkp->inDataUnorderSegs.value.ui32 = tcp_mib.tcpInDataUnorderSegs; 25070 tcpkp->inDataUnorderBytes.value.ui32 = tcp_mib.tcpInDataUnorderBytes; 25071 tcpkp->inDataDupSegs.value.ui32 = tcp_mib.tcpInDataDupSegs; 25072 tcpkp->inDataDupBytes.value.ui32 = tcp_mib.tcpInDataDupBytes; 25073 tcpkp->inDataPartDupSegs.value.ui32 = tcp_mib.tcpInDataPartDupSegs; 25074 tcpkp->inDataPartDupBytes.value.ui32 = tcp_mib.tcpInDataPartDupBytes; 25075 tcpkp->inDataPastWinSegs.value.ui32 = tcp_mib.tcpInDataPastWinSegs; 25076 tcpkp->inDataPastWinBytes.value.ui32 = tcp_mib.tcpInDataPastWinBytes; 25077 tcpkp->inWinProbe.value.ui32 = tcp_mib.tcpInWinProbe; 25078 tcpkp->inWinUpdate.value.ui32 = tcp_mib.tcpInWinUpdate; 25079 tcpkp->inClosed.value.ui32 = tcp_mib.tcpInClosed; 25080 tcpkp->rttNoUpdate.value.ui32 = tcp_mib.tcpRttNoUpdate; 25081 tcpkp->rttUpdate.value.ui32 = tcp_mib.tcpRttUpdate; 25082 tcpkp->timRetrans.value.ui32 = tcp_mib.tcpTimRetrans; 25083 tcpkp->timRetransDrop.value.ui32 = tcp_mib.tcpTimRetransDrop; 25084 tcpkp->timKeepalive.value.ui32 = tcp_mib.tcpTimKeepalive; 25085 tcpkp->timKeepaliveProbe.value.ui32 = tcp_mib.tcpTimKeepaliveProbe; 25086 tcpkp->timKeepaliveDrop.value.ui32 = tcp_mib.tcpTimKeepaliveDrop; 25087 tcpkp->listenDrop.value.ui32 = tcp_mib.tcpListenDrop; 25088 tcpkp->listenDropQ0.value.ui32 = tcp_mib.tcpListenDropQ0; 25089 tcpkp->halfOpenDrop.value.ui32 = tcp_mib.tcpHalfOpenDrop; 25090 tcpkp->outSackRetransSegs.value.ui32 = tcp_mib.tcpOutSackRetransSegs; 25091 tcpkp->connTableSize6.value.i32 = tcp_mib.tcp6ConnTableSize; 25092 25093 return (0); 25094 } 25095 25096 void 25097 tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp) 25098 { 25099 uint16_t hdr_len; 25100 ipha_t *ipha; 25101 uint8_t *nexthdrp; 25102 tcph_t *tcph; 25103 25104 /* Already has an eager */ 25105 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { 25106 TCP_STAT(tcp_reinput_syn); 25107 squeue_enter(connp->conn_sqp, mp, connp->conn_recv, 25108 connp, SQTAG_TCP_REINPUT_EAGER); 25109 return; 25110 } 25111 25112 switch (IPH_HDR_VERSION(mp->b_rptr)) { 25113 case IPV4_VERSION: 25114 ipha = (ipha_t *)mp->b_rptr; 25115 hdr_len = IPH_HDR_LENGTH(ipha); 25116 break; 25117 case IPV6_VERSION: 25118 if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, 25119 &hdr_len, &nexthdrp)) { 25120 CONN_DEC_REF(connp); 25121 freemsg(mp); 25122 return; 25123 } 25124 break; 25125 } 25126 25127 tcph = (tcph_t *)&mp->b_rptr[hdr_len]; 25128 if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { 25129 mp->b_datap->db_struioflag |= STRUIO_EAGER; 25130 DB_CKSUMSTART(mp) = (intptr_t)sqp; 25131 } 25132 25133 squeue_fill(connp->conn_sqp, mp, connp->conn_recv, connp, 25134 SQTAG_TCP_REINPUT); 25135 } 25136 25137 static squeue_func_t 25138 tcp_squeue_switch(int val) 25139 { 25140 squeue_func_t rval = squeue_fill; 25141 25142 switch (val) { 25143 case 1: 25144 rval = squeue_enter_nodrain; 25145 break; 25146 case 2: 25147 rval = squeue_enter; 25148 break; 25149 default: 25150 break; 25151 } 25152 return (rval); 25153 } 25154 25155 static void 25156 tcp_squeue_add(squeue_t *sqp) 25157 { 25158 tcp_squeue_priv_t *tcp_time_wait = kmem_zalloc( 25159 sizeof (tcp_squeue_priv_t), KM_SLEEP); 25160 25161 *squeue_getprivate(sqp, SQPRIVATE_TCP) = (intptr_t)tcp_time_wait; 25162 tcp_time_wait->tcp_time_wait_tid = timeout(tcp_time_wait_collector, 25163 sqp, TCP_TIME_WAIT_DELAY); 25164 if (tcp_free_list_max_cnt == 0) { 25165 int tcp_ncpus = ((boot_max_ncpus == -1) ? 25166 max_ncpus : boot_max_ncpus); 25167 25168 /* 25169 * Limit number of entries to 1% of availble memory / tcp_ncpus 25170 */ 25171 tcp_free_list_max_cnt = (freemem * PAGESIZE) / 25172 (tcp_ncpus * sizeof (tcp_t) * 100); 25173 } 25174 tcp_time_wait->tcp_free_list_cnt = 0; 25175 } 25176