1ff550d0eSmasputra /* 2ff550d0eSmasputra * CDDL HEADER START 3ff550d0eSmasputra * 4ff550d0eSmasputra * The contents of this file are subject to the terms of the 5c203fc81Skrishna * Common Development and Distribution License (the "License"). 6c203fc81Skrishna * You may not use this file except in compliance with the License. 7ff550d0eSmasputra * 8ff550d0eSmasputra * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9ff550d0eSmasputra * or http://www.opensolaris.org/os/licensing. 10ff550d0eSmasputra * See the License for the specific language governing permissions 11ff550d0eSmasputra * and limitations under the License. 12ff550d0eSmasputra * 13ff550d0eSmasputra * When distributing Covered Code, include this CDDL HEADER in each 14ff550d0eSmasputra * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15ff550d0eSmasputra * If applicable, add the following below this CDDL HEADER, with the 16ff550d0eSmasputra * fields enclosed by brackets "[]" replaced with your own identifying 17ff550d0eSmasputra * information: Portions Copyright [yyyy] [name of copyright owner] 18ff550d0eSmasputra * 19ff550d0eSmasputra * CDDL HEADER END 20ff550d0eSmasputra */ 21ff550d0eSmasputra /* 2266cd0f60SKacheong Poon * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23ff550d0eSmasputra */ 24ff550d0eSmasputra 25ff550d0eSmasputra #include <sys/types.h> 26ff550d0eSmasputra #include <sys/stream.h> 27ff550d0eSmasputra #include <sys/strsun.h> 28ff550d0eSmasputra #include <sys/strsubr.h> 29ff550d0eSmasputra #include <sys/debug.h> 30381a2a9aSdr146992 #include <sys/sdt.h> 31ff550d0eSmasputra #include <sys/cmn_err.h> 32ff550d0eSmasputra #include <sys/tihdr.h> 33ff550d0eSmasputra 34ff550d0eSmasputra #include <inet/common.h> 35fc80c0dfSnordmark #include <inet/optcom.h> 36ff550d0eSmasputra #include <inet/ip.h> 37e11c3f44Smeem #include <inet/ip_if.h> 38ff550d0eSmasputra #include <inet/ip_impl.h> 39ff550d0eSmasputra #include <inet/tcp.h> 40ff550d0eSmasputra #include <inet/tcp_impl.h> 41ff550d0eSmasputra #include <inet/ipsec_impl.h> 42ff550d0eSmasputra #include <inet/ipclassifier.h> 43ff550d0eSmasputra #include <inet/ipp_common.h> 4491762968SBrian Ruthven #include <inet/ip_if.h> 45ff550d0eSmasputra 46ff550d0eSmasputra /* 47ff550d0eSmasputra * This file implements TCP fusion - a protocol-less data path for TCP 48ff550d0eSmasputra * loopback connections. The fusion of two local TCP endpoints occurs 49ff550d0eSmasputra * at connection establishment time. Various conditions (see details 50ff550d0eSmasputra * in tcp_fuse()) need to be met for fusion to be successful. If it 51ff550d0eSmasputra * fails, we fall back to the regular TCP data path; if it succeeds, 52ff550d0eSmasputra * both endpoints proceed to use tcp_fuse_output() as the transmit path. 53ff550d0eSmasputra * tcp_fuse_output() enqueues application data directly onto the peer's 547b8f5432SAnders Persson * receive queue; no protocol processing is involved. 55ff550d0eSmasputra * 56e0968231Svi117747 * Sychronization is handled by squeue and the mutex tcp_non_sq_lock. 57ff550d0eSmasputra * One of the requirements for fusion to succeed is that both endpoints 58ff550d0eSmasputra * need to be using the same squeue. This ensures that neither side 597b8f5432SAnders Persson * can disappear while the other side is still sending data. Flow 607b8f5432SAnders Persson * control information is manipulated outside the squeue, so the 617b8f5432SAnders Persson * tcp_non_sq_lock must be held when touching tcp_flow_stopped. 62ff550d0eSmasputra */ 63ff550d0eSmasputra 64ff550d0eSmasputra /* 65ff550d0eSmasputra * Setting this to false means we disable fusion altogether and 66ff550d0eSmasputra * loopback connections would go through the protocol paths. 67ff550d0eSmasputra */ 68ff550d0eSmasputra boolean_t do_tcp_fusion = B_TRUE; 69ff550d0eSmasputra 70ff550d0eSmasputra /* 71ff550d0eSmasputra * This routine gets called by the eager tcp upon changing state from 72ff550d0eSmasputra * SYN_RCVD to ESTABLISHED. It fuses a direct path between itself 73ff550d0eSmasputra * and the active connect tcp such that the regular tcp processings 74ff550d0eSmasputra * may be bypassed under allowable circumstances. Because the fusion 75ff550d0eSmasputra * requires both endpoints to be in the same squeue, it does not work 76ff550d0eSmasputra * for simultaneous active connects because there is no easy way to 77ff550d0eSmasputra * switch from one squeue to another once the connection is created. 78ff550d0eSmasputra * This is different from the eager tcp case where we assign it the 79ff550d0eSmasputra * same squeue as the one given to the active connect tcp during open. 80ff550d0eSmasputra */ 81ff550d0eSmasputra void 82bd670b35SErik Nordmark tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcpha_t *tcpha) 83ff550d0eSmasputra { 84ff550d0eSmasputra conn_t *peer_connp, *connp = tcp->tcp_connp; 85ff550d0eSmasputra tcp_t *peer_tcp; 86f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 87f4b3ec61Sdh155122 netstack_t *ns; 88f4b3ec61Sdh155122 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; 89ff550d0eSmasputra 90ff550d0eSmasputra ASSERT(!tcp->tcp_fused); 91ff550d0eSmasputra ASSERT(tcp->tcp_loopback); 92ff550d0eSmasputra ASSERT(tcp->tcp_loopback_peer == NULL); 93ff550d0eSmasputra /* 94bd670b35SErik Nordmark * We need to inherit conn_rcvbuf of the listener tcp, 9579c0745dSRao Shoaib * but we can't really use tcp_listener since we get here after 96bd670b35SErik Nordmark * sending up T_CONN_IND and tcp_tli_accept() may be called 9779c0745dSRao Shoaib * independently, at which point tcp_listener is cleared; 9879c0745dSRao Shoaib * this is why we use tcp_saved_listener. The listener itself 9979c0745dSRao Shoaib * is guaranteed to be around until tcp_accept_finish() is called 10079c0745dSRao Shoaib * on this eager -- this won't happen until we're done since we're 10179c0745dSRao Shoaib * inside the eager's perimeter now. 102ff550d0eSmasputra */ 103bd670b35SErik Nordmark ASSERT(tcp->tcp_saved_listener != NULL); 104ff550d0eSmasputra /* 105ff550d0eSmasputra * Lookup peer endpoint; search for the remote endpoint having 106ff550d0eSmasputra * the reversed address-port quadruplet in ESTABLISHED state, 107ff550d0eSmasputra * which is guaranteed to be unique in the system. Zone check 108ff550d0eSmasputra * is applied accordingly for loopback address, but not for 109ff550d0eSmasputra * local address since we want fusion to happen across Zones. 110ff550d0eSmasputra */ 111bd670b35SErik Nordmark if (connp->conn_ipversion == IPV4_VERSION) { 112ff550d0eSmasputra peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp, 113bd670b35SErik Nordmark (ipha_t *)iphdr, tcpha, ipst); 114ff550d0eSmasputra } else { 115ff550d0eSmasputra peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp, 116bd670b35SErik Nordmark (ip6_t *)iphdr, tcpha, ipst); 117ff550d0eSmasputra } 118ff550d0eSmasputra 119ff550d0eSmasputra /* 120ff550d0eSmasputra * We can only proceed if peer exists, resides in the same squeue 1214da9f95bSAnders Persson * as our conn and is not raw-socket. We also restrict fusion to 1224da9f95bSAnders Persson * endpoints of the same type (STREAMS or non-STREAMS). The squeue 1234da9f95bSAnders Persson * assignment of this eager tcp was done earlier at the time of SYN 1244da9f95bSAnders Persson * processing in ip_fanout_tcp{_v6}. Note that similar squeues by 1254da9f95bSAnders Persson * itself doesn't guarantee a safe condition to fuse, hence we perform 126ff550d0eSmasputra * additional tests below. 127ff550d0eSmasputra */ 128ff550d0eSmasputra ASSERT(peer_connp == NULL || peer_connp != connp); 129ff550d0eSmasputra if (peer_connp == NULL || peer_connp->conn_sqp != connp->conn_sqp || 1304da9f95bSAnders Persson !IPCL_IS_TCP(peer_connp) || 1314da9f95bSAnders Persson IPCL_IS_NONSTR(connp) != IPCL_IS_NONSTR(peer_connp)) { 132ff550d0eSmasputra if (peer_connp != NULL) { 133f4b3ec61Sdh155122 TCP_STAT(tcps, tcp_fusion_unqualified); 134ff550d0eSmasputra CONN_DEC_REF(peer_connp); 135ff550d0eSmasputra } 136ff550d0eSmasputra return; 137ff550d0eSmasputra } 138ff550d0eSmasputra peer_tcp = peer_connp->conn_tcp; /* active connect tcp */ 139ff550d0eSmasputra 140ff550d0eSmasputra ASSERT(peer_tcp != NULL && peer_tcp != tcp && !peer_tcp->tcp_fused); 141ee6f0546SErik Nordmark ASSERT(peer_tcp->tcp_loopback_peer == NULL); 142ff550d0eSmasputra ASSERT(peer_connp->conn_sqp == connp->conn_sqp); 143ff550d0eSmasputra 144ff550d0eSmasputra /* 145ee6f0546SErik Nordmark * Due to IRE changes the peer and us might not agree on tcp_loopback. 146ee6f0546SErik Nordmark * We bail in that case. 147ee6f0546SErik Nordmark */ 148ee6f0546SErik Nordmark if (!peer_tcp->tcp_loopback) { 149ee6f0546SErik Nordmark TCP_STAT(tcps, tcp_fusion_unqualified); 150ee6f0546SErik Nordmark CONN_DEC_REF(peer_connp); 151ee6f0546SErik Nordmark return; 152ee6f0546SErik Nordmark } 153ee6f0546SErik Nordmark /* 154ff550d0eSmasputra * Fuse the endpoints; we perform further checks against both 155ff550d0eSmasputra * tcp endpoints to ensure that a fusion is allowed to happen. 156ff550d0eSmasputra */ 157f4b3ec61Sdh155122 ns = tcps->tcps_netstack; 158f4b3ec61Sdh155122 ipst = ns->netstack_ip; 159f4b3ec61Sdh155122 160ff550d0eSmasputra if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable && 161*dd49f125SAnders Persson tcp->tcp_xmit_head == NULL && peer_tcp->tcp_xmit_head == NULL) { 162ff550d0eSmasputra mblk_t *mp; 163bd670b35SErik Nordmark queue_t *peer_rq = peer_connp->conn_rq; 164ff550d0eSmasputra 1650f1702c5SYu Xiangning ASSERT(!TCP_IS_DETACHED(peer_tcp)); 166bd670b35SErik Nordmark ASSERT(tcp->tcp_fused_sigurg_mp == NULL); 167bd670b35SErik Nordmark ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL); 168ff550d0eSmasputra 169ff550d0eSmasputra /* 170ff550d0eSmasputra * We need to drain data on both endpoints during unfuse. 171ff550d0eSmasputra * If we need to send up SIGURG at the time of draining, 172ff550d0eSmasputra * we want to be sure that an mblk is readily available. 173ff550d0eSmasputra * This is why we pre-allocate the M_PCSIG mblks for both 174ff550d0eSmasputra * endpoints which will only be used during/after unfuse. 17501765833SAnders Persson * The mblk might already exist if we are doing a re-fuse. 176ff550d0eSmasputra */ 1770f1702c5SYu Xiangning if (!IPCL_IS_NONSTR(tcp->tcp_connp)) { 1784da9f95bSAnders Persson ASSERT(!IPCL_IS_NONSTR(peer_tcp->tcp_connp)); 1794da9f95bSAnders Persson 18001765833SAnders Persson if (tcp->tcp_fused_sigurg_mp == NULL) { 181ff550d0eSmasputra if ((mp = allocb(1, BPRI_HI)) == NULL) 182ff550d0eSmasputra goto failed; 183ff550d0eSmasputra tcp->tcp_fused_sigurg_mp = mp; 18401765833SAnders Persson } 185ff550d0eSmasputra 18601765833SAnders Persson if (peer_tcp->tcp_fused_sigurg_mp == NULL) { 187ff550d0eSmasputra if ((mp = allocb(1, BPRI_HI)) == NULL) 188ff550d0eSmasputra goto failed; 189ff550d0eSmasputra peer_tcp->tcp_fused_sigurg_mp = mp; 19001765833SAnders Persson } 191ff550d0eSmasputra 1924da9f95bSAnders Persson if ((mp = allocb(sizeof (struct stroptions), 1934da9f95bSAnders Persson BPRI_HI)) == NULL) 194ff550d0eSmasputra goto failed; 1950f1702c5SYu Xiangning } 196ff550d0eSmasputra 197ff550d0eSmasputra /* Fuse both endpoints */ 198ff550d0eSmasputra peer_tcp->tcp_loopback_peer = tcp; 199ff550d0eSmasputra tcp->tcp_loopback_peer = peer_tcp; 200ff550d0eSmasputra peer_tcp->tcp_fused = tcp->tcp_fused = B_TRUE; 201ff550d0eSmasputra 202ff550d0eSmasputra /* 203ff550d0eSmasputra * We never use regular tcp paths in fusion and should 204ff550d0eSmasputra * therefore clear tcp_unsent on both endpoints. Having 205ff550d0eSmasputra * them set to non-zero values means asking for trouble 206ff550d0eSmasputra * especially after unfuse, where we may end up sending 207ff550d0eSmasputra * through regular tcp paths which expect xmit_list and 208ff550d0eSmasputra * friends to be correctly setup. 209ff550d0eSmasputra */ 210ff550d0eSmasputra peer_tcp->tcp_unsent = tcp->tcp_unsent = 0; 211ff550d0eSmasputra 212ff550d0eSmasputra tcp_timers_stop(tcp); 213ff550d0eSmasputra tcp_timers_stop(peer_tcp); 214ff550d0eSmasputra 21579c0745dSRao Shoaib /* 21679c0745dSRao Shoaib * Set receive buffer and max packet size for the 21779c0745dSRao Shoaib * active open tcp. 21879c0745dSRao Shoaib * eager's values will be set in tcp_accept_finish. 21979c0745dSRao Shoaib */ 220bd670b35SErik Nordmark (void) tcp_rwnd_set(peer_tcp, peer_tcp->tcp_connp->conn_rcvbuf); 221ff550d0eSmasputra 222ff550d0eSmasputra /* 22379c0745dSRao Shoaib * Set the write offset value to zero since we won't 22479c0745dSRao Shoaib * be needing any room for TCP/IP headers. 225ff550d0eSmasputra */ 2260f1702c5SYu Xiangning if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) { 2270f1702c5SYu Xiangning struct stroptions *stropt; 2280f1702c5SYu Xiangning 229ff550d0eSmasputra DB_TYPE(mp) = M_SETOPTS; 230ff550d0eSmasputra mp->b_wptr += sizeof (*stropt); 231ff550d0eSmasputra 232ff550d0eSmasputra stropt = (struct stroptions *)mp->b_rptr; 2333e95bd4aSAnders Persson stropt->so_flags = SO_WROFF | SO_MAXBLK; 234ff550d0eSmasputra stropt->so_wroff = 0; 2353e95bd4aSAnders Persson stropt->so_maxblk = INFPSZ; 236ff550d0eSmasputra 237ff550d0eSmasputra /* Send the options up */ 238ff550d0eSmasputra putnext(peer_rq, mp); 2390f1702c5SYu Xiangning } else { 2400f1702c5SYu Xiangning struct sock_proto_props sopp; 2410f1702c5SYu Xiangning 2420f1702c5SYu Xiangning /* The peer is a non-STREAMS end point */ 2430f1702c5SYu Xiangning ASSERT(IPCL_IS_TCP(peer_connp)); 2440f1702c5SYu Xiangning 2453e95bd4aSAnders Persson sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_MAXBLK; 2460f1702c5SYu Xiangning sopp.sopp_wroff = 0; 2473e95bd4aSAnders Persson sopp.sopp_maxblk = INFPSZ; 2480f1702c5SYu Xiangning (*peer_connp->conn_upcalls->su_set_proto_props) 2490f1702c5SYu Xiangning (peer_connp->conn_upper_handle, &sopp); 2500f1702c5SYu Xiangning } 2512f9e7e9bSAnders Persson } else { 252f4b3ec61Sdh155122 TCP_STAT(tcps, tcp_fusion_unqualified); 253ff550d0eSmasputra } 254ff550d0eSmasputra CONN_DEC_REF(peer_connp); 255ff550d0eSmasputra return; 256ff550d0eSmasputra 257ff550d0eSmasputra failed: 258ff550d0eSmasputra if (tcp->tcp_fused_sigurg_mp != NULL) { 259ff550d0eSmasputra freeb(tcp->tcp_fused_sigurg_mp); 260ff550d0eSmasputra tcp->tcp_fused_sigurg_mp = NULL; 261ff550d0eSmasputra } 262ff550d0eSmasputra if (peer_tcp->tcp_fused_sigurg_mp != NULL) { 263ff550d0eSmasputra freeb(peer_tcp->tcp_fused_sigurg_mp); 264ff550d0eSmasputra peer_tcp->tcp_fused_sigurg_mp = NULL; 265ff550d0eSmasputra } 266ff550d0eSmasputra CONN_DEC_REF(peer_connp); 267ff550d0eSmasputra } 268ff550d0eSmasputra 269ff550d0eSmasputra /* 270ff550d0eSmasputra * Unfuse a previously-fused pair of tcp loopback endpoints. 271ff550d0eSmasputra */ 272ff550d0eSmasputra void 273ff550d0eSmasputra tcp_unfuse(tcp_t *tcp) 274ff550d0eSmasputra { 275ff550d0eSmasputra tcp_t *peer_tcp = tcp->tcp_loopback_peer; 2767b8f5432SAnders Persson tcp_stack_t *tcps = tcp->tcp_tcps; 277ff550d0eSmasputra 278ff550d0eSmasputra ASSERT(tcp->tcp_fused && peer_tcp != NULL); 279ff550d0eSmasputra ASSERT(peer_tcp->tcp_fused && peer_tcp->tcp_loopback_peer == tcp); 280ff550d0eSmasputra ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp); 281ff550d0eSmasputra ASSERT(tcp->tcp_unsent == 0 && peer_tcp->tcp_unsent == 0); 282ff550d0eSmasputra 283ff550d0eSmasputra /* 2847b8f5432SAnders Persson * Cancel any pending push timers. 285ff550d0eSmasputra */ 2867b8f5432SAnders Persson if (tcp->tcp_push_tid != 0) { 2877b8f5432SAnders Persson (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 2887b8f5432SAnders Persson tcp->tcp_push_tid = 0; 2897b8f5432SAnders Persson } 2907b8f5432SAnders Persson if (peer_tcp->tcp_push_tid != 0) { 2917b8f5432SAnders Persson (void) TCP_TIMER_CANCEL(peer_tcp, peer_tcp->tcp_push_tid); 2927b8f5432SAnders Persson peer_tcp->tcp_push_tid = 0; 2937b8f5432SAnders Persson } 2947b8f5432SAnders Persson 2957b8f5432SAnders Persson /* 2967b8f5432SAnders Persson * Drain any pending data; Note that in case of a detached tcp, the 2977b8f5432SAnders Persson * draining will happen later after the tcp is unfused. For non- 2987b8f5432SAnders Persson * urgent data, this can be handled by the regular tcp_rcv_drain(). 2997b8f5432SAnders Persson * If we have urgent data sitting in the receive list, we will 3007b8f5432SAnders Persson * need to send up a SIGURG signal first before draining the data. 3017b8f5432SAnders Persson * All of these will be handled by the code in tcp_fuse_rcv_drain() 3027b8f5432SAnders Persson * when called from tcp_rcv_drain(). 3037b8f5432SAnders Persson */ 3047b8f5432SAnders Persson if (!TCP_IS_DETACHED(tcp)) { 305bd670b35SErik Nordmark (void) tcp_fuse_rcv_drain(tcp->tcp_connp->conn_rq, tcp, 3067b8f5432SAnders Persson &tcp->tcp_fused_sigurg_mp); 3077b8f5432SAnders Persson } 3087b8f5432SAnders Persson if (!TCP_IS_DETACHED(peer_tcp)) { 309bd670b35SErik Nordmark (void) tcp_fuse_rcv_drain(peer_tcp->tcp_connp->conn_rq, 310bd670b35SErik Nordmark peer_tcp, &peer_tcp->tcp_fused_sigurg_mp); 3117b8f5432SAnders Persson } 3127b8f5432SAnders Persson 3137b8f5432SAnders Persson /* Lift up any flow-control conditions */ 3147b8f5432SAnders Persson mutex_enter(&tcp->tcp_non_sq_lock); 3157b8f5432SAnders Persson if (tcp->tcp_flow_stopped) { 3167b8f5432SAnders Persson tcp_clrqfull(tcp); 3177b8f5432SAnders Persson TCP_STAT(tcps, tcp_fusion_backenabled); 3187b8f5432SAnders Persson } 3197b8f5432SAnders Persson mutex_exit(&tcp->tcp_non_sq_lock); 3207b8f5432SAnders Persson 3217b8f5432SAnders Persson mutex_enter(&peer_tcp->tcp_non_sq_lock); 3227b8f5432SAnders Persson if (peer_tcp->tcp_flow_stopped) { 3237b8f5432SAnders Persson tcp_clrqfull(peer_tcp); 3247b8f5432SAnders Persson TCP_STAT(tcps, tcp_fusion_backenabled); 3257b8f5432SAnders Persson } 3267b8f5432SAnders Persson mutex_exit(&peer_tcp->tcp_non_sq_lock); 327ff550d0eSmasputra 328ff550d0eSmasputra /* 329bd670b35SErik Nordmark * Update tha_seq and tha_ack in the header template 330ff550d0eSmasputra */ 331bd670b35SErik Nordmark tcp->tcp_tcpha->tha_seq = htonl(tcp->tcp_snxt); 332bd670b35SErik Nordmark tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); 333bd670b35SErik Nordmark peer_tcp->tcp_tcpha->tha_seq = htonl(peer_tcp->tcp_snxt); 334bd670b35SErik Nordmark peer_tcp->tcp_tcpha->tha_ack = htonl(peer_tcp->tcp_rnxt); 335ff550d0eSmasputra 336ff550d0eSmasputra /* Unfuse the endpoints */ 337ff550d0eSmasputra peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE; 338ff550d0eSmasputra peer_tcp->tcp_loopback_peer = tcp->tcp_loopback_peer = NULL; 3390f1702c5SYu Xiangning } 340ff550d0eSmasputra 341ff550d0eSmasputra /* 3424da9f95bSAnders Persson * Fusion output routine used to handle urgent data sent by STREAMS based 3434da9f95bSAnders Persson * endpoints. This routine is called by tcp_fuse_output() for handling 3444da9f95bSAnders Persson * non-M_DATA mblks. 345ff550d0eSmasputra */ 346ff550d0eSmasputra void 347ff550d0eSmasputra tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp) 348ff550d0eSmasputra { 349ff550d0eSmasputra mblk_t *mp1; 350ff550d0eSmasputra struct T_exdata_ind *tei; 351ff550d0eSmasputra tcp_t *peer_tcp = tcp->tcp_loopback_peer; 352ff550d0eSmasputra mblk_t *head, *prev_head = NULL; 353f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 354ff550d0eSmasputra 355ff550d0eSmasputra ASSERT(tcp->tcp_fused); 356ff550d0eSmasputra ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp); 3577b8f5432SAnders Persson ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 358ff550d0eSmasputra ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); 359ff550d0eSmasputra ASSERT(mp->b_cont != NULL && DB_TYPE(mp->b_cont) == M_DATA); 360ff550d0eSmasputra ASSERT(MBLKL(mp) >= sizeof (*tei) && MBLKL(mp->b_cont) > 0); 361ff550d0eSmasputra 362ff550d0eSmasputra /* 363ff550d0eSmasputra * Urgent data arrives in the form of T_EXDATA_REQ from above. 364ff550d0eSmasputra * Each occurence denotes a new urgent pointer. For each new 365ff550d0eSmasputra * urgent pointer we signal (SIGURG) the receiving app to indicate 366ff550d0eSmasputra * that it needs to go into urgent mode. This is similar to the 367ff550d0eSmasputra * urgent data handling in the regular tcp. We don't need to keep 368ff550d0eSmasputra * track of where the urgent pointer is, because each T_EXDATA_REQ 369ff550d0eSmasputra * "advances" the urgent pointer for us. 370ff550d0eSmasputra * 371ff550d0eSmasputra * The actual urgent data carried by T_EXDATA_REQ is then prepended 372ff550d0eSmasputra * by a T_EXDATA_IND before being enqueued behind any existing data 373ff550d0eSmasputra * destined for the receiving app. There is only a single urgent 374ff550d0eSmasputra * pointer (out-of-band mark) for a given tcp. If the new urgent 375ff550d0eSmasputra * data arrives before the receiving app reads some existing urgent 376ff550d0eSmasputra * data, the previous marker is lost. This behavior is emulated 377ff550d0eSmasputra * accordingly below, by removing any existing T_EXDATA_IND messages 378ff550d0eSmasputra * and essentially converting old urgent data into non-urgent. 379ff550d0eSmasputra */ 380ff550d0eSmasputra ASSERT(tcp->tcp_valid_bits & TCP_URG_VALID); 381ff550d0eSmasputra /* Let sender get out of urgent mode */ 382ff550d0eSmasputra tcp->tcp_valid_bits &= ~TCP_URG_VALID; 383ff550d0eSmasputra 384ff550d0eSmasputra /* 385ff550d0eSmasputra * This flag indicates that a signal needs to be sent up. 386ff550d0eSmasputra * This flag will only get cleared once SIGURG is delivered and 387ff550d0eSmasputra * is not affected by the tcp_fused flag -- delivery will still 388ff550d0eSmasputra * happen even after an endpoint is unfused, to handle the case 389ff550d0eSmasputra * where the sending endpoint immediately closes/unfuses after 390ff550d0eSmasputra * sending urgent data and the accept is not yet finished. 391ff550d0eSmasputra */ 392ff550d0eSmasputra peer_tcp->tcp_fused_sigurg = B_TRUE; 393ff550d0eSmasputra 394ff550d0eSmasputra /* Reuse T_EXDATA_REQ mblk for T_EXDATA_IND */ 395ff550d0eSmasputra DB_TYPE(mp) = M_PROTO; 396ff550d0eSmasputra tei = (struct T_exdata_ind *)mp->b_rptr; 397ff550d0eSmasputra tei->PRIM_type = T_EXDATA_IND; 398ff550d0eSmasputra tei->MORE_flag = 0; 399ff550d0eSmasputra mp->b_wptr = (uchar_t *)&tei[1]; 400ff550d0eSmasputra 401f4b3ec61Sdh155122 TCP_STAT(tcps, tcp_fusion_urg); 402721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutUrg); 403ff550d0eSmasputra 404ff550d0eSmasputra head = peer_tcp->tcp_rcv_list; 405ff550d0eSmasputra while (head != NULL) { 406ff550d0eSmasputra /* 407ff550d0eSmasputra * Remove existing T_EXDATA_IND, keep the data which follows 408ff550d0eSmasputra * it and relink our list. Note that we don't modify the 409ff550d0eSmasputra * tcp_rcv_last_tail since it never points to T_EXDATA_IND. 410ff550d0eSmasputra */ 411ff550d0eSmasputra if (DB_TYPE(head) != M_DATA) { 412ff550d0eSmasputra mp1 = head; 413ff550d0eSmasputra 414ff550d0eSmasputra ASSERT(DB_TYPE(mp1->b_cont) == M_DATA); 415ff550d0eSmasputra head = mp1->b_cont; 416ff550d0eSmasputra mp1->b_cont = NULL; 417ff550d0eSmasputra head->b_next = mp1->b_next; 418ff550d0eSmasputra mp1->b_next = NULL; 419ff550d0eSmasputra if (prev_head != NULL) 420ff550d0eSmasputra prev_head->b_next = head; 421ff550d0eSmasputra if (peer_tcp->tcp_rcv_list == mp1) 422ff550d0eSmasputra peer_tcp->tcp_rcv_list = head; 423ff550d0eSmasputra if (peer_tcp->tcp_rcv_last_head == mp1) 424ff550d0eSmasputra peer_tcp->tcp_rcv_last_head = head; 425ff550d0eSmasputra freeb(mp1); 426ff550d0eSmasputra } 427ff550d0eSmasputra prev_head = head; 428ff550d0eSmasputra head = head->b_next; 429ff550d0eSmasputra } 430ff550d0eSmasputra } 431ff550d0eSmasputra 432ff550d0eSmasputra /* 433ff550d0eSmasputra * Fusion output routine, called by tcp_output() and tcp_wput_proto(). 434e0968231Svi117747 * If we are modifying any member that can be changed outside the squeue, 435e0968231Svi117747 * like tcp_flow_stopped, we need to take tcp_non_sq_lock. 436ff550d0eSmasputra */ 437ff550d0eSmasputra boolean_t 438ff550d0eSmasputra tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) 439ff550d0eSmasputra { 440bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 441ff550d0eSmasputra tcp_t *peer_tcp = tcp->tcp_loopback_peer; 442bd670b35SErik Nordmark conn_t *peer_connp = peer_tcp->tcp_connp; 4432c5134dbSudpa boolean_t flow_stopped, peer_data_queued = B_FALSE; 444ff550d0eSmasputra boolean_t urgent = (DB_TYPE(mp) != M_DATA); 4459910327fSanders boolean_t push = B_TRUE; 446381a2a9aSdr146992 mblk_t *mp1 = mp; 447381a2a9aSdr146992 uint_t ip_hdr_len; 448381a2a9aSdr146992 uint32_t recv_size = send_size; 449f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 450f4b3ec61Sdh155122 netstack_t *ns = tcps->tcps_netstack; 451f4b3ec61Sdh155122 ip_stack_t *ipst = ns->netstack_ip; 452bd670b35SErik Nordmark ipsec_stack_t *ipss = ns->netstack_ipsec; 453bd670b35SErik Nordmark iaflags_t ixaflags = connp->conn_ixa->ixa_flags; 454bd670b35SErik Nordmark boolean_t do_ipsec, hooks_out, hooks_in, ipobs_enabled; 455ff550d0eSmasputra 456ff550d0eSmasputra ASSERT(tcp->tcp_fused); 457ff550d0eSmasputra ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp); 458bd670b35SErik Nordmark ASSERT(connp->conn_sqp == peer_connp->conn_sqp); 459ff550d0eSmasputra ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO || 460ff550d0eSmasputra DB_TYPE(mp) == M_PCPROTO); 461ff550d0eSmasputra 462ff550d0eSmasputra if (send_size == 0) { 463ff550d0eSmasputra freemsg(mp); 464ff550d0eSmasputra return (B_TRUE); 465ff550d0eSmasputra } 466ff550d0eSmasputra 467ff550d0eSmasputra /* 468ff550d0eSmasputra * Handle urgent data; we either send up SIGURG to the peer now 469ff550d0eSmasputra * or do it later when we drain, in case the peer is detached 470ff550d0eSmasputra * or if we're short of memory for M_PCSIG mblk. 471ff550d0eSmasputra */ 472ff550d0eSmasputra if (urgent) { 473ff550d0eSmasputra tcp_fuse_output_urg(tcp, mp); 474381a2a9aSdr146992 475381a2a9aSdr146992 mp1 = mp->b_cont; 476381a2a9aSdr146992 } 477381a2a9aSdr146992 478381a2a9aSdr146992 /* 479bd670b35SErik Nordmark * Check that we are still using an IRE_LOCAL or IRE_LOOPBACK before 480bd670b35SErik Nordmark * further processes. 481381a2a9aSdr146992 */ 482bd670b35SErik Nordmark if (!ip_output_verify_local(connp->conn_ixa)) 483bd670b35SErik Nordmark goto unfuse; 484bd670b35SErik Nordmark 485bd670b35SErik Nordmark /* 486bd670b35SErik Nordmark * Build IP and TCP header in case we have something that needs the 487bd670b35SErik Nordmark * headers. Those cases are: 488bd670b35SErik Nordmark * 1. IPsec 489bd670b35SErik Nordmark * 2. IPobs 490bd670b35SErik Nordmark * 3. FW_HOOKS 491bd670b35SErik Nordmark * 492bd670b35SErik Nordmark * If tcp_xmit_mp() fails to dupb() the message, unfuse the connection 493bd670b35SErik Nordmark * and back to regular path. 494bd670b35SErik Nordmark */ 495bd670b35SErik Nordmark if (ixaflags & IXAF_IS_IPV4) { 496bd670b35SErik Nordmark do_ipsec = (ixaflags & IXAF_IPSEC_SECURE) || 497bd670b35SErik Nordmark CONN_INBOUND_POLICY_PRESENT(peer_connp, ipss); 498bd670b35SErik Nordmark 499bd670b35SErik Nordmark hooks_out = HOOKS4_INTERESTED_LOOPBACK_OUT(ipst); 500bd670b35SErik Nordmark hooks_in = HOOKS4_INTERESTED_LOOPBACK_IN(ipst); 501bd670b35SErik Nordmark ipobs_enabled = (ipst->ips_ip4_observe.he_interested != 0); 502bd670b35SErik Nordmark } else { 503bd670b35SErik Nordmark do_ipsec = (ixaflags & IXAF_IPSEC_SECURE) || 504bd670b35SErik Nordmark CONN_INBOUND_POLICY_PRESENT_V6(peer_connp, ipss); 505bd670b35SErik Nordmark 506bd670b35SErik Nordmark hooks_out = HOOKS6_INTERESTED_LOOPBACK_OUT(ipst); 507bd670b35SErik Nordmark hooks_in = HOOKS6_INTERESTED_LOOPBACK_IN(ipst); 508bd670b35SErik Nordmark ipobs_enabled = (ipst->ips_ip6_observe.he_interested != 0); 509bd670b35SErik Nordmark } 510bd670b35SErik Nordmark 511bd670b35SErik Nordmark /* We do logical 'or' for efficiency */ 512bd670b35SErik Nordmark if (ipobs_enabled | do_ipsec | hooks_in | hooks_out) { 513381a2a9aSdr146992 if ((mp1 = tcp_xmit_mp(tcp, mp1, tcp->tcp_mss, NULL, NULL, 514381a2a9aSdr146992 tcp->tcp_snxt, B_TRUE, NULL, B_FALSE)) == NULL) 515381a2a9aSdr146992 /* If tcp_xmit_mp fails, use regular path */ 516381a2a9aSdr146992 goto unfuse; 517381a2a9aSdr146992 51891762968SBrian Ruthven /* 519bd670b35SErik Nordmark * Leave all IP relevant processes to ip_output_process_local(), 520bd670b35SErik Nordmark * which handles IPsec, IPobs, and FW_HOOKS. 52191762968SBrian Ruthven */ 522bd670b35SErik Nordmark mp1 = ip_output_process_local(mp1, connp->conn_ixa, hooks_out, 523bd670b35SErik Nordmark hooks_in, do_ipsec ? peer_connp : NULL); 52491762968SBrian Ruthven 525bd670b35SErik Nordmark /* If the message is dropped for any reason. */ 526381a2a9aSdr146992 if (mp1 == NULL) 527381a2a9aSdr146992 goto unfuse; 528381a2a9aSdr146992 52991762968SBrian Ruthven /* 530bd670b35SErik Nordmark * Data length might have been changed by FW_HOOKS. 531bd670b35SErik Nordmark * We assume that the first mblk contains the TCP/IP headers. 53291762968SBrian Ruthven */ 533bd670b35SErik Nordmark if (hooks_in || hooks_out) { 534bd670b35SErik Nordmark tcpha_t *tcpha; 535bd670b35SErik Nordmark 536bd670b35SErik Nordmark ip_hdr_len = (ixaflags & IXAF_IS_IPV4) ? 537bd670b35SErik Nordmark IPH_HDR_LENGTH((ipha_t *)mp1->b_rptr) : 538bd670b35SErik Nordmark ip_hdr_length_v6(mp1, (ip6_t *)mp1->b_rptr); 539bd670b35SErik Nordmark 540bd670b35SErik Nordmark tcpha = (tcpha_t *)&mp1->b_rptr[ip_hdr_len]; 541bd670b35SErik Nordmark ASSERT((uchar_t *)tcpha + sizeof (tcpha_t) <= 542bd670b35SErik Nordmark mp1->b_wptr); 543bd670b35SErik Nordmark recv_size += htonl(tcpha->tha_seq) - tcp->tcp_snxt; 544bd670b35SErik Nordmark 54591762968SBrian Ruthven } 546381a2a9aSdr146992 547381a2a9aSdr146992 /* 548381a2a9aSdr146992 * The message duplicated by tcp_xmit_mp is freed. 549381a2a9aSdr146992 * Note: the original message passed in remains unchanged. 550381a2a9aSdr146992 */ 551381a2a9aSdr146992 freemsg(mp1); 552ff550d0eSmasputra } 553ff550d0eSmasputra 554ff550d0eSmasputra /* 555ff550d0eSmasputra * Enqueue data into the peer's receive list; we may or may not 556ff550d0eSmasputra * drain the contents depending on the conditions below. 5577cae9885SAnders Persson * 5587b8f5432SAnders Persson * For non-STREAMS sockets we normally queue data directly in the 5597b8f5432SAnders Persson * socket by calling the su_recv upcall. However, if the peer is 5607b8f5432SAnders Persson * detached we use tcp_rcv_enqueue() instead. Queued data will be 5617b8f5432SAnders Persson * drained when the accept completes (in tcp_accept_finish()). 562ff550d0eSmasputra */ 563bd670b35SErik Nordmark if (IPCL_IS_NONSTR(peer_connp) && 5647b8f5432SAnders Persson !TCP_IS_DETACHED(peer_tcp)) { 5650f1702c5SYu Xiangning int error; 5660f1702c5SYu Xiangning int flags = 0; 5670f1702c5SYu Xiangning 5680f1702c5SYu Xiangning if ((tcp->tcp_valid_bits & TCP_URG_VALID) && 5690f1702c5SYu Xiangning (tcp->tcp_urg == tcp->tcp_snxt)) { 5700f1702c5SYu Xiangning flags = MSG_OOB; 571bd670b35SErik Nordmark (*peer_connp->conn_upcalls->su_signal_oob) 572bd670b35SErik Nordmark (peer_connp->conn_upper_handle, 0); 5730f1702c5SYu Xiangning tcp->tcp_valid_bits &= ~TCP_URG_VALID; 5740f1702c5SYu Xiangning } 575bd670b35SErik Nordmark if ((*peer_connp->conn_upcalls->su_recv)( 576bd670b35SErik Nordmark peer_connp->conn_upper_handle, mp, recv_size, 577f3124163SAnders Persson flags, &error, &push) < 0) { 57841174437SAnders Persson ASSERT(error != EOPNOTSUPP); 579f3124163SAnders Persson peer_data_queued = B_TRUE; 580f3124163SAnders Persson } 5810f1702c5SYu Xiangning } else { 582bd670b35SErik Nordmark if (IPCL_IS_NONSTR(peer_connp) && 5830f1702c5SYu Xiangning (tcp->tcp_valid_bits & TCP_URG_VALID) && 5840f1702c5SYu Xiangning (tcp->tcp_urg == tcp->tcp_snxt)) { 5850f1702c5SYu Xiangning /* 5860f1702c5SYu Xiangning * Can not deal with urgent pointers 5870f1702c5SYu Xiangning * that arrive before the connection has been 5880f1702c5SYu Xiangning * accept()ed. 5890f1702c5SYu Xiangning */ 5900f1702c5SYu Xiangning tcp->tcp_valid_bits &= ~TCP_URG_VALID; 5910f1702c5SYu Xiangning freemsg(mp); 5920f1702c5SYu Xiangning return (B_TRUE); 5930f1702c5SYu Xiangning } 5940f1702c5SYu Xiangning 595bd670b35SErik Nordmark tcp_rcv_enqueue(peer_tcp, mp, recv_size, 596bd670b35SErik Nordmark tcp->tcp_connp->conn_cred); 597ff550d0eSmasputra 598ff550d0eSmasputra /* In case it wrapped around and also to keep it constant */ 599381a2a9aSdr146992 peer_tcp->tcp_rwnd += recv_size; 6007b8f5432SAnders Persson } 601ff550d0eSmasputra 602ff550d0eSmasputra /* 603ff550d0eSmasputra * Exercise flow-control when needed; we will get back-enabled 6047b8f5432SAnders Persson * in either tcp_accept_finish(), tcp_unfuse(), or when data is 6057b8f5432SAnders Persson * consumed. If peer endpoint is detached, we emulate streams flow 6067b8f5432SAnders Persson * control by checking the peer's queue size and high water mark; 6077b8f5432SAnders Persson * otherwise we simply use canputnext() to decide if we need to stop 6087b8f5432SAnders Persson * our flow. 609ff550d0eSmasputra * 610e0968231Svi117747 * Since we are accessing our tcp_flow_stopped and might modify it, 6117b8f5432SAnders Persson * we need to take tcp->tcp_non_sq_lock. 612e0968231Svi117747 */ 613e0968231Svi117747 mutex_enter(&tcp->tcp_non_sq_lock); 614ff550d0eSmasputra flow_stopped = tcp->tcp_flow_stopped; 6157b8f5432SAnders Persson if ((TCP_IS_DETACHED(peer_tcp) && 616bd670b35SErik Nordmark (peer_tcp->tcp_rcv_cnt >= peer_connp->conn_rcvbuf)) || 6177b8f5432SAnders Persson (!TCP_IS_DETACHED(peer_tcp) && 618bd670b35SErik Nordmark !IPCL_IS_NONSTR(peer_connp) && !canputnext(peer_connp->conn_rq))) { 6192c5134dbSudpa peer_data_queued = B_TRUE; 6202c5134dbSudpa } 6212c5134dbSudpa 6222c5134dbSudpa if (!flow_stopped && (peer_data_queued || 623bd670b35SErik Nordmark (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf))) { 624ff550d0eSmasputra tcp_setqfull(tcp); 625ff550d0eSmasputra flow_stopped = B_TRUE; 626f4b3ec61Sdh155122 TCP_STAT(tcps, tcp_fusion_flowctl); 6277b8f5432SAnders Persson DTRACE_PROBE3(tcp__fuse__output__flowctl, tcp_t *, tcp, 6287b8f5432SAnders Persson uint_t, send_size, uint_t, peer_tcp->tcp_rcv_cnt); 6292c5134dbSudpa } else if (flow_stopped && !peer_data_queued && 630bd670b35SErik Nordmark (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat)) { 631ff550d0eSmasputra tcp_clrqfull(tcp); 6328c0bf406Sja97890 TCP_STAT(tcps, tcp_fusion_backenabled); 633a2036d4dSmeem flow_stopped = B_FALSE; 634ff550d0eSmasputra } 635e0968231Svi117747 mutex_exit(&tcp->tcp_non_sq_lock); 6368c0bf406Sja97890 637f4b3ec61Sdh155122 ipst->ips_loopback_packets++; 638ff550d0eSmasputra tcp->tcp_last_sent_len = send_size; 639ff550d0eSmasputra 640ff550d0eSmasputra /* Need to adjust the following SNMP MIB-related variables */ 641ff550d0eSmasputra tcp->tcp_snxt += send_size; 642ff550d0eSmasputra tcp->tcp_suna = tcp->tcp_snxt; 643381a2a9aSdr146992 peer_tcp->tcp_rnxt += recv_size; 6449cd928feSAlan Maguire peer_tcp->tcp_last_recv_len = recv_size; 645ff550d0eSmasputra peer_tcp->tcp_rack = peer_tcp->tcp_rnxt; 646ff550d0eSmasputra 647721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpOutDataSegs); 648721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, send_size); 649ff550d0eSmasputra 650721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpHCInSegs); 651721fffe3SKacheong Poon TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); 652721fffe3SKacheong Poon TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, send_size); 653ff550d0eSmasputra 654ff550d0eSmasputra BUMP_LOCAL(tcp->tcp_obsegs); 655ff550d0eSmasputra BUMP_LOCAL(peer_tcp->tcp_ibsegs); 656ff550d0eSmasputra 6579cd928feSAlan Maguire DTRACE_TCP5(send, void, NULL, ip_xmit_attr_t *, connp->conn_ixa, 6589cd928feSAlan Maguire __dtrace_tcp_void_ip_t *, NULL, tcp_t *, tcp, 6599cd928feSAlan Maguire __dtrace_tcp_tcph_t *, NULL); 6609cd928feSAlan Maguire DTRACE_TCP5(receive, void, NULL, ip_xmit_attr_t *, 6619cd928feSAlan Maguire peer_connp->conn_ixa, __dtrace_tcp_void_ip_t *, NULL, 6629cd928feSAlan Maguire tcp_t *, peer_tcp, __dtrace_tcp_tcph_t *, NULL); 663ff550d0eSmasputra 6644da9f95bSAnders Persson if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp) && 6654da9f95bSAnders Persson !TCP_IS_DETACHED(peer_tcp)) { 666ff550d0eSmasputra /* 667ff550d0eSmasputra * Drain the peer's receive queue it has urgent data or if 6687b8f5432SAnders Persson * we're not flow-controlled. 669ff550d0eSmasputra */ 6707b8f5432SAnders Persson if (urgent || !flow_stopped) { 6714da9f95bSAnders Persson ASSERT(peer_tcp->tcp_rcv_list != NULL); 67281d28f7bSmeem /* 67381d28f7bSmeem * For TLI-based streams, a thread in tcp_accept_swap() 67481d28f7bSmeem * can race with us. That thread will ensure that the 675bd670b35SErik Nordmark * correct peer_connp->conn_rq is globally visible 676bd670b35SErik Nordmark * before peer_tcp->tcp_detached is visible as clear, 677bd670b35SErik Nordmark * but we must also ensure that the load of conn_rq 678bd670b35SErik Nordmark * cannot be reordered to be before the tcp_detached 679bd670b35SErik Nordmark * check. 68081d28f7bSmeem */ 68181d28f7bSmeem membar_consumer(); 682bd670b35SErik Nordmark (void) tcp_fuse_rcv_drain(peer_connp->conn_rq, peer_tcp, 68381d28f7bSmeem NULL); 684ff550d0eSmasputra } 685ff550d0eSmasputra } 686ff550d0eSmasputra return (B_TRUE); 687381a2a9aSdr146992 unfuse: 688381a2a9aSdr146992 tcp_unfuse(tcp); 689381a2a9aSdr146992 return (B_FALSE); 690ff550d0eSmasputra } 691ff550d0eSmasputra 692ff550d0eSmasputra /* 693ff550d0eSmasputra * This routine gets called to deliver data upstream on a fused or 694ff550d0eSmasputra * previously fused tcp loopback endpoint; the latter happens only 695ff550d0eSmasputra * when there is a pending SIGURG signal plus urgent data that can't 696ff550d0eSmasputra * be sent upstream in the past. 697ff550d0eSmasputra */ 698ff550d0eSmasputra boolean_t 699ff550d0eSmasputra tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp) 700ff550d0eSmasputra { 701ff550d0eSmasputra mblk_t *mp; 7020f1702c5SYu Xiangning conn_t *connp = tcp->tcp_connp; 7030f1702c5SYu Xiangning 704ff550d0eSmasputra #ifdef DEBUG 705ff550d0eSmasputra uint_t cnt = 0; 706ff550d0eSmasputra #endif 707f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 7088c0bf406Sja97890 tcp_t *peer_tcp = tcp->tcp_loopback_peer; 709ff550d0eSmasputra 710ff550d0eSmasputra ASSERT(tcp->tcp_loopback); 711ff550d0eSmasputra ASSERT(tcp->tcp_fused || tcp->tcp_fused_sigurg); 712ff550d0eSmasputra ASSERT(!tcp->tcp_fused || tcp->tcp_loopback_peer != NULL); 7130f1702c5SYu Xiangning ASSERT(IPCL_IS_NONSTR(connp) || sigurg_mpp != NULL || tcp->tcp_fused); 714ff550d0eSmasputra 715ff550d0eSmasputra /* No need for the push timer now, in case it was scheduled */ 716ff550d0eSmasputra if (tcp->tcp_push_tid != 0) { 717ff550d0eSmasputra (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); 718ff550d0eSmasputra tcp->tcp_push_tid = 0; 719ff550d0eSmasputra } 720ff550d0eSmasputra /* 721ff550d0eSmasputra * If there's urgent data sitting in receive list and we didn't 722ff550d0eSmasputra * get a chance to send up a SIGURG signal, make sure we send 723ff550d0eSmasputra * it first before draining in order to ensure that SIOCATMARK 724ff550d0eSmasputra * works properly. 725ff550d0eSmasputra */ 726ff550d0eSmasputra if (tcp->tcp_fused_sigurg) { 7274da9f95bSAnders Persson ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); 7284da9f95bSAnders Persson 7290f1702c5SYu Xiangning tcp->tcp_fused_sigurg = B_FALSE; 730ff550d0eSmasputra /* 731ff550d0eSmasputra * sigurg_mpp is normally NULL, i.e. when we're still 732ff550d0eSmasputra * fused and didn't get here because of tcp_unfuse(). 733ff550d0eSmasputra * In this case try hard to allocate the M_PCSIG mblk. 734ff550d0eSmasputra */ 735ff550d0eSmasputra if (sigurg_mpp == NULL && 736ff550d0eSmasputra (mp = allocb(1, BPRI_HI)) == NULL && 737ff550d0eSmasputra (mp = allocb_tryhard(1)) == NULL) { 738ff550d0eSmasputra /* Alloc failed; try again next time */ 7390f1702c5SYu Xiangning tcp->tcp_push_tid = TCP_TIMER(tcp, 74066cd0f60SKacheong Poon tcp_push_timer, tcps->tcps_push_timer_interval); 741ff550d0eSmasputra return (B_TRUE); 742ff550d0eSmasputra } else if (sigurg_mpp != NULL) { 743ff550d0eSmasputra /* 744ff550d0eSmasputra * Use the supplied M_PCSIG mblk; it means we're 745ff550d0eSmasputra * either unfused or in the process of unfusing, 746ff550d0eSmasputra * and the drain must happen now. 747ff550d0eSmasputra */ 748ff550d0eSmasputra mp = *sigurg_mpp; 749ff550d0eSmasputra *sigurg_mpp = NULL; 750ff550d0eSmasputra } 751ff550d0eSmasputra ASSERT(mp != NULL); 752ff550d0eSmasputra 753ff550d0eSmasputra /* Send up the signal */ 754ff550d0eSmasputra DB_TYPE(mp) = M_PCSIG; 755ff550d0eSmasputra *mp->b_wptr++ = (uchar_t)SIGURG; 756ff550d0eSmasputra putnext(q, mp); 7574da9f95bSAnders Persson 758ff550d0eSmasputra /* 759ff550d0eSmasputra * Let the regular tcp_rcv_drain() path handle 760ff550d0eSmasputra * draining the data if we're no longer fused. 761ff550d0eSmasputra */ 762ff550d0eSmasputra if (!tcp->tcp_fused) 763ff550d0eSmasputra return (B_FALSE); 764ff550d0eSmasputra } 765ff550d0eSmasputra 766ff550d0eSmasputra /* Drain the data */ 767ff550d0eSmasputra while ((mp = tcp->tcp_rcv_list) != NULL) { 768ff550d0eSmasputra tcp->tcp_rcv_list = mp->b_next; 769ff550d0eSmasputra mp->b_next = NULL; 770ff550d0eSmasputra #ifdef DEBUG 771ff550d0eSmasputra cnt += msgdsize(mp); 772ff550d0eSmasputra #endif 7730f1702c5SYu Xiangning ASSERT(!IPCL_IS_NONSTR(connp)); 774ff550d0eSmasputra putnext(q, mp); 775f4b3ec61Sdh155122 TCP_STAT(tcps, tcp_fusion_putnext); 776ff550d0eSmasputra } 777ff550d0eSmasputra 7780f1702c5SYu Xiangning #ifdef DEBUG 779ff550d0eSmasputra ASSERT(cnt == tcp->tcp_rcv_cnt); 7800f1702c5SYu Xiangning #endif 781ff550d0eSmasputra tcp->tcp_rcv_last_head = NULL; 782ff550d0eSmasputra tcp->tcp_rcv_last_tail = NULL; 783ff550d0eSmasputra tcp->tcp_rcv_cnt = 0; 784bd670b35SErik Nordmark tcp->tcp_rwnd = tcp->tcp_connp->conn_rcvbuf; 785ff550d0eSmasputra 7867b8f5432SAnders Persson mutex_enter(&peer_tcp->tcp_non_sq_lock); 7878c0bf406Sja97890 if (peer_tcp->tcp_flow_stopped && (TCP_UNSENT_BYTES(peer_tcp) <= 788bd670b35SErik Nordmark peer_tcp->tcp_connp->conn_sndlowat)) { 7898c0bf406Sja97890 tcp_clrqfull(peer_tcp); 7908c0bf406Sja97890 TCP_STAT(tcps, tcp_fusion_backenabled); 7918c0bf406Sja97890 } 7927b8f5432SAnders Persson mutex_exit(&peer_tcp->tcp_non_sq_lock); 7938c0bf406Sja97890 794ff550d0eSmasputra return (B_TRUE); 795ff550d0eSmasputra } 796ff550d0eSmasputra 797ff550d0eSmasputra /* 798ff550d0eSmasputra * Calculate the size of receive buffer for a fused tcp endpoint. 799ff550d0eSmasputra */ 800ff550d0eSmasputra size_t 801ff550d0eSmasputra tcp_fuse_set_rcv_hiwat(tcp_t *tcp, size_t rwnd) 802ff550d0eSmasputra { 803f4b3ec61Sdh155122 tcp_stack_t *tcps = tcp->tcp_tcps; 80493fcb0b9SKacheong Poon uint32_t max_win; 805f4b3ec61Sdh155122 806ff550d0eSmasputra ASSERT(tcp->tcp_fused); 807ff550d0eSmasputra 808ff550d0eSmasputra /* Ensure that value is within the maximum upper bound */ 809f4b3ec61Sdh155122 if (rwnd > tcps->tcps_max_buf) 810f4b3ec61Sdh155122 rwnd = tcps->tcps_max_buf; 811ff550d0eSmasputra /* 812ff550d0eSmasputra * Round up to system page size in case SO_RCVBUF is modified 813ff550d0eSmasputra * after SO_SNDBUF; the latter is also similarly rounded up. 814ff550d0eSmasputra */ 815ff550d0eSmasputra rwnd = P2ROUNDUP_TYPED(rwnd, PAGESIZE, size_t); 81693fcb0b9SKacheong Poon max_win = TCP_MAXWIN << tcp->tcp_rcv_ws; 81793fcb0b9SKacheong Poon if (rwnd > max_win) { 81893fcb0b9SKacheong Poon rwnd = max_win - (max_win % tcp->tcp_mss); 81993fcb0b9SKacheong Poon if (rwnd < tcp->tcp_mss) 82093fcb0b9SKacheong Poon rwnd = max_win; 82193fcb0b9SKacheong Poon } 82279c0745dSRao Shoaib 82379c0745dSRao Shoaib /* 82479c0745dSRao Shoaib * Record high water mark, this is used for flow-control 82579c0745dSRao Shoaib * purposes in tcp_fuse_output(). 82679c0745dSRao Shoaib */ 827bd670b35SErik Nordmark tcp->tcp_connp->conn_rcvbuf = rwnd; 828bd670b35SErik Nordmark tcp->tcp_rwnd = rwnd; 829ff550d0eSmasputra return (rwnd); 830ff550d0eSmasputra } 831ff550d0eSmasputra 832ff550d0eSmasputra /* 833ff550d0eSmasputra * Calculate the maximum outstanding unread data block for a fused tcp endpoint. 834ff550d0eSmasputra */ 835ff550d0eSmasputra int 83679c0745dSRao Shoaib tcp_fuse_maxpsz(tcp_t *tcp) 837ff550d0eSmasputra { 838ff550d0eSmasputra tcp_t *peer_tcp = tcp->tcp_loopback_peer; 839bd670b35SErik Nordmark conn_t *connp = tcp->tcp_connp; 840bd670b35SErik Nordmark uint_t sndbuf = connp->conn_sndbuf; 841ff550d0eSmasputra uint_t maxpsz = sndbuf; 842ff550d0eSmasputra 843ff550d0eSmasputra ASSERT(tcp->tcp_fused); 844ff550d0eSmasputra ASSERT(peer_tcp != NULL); 845bd670b35SErik Nordmark ASSERT(peer_tcp->tcp_connp->conn_rcvbuf != 0); 846ff550d0eSmasputra /* 847ff550d0eSmasputra * In the fused loopback case, we want the stream head to split 848ff550d0eSmasputra * up larger writes into smaller chunks for a more accurate flow- 849ff550d0eSmasputra * control accounting. Our maxpsz is half of the sender's send 850ff550d0eSmasputra * buffer or the receiver's receive buffer, whichever is smaller. 851ff550d0eSmasputra * We round up the buffer to system page size due to the lack of 852ff550d0eSmasputra * TCP MSS concept in Fusion. 853ff550d0eSmasputra */ 854bd670b35SErik Nordmark if (maxpsz > peer_tcp->tcp_connp->conn_rcvbuf) 855bd670b35SErik Nordmark maxpsz = peer_tcp->tcp_connp->conn_rcvbuf; 856ff550d0eSmasputra maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1; 857ff550d0eSmasputra 858ff550d0eSmasputra return (maxpsz); 859ff550d0eSmasputra } 860f3124163SAnders Persson 861f3124163SAnders Persson /* 862f3124163SAnders Persson * Called to release flow control. 863f3124163SAnders Persson */ 864f3124163SAnders Persson void 865f3124163SAnders Persson tcp_fuse_backenable(tcp_t *tcp) 866f3124163SAnders Persson { 867f3124163SAnders Persson tcp_t *peer_tcp = tcp->tcp_loopback_peer; 868f3124163SAnders Persson 869f3124163SAnders Persson ASSERT(tcp->tcp_fused); 870f3124163SAnders Persson ASSERT(peer_tcp != NULL && peer_tcp->tcp_fused); 871f3124163SAnders Persson ASSERT(peer_tcp->tcp_loopback_peer == tcp); 872f3124163SAnders Persson ASSERT(!TCP_IS_DETACHED(tcp)); 873f3124163SAnders Persson ASSERT(tcp->tcp_connp->conn_sqp == 874f3124163SAnders Persson peer_tcp->tcp_connp->conn_sqp); 875f3124163SAnders Persson 876f3124163SAnders Persson if (tcp->tcp_rcv_list != NULL) 877bd670b35SErik Nordmark (void) tcp_fuse_rcv_drain(tcp->tcp_connp->conn_rq, tcp, NULL); 878f3124163SAnders Persson 879f3124163SAnders Persson mutex_enter(&peer_tcp->tcp_non_sq_lock); 880f3124163SAnders Persson if (peer_tcp->tcp_flow_stopped && 881f3124163SAnders Persson (TCP_UNSENT_BYTES(peer_tcp) <= 882bd670b35SErik Nordmark peer_tcp->tcp_connp->conn_sndlowat)) { 883f3124163SAnders Persson tcp_clrqfull(peer_tcp); 884f3124163SAnders Persson } 885f3124163SAnders Persson mutex_exit(&peer_tcp->tcp_non_sq_lock); 886f3124163SAnders Persson 887f3124163SAnders Persson TCP_STAT(tcp->tcp_tcps, tcp_fusion_backenabled); 888f3124163SAnders Persson } 889