/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ /* This files contains all TCP TLI/TPI related functions */ #include #include #include #include #include #include #define _SUN_TPI_VERSION 2 #include #include #include #include #include #include #include #include #include #include static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *); static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *); void tcp_use_pure_tpi(tcp_t *tcp) { conn_t *connp = tcp->tcp_connp; #ifdef _ILP32 tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq; #else tcp->tcp_acceptor_id = connp->conn_dev; #endif /* * Insert this socket into the acceptor hash. * We might need it for T_CONN_RES message */ tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); tcp->tcp_issocket = B_FALSE; TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback); } /* Shorthand to generate and send TPI error acks to our client */ void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) { if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) putnext(tcp->tcp_connp->conn_rq, mp); } /* Shorthand to generate and send TPI error acks to our client */ void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, int t_error, int sys_error) { struct T_error_ack *teackp; if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), M_PCPROTO, T_ERROR_ACK)) != NULL) { teackp = (struct T_error_ack *)mp->b_rptr; teackp->ERROR_prim = primitive; teackp->TLI_error = t_error; teackp->UNIX_error = sys_error; putnext(tcp->tcp_connp->conn_rq, mp); } } /* * TCP routine to get the values of options. */ int tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) { return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr)); } /* ARGSUSED */ int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, void *thisdg_attrs, cred_t *cr) { conn_t *connp = Q_TO_CONN(q); return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp, outlenp, outvalp, thisdg_attrs, cr)); } static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, int *t_errorp, int *sys_errorp) { int error; int is_absreq_failure; t_scalar_t *opt_lenp; t_scalar_t opt_offset; int prim_type; struct T_conn_req *tcreqp; struct T_conn_res *tcresp; cred_t *cr; /* * All Solaris components should pass a db_credp * for this TPI message, hence we ASSERT. * But in case there is some other M_PROTO that looks * like a TPI message sent by some other kernel * component, we check and return an error. */ cr = msg_getcred(mp, NULL); ASSERT(cr != NULL); if (cr == NULL) return (-1); prim_type = ((union T_primitives *)mp->b_rptr)->type; ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || prim_type == T_CONN_RES); switch (prim_type) { case T_CONN_REQ: tcreqp = (struct T_conn_req *)mp->b_rptr; opt_offset = tcreqp->OPT_offset; opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; break; case O_T_CONN_RES: case T_CONN_RES: tcresp = (struct T_conn_res *)mp->b_rptr; opt_offset = tcresp->OPT_offset; opt_lenp = (t_scalar_t *)&tcresp->OPT_length; break; } *t_errorp = 0; *sys_errorp = 0; *do_disconnectp = 0; error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp, opt_offset, cr, &tcp_opt_obj, NULL, &is_absreq_failure); switch (error) { case 0: /* no error */ ASSERT(is_absreq_failure == 0); return (0); case ENOPROTOOPT: *t_errorp = TBADOPT; break; case EACCES: *t_errorp = TACCES; break; default: *t_errorp = TSYSERR; *sys_errorp = error; break; } if (is_absreq_failure != 0) { /* * The connection request should get the local ack * T_OK_ACK and then a T_DISCON_IND. */ *do_disconnectp = 1; } return (-1); } void tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) { int error; conn_t *connp = tcp->tcp_connp; struct sockaddr *sa; mblk_t *mp1; struct T_bind_req *tbr; int backlog; socklen_t len; sin_t *sin; sin6_t *sin6; cred_t *cr; /* * All Solaris components should pass a db_credp * for this TPI message, hence we ASSERT. * But in case there is some other M_PROTO that looks * like a TPI message sent by some other kernel * component, we check and return an error. */ cr = msg_getcred(mp, NULL); ASSERT(cr != NULL); if (cr == NULL) { tcp_err_ack(tcp, mp, TSYSERR, EINVAL); return; } ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_tpi_bind: bad req, len %u", (uint_t)(mp->b_wptr - mp->b_rptr)); } tcp_err_ack(tcp, mp, TPROTO, 0); return; } /* Make sure the largest address fits */ mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); if (mp1 == NULL) { tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); return; } mp = mp1; tbr = (struct T_bind_req *)mp->b_rptr; backlog = tbr->CONIND_number; len = tbr->ADDR_length; switch (len) { case 0: /* request for a generic port */ tbr->ADDR_offset = sizeof (struct T_bind_req); if (connp->conn_family == AF_INET) { tbr->ADDR_length = sizeof (sin_t); sin = (sin_t *)&tbr[1]; *sin = sin_null; sin->sin_family = AF_INET; sa = (struct sockaddr *)sin; len = sizeof (sin_t); mp->b_wptr = (uchar_t *)&sin[1]; } else { ASSERT(connp->conn_family == AF_INET6); tbr->ADDR_length = sizeof (sin6_t); sin6 = (sin6_t *)&tbr[1]; *sin6 = sin6_null; sin6->sin6_family = AF_INET6; sa = (struct sockaddr *)sin6; len = sizeof (sin6_t); mp->b_wptr = (uchar_t *)&sin6[1]; } break; case sizeof (sin_t): /* Complete IPv4 address */ sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, sizeof (sin_t)); break; case sizeof (sin6_t): /* Complete IPv6 address */ sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, sizeof (sin6_t)); break; default: if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_tpi_bind: bad address length, %d", tbr->ADDR_length); } tcp_err_ack(tcp, mp, TBADADDR, 0); return; } if (backlog > 0) { error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp), tbr->PRIM_type != O_T_BIND_REQ); } else { error = tcp_do_bind(connp, sa, len, DB_CRED(mp), tbr->PRIM_type != O_T_BIND_REQ); } done: if (error > 0) { tcp_err_ack(tcp, mp, TSYSERR, error); } else if (error < 0) { tcp_err_ack(tcp, mp, -error, 0); } else { /* * Update port information as sockfs/tpi needs it for checking */ if (connp->conn_family == AF_INET) { sin = (sin_t *)sa; sin->sin_port = connp->conn_lport; } else { sin6 = (sin6_t *)sa; sin6->sin6_port = connp->conn_lport; } mp->b_datap->db_type = M_PCPROTO; tbr->PRIM_type = T_BIND_ACK; putnext(connp->conn_rq, mp); } } /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ void tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp) { conn_t *connp = tcp->tcp_connp; int error; error = tcp_do_unbind(connp); if (error > 0) { tcp_err_ack(tcp, mp, TSYSERR, error); } else if (error < 0) { tcp_err_ack(tcp, mp, -error, 0); } else { /* Send M_FLUSH according to TPI */ (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); mp = mi_tpi_ok_ack_alloc(mp); if (mp != NULL) putnext(connp->conn_rq, mp); } } /* ARGSUSED */ int tcp_tpi_close(queue_t *q, int flags, cred_t *credp __unused) { conn_t *connp; ASSERT(WR(q)->q_next == NULL); if (flags & SO_FALLBACK) { /* * stream is being closed while in fallback * simply free the resources that were allocated */ inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); qprocsoff(q); goto done; } connp = Q_TO_CONN(q); /* * We are being closed as /dev/tcp or /dev/tcp6. */ tcp_close_common(connp, flags); qprocsoff(q); inet_minor_free(connp->conn_minor_arena, connp->conn_dev); /* * Drop IP's reference on the conn. This is the last reference * on the connp if the state was less than established. If the * connection has gone into timewait state, then we will have * one ref for the TCP and one more ref (total of two) for the * classifier connected hash list (a timewait connections stays * in connected hash till closed). * * We can't assert the references because there might be other * transient reference places because of some walkers or queued * packets in squeue for the timewait state. */ CONN_DEC_REF(connp); done: q->q_ptr = WR(q)->q_ptr = NULL; return (0); } /* ARGSUSED */ int tcp_tpi_close_accept(queue_t *q, int flags __unused, cred_t *credp __unused) { vmem_t *minor_arena; dev_t conn_dev; extern struct qinit tcp_acceptor_winit; ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); /* * We had opened an acceptor STREAM for sockfs which is * now being closed due to some error. */ qprocsoff(q); minor_arena = (vmem_t *)WR(q)->q_ptr; conn_dev = (dev_t)RD(q)->q_ptr; ASSERT(minor_arena != NULL); ASSERT(conn_dev != 0); inet_minor_free(minor_arena, conn_dev); q->q_ptr = WR(q)->q_ptr = NULL; return (0); } /* * Put a connection confirmation message upstream built from the * address/flowid information with the conn and iph. Report our success or * failure. */ boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira) { sin_t sin; sin6_t sin6; mblk_t *mp; char *optp = NULL; int optlen = 0; conn_t *connp = tcp->tcp_connp; if (defermp != NULL) *defermp = NULL; if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { /* * Return in T_CONN_CON results of option negotiation through * the T_CONN_REQ. Note: If there is an real end-to-end option * negotiation, then what is received from remote end needs * to be taken into account but there is no such thing (yet?) * in our TCP/IP. * Note: We do not use mi_offset_param() here as * tcp_opts_conn_req contents do not directly come from * an application and are either generated in kernel or * from user input that was already verified. */ mp = tcp->tcp_conn.tcp_opts_conn_req; optp = (char *)(mp->b_rptr + ((struct T_conn_req *)mp->b_rptr)->OPT_offset); optlen = (int) ((struct T_conn_req *)mp->b_rptr)->OPT_length; } if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { /* packet is IPv4 */ if (connp->conn_family == AF_INET) { sin = sin_null; sin.sin_addr.s_addr = connp->conn_faddr_v4; sin.sin_port = connp->conn_fport; sin.sin_family = AF_INET; mp = mi_tpi_conn_con(NULL, (char *)&sin, (int)sizeof (sin_t), optp, optlen); } else { sin6 = sin6_null; sin6.sin6_addr = connp->conn_faddr_v6; sin6.sin6_port = connp->conn_fport; sin6.sin6_family = AF_INET6; mp = mi_tpi_conn_con(NULL, (char *)&sin6, (int)sizeof (sin6_t), optp, optlen); } } else { ip6_t *ip6h = (ip6_t *)iphdr; ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); ASSERT(connp->conn_family == AF_INET6); sin6 = sin6_null; sin6.sin6_addr = connp->conn_faddr_v6; sin6.sin6_port = connp->conn_fport; sin6.sin6_family = AF_INET6; sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; mp = mi_tpi_conn_con(NULL, (char *)&sin6, (int)sizeof (sin6_t), optp, optlen); } if (!mp) return (B_FALSE); mblk_copycred(mp, idmp); if (defermp == NULL) { conn_t *connp = tcp->tcp_connp; if (IPCL_IS_NONSTR(connp)) { (*connp->conn_upcalls->su_connected) (connp->conn_upper_handle, tcp->tcp_connid, ira->ira_cred, ira->ira_cpid); freemsg(mp); } else { if (ira->ira_cred != NULL) { /* So that getpeerucred works for TPI sockfs */ mblk_setcred(mp, ira->ira_cred, ira->ira_cpid); } putnext(connp->conn_rq, mp); } } else { *defermp = mp; } if (tcp->tcp_conn.tcp_opts_conn_req != NULL) tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); return (B_TRUE); } /* * Successful connect request processing begins when our client passes * a T_CONN_REQ message into tcp_wput(), which performs function calls into * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream). * * After various error checks are completed, tcp_tpi_connect() lays * the target address and port into the composite header template. * Then we ask IP for information, including a source address if we didn't * already have one. Finally we prepare to send the SYN packet, and then * send up the T_OK_ACK reply message. */ void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) { sin_t *sin; struct T_conn_req *tcr; struct sockaddr *sa; socklen_t len; int error; cred_t *cr; pid_t cpid; conn_t *connp = tcp->tcp_connp; queue_t *q = connp->conn_wq; /* * All Solaris components should pass a db_credp * for this TPI message, hence we ASSERT. * But in case there is some other M_PROTO that looks * like a TPI message sent by some other kernel * component, we check and return an error. */ cr = msg_getcred(mp, &cpid); ASSERT(cr != NULL); if (cr == NULL) { tcp_err_ack(tcp, mp, TSYSERR, EINVAL); return; } tcr = (struct T_conn_req *)mp->b_rptr; ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { tcp_err_ack(tcp, mp, TPROTO, 0); return; } /* * Pre-allocate the T_ordrel_ind mblk so that at close time, we * will always have that to send up. Otherwise, we need to do * special handling in case the allocation fails at that time. * If the end point is TPI, the tcp_t can be reused and the * tcp_ordrel_mp may be allocated already. */ if (tcp->tcp_ordrel_mp == NULL) { if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) { tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); return; } } /* * Determine packet type based on type of address passed in * the request should contain an IPv4 or IPv6 address. * Make sure that address family matches the type of * family of the address passed down. */ switch (tcr->DEST_length) { default: tcp_err_ack(tcp, mp, TBADADDR, 0); return; case (sizeof (sin_t) - sizeof (sin->sin_zero)): { /* * XXX: The check for valid DEST_length was not there * in earlier releases and some buggy * TLI apps (e.g Sybase) got away with not feeding * in sin_zero part of address. * We allow that bug to keep those buggy apps humming. * Test suites require the check on DEST_length. * We construct a new mblk with valid DEST_length * free the original so the rest of the code does * not have to keep track of this special shorter * length address case. */ mblk_t *nmp; struct T_conn_req *ntcr; sin_t *nsin; nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + tcr->OPT_length, BPRI_HI); if (nmp == NULL) { tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); return; } ntcr = (struct T_conn_req *)nmp->b_rptr; bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ ntcr->PRIM_type = T_CONN_REQ; ntcr->DEST_length = sizeof (sin_t); ntcr->DEST_offset = sizeof (struct T_conn_req); nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); *nsin = sin_null; /* Get pointer to shorter address to copy from original mp */ sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, tcr->DEST_length); /* extract DEST_length worth of sin_t */ if (sin == NULL || !OK_32PTR((char *)sin)) { freemsg(nmp); tcp_err_ack(tcp, mp, TSYSERR, EINVAL); return; } nsin->sin_family = sin->sin_family; nsin->sin_port = sin->sin_port; nsin->sin_addr = sin->sin_addr; /* Note:nsin->sin_zero zero-fill with sin_null assign above */ nmp->b_wptr = (uchar_t *)&nsin[1]; if (tcr->OPT_length != 0) { ntcr->OPT_length = tcr->OPT_length; ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; bcopy((uchar_t *)tcr + tcr->OPT_offset, (uchar_t *)ntcr + ntcr->OPT_offset, tcr->OPT_length); nmp->b_wptr += tcr->OPT_length; } freemsg(mp); /* original mp freed */ mp = nmp; /* re-initialize original variables */ tcr = ntcr; } /* FALLTHRU */ case sizeof (sin_t): sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, sizeof (sin_t)); len = sizeof (sin_t); break; case sizeof (sin6_t): sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, sizeof (sin6_t)); len = sizeof (sin6_t); break; } error = proto_verify_ip_addr(connp->conn_family, sa, len); if (error != 0) { tcp_err_ack(tcp, mp, TSYSERR, error); return; } /* * TODO: If someone in TCPS_TIME_WAIT has this dst/port we * should key on their sequence number and cut them loose. */ /* * If options passed in, feed it for verification and handling */ if (tcr->OPT_length != 0) { mblk_t *ok_mp; mblk_t *discon_mp; mblk_t *conn_opts_mp; int t_error, sys_error, do_disconnect; conn_opts_mp = NULL; if (tcp_conprim_opt_process(tcp, mp, &do_disconnect, &t_error, &sys_error) < 0) { if (do_disconnect) { ASSERT(t_error == 0 && sys_error == 0); discon_mp = mi_tpi_discon_ind(NULL, ECONNREFUSED, 0); if (!discon_mp) { tcp_err_ack_prim(tcp, mp, T_CONN_REQ, TSYSERR, ENOMEM); return; } ok_mp = mi_tpi_ok_ack_alloc(mp); if (!ok_mp) { tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, TSYSERR, ENOMEM); return; } qreply(q, ok_mp); qreply(q, discon_mp); /* no flush! */ } else { ASSERT(t_error != 0); tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, sys_error); } return; } /* * Success in setting options, the mp option buffer represented * by OPT_length/offset has been potentially modified and * contains results of option processing. We copy it in * another mp to save it for potentially influencing returning * it in T_CONN_CONN. */ if (tcr->OPT_length != 0) { /* there are resulting options */ conn_opts_mp = copyb(mp); if (!conn_opts_mp) { tcp_err_ack_prim(tcp, mp, T_CONN_REQ, TSYSERR, ENOMEM); return; } ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; /* * Note: * These resulting option negotiation can include any * end-to-end negotiation options but there no such * thing (yet?) in our TCP/IP. */ } } /* call the non-TPI version */ error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid); if (error < 0) { mp = mi_tpi_err_ack_alloc(mp, -error, 0); } else if (error > 0) { mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); } else { mp = mi_tpi_ok_ack_alloc(mp); } /* * Note: Code below is the "failure" case */ /* return error ack and blow away saved option results if any */ connect_failed: if (mp != NULL) putnext(connp->conn_rq, mp); else { tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, TSYSERR, ENOMEM); } } /* Return the TPI/TLI equivalent of our current tcp_state */ static int tcp_tpistate(tcp_t *tcp) { switch (tcp->tcp_state) { case TCPS_IDLE: return (TS_UNBND); case TCPS_LISTEN: /* * Return whether there are outstanding T_CONN_IND waiting * for the matching T_CONN_RES. Therefore don't count q0. */ if (tcp->tcp_conn_req_cnt_q > 0) return (TS_WRES_CIND); else return (TS_IDLE); case TCPS_BOUND: return (TS_IDLE); case TCPS_SYN_SENT: return (TS_WCON_CREQ); case TCPS_SYN_RCVD: /* * Note: assumption: this has to the active open SYN_RCVD. * The passive instance is detached in SYN_RCVD stage of * incoming connection processing so we cannot get request * for T_info_ack on it. */ return (TS_WACK_CRES); case TCPS_ESTABLISHED: return (TS_DATA_XFER); case TCPS_CLOSE_WAIT: return (TS_WREQ_ORDREL); case TCPS_FIN_WAIT_1: return (TS_WIND_ORDREL); case TCPS_FIN_WAIT_2: return (TS_WIND_ORDREL); case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: case TCPS_CLOSED: /* * Following TS_WACK_DREQ7 is a rendition of "not * yet TS_IDLE" TPI state. There is no best match to any * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we * choose a value chosen that will map to TLI/XTI level * state of TSTATECHNG (state is process of changing) which * captures what this dummy state represents. */ return (TS_WACK_DREQ7); default: cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", tcp->tcp_state, tcp_display(tcp, NULL, DISP_PORT_ONLY)); return (TS_UNBND); } } static void tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) { tcp_stack_t *tcps = tcp->tcp_tcps; conn_t *connp = tcp->tcp_connp; extern struct T_info_ack tcp_g_t_info_ack; extern struct T_info_ack tcp_g_t_info_ack_v6; if (connp->conn_family == AF_INET6) *tia = tcp_g_t_info_ack_v6; else *tia = tcp_g_t_info_ack; tia->CURRENT_state = tcp_tpistate(tcp); tia->OPT_size = tcp_max_optsize; if (tcp->tcp_mss == 0) { /* Not yet set - tcp_open does not set mss */ if (connp->conn_ipversion == IPV4_VERSION) tia->TIDU_size = tcps->tcps_mss_def_ipv4; else tia->TIDU_size = tcps->tcps_mss_def_ipv6; } else { tia->TIDU_size = tcp->tcp_mss; } /* TODO: Default ETSDU is 1. Is that correct for tcp? */ } void tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap, t_uscalar_t cap_bits1) { tcap->CAP_bits1 = 0; if (cap_bits1 & TC1_INFO) { tcp_copy_info(&tcap->INFO_ack, tcp); tcap->CAP_bits1 |= TC1_INFO; } if (cap_bits1 & TC1_ACCEPTOR_ID) { tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; } } /* * This routine responds to T_CAPABILITY_REQ messages. It is called by * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from * tcp_g_t_info_ack. The current state of the stream is copied from * tcp_state. */ void tcp_capability_req(tcp_t *tcp, mblk_t *mp) { t_uscalar_t cap_bits1; struct T_capability_ack *tcap; if (MBLKL(mp) < sizeof (struct T_capability_req)) { freemsg(mp); return; } cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), mp->b_datap->db_type, T_CAPABILITY_ACK); if (mp == NULL) return; tcap = (struct T_capability_ack *)mp->b_rptr; tcp_do_capability_ack(tcp, tcap, cap_bits1); putnext(tcp->tcp_connp->conn_rq, mp); } /* * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. * The current state of the stream is copied from tcp_state. */ void tcp_info_req(tcp_t *tcp, mblk_t *mp) { mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, T_INFO_ACK); if (!mp) { tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); return; } tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); putnext(tcp->tcp_connp->conn_rq, mp); } /* Respond to the TPI addr request */ void tcp_addr_req(tcp_t *tcp, mblk_t *mp) { struct sockaddr *sa; mblk_t *ackmp; struct T_addr_ack *taa; conn_t *connp = tcp->tcp_connp; uint_t addrlen; /* Make it large enough for worst case */ ackmp = reallocb(mp, sizeof (struct T_addr_ack) + 2 * sizeof (sin6_t), 1); if (ackmp == NULL) { tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); return; } taa = (struct T_addr_ack *)ackmp->b_rptr; bzero(taa, sizeof (struct T_addr_ack)); ackmp->b_wptr = (uchar_t *)&taa[1]; taa->PRIM_type = T_ADDR_ACK; ackmp->b_datap->db_type = M_PCPROTO; if (connp->conn_family == AF_INET) addrlen = sizeof (sin_t); else addrlen = sizeof (sin6_t); /* * Note: Following code assumes 32 bit alignment of basic * data structures like sin_t and struct T_addr_ack. */ if (tcp->tcp_state >= TCPS_BOUND) { /* * Fill in local address first */ taa->LOCADDR_offset = sizeof (*taa); taa->LOCADDR_length = addrlen; sa = (struct sockaddr *)&taa[1]; (void) conn_getsockname(connp, sa, &addrlen); ackmp->b_wptr += addrlen; } if (tcp->tcp_state >= TCPS_SYN_RCVD) { /* * Fill in Remote address */ taa->REMADDR_length = addrlen; /* assumed 32-bit alignment */ taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); (void) conn_getpeername(connp, sa, &addrlen); ackmp->b_wptr += addrlen; } ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); putnext(tcp->tcp_connp->conn_rq, ackmp); } /* * Swap information between the eager and acceptor for a TLI/XTI client. * The sockfs accept is done on the acceptor stream and control goes * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not * called. In either case, both the eager and listener are in their own * perimeter (squeue) and the code has to deal with potential race. * * See the block comment on top of tcp_accept() and tcp_tli_accept(). */ static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) { conn_t *econnp, *aconnp; ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq); ASSERT(eager->tcp_detached && !acceptor->tcp_detached); ASSERT(!TCP_IS_SOCKET(acceptor)); ASSERT(!TCP_IS_SOCKET(eager)); ASSERT(!TCP_IS_SOCKET(listener)); /* * Trusted Extensions may need to use a security label that is * different from the acceptor's label on MLP and MAC-Exempt * sockets. If this is the case, the required security label * already exists in econnp->conn_ixa->ixa_tsl. Since we make the * acceptor stream refer to econnp we atomatically get that label. */ acceptor->tcp_detached = B_TRUE; /* * To permit stream re-use by TLI/XTI, the eager needs a copy of * the acceptor id. */ eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; /* remove eager from listen list... */ mutex_enter(&listener->tcp_eager_lock); tcp_eager_unlink(eager); ASSERT(eager->tcp_eager_next_q == NULL && eager->tcp_eager_last_q == NULL); ASSERT(eager->tcp_eager_next_q0 == NULL && eager->tcp_eager_prev_q0 == NULL); mutex_exit(&listener->tcp_eager_lock); econnp = eager->tcp_connp; aconnp = acceptor->tcp_connp; econnp->conn_rq = aconnp->conn_rq; econnp->conn_wq = aconnp->conn_wq; econnp->conn_rq->q_ptr = econnp; econnp->conn_wq->q_ptr = econnp; /* * In the TLI/XTI loopback case, we are inside the listener's squeue, * which might be a different squeue from our peer TCP instance. * For TCP Fusion, the peer expects that whenever tcp_detached is * clear, our TCP queues point to the acceptor's queues. Thus, use * membar_producer() to ensure that the assignments of conn_rq/conn_wq * above reach global visibility prior to the clearing of tcp_detached. */ membar_producer(); eager->tcp_detached = B_FALSE; ASSERT(eager->tcp_ack_tid == 0); econnp->conn_dev = aconnp->conn_dev; econnp->conn_minor_arena = aconnp->conn_minor_arena; ASSERT(econnp->conn_minor_arena != NULL); if (econnp->conn_cred != NULL) crfree(econnp->conn_cred); econnp->conn_cred = aconnp->conn_cred; ASSERT(!(econnp->conn_ixa->ixa_free_flags & IXA_FREE_CRED)); econnp->conn_ixa->ixa_cred = econnp->conn_cred; aconnp->conn_cred = NULL; econnp->conn_cpid = aconnp->conn_cpid; ASSERT(econnp->conn_netstack == aconnp->conn_netstack); ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); econnp->conn_zoneid = aconnp->conn_zoneid; econnp->conn_allzones = aconnp->conn_allzones; econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid; econnp->conn_mac_mode = aconnp->conn_mac_mode; econnp->conn_zone_is_global = aconnp->conn_zone_is_global; aconnp->conn_mac_mode = CONN_MAC_DEFAULT; /* Do the IPC initialization */ CONN_INC_REF(econnp); /* Done with old IPC. Drop its ref on its connp */ CONN_DEC_REF(aconnp); } /* * This runs at the tail end of accept processing on the squeue of the * new connection. */ /* ARGSUSED */ static void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; queue_t *q = connp->conn_rq; tcp_stack_t *tcps = tcp->tcp_tcps; struct stroptions *stropt; struct sock_proto_props sopp; /* Should never be called for non-STREAMS sockets */ ASSERT(!IPCL_IS_NONSTR(connp)); /* We should just receive a single mblk that fits a T_discon_ind */ ASSERT(mp->b_cont == NULL); /* * Drop the eager's ref on the listener, that was placed when * this eager began life in tcp_input_listener. */ CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); tcp->tcp_detached = B_FALSE; if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) { /* * Someone blewoff the eager before we could finish * the accept. * * The only reason eager exists it because we put in * a ref on it when conn ind went up. We need to send * a disconnect indication up while the last reference * on the eager will be dropped by the squeue when we * return. */ ASSERT(tcp->tcp_listener == NULL); if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) { struct T_discon_ind *tdi; (void) putnextctl1(q, M_FLUSH, FLUSHRW); /* * Let us reuse the incoming mblk to avoid * memory allocation failure problems. We know * that the size of the incoming mblk i.e. * stroptions is greater than sizeof * T_discon_ind. */ ASSERT(DB_REF(mp) == 1); ASSERT(MBLKSIZE(mp) >= sizeof (struct T_discon_ind)); DB_TYPE(mp) = M_PROTO; ((union T_primitives *)mp->b_rptr)->type = T_DISCON_IND; tdi = (struct T_discon_ind *)mp->b_rptr; if (tcp->tcp_issocket) { tdi->DISCON_reason = ECONNREFUSED; tdi->SEQ_number = 0; } else { tdi->DISCON_reason = ENOPROTOOPT; tdi->SEQ_number = tcp->tcp_conn_req_seqnum; } mp->b_wptr = mp->b_rptr + sizeof (struct T_discon_ind); putnext(q, mp); } tcp->tcp_hard_binding = B_FALSE; return; } /* * This is the first time we run on the correct * queue after tcp_accept. So fix all the q parameters * here. * * Let us reuse the incoming mblk to avoid * memory allocation failure problems. We know * that the size of the incoming mblk is at least * stroptions */ tcp_get_proto_props(tcp, &sopp); ASSERT(DB_REF(mp) == 1); ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions)); DB_TYPE(mp) = M_SETOPTS; stropt = (struct stroptions *)mp->b_rptr; mp->b_wptr = mp->b_rptr + sizeof (struct stroptions); stropt = (struct stroptions *)mp->b_rptr; ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK)); stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; stropt->so_hiwat = sopp.sopp_rxhiwat; stropt->so_wroff = sopp.sopp_wroff; stropt->so_maxblk = sopp.sopp_maxblk; /* Send the options up */ putnext(q, mp); /* * Pass up any data and/or a fin that has been received. * * Adjust receive window in case it had decreased * (because there is data <=> tcp_rcv_list != NULL) * while the connection was detached. Note that * in case the eager was flow-controlled, w/o this * code, the rwnd may never open up again! */ if (tcp->tcp_rcv_list != NULL) { /* We drain directly in case of fused tcp loopback */ if (!tcp->tcp_fused && canputnext(q)) { tcp->tcp_rwnd = connp->conn_rcvbuf; if (tcp->tcp_state >= TCPS_ESTABLISHED && tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { tcp_xmit_ctl(NULL, tcp, (tcp->tcp_swnd == 0) ? tcp->tcp_suna : tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); } } (void) tcp_rcv_drain(tcp); /* * For fused tcp loopback, back-enable peer endpoint * if it's currently flow-controlled. */ if (tcp->tcp_fused) { tcp_t *peer_tcp = tcp->tcp_loopback_peer; ASSERT(peer_tcp != NULL); ASSERT(peer_tcp->tcp_fused); mutex_enter(&peer_tcp->tcp_non_sq_lock); if (peer_tcp->tcp_flow_stopped) { tcp_clrqfull(peer_tcp); TCP_STAT(tcps, tcp_fusion_backenabled); } mutex_exit(&peer_tcp->tcp_non_sq_lock); } } ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) { tcp->tcp_ordrel_done = B_TRUE; mp = tcp->tcp_ordrel_mp; tcp->tcp_ordrel_mp = NULL; putnext(q, mp); } tcp->tcp_hard_binding = B_FALSE; if (connp->conn_keepalive) { tcp->tcp_ka_last_intrvl = 0; tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, tcp->tcp_ka_interval); } /* * At this point, eager is fully established and will * have the following references - * * 2 references for connection to exist (1 for TCP and 1 for IP). * 1 reference for the squeue which will be dropped by the squeue as * soon as this function returns. * There will be 1 additonal reference for being in classifier * hash list provided something bad hasn't happened. */ ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || (connp->conn_fanout == NULL && connp->conn_ref >= 3)); } /* * Pull a deferred connection indication off of the listener. The caller * must verify that there is a deferred conn ind under eager_lock before * calling this function. */ static mblk_t * tcp_get_def_conn_ind(tcp_t *listener) { tcp_t *tail; tcp_t *tcp; mblk_t *conn_ind; ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); ASSERT(listener->tcp_eager_prev_q0->tcp_conn_def_q0); tcp = listener->tcp_eager_prev_q0; /* * listener->tcp_eager_prev_q0 points to the TAIL of the * deferred T_conn_ind queue. We need to get to the head * of the queue in order to send up T_conn_ind the same * order as how the 3WHS is completed. */ while (tcp != listener) { if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) break; else tcp = tcp->tcp_eager_prev_q0; } conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; tcp->tcp_conn.tcp_eager_conn_ind = NULL; /* Move from q0 to q */ ASSERT(listener->tcp_conn_req_cnt_q0 > 0); listener->tcp_conn_req_cnt_q0--; listener->tcp_conn_req_cnt_q++; tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0; tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; tcp->tcp_eager_prev_q0 = NULL; tcp->tcp_eager_next_q0 = NULL; tcp->tcp_conn_def_q0 = B_FALSE; /* Make sure the tcp isn't in the list of droppables */ ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && tcp->tcp_eager_prev_drop_q0 == NULL); /* * Insert at end of the queue because sockfs sends * down T_CONN_RES in chronological order. Leaving * the older conn indications at front of the queue * helps reducing search time. */ tail = listener->tcp_eager_last_q; if (tail != NULL) { tail->tcp_eager_next_q = tcp; } else { listener->tcp_eager_next_q = tcp; } listener->tcp_eager_last_q = tcp; tcp->tcp_eager_next_q = NULL; return (conn_ind); } /* * Reply to a clients T_CONN_RES TPI message. This function * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES * on the acceptor STREAM and processed in tcp_accept_common(). * Read the block comment on top of tcp_input_listener(). */ void tcp_tli_accept(tcp_t *listener, mblk_t *mp) { tcp_t *acceptor; tcp_t *eager; struct T_conn_res *tcr; t_uscalar_t acceptor_id; t_scalar_t seqnum; mblk_t *discon_mp = NULL; mblk_t *ok_mp; mblk_t *mp1; tcp_stack_t *tcps = listener->tcp_tcps; conn_t *econnp; if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { tcp_err_ack(listener, mp, TPROTO, 0); return; } tcr = (struct T_conn_res *)mp->b_rptr; /* * Under ILP32 the stream head points tcr->ACCEPTOR_id at the * read side queue of the streams device underneath us i.e. the * read side queue of 'ip'. Since we can't deference QUEUE_ptr we * look it up in the queue_hash. Under LP64 it sends down the * minor_t of the accepting endpoint. * * Once the acceptor/eager are modified (in tcp_accept_swap) the * fanout hash lock is held. * This prevents any thread from entering the acceptor queue from * below (since it has not been hard bound yet i.e. any inbound * packets will arrive on the listener conn_t and * go through the classifier). * The CONN_INC_REF will prevent the acceptor from closing. * * XXX It is still possible for a tli application to send down data * on the accepting stream while another thread calls t_accept. * This should not be a problem for well-behaved applications since * the T_OK_ACK is sent after the queue swapping is completed. * * If the accepting fd is the same as the listening fd, avoid * queue hash lookup since that will return an eager listener in a * already established state. */ acceptor_id = tcr->ACCEPTOR_id; mutex_enter(&listener->tcp_eager_lock); if (listener->tcp_acceptor_id == acceptor_id) { eager = listener->tcp_eager_next_q; /* only count how many T_CONN_INDs so don't count q0 */ if ((listener->tcp_conn_req_cnt_q != 1) || (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { mutex_exit(&listener->tcp_eager_lock); tcp_err_ack(listener, mp, TBADF, 0); return; } if (listener->tcp_conn_req_cnt_q0 != 0) { /* Throw away all the eagers on q0. */ tcp_eager_cleanup(listener, 1); } if (listener->tcp_syn_defense) { listener->tcp_syn_defense = B_FALSE; if (listener->tcp_ip_addr_cache != NULL) { kmem_free(listener->tcp_ip_addr_cache, IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); listener->tcp_ip_addr_cache = NULL; } } /* * Transfer tcp_conn_req_max to the eager so that when * a disconnect occurs we can revert the endpoint to the * listen state. */ eager->tcp_conn_req_max = listener->tcp_conn_req_max; ASSERT(listener->tcp_conn_req_cnt_q0 == 0); /* * Get a reference on the acceptor just like the * tcp_acceptor_hash_lookup below. */ acceptor = listener; CONN_INC_REF(acceptor->tcp_connp); } else { acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); if (acceptor == NULL) { if (listener->tcp_connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_accept: did not find acceptor 0x%x\n", acceptor_id); } mutex_exit(&listener->tcp_eager_lock); tcp_err_ack(listener, mp, TPROVMISMATCH, 0); return; } /* * Verify acceptor state. The acceptable states for an acceptor * include TCPS_IDLE and TCPS_BOUND. */ switch (acceptor->tcp_state) { case TCPS_IDLE: /* FALLTHRU */ case TCPS_BOUND: break; default: CONN_DEC_REF(acceptor->tcp_connp); mutex_exit(&listener->tcp_eager_lock); tcp_err_ack(listener, mp, TOUTSTATE, 0); return; } } /* The listener must be in TCPS_LISTEN */ if (listener->tcp_state != TCPS_LISTEN) { CONN_DEC_REF(acceptor->tcp_connp); mutex_exit(&listener->tcp_eager_lock); tcp_err_ack(listener, mp, TOUTSTATE, 0); return; } /* * Rendezvous with an eager connection request packet hanging off * 'tcp' that has the 'seqnum' tag. We tagged the detached open * tcp structure when the connection packet arrived in * tcp_input_listener(). */ seqnum = tcr->SEQ_number; eager = listener; do { eager = eager->tcp_eager_next_q; if (eager == NULL) { CONN_DEC_REF(acceptor->tcp_connp); mutex_exit(&listener->tcp_eager_lock); tcp_err_ack(listener, mp, TBADSEQ, 0); return; } } while (eager->tcp_conn_req_seqnum != seqnum); mutex_exit(&listener->tcp_eager_lock); /* * At this point, both acceptor and listener have 2 ref * that they begin with. Acceptor has one additional ref * we placed in lookup while listener has 3 additional * ref for being behind the squeue (tcp_accept() is * done on listener's squeue); being in classifier hash; * and eager's ref on listener. */ ASSERT(listener->tcp_connp->conn_ref >= 5); ASSERT(acceptor->tcp_connp->conn_ref >= 3); /* * The eager at this point is set in its own squeue and * could easily have been killed (tcp_accept_finish will * deal with that) because of a TH_RST so we can only * ASSERT for a single ref. */ ASSERT(eager->tcp_connp->conn_ref >= 1); /* * Pre allocate the discon_ind mblk also. tcp_accept_finish will * use it if something failed. */ discon_mp = allocb(MAX(sizeof (struct T_discon_ind), sizeof (struct stroptions)), BPRI_HI); if (discon_mp == NULL) { CONN_DEC_REF(acceptor->tcp_connp); CONN_DEC_REF(eager->tcp_connp); tcp_err_ack(listener, mp, TSYSERR, ENOMEM); return; } econnp = eager->tcp_connp; /* Hold a copy of mp, in case reallocb fails */ if ((mp1 = copymsg(mp)) == NULL) { CONN_DEC_REF(acceptor->tcp_connp); CONN_DEC_REF(eager->tcp_connp); freemsg(discon_mp); tcp_err_ack(listener, mp, TSYSERR, ENOMEM); return; } tcr = (struct T_conn_res *)mp1->b_rptr; /* * This is an expanded version of mi_tpi_ok_ack_alloc() * which allocates a larger mblk and appends the new * local address to the ok_ack. The address is copied by * soaccept() for getsockname(). */ { int extra; extra = (econnp->conn_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t); /* * Try to re-use mp, if possible. Otherwise, allocate * an mblk and return it as ok_mp. In any case, mp * is no longer usable upon return. */ if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { CONN_DEC_REF(acceptor->tcp_connp); CONN_DEC_REF(eager->tcp_connp); freemsg(discon_mp); /* Original mp has been freed by now, so use mp1 */ tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); return; } mp = NULL; /* We should never use mp after this point */ switch (extra) { case sizeof (sin_t): { sin_t *sin = (sin_t *)ok_mp->b_wptr; ok_mp->b_wptr += extra; sin->sin_family = AF_INET; sin->sin_port = econnp->conn_lport; sin->sin_addr.s_addr = econnp->conn_laddr_v4; break; } case sizeof (sin6_t): { sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; ok_mp->b_wptr += extra; sin6->sin6_family = AF_INET6; sin6->sin6_port = econnp->conn_lport; sin6->sin6_addr = econnp->conn_laddr_v6; sin6->sin6_flowinfo = econnp->conn_flowinfo; if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { sin6->sin6_scope_id = econnp->conn_ixa->ixa_scopeid; } else { sin6->sin6_scope_id = 0; } sin6->__sin6_src_id = 0; break; } default: break; } ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); } /* * If there are no options we know that the T_CONN_RES will * succeed. However, we can't send the T_OK_ACK upstream until * the tcp_accept_swap is done since it would be dangerous to * let the application start using the new fd prior to the swap. */ tcp_accept_swap(listener, acceptor, eager); /* * tcp_accept_swap unlinks eager from listener but does not drop * the eager's reference on the listener. */ ASSERT(eager->tcp_listener == NULL); ASSERT(listener->tcp_connp->conn_ref >= 5); /* * The eager is now associated with its own queue. Insert in * the hash so that the connection can be reused for a future * T_CONN_RES. */ tcp_acceptor_hash_insert(acceptor_id, eager); /* * We now do the processing of options with T_CONN_RES. * We delay till now since we wanted to have queue to pass to * option processing routines that points back to the right * instance structure which does not happen until after * tcp_accept_swap(). * * Note: * The sanity of the logic here assumes that whatever options * are appropriate to inherit from listner=>eager are done * before this point, and whatever were to be overridden (or not) * in transfer logic from eager=>acceptor in tcp_accept_swap(). * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it * before its ACCEPTOR_id comes down in T_CONN_RES ] * This may not be true at this point in time but can be fixed * independently. This option processing code starts with * the instantiated acceptor instance and the final queue at * this point. */ if (tcr->OPT_length != 0) { /* Options to process */ int t_error = 0; int sys_error = 0; int do_disconnect = 0; if (tcp_conprim_opt_process(eager, mp1, &do_disconnect, &t_error, &sys_error) < 0) { eager->tcp_accept_error = 1; if (do_disconnect) { /* * An option failed which does not allow * connection to be accepted. * * We allow T_CONN_RES to succeed and * put a T_DISCON_IND on the eager queue. */ ASSERT(t_error == 0 && sys_error == 0); eager->tcp_send_discon_ind = 1; } else { ASSERT(t_error != 0); freemsg(ok_mp); /* * Original mp was either freed or set * to ok_mp above, so use mp1 instead. */ tcp_err_ack(listener, mp1, t_error, sys_error); goto finish; } } /* * Most likely success in setting options (except if * eager->tcp_send_discon_ind set). * mp1 option buffer represented by OPT_length/offset * potentially modified and contains results of setting * options at this point */ } /* We no longer need mp1, since all options processing has passed */ freemsg(mp1); putnext(listener->tcp_connp->conn_rq, ok_mp); mutex_enter(&listener->tcp_eager_lock); if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { mblk_t *conn_ind; /* * This path should not be executed if listener and * acceptor streams are the same. */ ASSERT(listener != acceptor); conn_ind = tcp_get_def_conn_ind(listener); mutex_exit(&listener->tcp_eager_lock); putnext(listener->tcp_connp->conn_rq, conn_ind); } else { mutex_exit(&listener->tcp_eager_lock); } /* * Done with the acceptor - free it * * Note: from this point on, no access to listener should be made * as listener can be equal to acceptor. */ finish: ASSERT(acceptor->tcp_detached); acceptor->tcp_connp->conn_rq = NULL; ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); acceptor->tcp_connp->conn_wq = NULL; (void) tcp_clean_death(acceptor, 0); CONN_DEC_REF(acceptor->tcp_connp); /* * We pass discon_mp to tcp_accept_finish to get on the right squeue. * * It will update the setting for sockfs/stream head and also take * care of any data that arrived before accept() wad called. * In case we already received a FIN then tcp_accept_finish will send up * the ordrel. It will also send up a window update if the window * has opened up. */ /* * XXX: we currently have a problem if XTI application closes the * acceptor stream in between. This problem exists in on10-gate also * and is well know but nothing can be done short of major rewrite * to fix it. Now it is possible to take care of it by assigning TLI/XTI * eager same squeue as listener (we can distinguish non socket * listeners at the time of handling a SYN in tcp_input_listener) * and do most of the work that tcp_accept_finish does here itself * and then get behind the acceptor squeue to access the acceptor * queue. */ /* * We already have a ref on tcp so no need to do one before squeue_enter */ SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp, tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_ACCEPT_FINISH); } /* * This is the STREAMS entry point for T_CONN_RES coming down on * Acceptor STREAM when sockfs listener does accept processing. * Read the block comment on top of tcp_input_listener(). */ void tcp_tpi_accept(queue_t *q, mblk_t *mp) { queue_t *rq = RD(q); struct T_conn_res *conn_res; tcp_t *eager; tcp_t *listener; struct T_ok_ack *ok; t_scalar_t PRIM_type; mblk_t *discon_mp; conn_t *econnp; cred_t *cr; ASSERT(DB_TYPE(mp) == M_PROTO); /* * All Solaris components should pass a db_credp * for this TPI message, hence we ASSERT. * But in case there is some other M_PROTO that looks * like a TPI message sent by some other kernel * component, we check and return an error. */ cr = msg_getcred(mp, NULL); ASSERT(cr != NULL); if (cr == NULL) { mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); if (mp != NULL) putnext(rq, mp); return; } conn_res = (struct T_conn_res *)mp->b_rptr; ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); if (mp != NULL) putnext(rq, mp); return; } switch (conn_res->PRIM_type) { case O_T_CONN_RES: case T_CONN_RES: /* * We pass up an err ack if allocb fails. This will * cause sockfs to issue a T_DISCON_REQ which will cause * tcp_eager_blowoff to be called. sockfs will then call * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. * we need to do the allocb up here because we have to * make sure rq->q_qinfo->qi_qclose still points to the * correct function (tcp_tpi_close_accept) in case allocb * fails. */ bcopy(mp->b_rptr + conn_res->OPT_offset, &eager, conn_res->OPT_length); PRIM_type = conn_res->PRIM_type; mp->b_datap->db_type = M_PCPROTO; mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); ok = (struct T_ok_ack *)mp->b_rptr; ok->PRIM_type = T_OK_ACK; ok->CORRECT_prim = PRIM_type; econnp = eager->tcp_connp; econnp->conn_dev = (dev_t)RD(q)->q_ptr; econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr); econnp->conn_rq = rq; econnp->conn_wq = q; rq->q_ptr = econnp; rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */ q->q_ptr = econnp; q->q_qinfo = &tcp_winit; listener = eager->tcp_listener; /* * Pre allocate the discon_ind mblk also. tcp_accept_finish will * use it if something failed. */ discon_mp = allocb(MAX(sizeof (struct T_discon_ind), sizeof (struct stroptions)), BPRI_HI); if (discon_mp == NULL) { mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); if (mp != NULL) putnext(rq, mp); return; } eager->tcp_issocket = B_TRUE; ASSERT(econnp->conn_netstack == listener->tcp_connp->conn_netstack); ASSERT(eager->tcp_tcps == listener->tcp_tcps); /* Put the ref for IP */ CONN_INC_REF(econnp); /* * We should have minimum of 3 references on the conn * at this point. One each for TCP and IP and one for * the T_conn_ind that was sent up when the 3-way handshake * completed. In the normal case we would also have another * reference (making a total of 4) for the conn being in the * classifier hash list. However the eager could have received * an RST subsequently and tcp_closei_local could have removed * the eager from the classifier hash list, hence we can't * assert that reference. */ ASSERT(econnp->conn_ref >= 3); mutex_enter(&listener->tcp_eager_lock); if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { mblk_t *conn_ind = tcp_get_def_conn_ind(listener); /* Need to get inside the listener perimeter */ CONN_INC_REF(listener->tcp_connp); SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, conn_ind, tcp_send_pending, listener->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_SEND_PENDING); } tcp_eager_unlink(eager); mutex_exit(&listener->tcp_eager_lock); /* * At this point, the eager is detached from the listener * but we still have an extra refs on eager (apart from the * usual tcp references). The ref was placed in tcp_input_data * before sending the conn_ind in tcp_send_conn_ind. * The ref will be dropped in tcp_accept_finish(). */ SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish, econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); /* * Send the new local address also up to sockfs. There * should already be enough space in the mp that came * down from soaccept(). */ if (econnp->conn_family == AF_INET) { sin_t *sin; ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= (sizeof (struct T_ok_ack) + sizeof (sin_t))); sin = (sin_t *)mp->b_wptr; mp->b_wptr += sizeof (sin_t); sin->sin_family = AF_INET; sin->sin_port = econnp->conn_lport; sin->sin_addr.s_addr = econnp->conn_laddr_v4; } else { sin6_t *sin6; ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= sizeof (struct T_ok_ack) + sizeof (sin6_t)); sin6 = (sin6_t *)mp->b_wptr; mp->b_wptr += sizeof (sin6_t); sin6->sin6_family = AF_INET6; sin6->sin6_port = econnp->conn_lport; sin6->sin6_addr = econnp->conn_laddr_v6; if (econnp->conn_ipversion == IPV4_VERSION) sin6->sin6_flowinfo = 0; else sin6->sin6_flowinfo = econnp->conn_flowinfo; if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { sin6->sin6_scope_id = econnp->conn_ixa->ixa_scopeid; } else { sin6->sin6_scope_id = 0; } sin6->__sin6_src_id = 0; } putnext(rq, mp); return; default: mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); if (mp != NULL) putnext(rq, mp); return; } } /* * The function called through squeue to get behind listener's perimeter to * send a deferred conn_ind. */ /* ARGSUSED */ void tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *lconnp = (conn_t *)arg; tcp_t *listener = lconnp->conn_tcp; struct T_conn_ind *conn_ind; tcp_t *tcp; conn_ind = (struct T_conn_ind *)mp->b_rptr; bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, conn_ind->OPT_length); if (listener->tcp_state != TCPS_LISTEN) { /* * If listener has closed, it would have caused a * a cleanup/blowoff to happen for the eager, so * we don't need to do anything more. */ freemsg(mp); return; } putnext(lconnp->conn_rq, mp); } /* * Sends the T_CONN_IND to the listener. The caller calls this * functions via squeue to get inside the listener's perimeter * once the 3 way hand shake is done a T_CONN_IND needs to be * sent. As an optimization, the caller can call this directly * if listener's perimeter is same as eager's. */ /* ARGSUSED */ void tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) { conn_t *lconnp = (conn_t *)arg; tcp_t *listener = lconnp->conn_tcp; tcp_t *tcp; struct T_conn_ind *conn_ind; ipaddr_t *addr_cache; boolean_t need_send_conn_ind = B_FALSE; tcp_stack_t *tcps = listener->tcp_tcps; /* retrieve the eager */ conn_ind = (struct T_conn_ind *)mp->b_rptr; ASSERT(conn_ind->OPT_offset != 0 && conn_ind->OPT_length == sizeof (intptr_t)); bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, conn_ind->OPT_length); /* * TLI/XTI applications will get confused by * sending eager as an option since it violates * the option semantics. So remove the eager as * option since TLI/XTI app doesn't need it anyway. */ if (!TCP_IS_SOCKET(listener)) { conn_ind->OPT_length = 0; conn_ind->OPT_offset = 0; } if (listener->tcp_state != TCPS_LISTEN) { /* * If listener has closed, it would have caused a * a cleanup/blowoff to happen for the eager. We * just need to return. */ freemsg(mp); return; } /* * if the conn_req_q is full defer passing up the * T_CONN_IND until space is availabe after t_accept() * processing */ mutex_enter(&listener->tcp_eager_lock); /* * Take the eager out, if it is in the list of droppable eagers * as we are here because the 3W handshake is over. */ MAKE_UNDROPPABLE(tcp); if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { tcp_t *tail; /* * The eager already has an extra ref put in tcp_input_data * so that it stays till accept comes back even though it * might get into TCPS_CLOSED as a result of a TH_RST etc. */ ASSERT(listener->tcp_conn_req_cnt_q0 > 0); listener->tcp_conn_req_cnt_q0--; listener->tcp_conn_req_cnt_q++; /* Move from SYN_RCVD to ESTABLISHED list */ tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0; tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; tcp->tcp_eager_prev_q0 = NULL; tcp->tcp_eager_next_q0 = NULL; /* * Insert at end of the queue because sockfs * sends down T_CONN_RES in chronological * order. Leaving the older conn indications * at front of the queue helps reducing search * time. */ tail = listener->tcp_eager_last_q; if (tail != NULL) tail->tcp_eager_next_q = tcp; else listener->tcp_eager_next_q = tcp; listener->tcp_eager_last_q = tcp; tcp->tcp_eager_next_q = NULL; /* * Delay sending up the T_conn_ind until we are * done with the eager. Once we have have sent up * the T_conn_ind, the accept can potentially complete * any time and release the refhold we have on the eager. */ need_send_conn_ind = B_TRUE; } else { /* * Defer connection on q0 and set deferred * connection bit true */ tcp->tcp_conn_def_q0 = B_TRUE; /* take tcp out of q0 ... */ tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = tcp->tcp_eager_prev_q0; /* ... and place it at the end of q0 */ tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; tcp->tcp_eager_next_q0 = listener; listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; listener->tcp_eager_prev_q0 = tcp; tcp->tcp_conn.tcp_eager_conn_ind = mp; } /* we have timed out before */ if (tcp->tcp_syn_rcvd_timeout != 0) { tcp->tcp_syn_rcvd_timeout = 0; listener->tcp_syn_rcvd_timeout--; if (listener->tcp_syn_defense && listener->tcp_syn_rcvd_timeout <= (tcps->tcps_conn_req_max_q0 >> 5) && 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - listener->tcp_last_rcv_lbolt)) { /* * Turn off the defense mode if we * believe the SYN attack is over. */ listener->tcp_syn_defense = B_FALSE; if (listener->tcp_ip_addr_cache) { kmem_free((void *)listener->tcp_ip_addr_cache, IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); listener->tcp_ip_addr_cache = NULL; } } } addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); if (addr_cache != NULL) { /* * We have finished a 3-way handshake with this * remote host. This proves the IP addr is good. * Cache it! */ addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = tcp->tcp_connp->conn_faddr_v4; } mutex_exit(&listener->tcp_eager_lock); if (need_send_conn_ind) putnext(lconnp->conn_rq, mp); }