xref: /illumos-gate/usr/src/uts/common/inet/sctp/sctp_conn.c (revision 6be61d4ea129a94bdfe33533b2bc265d4447f05c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/systm.h>
28 #include <sys/stream.h>
29 #include <sys/cmn_err.h>
30 #include <sys/kmem.h>
31 #define	_SUN_TPI_VERSION 2
32 #include <sys/tihdr.h>
33 #include <sys/stropts.h>
34 #include <sys/strsubr.h>
35 #include <sys/socket.h>
36 #include <sys/tsol/tndb.h>
37 
38 #include <netinet/in.h>
39 #include <netinet/ip6.h>
40 
41 #include <inet/common.h>
42 #include <inet/ip.h>
43 #include <inet/ip6.h>
44 #include <inet/ipclassifier.h>
45 #include <inet/ipsec_impl.h>
46 
47 #include "sctp_impl.h"
48 #include "sctp_addr.h"
49 
50 /*
51  * Common accept code.  Called by sctp_conn_request.
52  * cr_pkt is the INIT / INIT ACK packet.
53  */
54 static int
55 sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt,
56     uint_t ip_hdr_len, sctp_init_chunk_t *iack)
57 {
58 
59 	sctp_hdr_t		*sctph;
60 	sctp_chunk_hdr_t	*ich;
61 	sctp_init_chunk_t	*init;
62 	int			err;
63 	uint_t			sctp_options;
64 	conn_t			*aconnp;
65 	conn_t			*lconnp;
66 	sctp_stack_t		*sctps = listener->sctp_sctps;
67 
68 	sctph = (sctp_hdr_t *)(cr_pkt->b_rptr + ip_hdr_len);
69 	ASSERT(OK_32PTR(sctph));
70 
71 	aconnp = acceptor->sctp_connp;
72 	lconnp = listener->sctp_connp;
73 	aconnp->conn_lport = lconnp->conn_lport;
74 	aconnp->conn_fport = sctph->sh_sport;
75 
76 	ich = (sctp_chunk_hdr_t *)(iack + 1);
77 	init = (sctp_init_chunk_t *)(ich + 1);
78 
79 	/* acceptor isn't in any fanouts yet, so don't need to hold locks */
80 	ASSERT(acceptor->sctp_faddrs == NULL);
81 	err = sctp_get_addrparams(acceptor, listener, cr_pkt, ich,
82 	    &sctp_options);
83 	if (err != 0)
84 		return (err);
85 
86 	if ((err = sctp_set_hdraddrs(acceptor)) != 0)
87 		return (err);
88 
89 	if ((err = sctp_build_hdrs(acceptor, KM_NOSLEEP)) != 0)
90 		return (err);
91 
92 	if ((sctp_options & SCTP_PRSCTP_OPTION) &&
93 	    listener->sctp_prsctp_aware && sctps->sctps_prsctp_enabled) {
94 		acceptor->sctp_prsctp_aware = B_TRUE;
95 	} else {
96 		acceptor->sctp_prsctp_aware = B_FALSE;
97 	}
98 
99 	/* Get  initial TSNs */
100 	acceptor->sctp_ltsn = ntohl(iack->sic_inittsn);
101 	acceptor->sctp_recovery_tsn = acceptor->sctp_lastack_rxd =
102 	    acceptor->sctp_ltsn - 1;
103 	acceptor->sctp_adv_pap = acceptor->sctp_lastack_rxd;
104 	/* Serial numbers are initialized to the same value as the TSNs */
105 	acceptor->sctp_lcsn = acceptor->sctp_ltsn;
106 
107 	if (!sctp_initialize_params(acceptor, init, iack))
108 		return (ENOMEM);
109 
110 	/*
111 	 * Copy sctp_secret from the listener in case we need to validate
112 	 * a possibly delayed cookie.
113 	 */
114 	bcopy(listener->sctp_secret, acceptor->sctp_secret, SCTP_SECRET_LEN);
115 	bcopy(listener->sctp_old_secret, acceptor->sctp_old_secret,
116 	    SCTP_SECRET_LEN);
117 	acceptor->sctp_last_secret_update = ddi_get_lbolt64();
118 
119 	/*
120 	 * After acceptor is inserted in the hash list, it can be found.
121 	 * So we need to lock it here.
122 	 */
123 	RUN_SCTP(acceptor);
124 
125 	sctp_conn_hash_insert(&sctps->sctps_conn_fanout[
126 	    SCTP_CONN_HASH(sctps, aconnp->conn_ports)], acceptor, 0);
127 	sctp_bind_hash_insert(&sctps->sctps_bind_fanout[
128 	    SCTP_BIND_HASH(ntohs(aconnp->conn_lport))], acceptor, 0);
129 
130 	SCTP_ASSOC_EST(sctps, acceptor);
131 
132 	/*
133 	 * listener->sctp_rwnd should be the default window size or a
134 	 * window size changed via SO_RCVBUF option.
135 	 */
136 	acceptor->sctp_rwnd = listener->sctp_rwnd;
137 	acceptor->sctp_irwnd = acceptor->sctp_rwnd;
138 	acceptor->sctp_pd_point = acceptor->sctp_rwnd;
139 	acceptor->sctp_upcalls = listener->sctp_upcalls;
140 
141 	return (0);
142 }
143 
144 /* Process the COOKIE packet, mp, directed at the listener 'sctp' */
145 sctp_t *
146 sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len,
147     sctp_init_chunk_t *iack, ip_recv_attr_t *ira)
148 {
149 	sctp_t	*eager;
150 	ip6_t	*ip6h;
151 	int	err;
152 	conn_t	*connp, *econnp;
153 	sctp_stack_t	*sctps;
154 	struct sock_proto_props sopp;
155 	cred_t		*cr;
156 	pid_t		cpid;
157 	in6_addr_t	faddr, laddr;
158 	ip_xmit_attr_t	*ixa;
159 	sctp_listen_cnt_t *slc = sctp->sctp_listen_cnt;
160 	boolean_t	slc_set = B_FALSE;
161 
162 	/*
163 	 * No need to check for duplicate as this is the listener
164 	 * and we are holding the lock.  This means that no new
165 	 * connection can be created out of it.  And since the
166 	 * fanout already done cannot find a match, it means that
167 	 * there is no duplicate.
168 	 */
169 	ASSERT(OK_32PTR(mp->b_rptr));
170 
171 	connp = sctp->sctp_connp;
172 	sctps = sctp->sctp_sctps;
173 
174 	/*
175 	 * Enforce the limit set on the number of connections per listener.
176 	 * Note that tlc_cnt starts with 1.  So need to add 1 to tlc_max
177 	 * for comparison.
178 	 */
179 	if (slc != NULL) {
180 		int64_t now;
181 
182 		if (atomic_add_32_nv(&slc->slc_cnt, 1) > slc->slc_max + 1) {
183 			now = ddi_get_lbolt64();
184 			atomic_add_32(&slc->slc_cnt, -1);
185 			SCTP_KSTAT(sctps, sctp_listen_cnt_drop);
186 			slc->slc_drop++;
187 			if (now - slc->slc_report_time >
188 			    MSEC_TO_TICK(SCTP_SLC_REPORT_INTERVAL)) {
189 				zcmn_err(connp->conn_zoneid, CE_WARN,
190 				    "SCTP listener (port %d) association max "
191 				    "(%u) reached: %u attempts dropped total\n",
192 				    ntohs(connp->conn_lport),
193 				    slc->slc_max, slc->slc_drop);
194 				slc->slc_report_time = now;
195 			}
196 			return (NULL);
197 		}
198 		slc_set = B_TRUE;
199 	}
200 
201 	if ((eager = sctp_create_eager(sctp)) == NULL) {
202 		if (slc_set)
203 			atomic_add_32(&slc->slc_cnt, -1);
204 		return (NULL);
205 	}
206 	econnp = eager->sctp_connp;
207 
208 	if (connp->conn_policy != NULL) {
209 		/* Inherit the policy from the listener; use actions from ira */
210 		if (!ip_ipsec_policy_inherit(econnp, connp, ira)) {
211 			sctp_close_eager(eager);
212 			SCTPS_BUMP_MIB(sctps, sctpListenDrop);
213 			return (NULL);
214 		}
215 	}
216 
217 	ip6h = (ip6_t *)mp->b_rptr;
218 	if (ira->ira_flags & IXAF_IS_IPV4) {
219 		ipha_t	*ipha;
220 
221 		ipha = (ipha_t *)ip6h;
222 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &laddr);
223 		IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &faddr);
224 	} else {
225 		laddr = ip6h->ip6_dst;
226 		faddr = ip6h->ip6_src;
227 	}
228 
229 	if (ira->ira_flags & IRAF_IPSEC_SECURE) {
230 		/*
231 		 * XXX need to fix the cached policy issue here.
232 		 * We temporarily set the conn_laddr/conn_faddr here so
233 		 * that IPsec can use it for the latched policy
234 		 * selector.  This is obvioursly wrong as SCTP can
235 		 * use different addresses...
236 		 */
237 		econnp->conn_laddr_v6 = laddr;
238 		econnp->conn_faddr_v6 = faddr;
239 		econnp->conn_saddr_v6 = laddr;
240 	}
241 	if (ipsec_conn_cache_policy(econnp,
242 	    (ira->ira_flags & IRAF_IS_IPV4) != 0) != 0) {
243 		sctp_close_eager(eager);
244 		SCTPS_BUMP_MIB(sctps, sctpListenDrop);
245 		return (NULL);
246 	}
247 
248 	/* Save for getpeerucred */
249 	cr = ira->ira_cred;
250 	cpid = ira->ira_cpid;
251 
252 	if (is_system_labeled()) {
253 		ip_xmit_attr_t *ixa = econnp->conn_ixa;
254 
255 		ASSERT(ira->ira_tsl != NULL);
256 
257 		/* Discard any old label */
258 		if (ixa->ixa_free_flags & IXA_FREE_TSL) {
259 			ASSERT(ixa->ixa_tsl != NULL);
260 			label_rele(ixa->ixa_tsl);
261 			ixa->ixa_free_flags &= ~IXA_FREE_TSL;
262 			ixa->ixa_tsl = NULL;
263 		}
264 
265 		if ((connp->conn_mlp_type != mlptSingle ||
266 		    connp->conn_mac_mode != CONN_MAC_DEFAULT) &&
267 		    ira->ira_tsl != NULL) {
268 			/*
269 			 * If this is an MLP connection or a MAC-Exempt
270 			 * connection with an unlabeled node, packets are to be
271 			 * exchanged using the security label of the received
272 			 * Cookie packet instead of the server application's
273 			 * label.
274 			 * tsol_check_dest called from ip_set_destination
275 			 * might later update TSF_UNLABELED by replacing
276 			 * ixa_tsl with a new label.
277 			 */
278 			label_hold(ira->ira_tsl);
279 			ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl);
280 		} else {
281 			ixa->ixa_tsl = crgetlabel(econnp->conn_cred);
282 		}
283 	}
284 
285 	err = sctp_accept_comm(sctp, eager, mp, ip_hdr_len, iack);
286 	if (err != 0) {
287 		sctp_close_eager(eager);
288 		SCTPS_BUMP_MIB(sctps, sctpListenDrop);
289 		return (NULL);
290 	}
291 
292 	ASSERT(eager->sctp_current->sf_ixa != NULL);
293 
294 	ixa = eager->sctp_current->sf_ixa;
295 	if (!(ira->ira_flags & IXAF_IS_IPV4)) {
296 		ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
297 
298 		if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src) ||
299 		    IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) {
300 			eager->sctp_linklocal = 1;
301 
302 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
303 			ixa->ixa_scopeid = ifindex;
304 			econnp->conn_incoming_ifindex = ifindex;
305 		}
306 	}
307 
308 	/*
309 	 * On a clustered note send this notification to the clustering
310 	 * subsystem.
311 	 */
312 	if (cl_sctp_connect != NULL) {
313 		uchar_t	*slist;
314 		uchar_t	*flist;
315 		size_t	fsize;
316 		size_t	ssize;
317 
318 		fsize = sizeof (in6_addr_t) * eager->sctp_nfaddrs;
319 		ssize = sizeof (in6_addr_t) * eager->sctp_nsaddrs;
320 		slist = kmem_alloc(ssize, KM_NOSLEEP);
321 		flist = kmem_alloc(fsize, KM_NOSLEEP);
322 		if (slist == NULL || flist == NULL) {
323 			if (slist != NULL)
324 				kmem_free(slist, ssize);
325 			if (flist != NULL)
326 				kmem_free(flist, fsize);
327 			sctp_close_eager(eager);
328 			SCTPS_BUMP_MIB(sctps, sctpListenDrop);
329 			SCTP_KSTAT(sctps, sctp_cl_connect);
330 			return (NULL);
331 		}
332 		/* The clustering module frees these list */
333 		sctp_get_saddr_list(eager, slist, ssize);
334 		sctp_get_faddr_list(eager, flist, fsize);
335 		(*cl_sctp_connect)(econnp->conn_family, slist,
336 		    eager->sctp_nsaddrs, econnp->conn_lport, flist,
337 		    eager->sctp_nfaddrs, econnp->conn_fport, B_FALSE,
338 		    (cl_sctp_handle_t)eager);
339 	}
340 
341 	/* Connection established, so send up the conn_ind */
342 	if ((eager->sctp_ulpd = sctp->sctp_ulp_newconn(sctp->sctp_ulpd,
343 	    (sock_lower_handle_t)eager, NULL, cr, cpid,
344 	    &eager->sctp_upcalls)) == NULL) {
345 		sctp_close_eager(eager);
346 		SCTPS_BUMP_MIB(sctps, sctpListenDrop);
347 		return (NULL);
348 	}
349 	ASSERT(SCTP_IS_DETACHED(eager));
350 	eager->sctp_detached = B_FALSE;
351 	bzero(&sopp, sizeof (sopp));
352 	sopp.sopp_flags = SOCKOPT_MAXBLK|SOCKOPT_WROFF;
353 	sopp.sopp_maxblk = strmsgsz;
354 	if (econnp->conn_family == AF_INET) {
355 		sopp.sopp_wroff = sctps->sctps_wroff_xtra +
356 		    sizeof (sctp_data_hdr_t) + sctp->sctp_hdr_len;
357 	} else {
358 		sopp.sopp_wroff = sctps->sctps_wroff_xtra +
359 		    sizeof (sctp_data_hdr_t) + sctp->sctp_hdr6_len;
360 	}
361 	eager->sctp_ulp_prop(eager->sctp_ulpd, &sopp);
362 	return (eager);
363 }
364 
365 /*
366  * Connect to a peer - this function inserts the sctp in the
367  * bind and conn fanouts, sends the INIT, and replies to the client
368  * with an OK ack.
369  */
370 int
371 sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen,
372     cred_t *cr, pid_t pid)
373 {
374 	sin_t		*sin;
375 	sin6_t		*sin6;
376 	in6_addr_t	dstaddr;
377 	in_port_t	dstport;
378 	mblk_t		*initmp;
379 	sctp_tf_t	*tbf;
380 	sctp_t		*lsctp;
381 	char		buf[INET6_ADDRSTRLEN];
382 	int		sleep = sctp->sctp_cansleep ? KM_SLEEP : KM_NOSLEEP;
383 	int		err;
384 	sctp_faddr_t	*cur_fp;
385 	sctp_stack_t	*sctps = sctp->sctp_sctps;
386 	conn_t		*connp = sctp->sctp_connp;
387 	uint_t		scope_id = 0;
388 	ip_xmit_attr_t	*ixa;
389 
390 	/*
391 	 * Determine packet type based on type of address passed in
392 	 * the request should contain an IPv4 or IPv6 address.
393 	 * Make sure that address family matches the type of
394 	 * family of the address passed down.
395 	 */
396 	if (addrlen < sizeof (sin_t)) {
397 		return (EINVAL);
398 	}
399 	switch (dst->sa_family) {
400 	case AF_INET:
401 		sin = (sin_t *)dst;
402 
403 		/* Check for attempt to connect to non-unicast */
404 		if (CLASSD(sin->sin_addr.s_addr) ||
405 		    (sin->sin_addr.s_addr == INADDR_BROADCAST)) {
406 			ip0dbg(("sctp_connect: non-unicast\n"));
407 			return (EINVAL);
408 		}
409 		if (connp->conn_ipv6_v6only)
410 			return (EAFNOSUPPORT);
411 
412 		/* convert to v6 mapped */
413 		/* Check for attempt to connect to INADDR_ANY */
414 		if (sin->sin_addr.s_addr == INADDR_ANY)  {
415 			struct in_addr v4_addr;
416 			/*
417 			 * SunOS 4.x and 4.3 BSD allow an application
418 			 * to connect a TCP socket to INADDR_ANY.
419 			 * When they do this, the kernel picks the
420 			 * address of one interface and uses it
421 			 * instead.  The kernel usually ends up
422 			 * picking the address of the loopback
423 			 * interface.  This is an undocumented feature.
424 			 * However, we provide the same thing here
425 			 * in case any TCP apps that use this feature
426 			 * are being ported to SCTP...
427 			 */
428 			v4_addr.s_addr = htonl(INADDR_LOOPBACK);
429 			IN6_INADDR_TO_V4MAPPED(&v4_addr, &dstaddr);
430 		} else {
431 			IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &dstaddr);
432 		}
433 		dstport = sin->sin_port;
434 		break;
435 	case AF_INET6:
436 		sin6 = (sin6_t *)dst;
437 		/* Check for attempt to connect to non-unicast. */
438 		if ((addrlen < sizeof (sin6_t)) ||
439 		    IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
440 			ip0dbg(("sctp_connect: non-unicast\n"));
441 			return (EINVAL);
442 		}
443 		if (connp->conn_ipv6_v6only &&
444 		    IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
445 			return (EAFNOSUPPORT);
446 		}
447 		/* check for attempt to connect to unspec */
448 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
449 			dstaddr = ipv6_loopback;
450 		} else {
451 			dstaddr = sin6->sin6_addr;
452 			if (IN6_IS_ADDR_LINKLOCAL(&dstaddr)) {
453 				sctp->sctp_linklocal = 1;
454 				scope_id = sin6->sin6_scope_id;
455 			}
456 		}
457 		dstport = sin6->sin6_port;
458 		connp->conn_flowinfo = sin6->sin6_flowinfo;
459 		break;
460 	default:
461 		dprint(1, ("sctp_connect: unknown family %d\n",
462 		    dst->sa_family));
463 		return (EAFNOSUPPORT);
464 	}
465 
466 	(void) inet_ntop(AF_INET6, &dstaddr, buf, sizeof (buf));
467 	dprint(1, ("sctp_connect: attempting connect to %s...\n", buf));
468 
469 	RUN_SCTP(sctp);
470 
471 	if (connp->conn_family != dst->sa_family ||
472 	    (connp->conn_state_flags & CONN_CLOSING)) {
473 		WAKE_SCTP(sctp);
474 		return (EINVAL);
475 	}
476 
477 	/* We update our cred/cpid based on the caller of connect */
478 	if (connp->conn_cred != cr) {
479 		crhold(cr);
480 		crfree(connp->conn_cred);
481 		connp->conn_cred = cr;
482 	}
483 	connp->conn_cpid = pid;
484 
485 	/* Cache things in conn_ixa without any refhold */
486 	ixa = connp->conn_ixa;
487 	ASSERT(!(ixa->ixa_free_flags & IXA_FREE_CRED));
488 	ixa->ixa_cred = cr;
489 	ixa->ixa_cpid = pid;
490 	if (is_system_labeled()) {
491 		/* We need to restart with a label based on the cred */
492 		ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred);
493 	}
494 
495 	switch (sctp->sctp_state) {
496 	case SCTPS_IDLE: {
497 		struct sockaddr_storage	ss;
498 
499 		/*
500 		 * We support a quick connect capability here, allowing
501 		 * clients to transition directly from IDLE to COOKIE_WAIT.
502 		 * sctp_bindi will pick an unused port, insert the connection
503 		 * in the bind hash and transition to BOUND state. SCTP
504 		 * picks and uses what it considers the optimal local address
505 		 * set (just like specifiying INADDR_ANY to bind()).
506 		 */
507 		dprint(1, ("sctp_connect: idle, attempting bind...\n"));
508 		ASSERT(sctp->sctp_nsaddrs == 0);
509 
510 		bzero(&ss, sizeof (ss));
511 		ss.ss_family = connp->conn_family;
512 		WAKE_SCTP(sctp);
513 		if ((err = sctp_bind(sctp, (struct sockaddr *)&ss,
514 		    sizeof (ss))) != 0) {
515 			return (err);
516 		}
517 		RUN_SCTP(sctp);
518 		/* FALLTHRU */
519 	}
520 
521 	case SCTPS_BOUND:
522 		ASSERT(sctp->sctp_nsaddrs > 0);
523 
524 		/* do the connect */
525 		/* XXX check for attempt to connect to self */
526 		connp->conn_fport = dstport;
527 
528 		/*
529 		 * Don't allow this connection to completely duplicate
530 		 * an existing connection.
531 		 *
532 		 * Ensure that the duplicate check and insertion is atomic.
533 		 */
534 		sctp_conn_hash_remove(sctp);
535 		tbf = &sctps->sctps_conn_fanout[SCTP_CONN_HASH(sctps,
536 		    connp->conn_ports)];
537 		mutex_enter(&tbf->tf_lock);
538 		lsctp = sctp_lookup(sctp, &dstaddr, tbf, &connp->conn_ports,
539 		    SCTPS_COOKIE_WAIT);
540 		if (lsctp != NULL) {
541 			/* found a duplicate connection */
542 			mutex_exit(&tbf->tf_lock);
543 			SCTP_REFRELE(lsctp);
544 			WAKE_SCTP(sctp);
545 			return (EADDRINUSE);
546 		}
547 
548 		/*
549 		 * OK; set up the peer addr (this may grow after we get
550 		 * the INIT ACK from the peer with additional addresses).
551 		 */
552 		if ((err = sctp_add_faddr(sctp, &dstaddr, sleep,
553 		    B_FALSE)) != 0) {
554 			mutex_exit(&tbf->tf_lock);
555 			WAKE_SCTP(sctp);
556 			return (err);
557 		}
558 		cur_fp = sctp->sctp_faddrs;
559 		ASSERT(cur_fp->sf_ixa != NULL);
560 
561 		/* No valid src addr, return. */
562 		if (cur_fp->sf_state == SCTP_FADDRS_UNREACH) {
563 			mutex_exit(&tbf->tf_lock);
564 			WAKE_SCTP(sctp);
565 			return (EADDRNOTAVAIL);
566 		}
567 
568 		sctp->sctp_primary = cur_fp;
569 		sctp->sctp_current = cur_fp;
570 		sctp->sctp_mss = cur_fp->sf_pmss;
571 		sctp_conn_hash_insert(tbf, sctp, 1);
572 		mutex_exit(&tbf->tf_lock);
573 
574 		ixa = cur_fp->sf_ixa;
575 		ASSERT(ixa->ixa_cred != NULL);
576 
577 		if (scope_id != 0) {
578 			ixa->ixa_flags |= IXAF_SCOPEID_SET;
579 			ixa->ixa_scopeid = scope_id;
580 		} else {
581 			ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
582 		}
583 
584 		/* initialize composite headers */
585 		if ((err = sctp_set_hdraddrs(sctp)) != 0) {
586 			sctp_conn_hash_remove(sctp);
587 			WAKE_SCTP(sctp);
588 			return (err);
589 		}
590 
591 		if ((err = sctp_build_hdrs(sctp, KM_SLEEP)) != 0) {
592 			sctp_conn_hash_remove(sctp);
593 			WAKE_SCTP(sctp);
594 			return (err);
595 		}
596 
597 		/*
598 		 * Turn off the don't fragment bit on the (only) faddr,
599 		 * so that if one of the messages exchanged during the
600 		 * initialization sequence exceeds the path mtu, it
601 		 * at least has a chance to get there. SCTP does no
602 		 * fragmentation of initialization messages.  The DF bit
603 		 * will be turned on again in sctp_send_cookie_echo()
604 		 * (but the cookie echo will still be sent with the df bit
605 		 * off).
606 		 */
607 		cur_fp->sf_df = B_FALSE;
608 
609 		/* Mark this address as alive */
610 		cur_fp->sf_state = SCTP_FADDRS_ALIVE;
611 
612 		/* Send the INIT to the peer */
613 		SCTP_FADDR_TIMER_RESTART(sctp, cur_fp, cur_fp->sf_rto);
614 		sctp->sctp_state = SCTPS_COOKIE_WAIT;
615 		/*
616 		 * sctp_init_mp() could result in modifying the source
617 		 * address list, so take the hash lock.
618 		 */
619 		mutex_enter(&tbf->tf_lock);
620 		initmp = sctp_init_mp(sctp, cur_fp);
621 		if (initmp == NULL) {
622 			mutex_exit(&tbf->tf_lock);
623 			/*
624 			 * It may happen that all the source addresses
625 			 * (loopback/link local) are removed.  In that case,
626 			 * faile the connect.
627 			 */
628 			if (sctp->sctp_nsaddrs == 0) {
629 				sctp_conn_hash_remove(sctp);
630 				SCTP_FADDR_TIMER_STOP(cur_fp);
631 				WAKE_SCTP(sctp);
632 				return (EADDRNOTAVAIL);
633 			}
634 
635 			/* Otherwise, let the retransmission timer retry */
636 			WAKE_SCTP(sctp);
637 			goto notify_ulp;
638 		}
639 		mutex_exit(&tbf->tf_lock);
640 
641 		/*
642 		 * On a clustered note send this notification to the clustering
643 		 * subsystem.
644 		 */
645 		if (cl_sctp_connect != NULL) {
646 			uchar_t		*slist;
647 			uchar_t		*flist;
648 			size_t		ssize;
649 			size_t		fsize;
650 
651 			fsize = sizeof (in6_addr_t) * sctp->sctp_nfaddrs;
652 			ssize = sizeof (in6_addr_t) * sctp->sctp_nsaddrs;
653 			slist = kmem_alloc(ssize, KM_SLEEP);
654 			flist = kmem_alloc(fsize, KM_SLEEP);
655 			/* The clustering module frees the lists */
656 			sctp_get_saddr_list(sctp, slist, ssize);
657 			sctp_get_faddr_list(sctp, flist, fsize);
658 			(*cl_sctp_connect)(connp->conn_family, slist,
659 			    sctp->sctp_nsaddrs, connp->conn_lport,
660 			    flist, sctp->sctp_nfaddrs, connp->conn_fport,
661 			    B_TRUE, (cl_sctp_handle_t)sctp);
662 		}
663 		ASSERT(ixa->ixa_cred != NULL);
664 		ASSERT(ixa->ixa_ire != NULL);
665 
666 		(void) conn_ip_output(initmp, ixa);
667 		BUMP_LOCAL(sctp->sctp_opkts);
668 		WAKE_SCTP(sctp);
669 
670 notify_ulp:
671 		sctp_set_ulp_prop(sctp);
672 
673 		return (0);
674 	default:
675 		ip0dbg(("sctp_connect: invalid state. %d\n", sctp->sctp_state));
676 		WAKE_SCTP(sctp);
677 		return (EINVAL);
678 	}
679 }
680