xref: /titanic_50/usr/src/uts/common/inet/tcp/tcp_bind.c (revision a307732568c3d861c38b0342ae32434226d10e94)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/stream.h>
28 #include <sys/strsun.h>
29 #include <sys/strsubr.h>
30 #include <sys/stropts.h>
31 #include <sys/strlog.h>
32 #define	_SUN_TPI_VERSION 2
33 #include <sys/tihdr.h>
34 #include <sys/suntpi.h>
35 #include <sys/xti_inet.h>
36 #include <sys/policy.h>
37 #include <sys/squeue_impl.h>
38 #include <sys/squeue.h>
39 #include <sys/tsol/tnet.h>
40 
41 #include <rpc/pmap_prot.h>
42 
43 #include <inet/common.h>
44 #include <inet/ip.h>
45 #include <inet/tcp.h>
46 #include <inet/tcp_impl.h>
47 #include <inet/proto_set.h>
48 #include <inet/ipsec_impl.h>
49 
50 /* Setable in /etc/system */
51 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
52 static uint32_t tcp_random_anon_port = 1;
53 
54 static int	tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
55 		    cred_t *cr);
56 static in_port_t	tcp_get_next_priv_port(const tcp_t *);
57 
58 /*
59  * Hash list insertion routine for tcp_t structures. Each hash bucket
60  * contains a list of tcp_t entries, and each entry is bound to a unique
61  * port. If there are multiple tcp_t's that are bound to the same port, then
62  * one of them will be linked into the hash bucket list, and the rest will
63  * hang off of that one entry. For each port, entries bound to a specific IP
64  * address will be inserted before those those bound to INADDR_ANY.
65  */
66 void
67 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
68 {
69 	tcp_t	**tcpp;
70 	tcp_t	*tcpnext;
71 	tcp_t	*tcphash;
72 	conn_t	*connp = tcp->tcp_connp;
73 	conn_t	*connext;
74 
75 	if (tcp->tcp_ptpbhn != NULL) {
76 		ASSERT(!caller_holds_lock);
77 		tcp_bind_hash_remove(tcp);
78 	}
79 	tcpp = &tbf->tf_tcp;
80 	if (!caller_holds_lock) {
81 		mutex_enter(&tbf->tf_lock);
82 	} else {
83 		ASSERT(MUTEX_HELD(&tbf->tf_lock));
84 	}
85 	tcphash = tcpp[0];
86 	tcpnext = NULL;
87 	if (tcphash != NULL) {
88 		/* Look for an entry using the same port */
89 		while ((tcphash = tcpp[0]) != NULL &&
90 		    connp->conn_lport != tcphash->tcp_connp->conn_lport)
91 			tcpp = &(tcphash->tcp_bind_hash);
92 
93 		/* The port was not found, just add to the end */
94 		if (tcphash == NULL)
95 			goto insert;
96 
97 		/*
98 		 * OK, there already exists an entry bound to the
99 		 * same port.
100 		 *
101 		 * If the new tcp bound to the INADDR_ANY address
102 		 * and the first one in the list is not bound to
103 		 * INADDR_ANY we skip all entries until we find the
104 		 * first one bound to INADDR_ANY.
105 		 * This makes sure that applications binding to a
106 		 * specific address get preference over those binding to
107 		 * INADDR_ANY.
108 		 */
109 		tcpnext = tcphash;
110 		connext = tcpnext->tcp_connp;
111 		tcphash = NULL;
112 		if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
113 		    !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
114 			while ((tcpnext = tcpp[0]) != NULL) {
115 				connext = tcpnext->tcp_connp;
116 				if (!V6_OR_V4_INADDR_ANY(
117 				    connext->conn_bound_addr_v6))
118 					tcpp = &(tcpnext->tcp_bind_hash_port);
119 				else
120 					break;
121 			}
122 			if (tcpnext != NULL) {
123 				tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
124 				tcphash = tcpnext->tcp_bind_hash;
125 				if (tcphash != NULL) {
126 					tcphash->tcp_ptpbhn =
127 					    &(tcp->tcp_bind_hash);
128 					tcpnext->tcp_bind_hash = NULL;
129 				}
130 			}
131 		} else {
132 			tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
133 			tcphash = tcpnext->tcp_bind_hash;
134 			if (tcphash != NULL) {
135 				tcphash->tcp_ptpbhn =
136 				    &(tcp->tcp_bind_hash);
137 				tcpnext->tcp_bind_hash = NULL;
138 			}
139 		}
140 	}
141 insert:
142 	tcp->tcp_bind_hash_port = tcpnext;
143 	tcp->tcp_bind_hash = tcphash;
144 	tcp->tcp_ptpbhn = tcpp;
145 	tcpp[0] = tcp;
146 	if (!caller_holds_lock)
147 		mutex_exit(&tbf->tf_lock);
148 }
149 
150 /*
151  * Hash list removal routine for tcp_t structures.
152  */
153 void
154 tcp_bind_hash_remove(tcp_t *tcp)
155 {
156 	tcp_t	*tcpnext;
157 	kmutex_t *lockp;
158 	tcp_stack_t	*tcps = tcp->tcp_tcps;
159 	conn_t		*connp = tcp->tcp_connp;
160 
161 	if (tcp->tcp_ptpbhn == NULL)
162 		return;
163 
164 	/*
165 	 * Extract the lock pointer in case there are concurrent
166 	 * hash_remove's for this instance.
167 	 */
168 	ASSERT(connp->conn_lport != 0);
169 	lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
170 	    connp->conn_lport)].tf_lock;
171 
172 	ASSERT(lockp != NULL);
173 	mutex_enter(lockp);
174 	if (tcp->tcp_ptpbhn) {
175 		tcpnext = tcp->tcp_bind_hash_port;
176 		if (tcpnext != NULL) {
177 			tcp->tcp_bind_hash_port = NULL;
178 			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
179 			tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
180 			if (tcpnext->tcp_bind_hash != NULL) {
181 				tcpnext->tcp_bind_hash->tcp_ptpbhn =
182 				    &(tcpnext->tcp_bind_hash);
183 				tcp->tcp_bind_hash = NULL;
184 			}
185 		} else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
186 			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
187 			tcp->tcp_bind_hash = NULL;
188 		}
189 		*tcp->tcp_ptpbhn = tcpnext;
190 		tcp->tcp_ptpbhn = NULL;
191 	}
192 	mutex_exit(lockp);
193 }
194 
195 /*
196  * Don't let port fall into the privileged range.
197  * Since the extra privileged ports can be arbitrary we also
198  * ensure that we exclude those from consideration.
199  * tcp_g_epriv_ports is not sorted thus we loop over it until
200  * there are no changes.
201  *
202  * Note: No locks are held when inspecting tcp_g_*epriv_ports
203  * but instead the code relies on:
204  * - the fact that the address of the array and its size never changes
205  * - the atomic assignment of the elements of the array
206  *
207  * Returns 0 if there are no more ports available.
208  *
209  * TS note: skip multilevel ports.
210  */
211 in_port_t
212 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
213 {
214 	int i;
215 	boolean_t restart = B_FALSE;
216 	tcp_stack_t *tcps = tcp->tcp_tcps;
217 
218 	if (random && tcp_random_anon_port != 0) {
219 		(void) random_get_pseudo_bytes((uint8_t *)&port,
220 		    sizeof (in_port_t));
221 		/*
222 		 * Unless changed by a sys admin, the smallest anon port
223 		 * is 32768 and the largest anon port is 65535.  It is
224 		 * very likely (50%) for the random port to be smaller
225 		 * than the smallest anon port.  When that happens,
226 		 * add port % (anon port range) to the smallest anon
227 		 * port to get the random port.  It should fall into the
228 		 * valid anon port range.
229 		 */
230 		if (port < tcps->tcps_smallest_anon_port) {
231 			port = tcps->tcps_smallest_anon_port +
232 			    port % (tcps->tcps_largest_anon_port -
233 			    tcps->tcps_smallest_anon_port);
234 		}
235 	}
236 
237 retry:
238 	if (port < tcps->tcps_smallest_anon_port)
239 		port = (in_port_t)tcps->tcps_smallest_anon_port;
240 
241 	if (port > tcps->tcps_largest_anon_port) {
242 		if (restart)
243 			return (0);
244 		restart = B_TRUE;
245 		port = (in_port_t)tcps->tcps_smallest_anon_port;
246 	}
247 
248 	if (port < tcps->tcps_smallest_nonpriv_port)
249 		port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
250 
251 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
252 		if (port == tcps->tcps_g_epriv_ports[i]) {
253 			port++;
254 			/*
255 			 * Make sure whether the port is in the
256 			 * valid range.
257 			 */
258 			goto retry;
259 		}
260 	}
261 	if (is_system_labeled() &&
262 	    (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
263 	    IPPROTO_TCP, B_TRUE)) != 0) {
264 		port = i;
265 		goto retry;
266 	}
267 	return (port);
268 }
269 
270 /*
271  * Return the next anonymous port in the privileged port range for
272  * bind checking.  It starts at IPPORT_RESERVED - 1 and goes
273  * downwards.  This is the same behavior as documented in the userland
274  * library call rresvport(3N).
275  *
276  * TS note: skip multilevel ports.
277  */
278 static in_port_t
279 tcp_get_next_priv_port(const tcp_t *tcp)
280 {
281 	static in_port_t next_priv_port = IPPORT_RESERVED - 1;
282 	in_port_t nextport;
283 	boolean_t restart = B_FALSE;
284 	tcp_stack_t *tcps = tcp->tcp_tcps;
285 retry:
286 	if (next_priv_port < tcps->tcps_min_anonpriv_port ||
287 	    next_priv_port >= IPPORT_RESERVED) {
288 		next_priv_port = IPPORT_RESERVED - 1;
289 		if (restart)
290 			return (0);
291 		restart = B_TRUE;
292 	}
293 	if (is_system_labeled() &&
294 	    (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
295 	    next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
296 		next_priv_port = nextport;
297 		goto retry;
298 	}
299 	return (next_priv_port--);
300 }
301 
302 static int
303 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
304     boolean_t bind_to_req_port_only, cred_t *cr)
305 {
306 	in_port_t	mlp_port;
307 	mlp_type_t 	addrtype, mlptype;
308 	boolean_t	user_specified;
309 	in_port_t	allocated_port;
310 	in_port_t	requested_port = *requested_port_ptr;
311 	conn_t		*connp = tcp->tcp_connp;
312 	zone_t		*zone;
313 	tcp_stack_t	*tcps = tcp->tcp_tcps;
314 	in6_addr_t	v6addr = connp->conn_laddr_v6;
315 
316 	/*
317 	 * XXX It's up to the caller to specify bind_to_req_port_only or not.
318 	 */
319 	ASSERT(cr != NULL);
320 
321 	/*
322 	 * Get a valid port (within the anonymous range and should not
323 	 * be a privileged one) to use if the user has not given a port.
324 	 * If multiple threads are here, they may all start with
325 	 * with the same initial port. But, it should be fine as long as
326 	 * tcp_bindi will ensure that no two threads will be assigned
327 	 * the same port.
328 	 *
329 	 * NOTE: XXX If a privileged process asks for an anonymous port, we
330 	 * still check for ports only in the range > tcp_smallest_non_priv_port,
331 	 * unless TCP_ANONPRIVBIND option is set.
332 	 */
333 	mlptype = mlptSingle;
334 	mlp_port = requested_port;
335 	if (requested_port == 0) {
336 		requested_port = connp->conn_anon_priv_bind ?
337 		    tcp_get_next_priv_port(tcp) :
338 		    tcp_update_next_port(tcps->tcps_next_port_to_try,
339 		    tcp, B_TRUE);
340 		if (requested_port == 0) {
341 			return (-TNOADDR);
342 		}
343 		user_specified = B_FALSE;
344 
345 		/*
346 		 * If the user went through one of the RPC interfaces to create
347 		 * this socket and RPC is MLP in this zone, then give him an
348 		 * anonymous MLP.
349 		 */
350 		if (connp->conn_anon_mlp && is_system_labeled()) {
351 			zone = crgetzone(cr);
352 			addrtype = tsol_mlp_addr_type(
353 			    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
354 			    IPV6_VERSION, &v6addr,
355 			    tcps->tcps_netstack->netstack_ip);
356 			if (addrtype == mlptSingle) {
357 				return (-TNOADDR);
358 			}
359 			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
360 			    PMAPPORT, addrtype);
361 			mlp_port = PMAPPORT;
362 		}
363 	} else {
364 		int i;
365 		boolean_t priv = B_FALSE;
366 
367 		/*
368 		 * If the requested_port is in the well-known privileged range,
369 		 * verify that the stream was opened by a privileged user.
370 		 * Note: No locks are held when inspecting tcp_g_*epriv_ports
371 		 * but instead the code relies on:
372 		 * - the fact that the address of the array and its size never
373 		 *   changes
374 		 * - the atomic assignment of the elements of the array
375 		 */
376 		if (requested_port < tcps->tcps_smallest_nonpriv_port) {
377 			priv = B_TRUE;
378 		} else {
379 			for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
380 				if (requested_port ==
381 				    tcps->tcps_g_epriv_ports[i]) {
382 					priv = B_TRUE;
383 					break;
384 				}
385 			}
386 		}
387 		if (priv) {
388 			if (secpolicy_net_privaddr(cr, requested_port,
389 			    IPPROTO_TCP) != 0) {
390 				if (connp->conn_debug) {
391 					(void) strlog(TCP_MOD_ID, 0, 1,
392 					    SL_ERROR|SL_TRACE,
393 					    "tcp_bind: no priv for port %d",
394 					    requested_port);
395 				}
396 				return (-TACCES);
397 			}
398 		}
399 		user_specified = B_TRUE;
400 
401 		connp = tcp->tcp_connp;
402 		if (is_system_labeled()) {
403 			zone = crgetzone(cr);
404 			addrtype = tsol_mlp_addr_type(
405 			    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
406 			    IPV6_VERSION, &v6addr,
407 			    tcps->tcps_netstack->netstack_ip);
408 			if (addrtype == mlptSingle) {
409 				return (-TNOADDR);
410 			}
411 			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
412 			    requested_port, addrtype);
413 		}
414 	}
415 
416 	if (mlptype != mlptSingle) {
417 		if (secpolicy_net_bindmlp(cr) != 0) {
418 			if (connp->conn_debug) {
419 				(void) strlog(TCP_MOD_ID, 0, 1,
420 				    SL_ERROR|SL_TRACE,
421 				    "tcp_bind: no priv for multilevel port %d",
422 				    requested_port);
423 			}
424 			return (-TACCES);
425 		}
426 
427 		/*
428 		 * If we're specifically binding a shared IP address and the
429 		 * port is MLP on shared addresses, then check to see if this
430 		 * zone actually owns the MLP.  Reject if not.
431 		 */
432 		if (mlptype == mlptShared && addrtype == mlptShared) {
433 			/*
434 			 * No need to handle exclusive-stack zones since
435 			 * ALL_ZONES only applies to the shared stack.
436 			 */
437 			zoneid_t mlpzone;
438 
439 			mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
440 			    htons(mlp_port));
441 			if (connp->conn_zoneid != mlpzone) {
442 				if (connp->conn_debug) {
443 					(void) strlog(TCP_MOD_ID, 0, 1,
444 					    SL_ERROR|SL_TRACE,
445 					    "tcp_bind: attempt to bind port "
446 					    "%d on shared addr in zone %d "
447 					    "(should be %d)",
448 					    mlp_port, connp->conn_zoneid,
449 					    mlpzone);
450 				}
451 				return (-TACCES);
452 			}
453 		}
454 
455 		if (!user_specified) {
456 			int err;
457 			err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
458 			    requested_port, B_TRUE);
459 			if (err != 0) {
460 				if (connp->conn_debug) {
461 					(void) strlog(TCP_MOD_ID, 0, 1,
462 					    SL_ERROR|SL_TRACE,
463 					    "tcp_bind: cannot establish anon "
464 					    "MLP for port %d",
465 					    requested_port);
466 				}
467 				return (err);
468 			}
469 			connp->conn_anon_port = B_TRUE;
470 		}
471 		connp->conn_mlp_type = mlptype;
472 	}
473 
474 	allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
475 	    connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
476 	    user_specified);
477 
478 	if (allocated_port == 0) {
479 		connp->conn_mlp_type = mlptSingle;
480 		if (connp->conn_anon_port) {
481 			connp->conn_anon_port = B_FALSE;
482 			(void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
483 			    requested_port, B_FALSE);
484 		}
485 		if (bind_to_req_port_only) {
486 			if (connp->conn_debug) {
487 				(void) strlog(TCP_MOD_ID, 0, 1,
488 				    SL_ERROR|SL_TRACE,
489 				    "tcp_bind: requested addr busy");
490 			}
491 			return (-TADDRBUSY);
492 		} else {
493 			/* If we are out of ports, fail the bind. */
494 			if (connp->conn_debug) {
495 				(void) strlog(TCP_MOD_ID, 0, 1,
496 				    SL_ERROR|SL_TRACE,
497 				    "tcp_bind: out of ports?");
498 			}
499 			return (-TNOADDR);
500 		}
501 	}
502 
503 	/* Pass the allocated port back */
504 	*requested_port_ptr = allocated_port;
505 	return (0);
506 }
507 
508 /*
509  * Check the address and check/pick a local port number.
510  */
511 int
512 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
513     boolean_t bind_to_req_port_only)
514 {
515 	tcp_t	*tcp = connp->conn_tcp;
516 	sin_t	*sin;
517 	sin6_t  *sin6;
518 	in_port_t	requested_port;
519 	ipaddr_t	v4addr;
520 	in6_addr_t	v6addr;
521 	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
522 	zoneid_t	zoneid = IPCL_ZONEID(connp);
523 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
524 	uint_t		scopeid = 0;
525 	int		error = 0;
526 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
527 
528 	ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
529 
530 	if (tcp->tcp_state == TCPS_BOUND) {
531 		return (0);
532 	} else if (tcp->tcp_state > TCPS_BOUND) {
533 		if (connp->conn_debug) {
534 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
535 			    "tcp_bind: bad state, %d", tcp->tcp_state);
536 		}
537 		return (-TOUTSTATE);
538 	}
539 
540 	ASSERT(sa != NULL && len != 0);
541 
542 	if (!OK_32PTR((char *)sa)) {
543 		if (connp->conn_debug) {
544 			(void) strlog(TCP_MOD_ID, 0, 1,
545 			    SL_ERROR|SL_TRACE,
546 			    "tcp_bind: bad address parameter, "
547 			    "address %p, len %d",
548 			    (void *)sa, len);
549 		}
550 		return (-TPROTO);
551 	}
552 
553 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
554 	if (error != 0) {
555 		return (error);
556 	}
557 
558 	switch (len) {
559 	case sizeof (sin_t):	/* Complete IPv4 address */
560 		sin = (sin_t *)sa;
561 		requested_port = ntohs(sin->sin_port);
562 		v4addr = sin->sin_addr.s_addr;
563 		IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
564 		if (v4addr != INADDR_ANY) {
565 			laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
566 			    B_FALSE);
567 		}
568 		break;
569 
570 	case sizeof (sin6_t): /* Complete IPv6 address */
571 		sin6 = (sin6_t *)sa;
572 		v6addr = sin6->sin6_addr;
573 		requested_port = ntohs(sin6->sin6_port);
574 		if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
575 			if (connp->conn_ipv6_v6only)
576 				return (EADDRNOTAVAIL);
577 
578 			IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
579 			if (v4addr != INADDR_ANY) {
580 				laddr_type = ip_laddr_verify_v4(v4addr,
581 				    zoneid, ipst, B_FALSE);
582 			}
583 		} else {
584 			if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
585 				if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
586 					scopeid = sin6->sin6_scope_id;
587 				laddr_type = ip_laddr_verify_v6(&v6addr,
588 				    zoneid, ipst, B_FALSE, scopeid);
589 			}
590 		}
591 		break;
592 
593 	default:
594 		if (connp->conn_debug) {
595 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
596 			    "tcp_bind: bad address length, %d", len);
597 		}
598 		return (EAFNOSUPPORT);
599 		/* return (-TBADADDR); */
600 	}
601 
602 	/* Is the local address a valid unicast address? */
603 	if (laddr_type == IPVL_BAD)
604 		return (EADDRNOTAVAIL);
605 
606 	connp->conn_bound_addr_v6 = v6addr;
607 	if (scopeid != 0) {
608 		ixa->ixa_flags |= IXAF_SCOPEID_SET;
609 		ixa->ixa_scopeid = scopeid;
610 		connp->conn_incoming_ifindex = scopeid;
611 	} else {
612 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
613 		connp->conn_incoming_ifindex = connp->conn_bound_if;
614 	}
615 
616 	connp->conn_laddr_v6 = v6addr;
617 	connp->conn_saddr_v6 = v6addr;
618 
619 	bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
620 
621 	error = tcp_bind_select_lport(tcp, &requested_port,
622 	    bind_to_req_port_only, cr);
623 	if (error != 0) {
624 		connp->conn_laddr_v6 = ipv6_all_zeros;
625 		connp->conn_saddr_v6 = ipv6_all_zeros;
626 		connp->conn_bound_addr_v6 = ipv6_all_zeros;
627 	}
628 	return (error);
629 }
630 
631 /*
632  * If the "bind_to_req_port_only" parameter is set, if the requested port
633  * number is available, return it, If not return 0
634  *
635  * If "bind_to_req_port_only" parameter is not set and
636  * If the requested port number is available, return it.  If not, return
637  * the first anonymous port we happen across.  If no anonymous ports are
638  * available, return 0. addr is the requested local address, if any.
639  *
640  * In either case, when succeeding update the tcp_t to record the port number
641  * and insert it in the bind hash table.
642  *
643  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
644  * without setting SO_REUSEADDR. This is needed so that they
645  * can be viewed as two independent transport protocols.
646  */
647 in_port_t
648 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
649     int reuseaddr, boolean_t quick_connect,
650     boolean_t bind_to_req_port_only, boolean_t user_specified)
651 {
652 	/* number of times we have run around the loop */
653 	int count = 0;
654 	/* maximum number of times to run around the loop */
655 	int loopmax;
656 	conn_t *connp = tcp->tcp_connp;
657 	tcp_stack_t	*tcps = tcp->tcp_tcps;
658 
659 	/*
660 	 * Lookup for free addresses is done in a loop and "loopmax"
661 	 * influences how long we spin in the loop
662 	 */
663 	if (bind_to_req_port_only) {
664 		/*
665 		 * If the requested port is busy, don't bother to look
666 		 * for a new one. Setting loop maximum count to 1 has
667 		 * that effect.
668 		 */
669 		loopmax = 1;
670 	} else {
671 		/*
672 		 * If the requested port is busy, look for a free one
673 		 * in the anonymous port range.
674 		 * Set loopmax appropriately so that one does not look
675 		 * forever in the case all of the anonymous ports are in use.
676 		 */
677 		if (connp->conn_anon_priv_bind) {
678 			/*
679 			 * loopmax =
680 			 * 	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
681 			 */
682 			loopmax = IPPORT_RESERVED -
683 			    tcps->tcps_min_anonpriv_port;
684 		} else {
685 			loopmax = (tcps->tcps_largest_anon_port -
686 			    tcps->tcps_smallest_anon_port + 1);
687 		}
688 	}
689 	do {
690 		uint16_t	lport;
691 		tf_t		*tbf;
692 		tcp_t		*ltcp;
693 		conn_t		*lconnp;
694 
695 		lport = htons(port);
696 
697 		/*
698 		 * Ensure that the tcp_t is not currently in the bind hash.
699 		 * Hold the lock on the hash bucket to ensure that
700 		 * the duplicate check plus the insertion is an atomic
701 		 * operation.
702 		 *
703 		 * This function does an inline lookup on the bind hash list
704 		 * Make sure that we access only members of tcp_t
705 		 * and that we don't look at tcp_tcp, since we are not
706 		 * doing a CONN_INC_REF.
707 		 */
708 		tcp_bind_hash_remove(tcp);
709 		tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
710 		mutex_enter(&tbf->tf_lock);
711 		for (ltcp = tbf->tf_tcp; ltcp != NULL;
712 		    ltcp = ltcp->tcp_bind_hash) {
713 			if (lport == ltcp->tcp_connp->conn_lport)
714 				break;
715 		}
716 
717 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
718 			boolean_t not_socket;
719 			boolean_t exclbind;
720 
721 			lconnp = ltcp->tcp_connp;
722 
723 			/*
724 			 * On a labeled system, we must treat bindings to ports
725 			 * on shared IP addresses by sockets with MAC exemption
726 			 * privilege as being in all zones, as there's
727 			 * otherwise no way to identify the right receiver.
728 			 */
729 			if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
730 				continue;
731 
732 			/*
733 			 * If TCP_EXCLBIND is set for either the bound or
734 			 * binding endpoint, the semantics of bind
735 			 * is changed according to the following.
736 			 *
737 			 * spec = specified address (v4 or v6)
738 			 * unspec = unspecified address (v4 or v6)
739 			 * A = specified addresses are different for endpoints
740 			 *
741 			 * bound	bind to		allowed
742 			 * -------------------------------------
743 			 * unspec	unspec		no
744 			 * unspec	spec		no
745 			 * spec		unspec		no
746 			 * spec		spec		yes if A
747 			 *
748 			 * For labeled systems, SO_MAC_EXEMPT behaves the same
749 			 * as TCP_EXCLBIND, except that zoneid is ignored.
750 			 *
751 			 * Note:
752 			 *
753 			 * 1. Because of TLI semantics, an endpoint can go
754 			 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
755 			 * TCPS_BOUND, depending on whether it is originally
756 			 * a listener or not.  That is why we need to check
757 			 * for states greater than or equal to TCPS_BOUND
758 			 * here.
759 			 *
760 			 * 2. Ideally, we should only check for state equals
761 			 * to TCPS_LISTEN. And the following check should be
762 			 * added.
763 			 *
764 			 * if (ltcp->tcp_state == TCPS_LISTEN ||
765 			 *	!reuseaddr || !lconnp->conn_reuseaddr) {
766 			 *		...
767 			 * }
768 			 *
769 			 * The semantics will be changed to this.  If the
770 			 * endpoint on the list is in state not equal to
771 			 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
772 			 * set, let the bind succeed.
773 			 *
774 			 * Because of (1), we cannot do that for TLI
775 			 * endpoints.  But we can do that for socket endpoints.
776 			 * If in future, we can change this going back
777 			 * semantics, we can use the above check for TLI also.
778 			 */
779 			not_socket = !(TCP_IS_SOCKET(ltcp) &&
780 			    TCP_IS_SOCKET(tcp));
781 			exclbind = lconnp->conn_exclbind ||
782 			    connp->conn_exclbind;
783 
784 			if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
785 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
786 			    (exclbind && (not_socket ||
787 			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
788 				if (V6_OR_V4_INADDR_ANY(
789 				    lconnp->conn_bound_addr_v6) ||
790 				    V6_OR_V4_INADDR_ANY(*laddr) ||
791 				    IN6_ARE_ADDR_EQUAL(laddr,
792 				    &lconnp->conn_bound_addr_v6)) {
793 					break;
794 				}
795 				continue;
796 			}
797 
798 			/*
799 			 * Check ipversion to allow IPv4 and IPv6 sockets to
800 			 * have disjoint port number spaces, if *_EXCLBIND
801 			 * is not set and only if the application binds to a
802 			 * specific port. We use the same autoassigned port
803 			 * number space for IPv4 and IPv6 sockets.
804 			 */
805 			if (connp->conn_ipversion != lconnp->conn_ipversion &&
806 			    bind_to_req_port_only)
807 				continue;
808 
809 			/*
810 			 * Ideally, we should make sure that the source
811 			 * address, remote address, and remote port in the
812 			 * four tuple for this tcp-connection is unique.
813 			 * However, trying to find out the local source
814 			 * address would require too much code duplication
815 			 * with IP, since IP needs needs to have that code
816 			 * to support userland TCP implementations.
817 			 */
818 			if (quick_connect &&
819 			    (ltcp->tcp_state > TCPS_LISTEN) &&
820 			    ((connp->conn_fport != lconnp->conn_fport) ||
821 			    !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
822 			    &lconnp->conn_faddr_v6)))
823 				continue;
824 
825 			if (!reuseaddr) {
826 				/*
827 				 * No socket option SO_REUSEADDR.
828 				 * If existing port is bound to
829 				 * a non-wildcard IP address
830 				 * and the requesting stream is
831 				 * bound to a distinct
832 				 * different IP addresses
833 				 * (non-wildcard, also), keep
834 				 * going.
835 				 */
836 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
837 				    !V6_OR_V4_INADDR_ANY(
838 				    lconnp->conn_bound_addr_v6) &&
839 				    !IN6_ARE_ADDR_EQUAL(laddr,
840 				    &lconnp->conn_bound_addr_v6))
841 					continue;
842 				if (ltcp->tcp_state >= TCPS_BOUND) {
843 					/*
844 					 * This port is being used and
845 					 * its state is >= TCPS_BOUND,
846 					 * so we can't bind to it.
847 					 */
848 					break;
849 				}
850 			} else {
851 				/*
852 				 * socket option SO_REUSEADDR is set on the
853 				 * binding tcp_t.
854 				 *
855 				 * If two streams are bound to
856 				 * same IP address or both addr
857 				 * and bound source are wildcards
858 				 * (INADDR_ANY), we want to stop
859 				 * searching.
860 				 * We have found a match of IP source
861 				 * address and source port, which is
862 				 * refused regardless of the
863 				 * SO_REUSEADDR setting, so we break.
864 				 */
865 				if (IN6_ARE_ADDR_EQUAL(laddr,
866 				    &lconnp->conn_bound_addr_v6) &&
867 				    (ltcp->tcp_state == TCPS_LISTEN ||
868 				    ltcp->tcp_state == TCPS_BOUND))
869 					break;
870 			}
871 		}
872 		if (ltcp != NULL) {
873 			/* The port number is busy */
874 			mutex_exit(&tbf->tf_lock);
875 		} else {
876 			/*
877 			 * This port is ours. Insert in fanout and mark as
878 			 * bound to prevent others from getting the port
879 			 * number.
880 			 */
881 			tcp->tcp_state = TCPS_BOUND;
882 			DTRACE_TCP6(state__change, void, NULL,
883 			    ip_xmit_attr_t *, connp->conn_ixa,
884 			    void, NULL, tcp_t *, tcp, void, NULL,
885 			    int32_t, TCPS_IDLE);
886 
887 			connp->conn_lport = htons(port);
888 
889 			ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
890 			    connp->conn_lport)] == tbf);
891 			tcp_bind_hash_insert(tbf, tcp, 1);
892 
893 			mutex_exit(&tbf->tf_lock);
894 
895 			/*
896 			 * We don't want tcp_next_port_to_try to "inherit"
897 			 * a port number supplied by the user in a bind.
898 			 */
899 			if (user_specified)
900 				return (port);
901 
902 			/*
903 			 * This is the only place where tcp_next_port_to_try
904 			 * is updated. After the update, it may or may not
905 			 * be in the valid range.
906 			 */
907 			if (!connp->conn_anon_priv_bind)
908 				tcps->tcps_next_port_to_try = port + 1;
909 			return (port);
910 		}
911 
912 		if (connp->conn_anon_priv_bind) {
913 			port = tcp_get_next_priv_port(tcp);
914 		} else {
915 			if (count == 0 && user_specified) {
916 				/*
917 				 * We may have to return an anonymous port. So
918 				 * get one to start with.
919 				 */
920 				port =
921 				    tcp_update_next_port(
922 				    tcps->tcps_next_port_to_try,
923 				    tcp, B_TRUE);
924 				user_specified = B_FALSE;
925 			} else {
926 				port = tcp_update_next_port(port + 1, tcp,
927 				    B_FALSE);
928 			}
929 		}
930 		if (port == 0)
931 			break;
932 
933 		/*
934 		 * Don't let this loop run forever in the case where
935 		 * all of the anonymous ports are in use.
936 		 */
937 	} while (++count < loopmax);
938 	return (0);
939 }
940