xref: /titanic_44/usr/src/uts/common/inet/tcp/tcp_bind.c (revision 452bd827089206a0c637b3944aa91806f17304d7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2013, Nexenta Systems, Inc. All rights reserved.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/strsubr.h>
31 #include <sys/stropts.h>
32 #include <sys/strlog.h>
33 #define	_SUN_TPI_VERSION 2
34 #include <sys/tihdr.h>
35 #include <sys/suntpi.h>
36 #include <sys/xti_inet.h>
37 #include <sys/policy.h>
38 #include <sys/squeue_impl.h>
39 #include <sys/squeue.h>
40 #include <sys/tsol/tnet.h>
41 
42 #include <rpc/pmap_prot.h>
43 
44 #include <inet/common.h>
45 #include <inet/ip.h>
46 #include <inet/tcp.h>
47 #include <inet/tcp_impl.h>
48 #include <inet/proto_set.h>
49 #include <inet/ipsec_impl.h>
50 
51 /* Setable in /etc/system */
52 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
53 static uint32_t tcp_random_anon_port = 1;
54 
55 static int	tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
56 		    cred_t *cr);
57 static in_port_t	tcp_get_next_priv_port(const tcp_t *);
58 
59 /*
60  * Hash list insertion routine for tcp_t structures. Each hash bucket
61  * contains a list of tcp_t entries, and each entry is bound to a unique
62  * port. If there are multiple tcp_t's that are bound to the same port, then
63  * one of them will be linked into the hash bucket list, and the rest will
64  * hang off of that one entry. For each port, entries bound to a specific IP
65  * address will be inserted before those those bound to INADDR_ANY.
66  */
67 void
68 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
69 {
70 	tcp_t	**tcpp;
71 	tcp_t	*tcpnext;
72 	tcp_t	*tcphash;
73 	conn_t	*connp = tcp->tcp_connp;
74 	conn_t	*connext;
75 
76 	if (tcp->tcp_ptpbhn != NULL) {
77 		ASSERT(!caller_holds_lock);
78 		tcp_bind_hash_remove(tcp);
79 	}
80 	tcpp = &tbf->tf_tcp;
81 	if (!caller_holds_lock) {
82 		mutex_enter(&tbf->tf_lock);
83 	} else {
84 		ASSERT(MUTEX_HELD(&tbf->tf_lock));
85 	}
86 	tcphash = tcpp[0];
87 	tcpnext = NULL;
88 	if (tcphash != NULL) {
89 		/* Look for an entry using the same port */
90 		while ((tcphash = tcpp[0]) != NULL &&
91 		    connp->conn_lport != tcphash->tcp_connp->conn_lport)
92 			tcpp = &(tcphash->tcp_bind_hash);
93 
94 		/* The port was not found, just add to the end */
95 		if (tcphash == NULL)
96 			goto insert;
97 
98 		/*
99 		 * OK, there already exists an entry bound to the
100 		 * same port.
101 		 *
102 		 * If the new tcp bound to the INADDR_ANY address
103 		 * and the first one in the list is not bound to
104 		 * INADDR_ANY we skip all entries until we find the
105 		 * first one bound to INADDR_ANY.
106 		 * This makes sure that applications binding to a
107 		 * specific address get preference over those binding to
108 		 * INADDR_ANY.
109 		 */
110 		tcpnext = tcphash;
111 		connext = tcpnext->tcp_connp;
112 		tcphash = NULL;
113 		if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
114 		    !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
115 			while ((tcpnext = tcpp[0]) != NULL) {
116 				connext = tcpnext->tcp_connp;
117 				if (!V6_OR_V4_INADDR_ANY(
118 				    connext->conn_bound_addr_v6))
119 					tcpp = &(tcpnext->tcp_bind_hash_port);
120 				else
121 					break;
122 			}
123 			if (tcpnext != NULL) {
124 				tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
125 				tcphash = tcpnext->tcp_bind_hash;
126 				if (tcphash != NULL) {
127 					tcphash->tcp_ptpbhn =
128 					    &(tcp->tcp_bind_hash);
129 					tcpnext->tcp_bind_hash = NULL;
130 				}
131 			}
132 		} else {
133 			tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
134 			tcphash = tcpnext->tcp_bind_hash;
135 			if (tcphash != NULL) {
136 				tcphash->tcp_ptpbhn =
137 				    &(tcp->tcp_bind_hash);
138 				tcpnext->tcp_bind_hash = NULL;
139 			}
140 		}
141 	}
142 insert:
143 	tcp->tcp_bind_hash_port = tcpnext;
144 	tcp->tcp_bind_hash = tcphash;
145 	tcp->tcp_ptpbhn = tcpp;
146 	tcpp[0] = tcp;
147 	if (!caller_holds_lock)
148 		mutex_exit(&tbf->tf_lock);
149 }
150 
151 /*
152  * Hash list removal routine for tcp_t structures.
153  */
154 void
155 tcp_bind_hash_remove(tcp_t *tcp)
156 {
157 	tcp_t	*tcpnext;
158 	kmutex_t *lockp;
159 	tcp_stack_t	*tcps = tcp->tcp_tcps;
160 	conn_t		*connp = tcp->tcp_connp;
161 
162 	if (tcp->tcp_ptpbhn == NULL)
163 		return;
164 
165 	/*
166 	 * Extract the lock pointer in case there are concurrent
167 	 * hash_remove's for this instance.
168 	 */
169 	ASSERT(connp->conn_lport != 0);
170 	lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
171 	    connp->conn_lport)].tf_lock;
172 
173 	ASSERT(lockp != NULL);
174 	mutex_enter(lockp);
175 	if (tcp->tcp_ptpbhn) {
176 		tcpnext = tcp->tcp_bind_hash_port;
177 		if (tcpnext != NULL) {
178 			tcp->tcp_bind_hash_port = NULL;
179 			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
180 			tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
181 			if (tcpnext->tcp_bind_hash != NULL) {
182 				tcpnext->tcp_bind_hash->tcp_ptpbhn =
183 				    &(tcpnext->tcp_bind_hash);
184 				tcp->tcp_bind_hash = NULL;
185 			}
186 		} else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
187 			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
188 			tcp->tcp_bind_hash = NULL;
189 		}
190 		*tcp->tcp_ptpbhn = tcpnext;
191 		tcp->tcp_ptpbhn = NULL;
192 	}
193 	mutex_exit(lockp);
194 }
195 
196 /*
197  * Don't let port fall into the privileged range.
198  * Since the extra privileged ports can be arbitrary we also
199  * ensure that we exclude those from consideration.
200  * tcp_g_epriv_ports is not sorted thus we loop over it until
201  * there are no changes.
202  *
203  * Note: No locks are held when inspecting tcp_g_*epriv_ports
204  * but instead the code relies on:
205  * - the fact that the address of the array and its size never changes
206  * - the atomic assignment of the elements of the array
207  *
208  * Returns 0 if there are no more ports available.
209  *
210  * TS note: skip multilevel ports.
211  */
212 in_port_t
213 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
214 {
215 	int i;
216 	boolean_t restart = B_FALSE;
217 	tcp_stack_t *tcps = tcp->tcp_tcps;
218 
219 	if (random && tcp_random_anon_port != 0) {
220 		(void) random_get_pseudo_bytes((uint8_t *)&port,
221 		    sizeof (in_port_t));
222 		/*
223 		 * Unless changed by a sys admin, the smallest anon port
224 		 * is 32768 and the largest anon port is 65535.  It is
225 		 * very likely (50%) for the random port to be smaller
226 		 * than the smallest anon port.  When that happens,
227 		 * add port % (anon port range) to the smallest anon
228 		 * port to get the random port.  It should fall into the
229 		 * valid anon port range.
230 		 */
231 		if ((port < tcps->tcps_smallest_anon_port) ||
232 		    (port > tcps->tcps_largest_anon_port)) {
233 			port = tcps->tcps_smallest_anon_port +
234 			    port % (tcps->tcps_largest_anon_port -
235 			    tcps->tcps_smallest_anon_port);
236 		}
237 	}
238 
239 retry:
240 	if (port < tcps->tcps_smallest_anon_port)
241 		port = (in_port_t)tcps->tcps_smallest_anon_port;
242 
243 	if (port > tcps->tcps_largest_anon_port) {
244 		if (restart)
245 			return (0);
246 		restart = B_TRUE;
247 		port = (in_port_t)tcps->tcps_smallest_anon_port;
248 	}
249 
250 	if (port < tcps->tcps_smallest_nonpriv_port)
251 		port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
252 
253 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
254 		if (port == tcps->tcps_g_epriv_ports[i]) {
255 			port++;
256 			/*
257 			 * Make sure whether the port is in the
258 			 * valid range.
259 			 */
260 			goto retry;
261 		}
262 	}
263 	if (is_system_labeled() &&
264 	    (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
265 	    IPPROTO_TCP, B_TRUE)) != 0) {
266 		port = i;
267 		goto retry;
268 	}
269 	return (port);
270 }
271 
272 /*
273  * Return the next anonymous port in the privileged port range for
274  * bind checking.  It starts at IPPORT_RESERVED - 1 and goes
275  * downwards.  This is the same behavior as documented in the userland
276  * library call rresvport(3N).
277  *
278  * TS note: skip multilevel ports.
279  */
280 static in_port_t
281 tcp_get_next_priv_port(const tcp_t *tcp)
282 {
283 	static in_port_t next_priv_port = IPPORT_RESERVED - 1;
284 	in_port_t nextport;
285 	boolean_t restart = B_FALSE;
286 	tcp_stack_t *tcps = tcp->tcp_tcps;
287 retry:
288 	if (next_priv_port < tcps->tcps_min_anonpriv_port ||
289 	    next_priv_port >= IPPORT_RESERVED) {
290 		next_priv_port = IPPORT_RESERVED - 1;
291 		if (restart)
292 			return (0);
293 		restart = B_TRUE;
294 	}
295 	if (is_system_labeled() &&
296 	    (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
297 	    next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
298 		next_priv_port = nextport;
299 		goto retry;
300 	}
301 	return (next_priv_port--);
302 }
303 
304 static int
305 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
306     boolean_t bind_to_req_port_only, cred_t *cr)
307 {
308 	in_port_t	mlp_port;
309 	mlp_type_t 	addrtype, mlptype;
310 	boolean_t	user_specified;
311 	in_port_t	allocated_port;
312 	in_port_t	requested_port = *requested_port_ptr;
313 	conn_t		*connp = tcp->tcp_connp;
314 	zone_t		*zone;
315 	tcp_stack_t	*tcps = tcp->tcp_tcps;
316 	in6_addr_t	v6addr = connp->conn_laddr_v6;
317 
318 	/*
319 	 * XXX It's up to the caller to specify bind_to_req_port_only or not.
320 	 */
321 	ASSERT(cr != NULL);
322 
323 	/*
324 	 * Get a valid port (within the anonymous range and should not
325 	 * be a privileged one) to use if the user has not given a port.
326 	 * If multiple threads are here, they may all start with
327 	 * with the same initial port. But, it should be fine as long as
328 	 * tcp_bindi will ensure that no two threads will be assigned
329 	 * the same port.
330 	 *
331 	 * NOTE: XXX If a privileged process asks for an anonymous port, we
332 	 * still check for ports only in the range > tcp_smallest_non_priv_port,
333 	 * unless TCP_ANONPRIVBIND option is set.
334 	 */
335 	mlptype = mlptSingle;
336 	mlp_port = requested_port;
337 	if (requested_port == 0) {
338 		requested_port = connp->conn_anon_priv_bind ?
339 		    tcp_get_next_priv_port(tcp) :
340 		    tcp_update_next_port(tcps->tcps_next_port_to_try,
341 		    tcp, B_TRUE);
342 		if (requested_port == 0) {
343 			return (-TNOADDR);
344 		}
345 		user_specified = B_FALSE;
346 
347 		/*
348 		 * If the user went through one of the RPC interfaces to create
349 		 * this socket and RPC is MLP in this zone, then give him an
350 		 * anonymous MLP.
351 		 */
352 		if (connp->conn_anon_mlp && is_system_labeled()) {
353 			zone = crgetzone(cr);
354 			addrtype = tsol_mlp_addr_type(
355 			    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
356 			    IPV6_VERSION, &v6addr,
357 			    tcps->tcps_netstack->netstack_ip);
358 			if (addrtype == mlptSingle) {
359 				return (-TNOADDR);
360 			}
361 			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
362 			    PMAPPORT, addrtype);
363 			mlp_port = PMAPPORT;
364 		}
365 	} else {
366 		int i;
367 		boolean_t priv = B_FALSE;
368 
369 		/*
370 		 * If the requested_port is in the well-known privileged range,
371 		 * verify that the stream was opened by a privileged user.
372 		 * Note: No locks are held when inspecting tcp_g_*epriv_ports
373 		 * but instead the code relies on:
374 		 * - the fact that the address of the array and its size never
375 		 *   changes
376 		 * - the atomic assignment of the elements of the array
377 		 */
378 		if (requested_port < tcps->tcps_smallest_nonpriv_port) {
379 			priv = B_TRUE;
380 		} else {
381 			for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
382 				if (requested_port ==
383 				    tcps->tcps_g_epriv_ports[i]) {
384 					priv = B_TRUE;
385 					break;
386 				}
387 			}
388 		}
389 		if (priv) {
390 			if (secpolicy_net_privaddr(cr, requested_port,
391 			    IPPROTO_TCP) != 0) {
392 				if (connp->conn_debug) {
393 					(void) strlog(TCP_MOD_ID, 0, 1,
394 					    SL_ERROR|SL_TRACE,
395 					    "tcp_bind: no priv for port %d",
396 					    requested_port);
397 				}
398 				return (-TACCES);
399 			}
400 		}
401 		user_specified = B_TRUE;
402 
403 		connp = tcp->tcp_connp;
404 		if (is_system_labeled()) {
405 			zone = crgetzone(cr);
406 			addrtype = tsol_mlp_addr_type(
407 			    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
408 			    IPV6_VERSION, &v6addr,
409 			    tcps->tcps_netstack->netstack_ip);
410 			if (addrtype == mlptSingle) {
411 				return (-TNOADDR);
412 			}
413 			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
414 			    requested_port, addrtype);
415 		}
416 	}
417 
418 	if (mlptype != mlptSingle) {
419 		if (secpolicy_net_bindmlp(cr) != 0) {
420 			if (connp->conn_debug) {
421 				(void) strlog(TCP_MOD_ID, 0, 1,
422 				    SL_ERROR|SL_TRACE,
423 				    "tcp_bind: no priv for multilevel port %d",
424 				    requested_port);
425 			}
426 			return (-TACCES);
427 		}
428 
429 		/*
430 		 * If we're specifically binding a shared IP address and the
431 		 * port is MLP on shared addresses, then check to see if this
432 		 * zone actually owns the MLP.  Reject if not.
433 		 */
434 		if (mlptype == mlptShared && addrtype == mlptShared) {
435 			/*
436 			 * No need to handle exclusive-stack zones since
437 			 * ALL_ZONES only applies to the shared stack.
438 			 */
439 			zoneid_t mlpzone;
440 
441 			mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
442 			    htons(mlp_port));
443 			if (connp->conn_zoneid != mlpzone) {
444 				if (connp->conn_debug) {
445 					(void) strlog(TCP_MOD_ID, 0, 1,
446 					    SL_ERROR|SL_TRACE,
447 					    "tcp_bind: attempt to bind port "
448 					    "%d on shared addr in zone %d "
449 					    "(should be %d)",
450 					    mlp_port, connp->conn_zoneid,
451 					    mlpzone);
452 				}
453 				return (-TACCES);
454 			}
455 		}
456 
457 		if (!user_specified) {
458 			int err;
459 			err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
460 			    requested_port, B_TRUE);
461 			if (err != 0) {
462 				if (connp->conn_debug) {
463 					(void) strlog(TCP_MOD_ID, 0, 1,
464 					    SL_ERROR|SL_TRACE,
465 					    "tcp_bind: cannot establish anon "
466 					    "MLP for port %d",
467 					    requested_port);
468 				}
469 				return (err);
470 			}
471 			connp->conn_anon_port = B_TRUE;
472 		}
473 		connp->conn_mlp_type = mlptype;
474 	}
475 
476 	allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
477 	    connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
478 	    user_specified);
479 
480 	if (allocated_port == 0) {
481 		connp->conn_mlp_type = mlptSingle;
482 		if (connp->conn_anon_port) {
483 			connp->conn_anon_port = B_FALSE;
484 			(void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
485 			    requested_port, B_FALSE);
486 		}
487 		if (bind_to_req_port_only) {
488 			if (connp->conn_debug) {
489 				(void) strlog(TCP_MOD_ID, 0, 1,
490 				    SL_ERROR|SL_TRACE,
491 				    "tcp_bind: requested addr busy");
492 			}
493 			return (-TADDRBUSY);
494 		} else {
495 			/* If we are out of ports, fail the bind. */
496 			if (connp->conn_debug) {
497 				(void) strlog(TCP_MOD_ID, 0, 1,
498 				    SL_ERROR|SL_TRACE,
499 				    "tcp_bind: out of ports?");
500 			}
501 			return (-TNOADDR);
502 		}
503 	}
504 
505 	/* Pass the allocated port back */
506 	*requested_port_ptr = allocated_port;
507 	return (0);
508 }
509 
510 /*
511  * Check the address and check/pick a local port number.
512  */
513 int
514 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
515     boolean_t bind_to_req_port_only)
516 {
517 	tcp_t	*tcp = connp->conn_tcp;
518 	sin_t	*sin;
519 	sin6_t  *sin6;
520 	in_port_t	requested_port;
521 	ipaddr_t	v4addr;
522 	in6_addr_t	v6addr;
523 	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
524 	zoneid_t	zoneid = IPCL_ZONEID(connp);
525 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
526 	uint_t		scopeid = 0;
527 	int		error = 0;
528 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
529 
530 	ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
531 
532 	if (tcp->tcp_state == TCPS_BOUND) {
533 		return (0);
534 	} else if (tcp->tcp_state > TCPS_BOUND) {
535 		if (connp->conn_debug) {
536 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
537 			    "tcp_bind: bad state, %d", tcp->tcp_state);
538 		}
539 		return (-TOUTSTATE);
540 	}
541 
542 	ASSERT(sa != NULL && len != 0);
543 
544 	if (!OK_32PTR((char *)sa)) {
545 		if (connp->conn_debug) {
546 			(void) strlog(TCP_MOD_ID, 0, 1,
547 			    SL_ERROR|SL_TRACE,
548 			    "tcp_bind: bad address parameter, "
549 			    "address %p, len %d",
550 			    (void *)sa, len);
551 		}
552 		return (-TPROTO);
553 	}
554 
555 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
556 	if (error != 0) {
557 		return (error);
558 	}
559 
560 	switch (len) {
561 	case sizeof (sin_t):	/* Complete IPv4 address */
562 		sin = (sin_t *)sa;
563 		requested_port = ntohs(sin->sin_port);
564 		v4addr = sin->sin_addr.s_addr;
565 		IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
566 		if (v4addr != INADDR_ANY) {
567 			laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
568 			    B_FALSE);
569 		}
570 		break;
571 
572 	case sizeof (sin6_t): /* Complete IPv6 address */
573 		sin6 = (sin6_t *)sa;
574 		v6addr = sin6->sin6_addr;
575 		requested_port = ntohs(sin6->sin6_port);
576 		if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
577 			if (connp->conn_ipv6_v6only)
578 				return (EADDRNOTAVAIL);
579 
580 			IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
581 			if (v4addr != INADDR_ANY) {
582 				laddr_type = ip_laddr_verify_v4(v4addr,
583 				    zoneid, ipst, B_FALSE);
584 			}
585 		} else {
586 			if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
587 				if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
588 					scopeid = sin6->sin6_scope_id;
589 				laddr_type = ip_laddr_verify_v6(&v6addr,
590 				    zoneid, ipst, B_FALSE, scopeid);
591 			}
592 		}
593 		break;
594 
595 	default:
596 		if (connp->conn_debug) {
597 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
598 			    "tcp_bind: bad address length, %d", len);
599 		}
600 		return (EAFNOSUPPORT);
601 		/* return (-TBADADDR); */
602 	}
603 
604 	/* Is the local address a valid unicast address? */
605 	if (laddr_type == IPVL_BAD)
606 		return (EADDRNOTAVAIL);
607 
608 	connp->conn_bound_addr_v6 = v6addr;
609 	if (scopeid != 0) {
610 		ixa->ixa_flags |= IXAF_SCOPEID_SET;
611 		ixa->ixa_scopeid = scopeid;
612 		connp->conn_incoming_ifindex = scopeid;
613 	} else {
614 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
615 		connp->conn_incoming_ifindex = connp->conn_bound_if;
616 	}
617 
618 	connp->conn_laddr_v6 = v6addr;
619 	connp->conn_saddr_v6 = v6addr;
620 
621 	bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
622 
623 	error = tcp_bind_select_lport(tcp, &requested_port,
624 	    bind_to_req_port_only, cr);
625 	if (error != 0) {
626 		connp->conn_laddr_v6 = ipv6_all_zeros;
627 		connp->conn_saddr_v6 = ipv6_all_zeros;
628 		connp->conn_bound_addr_v6 = ipv6_all_zeros;
629 	}
630 	return (error);
631 }
632 
633 /*
634  * If the "bind_to_req_port_only" parameter is set, if the requested port
635  * number is available, return it, If not return 0
636  *
637  * If "bind_to_req_port_only" parameter is not set and
638  * If the requested port number is available, return it.  If not, return
639  * the first anonymous port we happen across.  If no anonymous ports are
640  * available, return 0. addr is the requested local address, if any.
641  *
642  * In either case, when succeeding update the tcp_t to record the port number
643  * and insert it in the bind hash table.
644  *
645  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
646  * without setting SO_REUSEADDR. This is needed so that they
647  * can be viewed as two independent transport protocols.
648  */
649 in_port_t
650 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
651     int reuseaddr, boolean_t quick_connect,
652     boolean_t bind_to_req_port_only, boolean_t user_specified)
653 {
654 	/* number of times we have run around the loop */
655 	int count = 0;
656 	/* maximum number of times to run around the loop */
657 	int loopmax;
658 	conn_t *connp = tcp->tcp_connp;
659 	tcp_stack_t	*tcps = tcp->tcp_tcps;
660 
661 	/*
662 	 * Lookup for free addresses is done in a loop and "loopmax"
663 	 * influences how long we spin in the loop
664 	 */
665 	if (bind_to_req_port_only) {
666 		/*
667 		 * If the requested port is busy, don't bother to look
668 		 * for a new one. Setting loop maximum count to 1 has
669 		 * that effect.
670 		 */
671 		loopmax = 1;
672 	} else {
673 		/*
674 		 * If the requested port is busy, look for a free one
675 		 * in the anonymous port range.
676 		 * Set loopmax appropriately so that one does not look
677 		 * forever in the case all of the anonymous ports are in use.
678 		 */
679 		if (connp->conn_anon_priv_bind) {
680 			/*
681 			 * loopmax =
682 			 * 	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
683 			 */
684 			loopmax = IPPORT_RESERVED -
685 			    tcps->tcps_min_anonpriv_port;
686 		} else {
687 			loopmax = (tcps->tcps_largest_anon_port -
688 			    tcps->tcps_smallest_anon_port + 1);
689 		}
690 	}
691 	do {
692 		uint16_t	lport;
693 		tf_t		*tbf;
694 		tcp_t		*ltcp;
695 		conn_t		*lconnp;
696 
697 		lport = htons(port);
698 
699 		/*
700 		 * Ensure that the tcp_t is not currently in the bind hash.
701 		 * Hold the lock on the hash bucket to ensure that
702 		 * the duplicate check plus the insertion is an atomic
703 		 * operation.
704 		 *
705 		 * This function does an inline lookup on the bind hash list
706 		 * Make sure that we access only members of tcp_t
707 		 * and that we don't look at tcp_tcp, since we are not
708 		 * doing a CONN_INC_REF.
709 		 */
710 		tcp_bind_hash_remove(tcp);
711 		tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
712 		mutex_enter(&tbf->tf_lock);
713 		for (ltcp = tbf->tf_tcp; ltcp != NULL;
714 		    ltcp = ltcp->tcp_bind_hash) {
715 			if (lport == ltcp->tcp_connp->conn_lport)
716 				break;
717 		}
718 
719 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
720 			boolean_t not_socket;
721 			boolean_t exclbind;
722 
723 			lconnp = ltcp->tcp_connp;
724 
725 			/*
726 			 * On a labeled system, we must treat bindings to ports
727 			 * on shared IP addresses by sockets with MAC exemption
728 			 * privilege as being in all zones, as there's
729 			 * otherwise no way to identify the right receiver.
730 			 */
731 			if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
732 				continue;
733 
734 			/*
735 			 * If TCP_EXCLBIND is set for either the bound or
736 			 * binding endpoint, the semantics of bind
737 			 * is changed according to the following.
738 			 *
739 			 * spec = specified address (v4 or v6)
740 			 * unspec = unspecified address (v4 or v6)
741 			 * A = specified addresses are different for endpoints
742 			 *
743 			 * bound	bind to		allowed
744 			 * -------------------------------------
745 			 * unspec	unspec		no
746 			 * unspec	spec		no
747 			 * spec		unspec		no
748 			 * spec		spec		yes if A
749 			 *
750 			 * For labeled systems, SO_MAC_EXEMPT behaves the same
751 			 * as TCP_EXCLBIND, except that zoneid is ignored.
752 			 *
753 			 * Note:
754 			 *
755 			 * 1. Because of TLI semantics, an endpoint can go
756 			 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
757 			 * TCPS_BOUND, depending on whether it is originally
758 			 * a listener or not.  That is why we need to check
759 			 * for states greater than or equal to TCPS_BOUND
760 			 * here.
761 			 *
762 			 * 2. Ideally, we should only check for state equals
763 			 * to TCPS_LISTEN. And the following check should be
764 			 * added.
765 			 *
766 			 * if (ltcp->tcp_state == TCPS_LISTEN ||
767 			 *	!reuseaddr || !lconnp->conn_reuseaddr) {
768 			 *		...
769 			 * }
770 			 *
771 			 * The semantics will be changed to this.  If the
772 			 * endpoint on the list is in state not equal to
773 			 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
774 			 * set, let the bind succeed.
775 			 *
776 			 * Because of (1), we cannot do that for TLI
777 			 * endpoints.  But we can do that for socket endpoints.
778 			 * If in future, we can change this going back
779 			 * semantics, we can use the above check for TLI also.
780 			 */
781 			not_socket = !(TCP_IS_SOCKET(ltcp) &&
782 			    TCP_IS_SOCKET(tcp));
783 			exclbind = lconnp->conn_exclbind ||
784 			    connp->conn_exclbind;
785 
786 			if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
787 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
788 			    (exclbind && (not_socket ||
789 			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
790 				if (V6_OR_V4_INADDR_ANY(
791 				    lconnp->conn_bound_addr_v6) ||
792 				    V6_OR_V4_INADDR_ANY(*laddr) ||
793 				    IN6_ARE_ADDR_EQUAL(laddr,
794 				    &lconnp->conn_bound_addr_v6)) {
795 					break;
796 				}
797 				continue;
798 			}
799 
800 			/*
801 			 * Check ipversion to allow IPv4 and IPv6 sockets to
802 			 * have disjoint port number spaces, if *_EXCLBIND
803 			 * is not set and only if the application binds to a
804 			 * specific port. We use the same autoassigned port
805 			 * number space for IPv4 and IPv6 sockets.
806 			 */
807 			if (connp->conn_ipversion != lconnp->conn_ipversion &&
808 			    bind_to_req_port_only)
809 				continue;
810 
811 			/*
812 			 * Ideally, we should make sure that the source
813 			 * address, remote address, and remote port in the
814 			 * four tuple for this tcp-connection is unique.
815 			 * However, trying to find out the local source
816 			 * address would require too much code duplication
817 			 * with IP, since IP needs needs to have that code
818 			 * to support userland TCP implementations.
819 			 */
820 			if (quick_connect &&
821 			    (ltcp->tcp_state > TCPS_LISTEN) &&
822 			    ((connp->conn_fport != lconnp->conn_fport) ||
823 			    !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
824 			    &lconnp->conn_faddr_v6)))
825 				continue;
826 
827 			if (!reuseaddr) {
828 				/*
829 				 * No socket option SO_REUSEADDR.
830 				 * If existing port is bound to
831 				 * a non-wildcard IP address
832 				 * and the requesting stream is
833 				 * bound to a distinct
834 				 * different IP addresses
835 				 * (non-wildcard, also), keep
836 				 * going.
837 				 */
838 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
839 				    !V6_OR_V4_INADDR_ANY(
840 				    lconnp->conn_bound_addr_v6) &&
841 				    !IN6_ARE_ADDR_EQUAL(laddr,
842 				    &lconnp->conn_bound_addr_v6))
843 					continue;
844 				if (ltcp->tcp_state >= TCPS_BOUND) {
845 					/*
846 					 * This port is being used and
847 					 * its state is >= TCPS_BOUND,
848 					 * so we can't bind to it.
849 					 */
850 					break;
851 				}
852 			} else {
853 				/*
854 				 * socket option SO_REUSEADDR is set on the
855 				 * binding tcp_t.
856 				 *
857 				 * If two streams are bound to
858 				 * same IP address or both addr
859 				 * and bound source are wildcards
860 				 * (INADDR_ANY), we want to stop
861 				 * searching.
862 				 * We have found a match of IP source
863 				 * address and source port, which is
864 				 * refused regardless of the
865 				 * SO_REUSEADDR setting, so we break.
866 				 */
867 				if (IN6_ARE_ADDR_EQUAL(laddr,
868 				    &lconnp->conn_bound_addr_v6) &&
869 				    (ltcp->tcp_state == TCPS_LISTEN ||
870 				    ltcp->tcp_state == TCPS_BOUND))
871 					break;
872 			}
873 		}
874 		if (ltcp != NULL) {
875 			/* The port number is busy */
876 			mutex_exit(&tbf->tf_lock);
877 		} else {
878 			/*
879 			 * This port is ours. Insert in fanout and mark as
880 			 * bound to prevent others from getting the port
881 			 * number.
882 			 */
883 			tcp->tcp_state = TCPS_BOUND;
884 			DTRACE_TCP6(state__change, void, NULL,
885 			    ip_xmit_attr_t *, connp->conn_ixa,
886 			    void, NULL, tcp_t *, tcp, void, NULL,
887 			    int32_t, TCPS_IDLE);
888 
889 			connp->conn_lport = htons(port);
890 
891 			ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
892 			    connp->conn_lport)] == tbf);
893 			tcp_bind_hash_insert(tbf, tcp, 1);
894 
895 			mutex_exit(&tbf->tf_lock);
896 
897 			/*
898 			 * We don't want tcp_next_port_to_try to "inherit"
899 			 * a port number supplied by the user in a bind.
900 			 */
901 			if (user_specified)
902 				return (port);
903 
904 			/*
905 			 * This is the only place where tcp_next_port_to_try
906 			 * is updated. After the update, it may or may not
907 			 * be in the valid range.
908 			 */
909 			if (!connp->conn_anon_priv_bind)
910 				tcps->tcps_next_port_to_try = port + 1;
911 			return (port);
912 		}
913 
914 		if (connp->conn_anon_priv_bind) {
915 			port = tcp_get_next_priv_port(tcp);
916 		} else {
917 			if (count == 0 && user_specified) {
918 				/*
919 				 * We may have to return an anonymous port. So
920 				 * get one to start with.
921 				 */
922 				port =
923 				    tcp_update_next_port(
924 				    tcps->tcps_next_port_to_try,
925 				    tcp, B_TRUE);
926 				user_specified = B_FALSE;
927 			} else {
928 				port = tcp_update_next_port(port + 1, tcp,
929 				    B_FALSE);
930 			}
931 		}
932 		if (port == 0)
933 			break;
934 
935 		/*
936 		 * Don't let this loop run forever in the case where
937 		 * all of the anonymous ports are in use.
938 		 */
939 	} while (++count < loopmax);
940 	return (0);
941 }
942