xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_bind.c (revision e86372a01d2d16a5dd4a64e144ed978ba17fe7dd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/stream.h>
30 #include <sys/strsun.h>
31 #include <sys/strsubr.h>
32 #include <sys/stropts.h>
33 #include <sys/strlog.h>
34 #define	_SUN_TPI_VERSION 2
35 #include <sys/tihdr.h>
36 #include <sys/suntpi.h>
37 #include <sys/xti_inet.h>
38 #include <sys/policy.h>
39 #include <sys/squeue_impl.h>
40 #include <sys/squeue.h>
41 #include <sys/tsol/tnet.h>
42 
43 #include <rpc/pmap_prot.h>
44 
45 #include <inet/common.h>
46 #include <inet/ip.h>
47 #include <inet/tcp.h>
48 #include <inet/tcp_impl.h>
49 #include <inet/proto_set.h>
50 #include <inet/ipsec_impl.h>
51 
52 /* Setable in /etc/system */
53 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
54 static uint32_t tcp_random_anon_port = 1;
55 
56 static int	tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
57 		    cred_t *cr);
58 static in_port_t	tcp_get_next_priv_port(const tcp_t *);
59 
60 /*
61  * Hash list insertion routine for tcp_t structures. Each hash bucket
62  * contains a list of tcp_t entries, and each entry is bound to a unique
63  * port. If there are multiple tcp_t's that are bound to the same port, then
64  * one of them will be linked into the hash bucket list, and the rest will
65  * hang off of that one entry. For each port, entries bound to a specific IP
66  * address will be inserted before those those bound to INADDR_ANY.
67  */
68 void
69 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
70 {
71 	tcp_t	**tcpp;
72 	tcp_t	*tcpnext;
73 	tcp_t	*tcphash;
74 	conn_t	*connp = tcp->tcp_connp;
75 	conn_t	*connext;
76 
77 	if (tcp->tcp_ptpbhn != NULL) {
78 		ASSERT(!caller_holds_lock);
79 		tcp_bind_hash_remove(tcp);
80 	}
81 	tcpp = &tbf->tf_tcp;
82 	if (!caller_holds_lock) {
83 		mutex_enter(&tbf->tf_lock);
84 	} else {
85 		ASSERT(MUTEX_HELD(&tbf->tf_lock));
86 	}
87 	tcphash = tcpp[0];
88 	tcpnext = NULL;
89 	if (tcphash != NULL) {
90 		/* Look for an entry using the same port */
91 		while ((tcphash = tcpp[0]) != NULL &&
92 		    connp->conn_lport != tcphash->tcp_connp->conn_lport)
93 			tcpp = &(tcphash->tcp_bind_hash);
94 
95 		/* The port was not found, just add to the end */
96 		if (tcphash == NULL)
97 			goto insert;
98 
99 		/*
100 		 * OK, there already exists an entry bound to the
101 		 * same port.
102 		 *
103 		 * If the new tcp bound to the INADDR_ANY address
104 		 * and the first one in the list is not bound to
105 		 * INADDR_ANY we skip all entries until we find the
106 		 * first one bound to INADDR_ANY.
107 		 * This makes sure that applications binding to a
108 		 * specific address get preference over those binding to
109 		 * INADDR_ANY.
110 		 */
111 		tcpnext = tcphash;
112 		connext = tcpnext->tcp_connp;
113 		tcphash = NULL;
114 		if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
115 		    !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
116 			while ((tcpnext = tcpp[0]) != NULL) {
117 				connext = tcpnext->tcp_connp;
118 				if (!V6_OR_V4_INADDR_ANY(
119 				    connext->conn_bound_addr_v6))
120 					tcpp = &(tcpnext->tcp_bind_hash_port);
121 				else
122 					break;
123 			}
124 			if (tcpnext != NULL) {
125 				tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
126 				tcphash = tcpnext->tcp_bind_hash;
127 				if (tcphash != NULL) {
128 					tcphash->tcp_ptpbhn =
129 					    &(tcp->tcp_bind_hash);
130 					tcpnext->tcp_bind_hash = NULL;
131 				}
132 			}
133 		} else {
134 			tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
135 			tcphash = tcpnext->tcp_bind_hash;
136 			if (tcphash != NULL) {
137 				tcphash->tcp_ptpbhn =
138 				    &(tcp->tcp_bind_hash);
139 				tcpnext->tcp_bind_hash = NULL;
140 			}
141 		}
142 	}
143 insert:
144 	tcp->tcp_bind_hash_port = tcpnext;
145 	tcp->tcp_bind_hash = tcphash;
146 	tcp->tcp_ptpbhn = tcpp;
147 	tcpp[0] = tcp;
148 	if (!caller_holds_lock)
149 		mutex_exit(&tbf->tf_lock);
150 }
151 
152 /*
153  * Hash list removal routine for tcp_t structures.
154  */
155 void
156 tcp_bind_hash_remove(tcp_t *tcp)
157 {
158 	tcp_t	*tcpnext;
159 	kmutex_t *lockp;
160 	tcp_stack_t	*tcps = tcp->tcp_tcps;
161 	conn_t		*connp = tcp->tcp_connp;
162 
163 	if (tcp->tcp_ptpbhn == NULL)
164 		return;
165 
166 	/*
167 	 * Extract the lock pointer in case there are concurrent
168 	 * hash_remove's for this instance.
169 	 */
170 	ASSERT(connp->conn_lport != 0);
171 	lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
172 	    connp->conn_lport)].tf_lock;
173 
174 	ASSERT(lockp != NULL);
175 	mutex_enter(lockp);
176 	if (tcp->tcp_ptpbhn) {
177 		tcpnext = tcp->tcp_bind_hash_port;
178 		if (tcpnext != NULL) {
179 			tcp->tcp_bind_hash_port = NULL;
180 			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
181 			tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
182 			if (tcpnext->tcp_bind_hash != NULL) {
183 				tcpnext->tcp_bind_hash->tcp_ptpbhn =
184 				    &(tcpnext->tcp_bind_hash);
185 				tcp->tcp_bind_hash = NULL;
186 			}
187 		} else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
188 			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
189 			tcp->tcp_bind_hash = NULL;
190 		}
191 		*tcp->tcp_ptpbhn = tcpnext;
192 		tcp->tcp_ptpbhn = NULL;
193 	}
194 	mutex_exit(lockp);
195 }
196 
197 /*
198  * Don't let port fall into the privileged range.
199  * Since the extra privileged ports can be arbitrary we also
200  * ensure that we exclude those from consideration.
201  * tcp_g_epriv_ports is not sorted thus we loop over it until
202  * there are no changes.
203  *
204  * Note: No locks are held when inspecting tcp_g_*epriv_ports
205  * but instead the code relies on:
206  * - the fact that the address of the array and its size never changes
207  * - the atomic assignment of the elements of the array
208  *
209  * Returns 0 if there are no more ports available.
210  *
211  * TS note: skip multilevel ports.
212  */
213 in_port_t
214 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
215 {
216 	int i, bump;
217 	boolean_t restart = B_FALSE;
218 	tcp_stack_t *tcps = tcp->tcp_tcps;
219 
220 	if (random && tcp_random_anon_port != 0) {
221 		(void) random_get_pseudo_bytes((uint8_t *)&port,
222 		    sizeof (in_port_t));
223 		/*
224 		 * Unless changed by a sys admin, the smallest anon port
225 		 * is 32768 and the largest anon port is 65535.  It is
226 		 * very likely (50%) for the random port to be smaller
227 		 * than the smallest anon port.  When that happens,
228 		 * add port % (anon port range) to the smallest anon
229 		 * port to get the random port.  It should fall into the
230 		 * valid anon port range.
231 		 */
232 		if ((port < tcps->tcps_smallest_anon_port) ||
233 		    (port > tcps->tcps_largest_anon_port)) {
234 			if (tcps->tcps_smallest_anon_port ==
235 			    tcps->tcps_largest_anon_port) {
236 				bump = 0;
237 			} else {
238 				bump = port % (tcps->tcps_largest_anon_port -
239 				    tcps->tcps_smallest_anon_port);
240 			}
241 			port = tcps->tcps_smallest_anon_port + bump;
242 		}
243 	}
244 
245 retry:
246 	if (port < tcps->tcps_smallest_anon_port)
247 		port = (in_port_t)tcps->tcps_smallest_anon_port;
248 
249 	if (port > tcps->tcps_largest_anon_port) {
250 		if (restart)
251 			return (0);
252 		restart = B_TRUE;
253 		port = (in_port_t)tcps->tcps_smallest_anon_port;
254 	}
255 
256 	if (port < tcps->tcps_smallest_nonpriv_port)
257 		port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
258 
259 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
260 		if (port == tcps->tcps_g_epriv_ports[i]) {
261 			port++;
262 			/*
263 			 * Make sure whether the port is in the
264 			 * valid range.
265 			 */
266 			goto retry;
267 		}
268 	}
269 	if (is_system_labeled() &&
270 	    (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
271 	    IPPROTO_TCP, B_TRUE)) != 0) {
272 		port = i;
273 		goto retry;
274 	}
275 	return (port);
276 }
277 
278 /*
279  * Return the next anonymous port in the privileged port range for
280  * bind checking.  It starts at IPPORT_RESERVED - 1 and goes
281  * downwards.  This is the same behavior as documented in the userland
282  * library call rresvport(3N).
283  *
284  * TS note: skip multilevel ports.
285  */
286 static in_port_t
287 tcp_get_next_priv_port(const tcp_t *tcp)
288 {
289 	static in_port_t next_priv_port = IPPORT_RESERVED - 1;
290 	in_port_t nextport;
291 	boolean_t restart = B_FALSE;
292 	tcp_stack_t *tcps = tcp->tcp_tcps;
293 retry:
294 	if (next_priv_port < tcps->tcps_min_anonpriv_port ||
295 	    next_priv_port >= IPPORT_RESERVED) {
296 		next_priv_port = IPPORT_RESERVED - 1;
297 		if (restart)
298 			return (0);
299 		restart = B_TRUE;
300 	}
301 	if (is_system_labeled() &&
302 	    (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
303 	    next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
304 		next_priv_port = nextport;
305 		goto retry;
306 	}
307 	return (next_priv_port--);
308 }
309 
310 static int
311 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
312     boolean_t bind_to_req_port_only, cred_t *cr)
313 {
314 	in_port_t	mlp_port;
315 	mlp_type_t	addrtype, mlptype;
316 	boolean_t	user_specified;
317 	in_port_t	allocated_port;
318 	in_port_t	requested_port = *requested_port_ptr;
319 	conn_t		*connp = tcp->tcp_connp;
320 	zone_t		*zone;
321 	tcp_stack_t	*tcps = tcp->tcp_tcps;
322 	in6_addr_t	v6addr = connp->conn_laddr_v6;
323 
324 	zone = NULL;
325 	/*
326 	 * XXX It's up to the caller to specify bind_to_req_port_only or not.
327 	 */
328 	ASSERT(cr != NULL);
329 
330 	/*
331 	 * Get a valid port (within the anonymous range and should not
332 	 * be a privileged one) to use if the user has not given a port.
333 	 * If multiple threads are here, they may all start with
334 	 * with the same initial port. But, it should be fine as long as
335 	 * tcp_bindi will ensure that no two threads will be assigned
336 	 * the same port.
337 	 *
338 	 * NOTE: XXX If a privileged process asks for an anonymous port, we
339 	 * still check for ports only in the range > tcp_smallest_non_priv_port,
340 	 * unless TCP_ANONPRIVBIND option is set.
341 	 */
342 	mlptype = mlptSingle;
343 	mlp_port = requested_port;
344 	if (requested_port == 0) {
345 		requested_port = connp->conn_anon_priv_bind ?
346 		    tcp_get_next_priv_port(tcp) :
347 		    tcp_update_next_port(tcps->tcps_next_port_to_try,
348 		    tcp, B_TRUE);
349 		if (requested_port == 0) {
350 			return (-TNOADDR);
351 		}
352 		user_specified = B_FALSE;
353 
354 		/*
355 		 * If the user went through one of the RPC interfaces to create
356 		 * this socket and RPC is MLP in this zone, then give them an
357 		 * anonymous MLP.
358 		 */
359 		if (connp->conn_anon_mlp && is_system_labeled()) {
360 			zone = crgetzone(cr);
361 			addrtype = tsol_mlp_addr_type(
362 			    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
363 			    IPV6_VERSION, &v6addr,
364 			    tcps->tcps_netstack->netstack_ip);
365 			if (addrtype == mlptSingle) {
366 				return (-TNOADDR);
367 			}
368 			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
369 			    PMAPPORT, addrtype);
370 			mlp_port = PMAPPORT;
371 		}
372 	} else {
373 		int i;
374 		boolean_t priv = B_FALSE;
375 
376 		/*
377 		 * If the requested_port is in the well-known privileged range,
378 		 * verify that the stream was opened by a privileged user.
379 		 * Note: No locks are held when inspecting tcp_g_*epriv_ports
380 		 * but instead the code relies on:
381 		 * - the fact that the address of the array and its size never
382 		 *   changes
383 		 * - the atomic assignment of the elements of the array
384 		 */
385 		if (requested_port < tcps->tcps_smallest_nonpriv_port) {
386 			priv = B_TRUE;
387 		} else {
388 			for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
389 				if (requested_port ==
390 				    tcps->tcps_g_epriv_ports[i]) {
391 					priv = B_TRUE;
392 					break;
393 				}
394 			}
395 		}
396 		if (priv) {
397 			if (secpolicy_net_privaddr(cr, requested_port,
398 			    IPPROTO_TCP) != 0) {
399 				if (connp->conn_debug) {
400 					(void) strlog(TCP_MOD_ID, 0, 1,
401 					    SL_ERROR|SL_TRACE,
402 					    "tcp_bind: no priv for port %d",
403 					    requested_port);
404 				}
405 				return (-TACCES);
406 			}
407 		}
408 		user_specified = B_TRUE;
409 
410 		connp = tcp->tcp_connp;
411 		if (is_system_labeled()) {
412 			zone = crgetzone(cr);
413 			addrtype = tsol_mlp_addr_type(
414 			    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
415 			    IPV6_VERSION, &v6addr,
416 			    tcps->tcps_netstack->netstack_ip);
417 			if (addrtype == mlptSingle) {
418 				return (-TNOADDR);
419 			}
420 			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
421 			    requested_port, addrtype);
422 		}
423 	}
424 
425 	if (mlptype != mlptSingle) {
426 		if (secpolicy_net_bindmlp(cr) != 0) {
427 			if (connp->conn_debug) {
428 				(void) strlog(TCP_MOD_ID, 0, 1,
429 				    SL_ERROR|SL_TRACE,
430 				    "tcp_bind: no priv for multilevel port %d",
431 				    requested_port);
432 			}
433 			return (-TACCES);
434 		}
435 
436 		/*
437 		 * If we're specifically binding a shared IP address and the
438 		 * port is MLP on shared addresses, then check to see if this
439 		 * zone actually owns the MLP.  Reject if not.
440 		 */
441 		if (mlptype == mlptShared && addrtype == mlptShared) {
442 			/*
443 			 * No need to handle exclusive-stack zones since
444 			 * ALL_ZONES only applies to the shared stack.
445 			 */
446 			zoneid_t mlpzone;
447 
448 			mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
449 			    htons(mlp_port));
450 			if (connp->conn_zoneid != mlpzone) {
451 				if (connp->conn_debug) {
452 					(void) strlog(TCP_MOD_ID, 0, 1,
453 					    SL_ERROR|SL_TRACE,
454 					    "tcp_bind: attempt to bind port "
455 					    "%d on shared addr in zone %d "
456 					    "(should be %d)",
457 					    mlp_port, connp->conn_zoneid,
458 					    mlpzone);
459 				}
460 				return (-TACCES);
461 			}
462 		}
463 
464 		if (!user_specified) {
465 			int err;
466 			err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
467 			    requested_port, B_TRUE);
468 			if (err != 0) {
469 				if (connp->conn_debug) {
470 					(void) strlog(TCP_MOD_ID, 0, 1,
471 					    SL_ERROR|SL_TRACE,
472 					    "tcp_bind: cannot establish anon "
473 					    "MLP for port %d",
474 					    requested_port);
475 				}
476 				return (err);
477 			}
478 			connp->conn_anon_port = B_TRUE;
479 		}
480 		connp->conn_mlp_type = mlptype;
481 	}
482 
483 	allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
484 	    connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
485 	    user_specified);
486 
487 	if (allocated_port == 0) {
488 		connp->conn_mlp_type = mlptSingle;
489 		if (connp->conn_anon_port) {
490 			connp->conn_anon_port = B_FALSE;
491 			(void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
492 			    requested_port, B_FALSE);
493 		}
494 		if (bind_to_req_port_only) {
495 			if (connp->conn_debug) {
496 				(void) strlog(TCP_MOD_ID, 0, 1,
497 				    SL_ERROR|SL_TRACE,
498 				    "tcp_bind: requested addr busy");
499 			}
500 			return (-TADDRBUSY);
501 		} else {
502 			/* If we are out of ports, fail the bind. */
503 			if (connp->conn_debug) {
504 				(void) strlog(TCP_MOD_ID, 0, 1,
505 				    SL_ERROR|SL_TRACE,
506 				    "tcp_bind: out of ports?");
507 			}
508 			return (-TNOADDR);
509 		}
510 	}
511 
512 	/* Pass the allocated port back */
513 	*requested_port_ptr = allocated_port;
514 	return (0);
515 }
516 
517 /*
518  * Check the address and check/pick a local port number.
519  */
520 int
521 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
522     boolean_t bind_to_req_port_only)
523 {
524 	tcp_t	*tcp = connp->conn_tcp;
525 	sin_t	*sin;
526 	sin6_t  *sin6;
527 	in_port_t	requested_port;
528 	ipaddr_t	v4addr;
529 	in6_addr_t	v6addr;
530 	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
531 	zoneid_t	zoneid = IPCL_ZONEID(connp);
532 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
533 	uint_t		scopeid = 0;
534 	int		error = 0;
535 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
536 
537 	ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
538 
539 	if (tcp->tcp_state == TCPS_BOUND) {
540 		return (0);
541 	} else if (tcp->tcp_state > TCPS_BOUND) {
542 		if (connp->conn_debug) {
543 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
544 			    "tcp_bind: bad state, %d", tcp->tcp_state);
545 		}
546 		return (-TOUTSTATE);
547 	}
548 
549 	ASSERT(sa != NULL && len != 0);
550 
551 	if (!OK_32PTR((char *)sa)) {
552 		if (connp->conn_debug) {
553 			(void) strlog(TCP_MOD_ID, 0, 1,
554 			    SL_ERROR|SL_TRACE,
555 			    "tcp_bind: bad address parameter, "
556 			    "address %p, len %d",
557 			    (void *)sa, len);
558 		}
559 		return (-TPROTO);
560 	}
561 
562 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
563 	if (error != 0) {
564 		return (error);
565 	}
566 
567 	switch (len) {
568 	case sizeof (sin_t):	/* Complete IPv4 address */
569 		sin = (sin_t *)sa;
570 		requested_port = ntohs(sin->sin_port);
571 		v4addr = sin->sin_addr.s_addr;
572 		IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
573 		if (v4addr != INADDR_ANY) {
574 			laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
575 			    B_FALSE);
576 		}
577 		break;
578 
579 	case sizeof (sin6_t): /* Complete IPv6 address */
580 		sin6 = (sin6_t *)sa;
581 		v6addr = sin6->sin6_addr;
582 		requested_port = ntohs(sin6->sin6_port);
583 		if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
584 			if (connp->conn_ipv6_v6only)
585 				return (EADDRNOTAVAIL);
586 
587 			IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
588 			if (v4addr != INADDR_ANY) {
589 				laddr_type = ip_laddr_verify_v4(v4addr,
590 				    zoneid, ipst, B_FALSE);
591 			}
592 		} else {
593 			if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
594 				if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
595 					scopeid = sin6->sin6_scope_id;
596 				laddr_type = ip_laddr_verify_v6(&v6addr,
597 				    zoneid, ipst, B_FALSE, scopeid);
598 			}
599 		}
600 		break;
601 
602 	default:
603 		if (connp->conn_debug) {
604 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
605 			    "tcp_bind: bad address length, %d", len);
606 		}
607 		return (EAFNOSUPPORT);
608 		/* return (-TBADADDR); */
609 	}
610 
611 	/* Is the local address a valid unicast address? */
612 	if (laddr_type == IPVL_BAD)
613 		return (EADDRNOTAVAIL);
614 
615 	connp->conn_bound_addr_v6 = v6addr;
616 	if (scopeid != 0) {
617 		ixa->ixa_flags |= IXAF_SCOPEID_SET;
618 		ixa->ixa_scopeid = scopeid;
619 		connp->conn_incoming_ifindex = scopeid;
620 	} else {
621 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
622 		connp->conn_incoming_ifindex = connp->conn_bound_if;
623 	}
624 
625 	connp->conn_laddr_v6 = v6addr;
626 	connp->conn_saddr_v6 = v6addr;
627 
628 	bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
629 
630 	error = tcp_bind_select_lport(tcp, &requested_port,
631 	    bind_to_req_port_only, cr);
632 	if (error != 0) {
633 		connp->conn_laddr_v6 = ipv6_all_zeros;
634 		connp->conn_saddr_v6 = ipv6_all_zeros;
635 		connp->conn_bound_addr_v6 = ipv6_all_zeros;
636 	}
637 	return (error);
638 }
639 
640 /*
641  * If the "bind_to_req_port_only" parameter is set, if the requested port
642  * number is available, return it, If not return 0
643  *
644  * If "bind_to_req_port_only" parameter is not set and
645  * If the requested port number is available, return it.  If not, return
646  * the first anonymous port we happen across.  If no anonymous ports are
647  * available, return 0. addr is the requested local address, if any.
648  *
649  * In either case, when succeeding update the tcp_t to record the port number
650  * and insert it in the bind hash table.
651  *
652  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
653  * without setting SO_REUSEADDR. This is needed so that they
654  * can be viewed as two independent transport protocols.
655  */
656 in_port_t
657 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
658     int reuseaddr, boolean_t quick_connect,
659     boolean_t bind_to_req_port_only, boolean_t user_specified)
660 {
661 	/* number of times we have run around the loop */
662 	int count = 0;
663 	/* maximum number of times to run around the loop */
664 	int loopmax;
665 	conn_t *connp = tcp->tcp_connp;
666 	tcp_stack_t	*tcps = tcp->tcp_tcps;
667 
668 	/*
669 	 * Lookup for free addresses is done in a loop and "loopmax"
670 	 * influences how long we spin in the loop
671 	 */
672 	if (bind_to_req_port_only) {
673 		/*
674 		 * If the requested port is busy, don't bother to look
675 		 * for a new one. Setting loop maximum count to 1 has
676 		 * that effect.
677 		 */
678 		loopmax = 1;
679 	} else {
680 		/*
681 		 * If the requested port is busy, look for a free one
682 		 * in the anonymous port range.
683 		 * Set loopmax appropriately so that one does not look
684 		 * forever in the case all of the anonymous ports are in use.
685 		 */
686 		if (connp->conn_anon_priv_bind) {
687 			/*
688 			 * loopmax =
689 			 *	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
690 			 */
691 			loopmax = IPPORT_RESERVED -
692 			    tcps->tcps_min_anonpriv_port;
693 		} else {
694 			loopmax = (tcps->tcps_largest_anon_port -
695 			    tcps->tcps_smallest_anon_port + 1);
696 		}
697 	}
698 	do {
699 		uint16_t	lport;
700 		tf_t		*tbf;
701 		tcp_t		*ltcp;
702 		conn_t		*lconnp;
703 
704 		lport = htons(port);
705 
706 		/*
707 		 * Ensure that the tcp_t is not currently in the bind hash.
708 		 * Hold the lock on the hash bucket to ensure that
709 		 * the duplicate check plus the insertion is an atomic
710 		 * operation.
711 		 *
712 		 * This function does an inline lookup on the bind hash list
713 		 * Make sure that we access only members of tcp_t
714 		 * and that we don't look at tcp_tcp, since we are not
715 		 * doing a CONN_INC_REF.
716 		 */
717 		tcp_bind_hash_remove(tcp);
718 		tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
719 		mutex_enter(&tbf->tf_lock);
720 		for (ltcp = tbf->tf_tcp; ltcp != NULL;
721 		    ltcp = ltcp->tcp_bind_hash) {
722 			if (lport == ltcp->tcp_connp->conn_lport)
723 				break;
724 		}
725 
726 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
727 			boolean_t not_socket;
728 			boolean_t exclbind;
729 
730 			lconnp = ltcp->tcp_connp;
731 
732 			/*
733 			 * On a labeled system, we must treat bindings to ports
734 			 * on shared IP addresses by sockets with MAC exemption
735 			 * privilege as being in all zones, as there's
736 			 * otherwise no way to identify the right receiver.
737 			 */
738 			if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
739 				continue;
740 
741 			/*
742 			 * If TCP_EXCLBIND is set for either the bound or
743 			 * binding endpoint, the semantics of bind
744 			 * is changed according to the following.
745 			 *
746 			 * spec = specified address (v4 or v6)
747 			 * unspec = unspecified address (v4 or v6)
748 			 * A = specified addresses are different for endpoints
749 			 *
750 			 * bound	bind to		allowed
751 			 * -------------------------------------
752 			 * unspec	unspec		no
753 			 * unspec	spec		no
754 			 * spec		unspec		no
755 			 * spec		spec		yes if A
756 			 *
757 			 * For labeled systems, SO_MAC_EXEMPT behaves the same
758 			 * as TCP_EXCLBIND, except that zoneid is ignored.
759 			 *
760 			 * Note:
761 			 *
762 			 * 1. Because of TLI semantics, an endpoint can go
763 			 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
764 			 * TCPS_BOUND, depending on whether it is originally
765 			 * a listener or not.  That is why we need to check
766 			 * for states greater than or equal to TCPS_BOUND
767 			 * here.
768 			 *
769 			 * 2. Ideally, we should only check for state equals
770 			 * to TCPS_LISTEN. And the following check should be
771 			 * added.
772 			 *
773 			 * if (ltcp->tcp_state == TCPS_LISTEN ||
774 			 *	!reuseaddr || !lconnp->conn_reuseaddr) {
775 			 *		...
776 			 * }
777 			 *
778 			 * The semantics will be changed to this.  If the
779 			 * endpoint on the list is in state not equal to
780 			 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
781 			 * set, let the bind succeed.
782 			 *
783 			 * Because of (1), we cannot do that for TLI
784 			 * endpoints.  But we can do that for socket endpoints.
785 			 * If in future, we can change this going back
786 			 * semantics, we can use the above check for TLI also.
787 			 */
788 			not_socket = !(TCP_IS_SOCKET(ltcp) &&
789 			    TCP_IS_SOCKET(tcp));
790 			exclbind = lconnp->conn_exclbind ||
791 			    connp->conn_exclbind;
792 
793 			if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
794 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
795 			    (exclbind && (not_socket ||
796 			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
797 				if (V6_OR_V4_INADDR_ANY(
798 				    lconnp->conn_bound_addr_v6) ||
799 				    V6_OR_V4_INADDR_ANY(*laddr) ||
800 				    IN6_ARE_ADDR_EQUAL(laddr,
801 				    &lconnp->conn_bound_addr_v6)) {
802 					break;
803 				}
804 				continue;
805 			}
806 
807 			/*
808 			 * Check ipversion to allow IPv4 and IPv6 sockets to
809 			 * have disjoint port number spaces, if *_EXCLBIND
810 			 * is not set and only if the application binds to a
811 			 * specific port. We use the same autoassigned port
812 			 * number space for IPv4 and IPv6 sockets.
813 			 */
814 			if (connp->conn_ipversion != lconnp->conn_ipversion &&
815 			    bind_to_req_port_only)
816 				continue;
817 
818 			/*
819 			 * Ideally, we should make sure that the source
820 			 * address, remote address, and remote port in the
821 			 * four tuple for this tcp-connection is unique.
822 			 * However, trying to find out the local source
823 			 * address would require too much code duplication
824 			 * with IP, since IP needs needs to have that code
825 			 * to support userland TCP implementations.
826 			 */
827 			if (quick_connect &&
828 			    (ltcp->tcp_state > TCPS_LISTEN) &&
829 			    ((connp->conn_fport != lconnp->conn_fport) ||
830 			    !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
831 			    &lconnp->conn_faddr_v6)))
832 				continue;
833 
834 			if (!reuseaddr) {
835 				/*
836 				 * No socket option SO_REUSEADDR.
837 				 * If existing port is bound to
838 				 * a non-wildcard IP address
839 				 * and the requesting stream is
840 				 * bound to a distinct
841 				 * different IP addresses
842 				 * (non-wildcard, also), keep
843 				 * going.
844 				 */
845 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
846 				    !V6_OR_V4_INADDR_ANY(
847 				    lconnp->conn_bound_addr_v6) &&
848 				    !IN6_ARE_ADDR_EQUAL(laddr,
849 				    &lconnp->conn_bound_addr_v6))
850 					continue;
851 				if (ltcp->tcp_state >= TCPS_BOUND) {
852 					/*
853 					 * This port is being used and
854 					 * its state is >= TCPS_BOUND,
855 					 * so we can't bind to it.
856 					 */
857 					break;
858 				}
859 			} else {
860 				/*
861 				 * socket option SO_REUSEADDR is set on the
862 				 * binding tcp_t.
863 				 *
864 				 * If two streams are bound to
865 				 * same IP address or both addr
866 				 * and bound source are wildcards
867 				 * (INADDR_ANY), we want to stop
868 				 * searching.
869 				 * We have found a match of IP source
870 				 * address and source port, which is
871 				 * refused regardless of the
872 				 * SO_REUSEADDR setting, so we break.
873 				 */
874 				if (IN6_ARE_ADDR_EQUAL(laddr,
875 				    &lconnp->conn_bound_addr_v6) &&
876 				    (ltcp->tcp_state == TCPS_LISTEN ||
877 				    ltcp->tcp_state == TCPS_BOUND))
878 					break;
879 			}
880 		}
881 		if (ltcp != NULL) {
882 			/* The port number is busy */
883 			mutex_exit(&tbf->tf_lock);
884 		} else {
885 			/*
886 			 * This port is ours. Insert in fanout and mark as
887 			 * bound to prevent others from getting the port
888 			 * number.
889 			 */
890 			tcp->tcp_state = TCPS_BOUND;
891 			DTRACE_TCP6(state__change, void, NULL,
892 			    ip_xmit_attr_t *, connp->conn_ixa,
893 			    void, NULL, tcp_t *, tcp, void, NULL,
894 			    int32_t, TCPS_IDLE);
895 
896 			connp->conn_lport = htons(port);
897 
898 			ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
899 			    connp->conn_lport)] == tbf);
900 			tcp_bind_hash_insert(tbf, tcp, 1);
901 
902 			mutex_exit(&tbf->tf_lock);
903 
904 			/*
905 			 * We don't want tcp_next_port_to_try to "inherit"
906 			 * a port number supplied by the user in a bind.
907 			 */
908 			if (user_specified)
909 				return (port);
910 
911 			/*
912 			 * This is the only place where tcp_next_port_to_try
913 			 * is updated. After the update, it may or may not
914 			 * be in the valid range.
915 			 */
916 			if (!connp->conn_anon_priv_bind)
917 				tcps->tcps_next_port_to_try = port + 1;
918 			return (port);
919 		}
920 
921 		if (connp->conn_anon_priv_bind) {
922 			port = tcp_get_next_priv_port(tcp);
923 		} else {
924 			if (count == 0 && user_specified) {
925 				/*
926 				 * We may have to return an anonymous port. So
927 				 * get one to start with.
928 				 */
929 				port =
930 				    tcp_update_next_port(
931 				    tcps->tcps_next_port_to_try,
932 				    tcp, B_TRUE);
933 				user_specified = B_FALSE;
934 			} else {
935 				port = tcp_update_next_port(port + 1, tcp,
936 				    B_FALSE);
937 			}
938 		}
939 		if (port == 0)
940 			break;
941 
942 		/*
943 		 * Don't let this loop run forever in the case where
944 		 * all of the anonymous ports are in use.
945 		 */
946 	} while (++count < loopmax);
947 	return (0);
948 }
949