xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_bind.c (revision bf002425f517afdc1d8b6a9602e59910eeee05aa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/strsubr.h>
31 #include <sys/stropts.h>
32 #include <sys/strlog.h>
33 #define	_SUN_TPI_VERSION 2
34 #include <sys/tihdr.h>
35 #include <sys/suntpi.h>
36 #include <sys/xti_inet.h>
37 #include <sys/policy.h>
38 #include <sys/squeue_impl.h>
39 #include <sys/squeue.h>
40 #include <sys/tsol/tnet.h>
41 
42 #include <rpc/pmap_prot.h>
43 
44 #include <inet/common.h>
45 #include <inet/ip.h>
46 #include <inet/tcp.h>
47 #include <inet/tcp_impl.h>
48 #include <inet/proto_set.h>
49 #include <inet/ipsec_impl.h>
50 
51 /* Setable in /etc/system */
52 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
53 static uint32_t tcp_random_anon_port = 1;
54 
55 static int	tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
56 		    cred_t *cr);
57 static in_port_t	tcp_get_next_priv_port(const tcp_t *);
58 
59 /*
60  * Hash list insertion routine for tcp_t structures. Each hash bucket
61  * contains a list of tcp_t entries, and each entry is bound to a unique
62  * port. If there are multiple tcp_t's that are bound to the same port, then
63  * one of them will be linked into the hash bucket list, and the rest will
64  * hang off of that one entry. For each port, entries bound to a specific IP
65  * address will be inserted before those those bound to INADDR_ANY.
66  */
67 void
68 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
69 {
70 	tcp_t	**tcpp;
71 	tcp_t	*tcpnext;
72 	tcp_t	*tcphash;
73 	conn_t	*connp = tcp->tcp_connp;
74 	conn_t	*connext;
75 
76 	if (tcp->tcp_ptpbhn != NULL) {
77 		ASSERT(!caller_holds_lock);
78 		tcp_bind_hash_remove(tcp);
79 	}
80 	tcpp = &tbf->tf_tcp;
81 	if (!caller_holds_lock) {
82 		mutex_enter(&tbf->tf_lock);
83 	} else {
84 		ASSERT(MUTEX_HELD(&tbf->tf_lock));
85 	}
86 	tcphash = tcpp[0];
87 	tcpnext = NULL;
88 	if (tcphash != NULL) {
89 		/* Look for an entry using the same port */
90 		while ((tcphash = tcpp[0]) != NULL &&
91 		    connp->conn_lport != tcphash->tcp_connp->conn_lport)
92 			tcpp = &(tcphash->tcp_bind_hash);
93 
94 		/* The port was not found, just add to the end */
95 		if (tcphash == NULL)
96 			goto insert;
97 
98 		/*
99 		 * OK, there already exists an entry bound to the
100 		 * same port.
101 		 *
102 		 * If the new tcp bound to the INADDR_ANY address
103 		 * and the first one in the list is not bound to
104 		 * INADDR_ANY we skip all entries until we find the
105 		 * first one bound to INADDR_ANY.
106 		 * This makes sure that applications binding to a
107 		 * specific address get preference over those binding to
108 		 * INADDR_ANY.
109 		 */
110 		tcpnext = tcphash;
111 		connext = tcpnext->tcp_connp;
112 		tcphash = NULL;
113 		if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
114 		    !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
115 			while ((tcpnext = tcpp[0]) != NULL) {
116 				connext = tcpnext->tcp_connp;
117 				if (!V6_OR_V4_INADDR_ANY(
118 				    connext->conn_bound_addr_v6))
119 					tcpp = &(tcpnext->tcp_bind_hash_port);
120 				else
121 					break;
122 			}
123 			if (tcpnext != NULL) {
124 				tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
125 				tcphash = tcpnext->tcp_bind_hash;
126 				if (tcphash != NULL) {
127 					tcphash->tcp_ptpbhn =
128 					    &(tcp->tcp_bind_hash);
129 					tcpnext->tcp_bind_hash = NULL;
130 				}
131 			}
132 		} else {
133 			tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
134 			tcphash = tcpnext->tcp_bind_hash;
135 			if (tcphash != NULL) {
136 				tcphash->tcp_ptpbhn =
137 				    &(tcp->tcp_bind_hash);
138 				tcpnext->tcp_bind_hash = NULL;
139 			}
140 		}
141 	}
142 insert:
143 	tcp->tcp_bind_hash_port = tcpnext;
144 	tcp->tcp_bind_hash = tcphash;
145 	tcp->tcp_ptpbhn = tcpp;
146 	tcpp[0] = tcp;
147 	if (!caller_holds_lock)
148 		mutex_exit(&tbf->tf_lock);
149 }
150 
151 /*
152  * Hash list removal routine for tcp_t structures.
153  */
154 void
155 tcp_bind_hash_remove(tcp_t *tcp)
156 {
157 	tcp_t	*tcpnext;
158 	kmutex_t *lockp;
159 	tcp_stack_t	*tcps = tcp->tcp_tcps;
160 	conn_t		*connp = tcp->tcp_connp;
161 
162 	if (tcp->tcp_ptpbhn == NULL)
163 		return;
164 
165 	/*
166 	 * Extract the lock pointer in case there are concurrent
167 	 * hash_remove's for this instance.
168 	 */
169 	ASSERT(connp->conn_lport != 0);
170 	lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
171 	    connp->conn_lport)].tf_lock;
172 
173 	ASSERT(lockp != NULL);
174 	mutex_enter(lockp);
175 	if (tcp->tcp_ptpbhn) {
176 		tcpnext = tcp->tcp_bind_hash_port;
177 		if (tcpnext != NULL) {
178 			tcp->tcp_bind_hash_port = NULL;
179 			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
180 			tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
181 			if (tcpnext->tcp_bind_hash != NULL) {
182 				tcpnext->tcp_bind_hash->tcp_ptpbhn =
183 				    &(tcpnext->tcp_bind_hash);
184 				tcp->tcp_bind_hash = NULL;
185 			}
186 		} else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
187 			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
188 			tcp->tcp_bind_hash = NULL;
189 		}
190 		*tcp->tcp_ptpbhn = tcpnext;
191 		tcp->tcp_ptpbhn = NULL;
192 	}
193 	mutex_exit(lockp);
194 }
195 
196 /*
197  * Don't let port fall into the privileged range.
198  * Since the extra privileged ports can be arbitrary we also
199  * ensure that we exclude those from consideration.
200  * tcp_g_epriv_ports is not sorted thus we loop over it until
201  * there are no changes.
202  *
203  * Note: No locks are held when inspecting tcp_g_*epriv_ports
204  * but instead the code relies on:
205  * - the fact that the address of the array and its size never changes
206  * - the atomic assignment of the elements of the array
207  *
208  * Returns 0 if there are no more ports available.
209  *
210  * TS note: skip multilevel ports.
211  */
212 in_port_t
213 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
214 {
215 	int i;
216 	boolean_t restart = B_FALSE;
217 	tcp_stack_t *tcps = tcp->tcp_tcps;
218 
219 	if (random && tcp_random_anon_port != 0) {
220 		(void) random_get_pseudo_bytes((uint8_t *)&port,
221 		    sizeof (in_port_t));
222 		/*
223 		 * Unless changed by a sys admin, the smallest anon port
224 		 * is 32768 and the largest anon port is 65535.  It is
225 		 * very likely (50%) for the random port to be smaller
226 		 * than the smallest anon port.  When that happens,
227 		 * add port % (anon port range) to the smallest anon
228 		 * port to get the random port.  It should fall into the
229 		 * valid anon port range.
230 		 */
231 		if (port < tcps->tcps_smallest_anon_port) {
232 			port = tcps->tcps_smallest_anon_port +
233 			    port % (tcps->tcps_largest_anon_port -
234 			    tcps->tcps_smallest_anon_port);
235 		}
236 	}
237 
238 retry:
239 	if (port < tcps->tcps_smallest_anon_port)
240 		port = (in_port_t)tcps->tcps_smallest_anon_port;
241 
242 	if (port > tcps->tcps_largest_anon_port) {
243 		if (restart)
244 			return (0);
245 		restart = B_TRUE;
246 		port = (in_port_t)tcps->tcps_smallest_anon_port;
247 	}
248 
249 	if (port < tcps->tcps_smallest_nonpriv_port)
250 		port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
251 
252 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
253 		if (port == tcps->tcps_g_epriv_ports[i]) {
254 			port++;
255 			/*
256 			 * Make sure whether the port is in the
257 			 * valid range.
258 			 */
259 			goto retry;
260 		}
261 	}
262 	if (is_system_labeled() &&
263 	    (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
264 	    IPPROTO_TCP, B_TRUE)) != 0) {
265 		port = i;
266 		goto retry;
267 	}
268 	return (port);
269 }
270 
271 /*
272  * Return the next anonymous port in the privileged port range for
273  * bind checking.  It starts at IPPORT_RESERVED - 1 and goes
274  * downwards.  This is the same behavior as documented in the userland
275  * library call rresvport(3N).
276  *
277  * TS note: skip multilevel ports.
278  */
279 static in_port_t
280 tcp_get_next_priv_port(const tcp_t *tcp)
281 {
282 	static in_port_t next_priv_port = IPPORT_RESERVED - 1;
283 	in_port_t nextport;
284 	boolean_t restart = B_FALSE;
285 	tcp_stack_t *tcps = tcp->tcp_tcps;
286 retry:
287 	if (next_priv_port < tcps->tcps_min_anonpriv_port ||
288 	    next_priv_port >= IPPORT_RESERVED) {
289 		next_priv_port = IPPORT_RESERVED - 1;
290 		if (restart)
291 			return (0);
292 		restart = B_TRUE;
293 	}
294 	if (is_system_labeled() &&
295 	    (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
296 	    next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
297 		next_priv_port = nextport;
298 		goto retry;
299 	}
300 	return (next_priv_port--);
301 }
302 
303 static int
304 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
305     boolean_t bind_to_req_port_only, cred_t *cr)
306 {
307 	in_port_t	mlp_port;
308 	mlp_type_t 	addrtype, mlptype;
309 	boolean_t	user_specified;
310 	in_port_t	allocated_port;
311 	in_port_t	requested_port = *requested_port_ptr;
312 	conn_t		*connp = tcp->tcp_connp;
313 	zone_t		*zone;
314 	tcp_stack_t	*tcps = tcp->tcp_tcps;
315 	in6_addr_t	v6addr = connp->conn_laddr_v6;
316 
317 	/*
318 	 * XXX It's up to the caller to specify bind_to_req_port_only or not.
319 	 */
320 	ASSERT(cr != NULL);
321 
322 	/*
323 	 * Get a valid port (within the anonymous range and should not
324 	 * be a privileged one) to use if the user has not given a port.
325 	 * If multiple threads are here, they may all start with
326 	 * with the same initial port. But, it should be fine as long as
327 	 * tcp_bindi will ensure that no two threads will be assigned
328 	 * the same port.
329 	 *
330 	 * NOTE: XXX If a privileged process asks for an anonymous port, we
331 	 * still check for ports only in the range > tcp_smallest_non_priv_port,
332 	 * unless TCP_ANONPRIVBIND option is set.
333 	 */
334 	mlptype = mlptSingle;
335 	mlp_port = requested_port;
336 	if (requested_port == 0) {
337 		requested_port = connp->conn_anon_priv_bind ?
338 		    tcp_get_next_priv_port(tcp) :
339 		    tcp_update_next_port(tcps->tcps_next_port_to_try,
340 		    tcp, B_TRUE);
341 		if (requested_port == 0) {
342 			return (-TNOADDR);
343 		}
344 		user_specified = B_FALSE;
345 
346 		/*
347 		 * If the user went through one of the RPC interfaces to create
348 		 * this socket and RPC is MLP in this zone, then give him an
349 		 * anonymous MLP.
350 		 */
351 		if (connp->conn_anon_mlp && is_system_labeled()) {
352 			zone = crgetzone(cr);
353 			addrtype = tsol_mlp_addr_type(
354 			    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
355 			    IPV6_VERSION, &v6addr,
356 			    tcps->tcps_netstack->netstack_ip);
357 			if (addrtype == mlptSingle) {
358 				return (-TNOADDR);
359 			}
360 			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
361 			    PMAPPORT, addrtype);
362 			mlp_port = PMAPPORT;
363 		}
364 	} else {
365 		int i;
366 		boolean_t priv = B_FALSE;
367 
368 		/*
369 		 * If the requested_port is in the well-known privileged range,
370 		 * verify that the stream was opened by a privileged user.
371 		 * Note: No locks are held when inspecting tcp_g_*epriv_ports
372 		 * but instead the code relies on:
373 		 * - the fact that the address of the array and its size never
374 		 *   changes
375 		 * - the atomic assignment of the elements of the array
376 		 */
377 		if (requested_port < tcps->tcps_smallest_nonpriv_port) {
378 			priv = B_TRUE;
379 		} else {
380 			for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
381 				if (requested_port ==
382 				    tcps->tcps_g_epriv_ports[i]) {
383 					priv = B_TRUE;
384 					break;
385 				}
386 			}
387 		}
388 		if (priv) {
389 			if (secpolicy_net_privaddr(cr, requested_port,
390 			    IPPROTO_TCP) != 0) {
391 				if (connp->conn_debug) {
392 					(void) strlog(TCP_MOD_ID, 0, 1,
393 					    SL_ERROR|SL_TRACE,
394 					    "tcp_bind: no priv for port %d",
395 					    requested_port);
396 				}
397 				return (-TACCES);
398 			}
399 		}
400 		user_specified = B_TRUE;
401 
402 		connp = tcp->tcp_connp;
403 		if (is_system_labeled()) {
404 			zone = crgetzone(cr);
405 			addrtype = tsol_mlp_addr_type(
406 			    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
407 			    IPV6_VERSION, &v6addr,
408 			    tcps->tcps_netstack->netstack_ip);
409 			if (addrtype == mlptSingle) {
410 				return (-TNOADDR);
411 			}
412 			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
413 			    requested_port, addrtype);
414 		}
415 	}
416 
417 	if (mlptype != mlptSingle) {
418 		if (secpolicy_net_bindmlp(cr) != 0) {
419 			if (connp->conn_debug) {
420 				(void) strlog(TCP_MOD_ID, 0, 1,
421 				    SL_ERROR|SL_TRACE,
422 				    "tcp_bind: no priv for multilevel port %d",
423 				    requested_port);
424 			}
425 			return (-TACCES);
426 		}
427 
428 		/*
429 		 * If we're specifically binding a shared IP address and the
430 		 * port is MLP on shared addresses, then check to see if this
431 		 * zone actually owns the MLP.  Reject if not.
432 		 */
433 		if (mlptype == mlptShared && addrtype == mlptShared) {
434 			/*
435 			 * No need to handle exclusive-stack zones since
436 			 * ALL_ZONES only applies to the shared stack.
437 			 */
438 			zoneid_t mlpzone;
439 
440 			mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
441 			    htons(mlp_port));
442 			if (connp->conn_zoneid != mlpzone) {
443 				if (connp->conn_debug) {
444 					(void) strlog(TCP_MOD_ID, 0, 1,
445 					    SL_ERROR|SL_TRACE,
446 					    "tcp_bind: attempt to bind port "
447 					    "%d on shared addr in zone %d "
448 					    "(should be %d)",
449 					    mlp_port, connp->conn_zoneid,
450 					    mlpzone);
451 				}
452 				return (-TACCES);
453 			}
454 		}
455 
456 		if (!user_specified) {
457 			int err;
458 			err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
459 			    requested_port, B_TRUE);
460 			if (err != 0) {
461 				if (connp->conn_debug) {
462 					(void) strlog(TCP_MOD_ID, 0, 1,
463 					    SL_ERROR|SL_TRACE,
464 					    "tcp_bind: cannot establish anon "
465 					    "MLP for port %d",
466 					    requested_port);
467 				}
468 				return (err);
469 			}
470 			connp->conn_anon_port = B_TRUE;
471 		}
472 		connp->conn_mlp_type = mlptype;
473 	}
474 
475 	allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
476 	    connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
477 	    user_specified);
478 
479 	if (allocated_port == 0) {
480 		connp->conn_mlp_type = mlptSingle;
481 		if (connp->conn_anon_port) {
482 			connp->conn_anon_port = B_FALSE;
483 			(void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
484 			    requested_port, B_FALSE);
485 		}
486 		if (bind_to_req_port_only) {
487 			if (connp->conn_debug) {
488 				(void) strlog(TCP_MOD_ID, 0, 1,
489 				    SL_ERROR|SL_TRACE,
490 				    "tcp_bind: requested addr busy");
491 			}
492 			return (-TADDRBUSY);
493 		} else {
494 			/* If we are out of ports, fail the bind. */
495 			if (connp->conn_debug) {
496 				(void) strlog(TCP_MOD_ID, 0, 1,
497 				    SL_ERROR|SL_TRACE,
498 				    "tcp_bind: out of ports?");
499 			}
500 			return (-TNOADDR);
501 		}
502 	}
503 
504 	/* Pass the allocated port back */
505 	*requested_port_ptr = allocated_port;
506 	return (0);
507 }
508 
509 /*
510  * Check the address and check/pick a local port number.
511  */
512 int
513 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
514     boolean_t bind_to_req_port_only)
515 {
516 	tcp_t	*tcp = connp->conn_tcp;
517 	sin_t	*sin;
518 	sin6_t  *sin6;
519 	in_port_t	requested_port;
520 	ipaddr_t	v4addr;
521 	in6_addr_t	v6addr;
522 	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
523 	zoneid_t	zoneid = IPCL_ZONEID(connp);
524 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
525 	uint_t		scopeid = 0;
526 	int		error = 0;
527 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
528 
529 	ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
530 
531 	if (tcp->tcp_state == TCPS_BOUND) {
532 		return (0);
533 	} else if (tcp->tcp_state > TCPS_BOUND) {
534 		if (connp->conn_debug) {
535 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
536 			    "tcp_bind: bad state, %d", tcp->tcp_state);
537 		}
538 		return (-TOUTSTATE);
539 	}
540 
541 	ASSERT(sa != NULL && len != 0);
542 
543 	if (!OK_32PTR((char *)sa)) {
544 		if (connp->conn_debug) {
545 			(void) strlog(TCP_MOD_ID, 0, 1,
546 			    SL_ERROR|SL_TRACE,
547 			    "tcp_bind: bad address parameter, "
548 			    "address %p, len %d",
549 			    (void *)sa, len);
550 		}
551 		return (-TPROTO);
552 	}
553 
554 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
555 	if (error != 0) {
556 		return (error);
557 	}
558 
559 	switch (len) {
560 	case sizeof (sin_t):	/* Complete IPv4 address */
561 		sin = (sin_t *)sa;
562 		requested_port = ntohs(sin->sin_port);
563 		v4addr = sin->sin_addr.s_addr;
564 		IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
565 		if (v4addr != INADDR_ANY) {
566 			laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
567 			    B_FALSE);
568 		}
569 		break;
570 
571 	case sizeof (sin6_t): /* Complete IPv6 address */
572 		sin6 = (sin6_t *)sa;
573 		v6addr = sin6->sin6_addr;
574 		requested_port = ntohs(sin6->sin6_port);
575 		if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
576 			if (connp->conn_ipv6_v6only)
577 				return (EADDRNOTAVAIL);
578 
579 			IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
580 			if (v4addr != INADDR_ANY) {
581 				laddr_type = ip_laddr_verify_v4(v4addr,
582 				    zoneid, ipst, B_FALSE);
583 			}
584 		} else {
585 			if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
586 				if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
587 					scopeid = sin6->sin6_scope_id;
588 				laddr_type = ip_laddr_verify_v6(&v6addr,
589 				    zoneid, ipst, B_FALSE, scopeid);
590 			}
591 		}
592 		break;
593 
594 	default:
595 		if (connp->conn_debug) {
596 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
597 			    "tcp_bind: bad address length, %d", len);
598 		}
599 		return (EAFNOSUPPORT);
600 		/* return (-TBADADDR); */
601 	}
602 
603 	/* Is the local address a valid unicast address? */
604 	if (laddr_type == IPVL_BAD)
605 		return (EADDRNOTAVAIL);
606 
607 	connp->conn_bound_addr_v6 = v6addr;
608 	if (scopeid != 0) {
609 		ixa->ixa_flags |= IXAF_SCOPEID_SET;
610 		ixa->ixa_scopeid = scopeid;
611 		connp->conn_incoming_ifindex = scopeid;
612 	} else {
613 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
614 		connp->conn_incoming_ifindex = connp->conn_bound_if;
615 	}
616 
617 	connp->conn_laddr_v6 = v6addr;
618 	connp->conn_saddr_v6 = v6addr;
619 
620 	bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
621 
622 	error = tcp_bind_select_lport(tcp, &requested_port,
623 	    bind_to_req_port_only, cr);
624 	if (error != 0) {
625 		connp->conn_laddr_v6 = ipv6_all_zeros;
626 		connp->conn_saddr_v6 = ipv6_all_zeros;
627 		connp->conn_bound_addr_v6 = ipv6_all_zeros;
628 	}
629 	return (error);
630 }
631 
632 /*
633  * If the "bind_to_req_port_only" parameter is set, if the requested port
634  * number is available, return it, If not return 0
635  *
636  * If "bind_to_req_port_only" parameter is not set and
637  * If the requested port number is available, return it.  If not, return
638  * the first anonymous port we happen across.  If no anonymous ports are
639  * available, return 0. addr is the requested local address, if any.
640  *
641  * In either case, when succeeding update the tcp_t to record the port number
642  * and insert it in the bind hash table.
643  *
644  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
645  * without setting SO_REUSEADDR. This is needed so that they
646  * can be viewed as two independent transport protocols.
647  */
648 in_port_t
649 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
650     int reuseaddr, boolean_t quick_connect,
651     boolean_t bind_to_req_port_only, boolean_t user_specified)
652 {
653 	/* number of times we have run around the loop */
654 	int count = 0;
655 	/* maximum number of times to run around the loop */
656 	int loopmax;
657 	conn_t *connp = tcp->tcp_connp;
658 	tcp_stack_t	*tcps = tcp->tcp_tcps;
659 
660 	/*
661 	 * Lookup for free addresses is done in a loop and "loopmax"
662 	 * influences how long we spin in the loop
663 	 */
664 	if (bind_to_req_port_only) {
665 		/*
666 		 * If the requested port is busy, don't bother to look
667 		 * for a new one. Setting loop maximum count to 1 has
668 		 * that effect.
669 		 */
670 		loopmax = 1;
671 	} else {
672 		/*
673 		 * If the requested port is busy, look for a free one
674 		 * in the anonymous port range.
675 		 * Set loopmax appropriately so that one does not look
676 		 * forever in the case all of the anonymous ports are in use.
677 		 */
678 		if (connp->conn_anon_priv_bind) {
679 			/*
680 			 * loopmax =
681 			 * 	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
682 			 */
683 			loopmax = IPPORT_RESERVED -
684 			    tcps->tcps_min_anonpriv_port;
685 		} else {
686 			loopmax = (tcps->tcps_largest_anon_port -
687 			    tcps->tcps_smallest_anon_port + 1);
688 		}
689 	}
690 	do {
691 		uint16_t	lport;
692 		tf_t		*tbf;
693 		tcp_t		*ltcp;
694 		conn_t		*lconnp;
695 
696 		lport = htons(port);
697 
698 		/*
699 		 * Ensure that the tcp_t is not currently in the bind hash.
700 		 * Hold the lock on the hash bucket to ensure that
701 		 * the duplicate check plus the insertion is an atomic
702 		 * operation.
703 		 *
704 		 * This function does an inline lookup on the bind hash list
705 		 * Make sure that we access only members of tcp_t
706 		 * and that we don't look at tcp_tcp, since we are not
707 		 * doing a CONN_INC_REF.
708 		 */
709 		tcp_bind_hash_remove(tcp);
710 		tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
711 		mutex_enter(&tbf->tf_lock);
712 		for (ltcp = tbf->tf_tcp; ltcp != NULL;
713 		    ltcp = ltcp->tcp_bind_hash) {
714 			if (lport == ltcp->tcp_connp->conn_lport)
715 				break;
716 		}
717 
718 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
719 			boolean_t not_socket;
720 			boolean_t exclbind;
721 
722 			lconnp = ltcp->tcp_connp;
723 
724 			/*
725 			 * On a labeled system, we must treat bindings to ports
726 			 * on shared IP addresses by sockets with MAC exemption
727 			 * privilege as being in all zones, as there's
728 			 * otherwise no way to identify the right receiver.
729 			 */
730 			if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
731 				continue;
732 
733 			/*
734 			 * If TCP_EXCLBIND is set for either the bound or
735 			 * binding endpoint, the semantics of bind
736 			 * is changed according to the following.
737 			 *
738 			 * spec = specified address (v4 or v6)
739 			 * unspec = unspecified address (v4 or v6)
740 			 * A = specified addresses are different for endpoints
741 			 *
742 			 * bound	bind to		allowed
743 			 * -------------------------------------
744 			 * unspec	unspec		no
745 			 * unspec	spec		no
746 			 * spec		unspec		no
747 			 * spec		spec		yes if A
748 			 *
749 			 * For labeled systems, SO_MAC_EXEMPT behaves the same
750 			 * as TCP_EXCLBIND, except that zoneid is ignored.
751 			 *
752 			 * Note:
753 			 *
754 			 * 1. Because of TLI semantics, an endpoint can go
755 			 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
756 			 * TCPS_BOUND, depending on whether it is originally
757 			 * a listener or not.  That is why we need to check
758 			 * for states greater than or equal to TCPS_BOUND
759 			 * here.
760 			 *
761 			 * 2. Ideally, we should only check for state equals
762 			 * to TCPS_LISTEN. And the following check should be
763 			 * added.
764 			 *
765 			 * if (ltcp->tcp_state == TCPS_LISTEN ||
766 			 *	!reuseaddr || !lconnp->conn_reuseaddr) {
767 			 *		...
768 			 * }
769 			 *
770 			 * The semantics will be changed to this.  If the
771 			 * endpoint on the list is in state not equal to
772 			 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
773 			 * set, let the bind succeed.
774 			 *
775 			 * Because of (1), we cannot do that for TLI
776 			 * endpoints.  But we can do that for socket endpoints.
777 			 * If in future, we can change this going back
778 			 * semantics, we can use the above check for TLI also.
779 			 */
780 			not_socket = !(TCP_IS_SOCKET(ltcp) &&
781 			    TCP_IS_SOCKET(tcp));
782 			exclbind = lconnp->conn_exclbind ||
783 			    connp->conn_exclbind;
784 
785 			if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
786 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
787 			    (exclbind && (not_socket ||
788 			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
789 				if (V6_OR_V4_INADDR_ANY(
790 				    lconnp->conn_bound_addr_v6) ||
791 				    V6_OR_V4_INADDR_ANY(*laddr) ||
792 				    IN6_ARE_ADDR_EQUAL(laddr,
793 				    &lconnp->conn_bound_addr_v6)) {
794 					break;
795 				}
796 				continue;
797 			}
798 
799 			/*
800 			 * Check ipversion to allow IPv4 and IPv6 sockets to
801 			 * have disjoint port number spaces, if *_EXCLBIND
802 			 * is not set and only if the application binds to a
803 			 * specific port. We use the same autoassigned port
804 			 * number space for IPv4 and IPv6 sockets.
805 			 */
806 			if (connp->conn_ipversion != lconnp->conn_ipversion &&
807 			    bind_to_req_port_only)
808 				continue;
809 
810 			/*
811 			 * Ideally, we should make sure that the source
812 			 * address, remote address, and remote port in the
813 			 * four tuple for this tcp-connection is unique.
814 			 * However, trying to find out the local source
815 			 * address would require too much code duplication
816 			 * with IP, since IP needs needs to have that code
817 			 * to support userland TCP implementations.
818 			 */
819 			if (quick_connect &&
820 			    (ltcp->tcp_state > TCPS_LISTEN) &&
821 			    ((connp->conn_fport != lconnp->conn_fport) ||
822 			    !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
823 			    &lconnp->conn_faddr_v6)))
824 				continue;
825 
826 			if (!reuseaddr) {
827 				/*
828 				 * No socket option SO_REUSEADDR.
829 				 * If existing port is bound to
830 				 * a non-wildcard IP address
831 				 * and the requesting stream is
832 				 * bound to a distinct
833 				 * different IP addresses
834 				 * (non-wildcard, also), keep
835 				 * going.
836 				 */
837 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
838 				    !V6_OR_V4_INADDR_ANY(
839 				    lconnp->conn_bound_addr_v6) &&
840 				    !IN6_ARE_ADDR_EQUAL(laddr,
841 				    &lconnp->conn_bound_addr_v6))
842 					continue;
843 				if (ltcp->tcp_state >= TCPS_BOUND) {
844 					/*
845 					 * This port is being used and
846 					 * its state is >= TCPS_BOUND,
847 					 * so we can't bind to it.
848 					 */
849 					break;
850 				}
851 			} else {
852 				/*
853 				 * socket option SO_REUSEADDR is set on the
854 				 * binding tcp_t.
855 				 *
856 				 * If two streams are bound to
857 				 * same IP address or both addr
858 				 * and bound source are wildcards
859 				 * (INADDR_ANY), we want to stop
860 				 * searching.
861 				 * We have found a match of IP source
862 				 * address and source port, which is
863 				 * refused regardless of the
864 				 * SO_REUSEADDR setting, so we break.
865 				 */
866 				if (IN6_ARE_ADDR_EQUAL(laddr,
867 				    &lconnp->conn_bound_addr_v6) &&
868 				    (ltcp->tcp_state == TCPS_LISTEN ||
869 				    ltcp->tcp_state == TCPS_BOUND))
870 					break;
871 			}
872 		}
873 		if (ltcp != NULL) {
874 			/* The port number is busy */
875 			mutex_exit(&tbf->tf_lock);
876 		} else {
877 			/*
878 			 * This port is ours. Insert in fanout and mark as
879 			 * bound to prevent others from getting the port
880 			 * number.
881 			 */
882 			tcp->tcp_state = TCPS_BOUND;
883 			connp->conn_lport = htons(port);
884 
885 			ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
886 			    connp->conn_lport)] == tbf);
887 			tcp_bind_hash_insert(tbf, tcp, 1);
888 
889 			mutex_exit(&tbf->tf_lock);
890 
891 			/*
892 			 * We don't want tcp_next_port_to_try to "inherit"
893 			 * a port number supplied by the user in a bind.
894 			 */
895 			if (user_specified)
896 				return (port);
897 
898 			/*
899 			 * This is the only place where tcp_next_port_to_try
900 			 * is updated. After the update, it may or may not
901 			 * be in the valid range.
902 			 */
903 			if (!connp->conn_anon_priv_bind)
904 				tcps->tcps_next_port_to_try = port + 1;
905 			return (port);
906 		}
907 
908 		if (connp->conn_anon_priv_bind) {
909 			port = tcp_get_next_priv_port(tcp);
910 		} else {
911 			if (count == 0 && user_specified) {
912 				/*
913 				 * We may have to return an anonymous port. So
914 				 * get one to start with.
915 				 */
916 				port =
917 				    tcp_update_next_port(
918 				    tcps->tcps_next_port_to_try,
919 				    tcp, B_TRUE);
920 				user_specified = B_FALSE;
921 			} else {
922 				port = tcp_update_next_port(port + 1, tcp,
923 				    B_FALSE);
924 			}
925 		}
926 		if (port == 0)
927 			break;
928 
929 		/*
930 		 * Don't let this loop run forever in the case where
931 		 * all of the anonymous ports are in use.
932 		 */
933 	} while (++count < loopmax);
934 	return (0);
935 }
936