xref: /illumos-gate/usr/src/uts/common/inet/tcp/tcp_bind.c (revision 64130b0be265e6f79e86a9f3c515fb40680f25b1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  * Copyright 2024 Bill Sommerfeld <sommerfeld@hamachi.org>
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/strsun.h>
32 #include <sys/strsubr.h>
33 #include <sys/stropts.h>
34 #include <sys/strlog.h>
35 #define	_SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 #include <sys/suntpi.h>
38 #include <sys/xti_inet.h>
39 #include <sys/policy.h>
40 #include <sys/squeue_impl.h>
41 #include <sys/squeue.h>
42 #include <sys/tsol/tnet.h>
43 
44 #include <rpc/pmap_prot.h>
45 
46 #include <inet/common.h>
47 #include <inet/ip.h>
48 #include <inet/tcp.h>
49 #include <inet/tcp_impl.h>
50 #include <inet/proto_set.h>
51 #include <inet/ipsec_impl.h>
52 
53 /* Setable in /etc/system */
54 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
55 static uint32_t tcp_random_anon_port = 1;
56 
57 static int	tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
58 		    cred_t *cr);
59 static in_port_t	tcp_get_next_priv_port(const tcp_t *);
60 
61 /*
62  * Hash list insertion routine for tcp_t structures. Each hash bucket
63  * contains a list of tcp_t entries, and each entry is bound to a unique
64  * port. If there are multiple tcp_t's that are bound to the same port, then
65  * one of them will be linked into the hash bucket list, and the rest will
66  * hang off of that one entry. For each port, entries bound to a specific IP
67  * address will be inserted before those those bound to INADDR_ANY.
68  */
69 void
tcp_bind_hash_insert(tf_t * tbf,tcp_t * tcp,int caller_holds_lock)70 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
71 {
72 	tcp_t	**tcpp;
73 	tcp_t	*tcpnext;
74 	tcp_t	*tcphash;
75 	conn_t	*connp = tcp->tcp_connp;
76 	conn_t	*connext;
77 
78 	if (tcp->tcp_ptpbhn != NULL) {
79 		ASSERT(!caller_holds_lock);
80 		tcp_bind_hash_remove(tcp);
81 	}
82 	tcpp = &tbf->tf_tcp;
83 	if (!caller_holds_lock) {
84 		mutex_enter(&tbf->tf_lock);
85 	} else {
86 		ASSERT(MUTEX_HELD(&tbf->tf_lock));
87 	}
88 	tcphash = tcpp[0];
89 	tcpnext = NULL;
90 	if (tcphash != NULL) {
91 		/* Look for an entry using the same port */
92 		while ((tcphash = tcpp[0]) != NULL &&
93 		    connp->conn_lport != tcphash->tcp_connp->conn_lport)
94 			tcpp = &(tcphash->tcp_bind_hash);
95 
96 		/* The port was not found, just add to the end */
97 		if (tcphash == NULL)
98 			goto insert;
99 
100 		/*
101 		 * OK, there already exists an entry bound to the
102 		 * same port.
103 		 *
104 		 * If the new tcp bound to the INADDR_ANY address
105 		 * and the first one in the list is not bound to
106 		 * INADDR_ANY we skip all entries until we find the
107 		 * first one bound to INADDR_ANY.
108 		 * This makes sure that applications binding to a
109 		 * specific address get preference over those binding to
110 		 * INADDR_ANY.
111 		 */
112 		tcpnext = tcphash;
113 		connext = tcpnext->tcp_connp;
114 		tcphash = NULL;
115 		if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
116 		    !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
117 			while ((tcpnext = tcpp[0]) != NULL) {
118 				connext = tcpnext->tcp_connp;
119 				if (!V6_OR_V4_INADDR_ANY(
120 				    connext->conn_bound_addr_v6))
121 					tcpp = &(tcpnext->tcp_bind_hash_port);
122 				else
123 					break;
124 			}
125 			if (tcpnext != NULL) {
126 				tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
127 				tcphash = tcpnext->tcp_bind_hash;
128 				if (tcphash != NULL) {
129 					tcphash->tcp_ptpbhn =
130 					    &(tcp->tcp_bind_hash);
131 					tcpnext->tcp_bind_hash = NULL;
132 				}
133 			}
134 		} else {
135 			tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
136 			tcphash = tcpnext->tcp_bind_hash;
137 			if (tcphash != NULL) {
138 				tcphash->tcp_ptpbhn =
139 				    &(tcp->tcp_bind_hash);
140 				tcpnext->tcp_bind_hash = NULL;
141 			}
142 		}
143 	}
144 insert:
145 	tcp->tcp_bind_hash_port = tcpnext;
146 	tcp->tcp_bind_hash = tcphash;
147 	tcp->tcp_ptpbhn = tcpp;
148 	tcpp[0] = tcp;
149 	if (!caller_holds_lock)
150 		mutex_exit(&tbf->tf_lock);
151 }
152 
153 /*
154  * Hash list removal routine for tcp_t structures.
155  */
156 void
tcp_bind_hash_remove(tcp_t * tcp)157 tcp_bind_hash_remove(tcp_t *tcp)
158 {
159 	tcp_t	*tcpnext;
160 	kmutex_t *lockp;
161 	tcp_stack_t	*tcps = tcp->tcp_tcps;
162 	conn_t		*connp = tcp->tcp_connp;
163 
164 	if (tcp->tcp_ptpbhn == NULL)
165 		return;
166 
167 	/*
168 	 * Extract the lock pointer in case there are concurrent
169 	 * hash_remove's for this instance.
170 	 */
171 	ASSERT(connp->conn_lport != 0);
172 	lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
173 	    connp->conn_lport)].tf_lock;
174 
175 	ASSERT(lockp != NULL);
176 	mutex_enter(lockp);
177 	if (tcp->tcp_ptpbhn) {
178 		tcpnext = tcp->tcp_bind_hash_port;
179 		if (tcpnext != NULL) {
180 			tcp->tcp_bind_hash_port = NULL;
181 			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
182 			tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
183 			if (tcpnext->tcp_bind_hash != NULL) {
184 				tcpnext->tcp_bind_hash->tcp_ptpbhn =
185 				    &(tcpnext->tcp_bind_hash);
186 				tcp->tcp_bind_hash = NULL;
187 			}
188 		} else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
189 			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
190 			tcp->tcp_bind_hash = NULL;
191 		}
192 		*tcp->tcp_ptpbhn = tcpnext;
193 		tcp->tcp_ptpbhn = NULL;
194 	}
195 	mutex_exit(lockp);
196 }
197 
198 /*
199  * Don't let port fall into the privileged range.
200  * Since the extra privileged ports can be arbitrary we also
201  * ensure that we exclude those from consideration.
202  * tcp_g_epriv_ports is not sorted thus we loop over it until
203  * there are no changes.
204  *
205  * Note: No locks are held when inspecting tcp_g_*epriv_ports
206  * but instead the code relies on:
207  * - the fact that the address of the array and its size never changes
208  * - the atomic assignment of the elements of the array
209  *
210  * Returns 0 if there are no more ports available.
211  *
212  * TS note: skip multilevel ports.
213  */
214 in_port_t
tcp_update_next_port(in_port_t port,const tcp_t * tcp,boolean_t random)215 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
216 {
217 	int i, bump;
218 	boolean_t restart = B_FALSE;
219 	tcp_stack_t *tcps = tcp->tcp_tcps;
220 
221 	if (random && tcp_random_anon_port != 0) {
222 		(void) random_get_pseudo_bytes((uint8_t *)&port,
223 		    sizeof (in_port_t));
224 		/*
225 		 * Unless changed by a sys admin, the smallest anon port
226 		 * is 32768 and the largest anon port is 65535.  It is
227 		 * very likely (50%) for the random port to be smaller
228 		 * than the smallest anon port.  When that happens,
229 		 * add port % (anon port range) to the smallest anon
230 		 * port to get the random port.  It should fall into the
231 		 * valid anon port range.
232 		 */
233 		if ((port < tcps->tcps_smallest_anon_port) ||
234 		    (port > tcps->tcps_largest_anon_port)) {
235 			if (tcps->tcps_smallest_anon_port ==
236 			    tcps->tcps_largest_anon_port) {
237 				bump = 0;
238 			} else {
239 				bump = port % (tcps->tcps_largest_anon_port -
240 				    tcps->tcps_smallest_anon_port);
241 			}
242 			port = tcps->tcps_smallest_anon_port + bump;
243 		}
244 	}
245 
246 retry:
247 	if (port < tcps->tcps_smallest_anon_port)
248 		port = (in_port_t)tcps->tcps_smallest_anon_port;
249 
250 	if (port > tcps->tcps_largest_anon_port) {
251 		if (restart)
252 			return (0);
253 		restart = B_TRUE;
254 		port = (in_port_t)tcps->tcps_smallest_anon_port;
255 	}
256 
257 	if (port < tcps->tcps_smallest_nonpriv_port)
258 		port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
259 
260 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
261 		if (port == tcps->tcps_g_epriv_ports[i]) {
262 			port++;
263 			/*
264 			 * Make sure whether the port is in the
265 			 * valid range.
266 			 */
267 			goto retry;
268 		}
269 	}
270 	if (is_system_labeled() &&
271 	    (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
272 	    IPPROTO_TCP, B_TRUE)) != 0) {
273 		port = i;
274 		goto retry;
275 	}
276 	return (port);
277 }
278 
279 /*
280  * Return the next anonymous port in the privileged port range for
281  * bind checking.  It starts at IPPORT_RESERVED - 1 and goes
282  * downwards.  This is the same behavior as documented in the userland
283  * library call rresvport(3SOCKET).
284  *
285  * TS note: skip multilevel ports.
286  */
287 static in_port_t
tcp_get_next_priv_port(const tcp_t * tcp)288 tcp_get_next_priv_port(const tcp_t *tcp)
289 {
290 	static in_port_t next_priv_port = IPPORT_RESERVED - 1;
291 	in_port_t nextport;
292 	boolean_t restart = B_FALSE;
293 	tcp_stack_t *tcps = tcp->tcp_tcps;
294 retry:
295 	if (next_priv_port < tcps->tcps_min_anonpriv_port ||
296 	    next_priv_port >= IPPORT_RESERVED) {
297 		next_priv_port = IPPORT_RESERVED - 1;
298 		if (restart)
299 			return (0);
300 		restart = B_TRUE;
301 	}
302 	if (is_system_labeled() &&
303 	    (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
304 	    next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
305 		next_priv_port = nextport;
306 		goto retry;
307 	}
308 	return (next_priv_port--);
309 }
310 
311 static int
tcp_bind_select_lport(tcp_t * tcp,in_port_t * requested_port_ptr,boolean_t bind_to_req_port_only,cred_t * cr)312 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
313     boolean_t bind_to_req_port_only, cred_t *cr)
314 {
315 	in_port_t	mlp_port;
316 	mlp_type_t	addrtype, mlptype;
317 	boolean_t	user_specified;
318 	in_port_t	allocated_port;
319 	in_port_t	requested_port = *requested_port_ptr;
320 	conn_t		*connp = tcp->tcp_connp;
321 	zone_t		*zone;
322 	tcp_stack_t	*tcps = tcp->tcp_tcps;
323 	in6_addr_t	v6addr = connp->conn_laddr_v6;
324 
325 	zone = NULL;
326 	/*
327 	 * XXX It's up to the caller to specify bind_to_req_port_only or not.
328 	 */
329 	ASSERT(cr != NULL);
330 
331 	/*
332 	 * Get a valid port (within the anonymous range and should not
333 	 * be a privileged one) to use if the user has not given a port.
334 	 * If multiple threads are here, they may all start with
335 	 * with the same initial port. But, it should be fine as long as
336 	 * tcp_bindi will ensure that no two threads will be assigned
337 	 * the same port.
338 	 *
339 	 * NOTE: XXX If a privileged process asks for an anonymous port, we
340 	 * still check for ports only in the range > tcp_smallest_non_priv_port,
341 	 * unless TCP_ANONPRIVBIND option is set.
342 	 */
343 	mlptype = mlptSingle;
344 	mlp_port = requested_port;
345 	if (requested_port == 0) {
346 		requested_port = connp->conn_anon_priv_bind ?
347 		    tcp_get_next_priv_port(tcp) :
348 		    tcp_update_next_port(tcps->tcps_next_port_to_try,
349 		    tcp, B_TRUE);
350 		if (requested_port == 0) {
351 			return (-TNOADDR);
352 		}
353 		user_specified = B_FALSE;
354 
355 		/*
356 		 * If the user went through one of the RPC interfaces to create
357 		 * this socket and RPC is MLP in this zone, then give them an
358 		 * anonymous MLP.
359 		 */
360 		if (connp->conn_anon_mlp && is_system_labeled()) {
361 			zone = crgetzone(cr);
362 			addrtype = tsol_mlp_addr_type(
363 			    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
364 			    IPV6_VERSION, &v6addr,
365 			    tcps->tcps_netstack->netstack_ip);
366 			if (addrtype == mlptSingle) {
367 				return (-TNOADDR);
368 			}
369 			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
370 			    PMAPPORT, addrtype);
371 			mlp_port = PMAPPORT;
372 		}
373 	} else {
374 		int i;
375 		boolean_t priv = B_FALSE;
376 
377 		/*
378 		 * If the requested_port is in the well-known privileged range,
379 		 * verify that the stream was opened by a privileged user.
380 		 * Note: No locks are held when inspecting tcp_g_*epriv_ports
381 		 * but instead the code relies on:
382 		 * - the fact that the address of the array and its size never
383 		 *   changes
384 		 * - the atomic assignment of the elements of the array
385 		 */
386 		if (requested_port < tcps->tcps_smallest_nonpriv_port) {
387 			priv = B_TRUE;
388 		} else {
389 			for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
390 				if (requested_port ==
391 				    tcps->tcps_g_epriv_ports[i]) {
392 					priv = B_TRUE;
393 					break;
394 				}
395 			}
396 		}
397 		if (priv) {
398 			if (secpolicy_net_privaddr(cr, requested_port,
399 			    IPPROTO_TCP) != 0) {
400 				if (connp->conn_debug) {
401 					(void) strlog(TCP_MOD_ID, 0, 1,
402 					    SL_ERROR|SL_TRACE,
403 					    "tcp_bind: no priv for port %d",
404 					    requested_port);
405 				}
406 				return (-TACCES);
407 			}
408 		}
409 		user_specified = B_TRUE;
410 
411 		connp = tcp->tcp_connp;
412 		if (is_system_labeled()) {
413 			zone = crgetzone(cr);
414 			addrtype = tsol_mlp_addr_type(
415 			    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
416 			    IPV6_VERSION, &v6addr,
417 			    tcps->tcps_netstack->netstack_ip);
418 			if (addrtype == mlptSingle) {
419 				return (-TNOADDR);
420 			}
421 			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
422 			    requested_port, addrtype);
423 		}
424 	}
425 
426 	if (mlptype != mlptSingle) {
427 		if (secpolicy_net_bindmlp(cr) != 0) {
428 			if (connp->conn_debug) {
429 				(void) strlog(TCP_MOD_ID, 0, 1,
430 				    SL_ERROR|SL_TRACE,
431 				    "tcp_bind: no priv for multilevel port %d",
432 				    requested_port);
433 			}
434 			return (-TACCES);
435 		}
436 
437 		/*
438 		 * If we're specifically binding a shared IP address and the
439 		 * port is MLP on shared addresses, then check to see if this
440 		 * zone actually owns the MLP.  Reject if not.
441 		 */
442 		if (mlptype == mlptShared && addrtype == mlptShared) {
443 			/*
444 			 * No need to handle exclusive-stack zones since
445 			 * ALL_ZONES only applies to the shared stack.
446 			 */
447 			zoneid_t mlpzone;
448 
449 			mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
450 			    htons(mlp_port));
451 			if (connp->conn_zoneid != mlpzone) {
452 				if (connp->conn_debug) {
453 					(void) strlog(TCP_MOD_ID, 0, 1,
454 					    SL_ERROR|SL_TRACE,
455 					    "tcp_bind: attempt to bind port "
456 					    "%d on shared addr in zone %d "
457 					    "(should be %d)",
458 					    mlp_port, connp->conn_zoneid,
459 					    mlpzone);
460 				}
461 				return (-TACCES);
462 			}
463 		}
464 
465 		if (!user_specified) {
466 			int err;
467 			err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
468 			    requested_port, B_TRUE);
469 			if (err != 0) {
470 				if (connp->conn_debug) {
471 					(void) strlog(TCP_MOD_ID, 0, 1,
472 					    SL_ERROR|SL_TRACE,
473 					    "tcp_bind: cannot establish anon "
474 					    "MLP for port %d",
475 					    requested_port);
476 				}
477 				return (err);
478 			}
479 			connp->conn_anon_port = B_TRUE;
480 		}
481 		connp->conn_mlp_type = mlptype;
482 	}
483 
484 	allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
485 	    connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
486 	    user_specified);
487 
488 	if (allocated_port == 0) {
489 		connp->conn_mlp_type = mlptSingle;
490 		if (connp->conn_anon_port) {
491 			connp->conn_anon_port = B_FALSE;
492 			(void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
493 			    requested_port, B_FALSE);
494 		}
495 		if (bind_to_req_port_only) {
496 			if (connp->conn_debug) {
497 				(void) strlog(TCP_MOD_ID, 0, 1,
498 				    SL_ERROR|SL_TRACE,
499 				    "tcp_bind: requested addr busy");
500 			}
501 			return (-TADDRBUSY);
502 		} else {
503 			/* If we are out of ports, fail the bind. */
504 			if (connp->conn_debug) {
505 				(void) strlog(TCP_MOD_ID, 0, 1,
506 				    SL_ERROR|SL_TRACE,
507 				    "tcp_bind: out of ports?");
508 			}
509 			return (-TNOADDR);
510 		}
511 	}
512 
513 	/* Pass the allocated port back */
514 	*requested_port_ptr = allocated_port;
515 	return (0);
516 }
517 
518 /*
519  * Check the address and check/pick a local port number.
520  */
521 int
tcp_bind_check(conn_t * connp,struct sockaddr * sa,socklen_t len,cred_t * cr,boolean_t bind_to_req_port_only)522 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
523     boolean_t bind_to_req_port_only)
524 {
525 	tcp_t	*tcp = connp->conn_tcp;
526 	sin_t	*sin;
527 	sin6_t  *sin6;
528 	in_port_t	requested_port;
529 	ipaddr_t	v4addr;
530 	in6_addr_t	v6addr;
531 	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
532 	zoneid_t	zoneid = IPCL_ZONEID(connp);
533 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
534 	uint_t		scopeid = 0;
535 	int		error = 0;
536 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
537 
538 	ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
539 
540 	if (tcp->tcp_state == TCPS_BOUND) {
541 		return (0);
542 	} else if (tcp->tcp_state > TCPS_BOUND) {
543 		if (connp->conn_debug) {
544 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
545 			    "tcp_bind: bad state, %d", tcp->tcp_state);
546 		}
547 		return (-TOUTSTATE);
548 	}
549 
550 	ASSERT(sa != NULL && len != 0);
551 
552 	if (!OK_32PTR((char *)sa)) {
553 		if (connp->conn_debug) {
554 			(void) strlog(TCP_MOD_ID, 0, 1,
555 			    SL_ERROR|SL_TRACE,
556 			    "tcp_bind: bad address parameter, "
557 			    "address %p, len %d",
558 			    (void *)sa, len);
559 		}
560 		return (-TPROTO);
561 	}
562 
563 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
564 	if (error != 0) {
565 		return (error);
566 	}
567 
568 	switch (len) {
569 	case sizeof (sin_t):	/* Complete IPv4 address */
570 		sin = (sin_t *)sa;
571 		requested_port = ntohs(sin->sin_port);
572 		v4addr = sin->sin_addr.s_addr;
573 		IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
574 		if (v4addr != INADDR_ANY) {
575 			laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
576 			    B_FALSE);
577 		}
578 		break;
579 
580 	case sizeof (sin6_t): /* Complete IPv6 address */
581 		sin6 = (sin6_t *)sa;
582 		v6addr = sin6->sin6_addr;
583 		requested_port = ntohs(sin6->sin6_port);
584 		if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
585 			if (connp->conn_ipv6_v6only)
586 				return (EADDRNOTAVAIL);
587 
588 			IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
589 			if (v4addr != INADDR_ANY) {
590 				laddr_type = ip_laddr_verify_v4(v4addr,
591 				    zoneid, ipst, B_FALSE);
592 			}
593 		} else {
594 			if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
595 				if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
596 					scopeid = sin6->sin6_scope_id;
597 				laddr_type = ip_laddr_verify_v6(&v6addr,
598 				    zoneid, ipst, B_FALSE, scopeid);
599 			}
600 		}
601 		break;
602 
603 	default:
604 		if (connp->conn_debug) {
605 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
606 			    "tcp_bind: bad address length, %d", len);
607 		}
608 		return (EAFNOSUPPORT);
609 		/* return (-TBADADDR); */
610 	}
611 
612 	/* Is the local address a valid unicast address? */
613 	if (laddr_type == IPVL_BAD)
614 		return (EADDRNOTAVAIL);
615 
616 	connp->conn_bound_addr_v6 = v6addr;
617 	if (scopeid != 0) {
618 		ixa->ixa_flags |= IXAF_SCOPEID_SET;
619 		ixa->ixa_scopeid = scopeid;
620 		connp->conn_incoming_ifindex = scopeid;
621 	} else {
622 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
623 		connp->conn_incoming_ifindex = connp->conn_bound_if;
624 	}
625 
626 	connp->conn_laddr_v6 = v6addr;
627 	connp->conn_saddr_v6 = v6addr;
628 
629 	bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
630 
631 	error = tcp_bind_select_lport(tcp, &requested_port,
632 	    bind_to_req_port_only, cr);
633 	if (error != 0) {
634 		connp->conn_laddr_v6 = ipv6_all_zeros;
635 		connp->conn_saddr_v6 = ipv6_all_zeros;
636 		connp->conn_bound_addr_v6 = ipv6_all_zeros;
637 	}
638 	return (error);
639 }
640 
641 /*
642  * If the "bind_to_req_port_only" parameter is set, if the requested port
643  * number is available, return it, If not return 0
644  *
645  * If "bind_to_req_port_only" parameter is not set and
646  * If the requested port number is available, return it.  If not, return
647  * the first anonymous port we happen across.  If no anonymous ports are
648  * available, return 0. addr is the requested local address, if any.
649  *
650  * In either case, when succeeding update the tcp_t to record the port number
651  * and insert it in the bind hash table.
652  *
653  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
654  * without setting SO_REUSEADDR. This is needed so that they
655  * can be viewed as two independent transport protocols.
656  */
657 in_port_t
tcp_bindi(tcp_t * tcp,in_port_t port,const in6_addr_t * laddr,int reuseaddr,boolean_t quick_connect,boolean_t bind_to_req_port_only,boolean_t user_specified)658 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
659     int reuseaddr, boolean_t quick_connect,
660     boolean_t bind_to_req_port_only, boolean_t user_specified)
661 {
662 	/* number of times we have run around the loop */
663 	int count = 0;
664 	/* maximum number of times to run around the loop */
665 	int loopmax;
666 	conn_t *connp = tcp->tcp_connp;
667 	tcp_stack_t	*tcps = tcp->tcp_tcps;
668 
669 	/*
670 	 * Lookup for free addresses is done in a loop and "loopmax"
671 	 * influences how long we spin in the loop
672 	 */
673 	if (bind_to_req_port_only) {
674 		/*
675 		 * If the requested port is busy, don't bother to look
676 		 * for a new one. Setting loop maximum count to 1 has
677 		 * that effect.
678 		 */
679 		loopmax = 1;
680 	} else {
681 		/*
682 		 * If the requested port is busy, look for a free one
683 		 * in the anonymous port range.
684 		 * Set loopmax appropriately so that one does not look
685 		 * forever in the case all of the anonymous ports are in use.
686 		 */
687 		if (connp->conn_anon_priv_bind) {
688 			/*
689 			 * loopmax =
690 			 *	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
691 			 */
692 			loopmax = IPPORT_RESERVED -
693 			    tcps->tcps_min_anonpriv_port;
694 		} else {
695 			loopmax = (tcps->tcps_largest_anon_port -
696 			    tcps->tcps_smallest_anon_port + 1);
697 		}
698 	}
699 	do {
700 		uint16_t	lport;
701 		tf_t		*tbf;
702 		tcp_t		*ltcp;
703 		conn_t		*lconnp;
704 
705 		lport = htons(port);
706 
707 		/*
708 		 * Ensure that the tcp_t is not currently in the bind hash.
709 		 * Hold the lock on the hash bucket to ensure that
710 		 * the duplicate check plus the insertion is an atomic
711 		 * operation.
712 		 *
713 		 * This function does an inline lookup on the bind hash list
714 		 * Make sure that we access only members of tcp_t
715 		 * and that we don't look at tcp_tcp, since we are not
716 		 * doing a CONN_INC_REF.
717 		 */
718 		tcp_bind_hash_remove(tcp);
719 		tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
720 		mutex_enter(&tbf->tf_lock);
721 		for (ltcp = tbf->tf_tcp; ltcp != NULL;
722 		    ltcp = ltcp->tcp_bind_hash) {
723 			if (lport == ltcp->tcp_connp->conn_lport)
724 				break;
725 		}
726 
727 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
728 			boolean_t not_socket;
729 			boolean_t exclbind;
730 
731 			lconnp = ltcp->tcp_connp;
732 
733 			/*
734 			 * On a labeled system, we must treat bindings to ports
735 			 * on shared IP addresses by sockets with MAC exemption
736 			 * privilege as being in all zones, as there's
737 			 * otherwise no way to identify the right receiver.
738 			 */
739 			if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
740 				continue;
741 
742 			/*
743 			 * allow multiple interface-specific binds to coexist.
744 			 */
745 			if (connp->conn_incoming_ifindex !=
746 			    lconnp->conn_incoming_ifindex) {
747 				if ((connp->conn_incoming_ifindex != 0) &&
748 				    (lconnp->conn_incoming_ifindex != 0))
749 					continue;
750 			}
751 
752 			/*
753 			 * If TCP_EXCLBIND is set for either the bound or
754 			 * binding endpoint, the semantics of bind
755 			 * is changed according to the following.
756 			 *
757 			 * spec = specified address (v4 or v6)
758 			 * unspec = unspecified address (v4 or v6)
759 			 * A = specified addresses are different for endpoints
760 			 *
761 			 * bound	bind to		allowed
762 			 * -------------------------------------
763 			 * unspec	unspec		no
764 			 * unspec	spec		no
765 			 * spec		unspec		no
766 			 * spec		spec		yes if A
767 			 *
768 			 * For labeled systems, SO_MAC_EXEMPT behaves the same
769 			 * as TCP_EXCLBIND, except that zoneid is ignored.
770 			 *
771 			 * Note:
772 			 *
773 			 * 1. Because of TLI semantics, an endpoint can go
774 			 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
775 			 * TCPS_BOUND, depending on whether it is originally
776 			 * a listener or not.  That is why we need to check
777 			 * for states greater than or equal to TCPS_BOUND
778 			 * here.
779 			 *
780 			 * 2. Ideally, we should only check for state equals
781 			 * to TCPS_LISTEN. And the following check should be
782 			 * added.
783 			 *
784 			 * if (ltcp->tcp_state == TCPS_LISTEN ||
785 			 *	!reuseaddr || !lconnp->conn_reuseaddr) {
786 			 *		...
787 			 * }
788 			 *
789 			 * The semantics will be changed to this.  If the
790 			 * endpoint on the list is in state not equal to
791 			 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
792 			 * set, let the bind succeed.
793 			 *
794 			 * Because of (1), we cannot do that for TLI
795 			 * endpoints.  But we can do that for socket endpoints.
796 			 * If in future, we can change this going back
797 			 * semantics, we can use the above check for TLI also.
798 			 */
799 			not_socket = !(TCP_IS_SOCKET(ltcp) &&
800 			    TCP_IS_SOCKET(tcp));
801 			exclbind = lconnp->conn_exclbind ||
802 			    connp->conn_exclbind;
803 
804 			if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
805 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
806 			    (exclbind && (not_socket ||
807 			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
808 				if (V6_OR_V4_INADDR_ANY(
809 				    lconnp->conn_bound_addr_v6) ||
810 				    V6_OR_V4_INADDR_ANY(*laddr) ||
811 				    IN6_ARE_ADDR_EQUAL(laddr,
812 				    &lconnp->conn_bound_addr_v6)) {
813 					break;
814 				}
815 				continue;
816 			}
817 
818 			/*
819 			 * Check ipversion to allow IPv4 and IPv6 sockets to
820 			 * have disjoint port number spaces, if *_EXCLBIND
821 			 * is not set and only if the application binds to a
822 			 * specific port. We use the same autoassigned port
823 			 * number space for IPv4 and IPv6 sockets.
824 			 */
825 			if (connp->conn_ipversion != lconnp->conn_ipversion &&
826 			    bind_to_req_port_only)
827 				continue;
828 
829 			/*
830 			 * Ideally, we should make sure that the source
831 			 * address, remote address, and remote port in the
832 			 * four tuple for this tcp-connection is unique.
833 			 * However, trying to find out the local source
834 			 * address would require too much code duplication
835 			 * with IP, since IP needs needs to have that code
836 			 * to support userland TCP implementations.
837 			 */
838 			if (quick_connect &&
839 			    (ltcp->tcp_state > TCPS_LISTEN) &&
840 			    ((connp->conn_fport != lconnp->conn_fport) ||
841 			    !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
842 			    &lconnp->conn_faddr_v6)))
843 				continue;
844 
845 			if (!reuseaddr) {
846 				/*
847 				 * No socket option SO_REUSEADDR.
848 				 * If existing port is bound to
849 				 * a non-wildcard IP address
850 				 * and the requesting stream is
851 				 * bound to a distinct
852 				 * different IP addresses
853 				 * (non-wildcard, also), keep
854 				 * going.
855 				 */
856 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
857 				    !V6_OR_V4_INADDR_ANY(
858 				    lconnp->conn_bound_addr_v6) &&
859 				    !IN6_ARE_ADDR_EQUAL(laddr,
860 				    &lconnp->conn_bound_addr_v6))
861 					continue;
862 				if (ltcp->tcp_state >= TCPS_BOUND) {
863 					/*
864 					 * This port is being used and
865 					 * its state is >= TCPS_BOUND,
866 					 * so we can't bind to it.
867 					 */
868 					break;
869 				}
870 			} else {
871 				/*
872 				 * socket option SO_REUSEADDR is set on the
873 				 * binding tcp_t.
874 				 *
875 				 * If two streams are bound to
876 				 * same IP address or both addr
877 				 * and bound source are wildcards
878 				 * (INADDR_ANY), we want to stop
879 				 * searching.
880 				 * We have found a match of IP source
881 				 * address and source port, which is
882 				 * refused regardless of the
883 				 * SO_REUSEADDR setting, so we break.
884 				 */
885 				if (IN6_ARE_ADDR_EQUAL(laddr,
886 				    &lconnp->conn_bound_addr_v6) &&
887 				    (ltcp->tcp_state == TCPS_LISTEN ||
888 				    ltcp->tcp_state == TCPS_BOUND))
889 					break;
890 			}
891 		}
892 		if (ltcp != NULL) {
893 			/* The port number is busy */
894 			mutex_exit(&tbf->tf_lock);
895 		} else {
896 			/*
897 			 * This port is ours. Insert in fanout and mark as
898 			 * bound to prevent others from getting the port
899 			 * number.
900 			 */
901 			tcp->tcp_state = TCPS_BOUND;
902 			DTRACE_TCP6(state__change, void, NULL,
903 			    ip_xmit_attr_t *, connp->conn_ixa,
904 			    void, NULL, tcp_t *, tcp, void, NULL,
905 			    int32_t, TCPS_IDLE);
906 
907 			connp->conn_lport = htons(port);
908 
909 			ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
910 			    connp->conn_lport)] == tbf);
911 			tcp_bind_hash_insert(tbf, tcp, 1);
912 
913 			mutex_exit(&tbf->tf_lock);
914 
915 			/*
916 			 * We don't want tcp_next_port_to_try to "inherit"
917 			 * a port number supplied by the user in a bind.
918 			 */
919 			if (user_specified)
920 				return (port);
921 
922 			/*
923 			 * This is the only place where tcp_next_port_to_try
924 			 * is updated. After the update, it may or may not
925 			 * be in the valid range.
926 			 */
927 			if (!connp->conn_anon_priv_bind)
928 				tcps->tcps_next_port_to_try = port + 1;
929 			return (port);
930 		}
931 
932 		if (connp->conn_anon_priv_bind) {
933 			port = tcp_get_next_priv_port(tcp);
934 		} else {
935 			if (count == 0 && user_specified) {
936 				/*
937 				 * We may have to return an anonymous port. So
938 				 * get one to start with.
939 				 */
940 				port =
941 				    tcp_update_next_port(
942 				    tcps->tcps_next_port_to_try,
943 				    tcp, B_TRUE);
944 				user_specified = B_FALSE;
945 			} else {
946 				port = tcp_update_next_port(port + 1, tcp,
947 				    B_FALSE);
948 			}
949 		}
950 		if (port == 0)
951 			break;
952 
953 		/*
954 		 * Don't let this loop run forever in the case where
955 		 * all of the anonymous ports are in use.
956 		 */
957 	} while (++count < loopmax);
958 	return (0);
959 }
960