xref: /titanic_41/usr/src/uts/common/inet/tcp/tcp_bind.c (revision 0b42f15ac52b077791d4ba079e8c163c592c3fda)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
25  */
26 
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/strsubr.h>
31 #include <sys/stropts.h>
32 #include <sys/strlog.h>
33 #define	_SUN_TPI_VERSION 2
34 #include <sys/tihdr.h>
35 #include <sys/suntpi.h>
36 #include <sys/xti_inet.h>
37 #include <sys/policy.h>
38 #include <sys/squeue_impl.h>
39 #include <sys/squeue.h>
40 #include <sys/tsol/tnet.h>
41 
42 #include <rpc/pmap_prot.h>
43 
44 #include <inet/common.h>
45 #include <inet/ip.h>
46 #include <inet/tcp.h>
47 #include <inet/tcp_impl.h>
48 #include <inet/proto_set.h>
49 #include <inet/ipsec_impl.h>
50 
51 /* Setable in /etc/system */
52 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
53 static uint32_t tcp_random_anon_port = 1;
54 
55 static int	tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
56 		    cred_t *cr);
57 static in_port_t	tcp_get_next_priv_port(const tcp_t *);
58 
59 /*
60  * Hash list insertion routine for tcp_t structures. Each hash bucket
61  * contains a list of tcp_t entries, and each entry is bound to a unique
62  * port. If there are multiple tcp_t's that are bound to the same port, then
63  * one of them will be linked into the hash bucket list, and the rest will
64  * hang off of that one entry. For each port, entries bound to a specific IP
65  * address will be inserted before those those bound to INADDR_ANY.
66  */
67 void
tcp_bind_hash_insert(tf_t * tbf,tcp_t * tcp,int caller_holds_lock)68 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
69 {
70 	tcp_t	**tcpp;
71 	tcp_t	*tcpnext;
72 	tcp_t	*tcphash;
73 	conn_t	*connp = tcp->tcp_connp;
74 	conn_t	*connext;
75 
76 	if (tcp->tcp_ptpbhn != NULL) {
77 		ASSERT(!caller_holds_lock);
78 		tcp_bind_hash_remove(tcp);
79 	}
80 	tcpp = &tbf->tf_tcp;
81 	if (!caller_holds_lock) {
82 		mutex_enter(&tbf->tf_lock);
83 	} else {
84 		ASSERT(MUTEX_HELD(&tbf->tf_lock));
85 	}
86 	tcphash = tcpp[0];
87 	tcpnext = NULL;
88 	if (tcphash != NULL) {
89 		/* Look for an entry using the same port */
90 		while ((tcphash = tcpp[0]) != NULL &&
91 		    connp->conn_lport != tcphash->tcp_connp->conn_lport)
92 			tcpp = &(tcphash->tcp_bind_hash);
93 
94 		/* The port was not found, just add to the end */
95 		if (tcphash == NULL)
96 			goto insert;
97 
98 		/*
99 		 * OK, there already exists an entry bound to the
100 		 * same port.
101 		 *
102 		 * If the new tcp bound to the INADDR_ANY address
103 		 * and the first one in the list is not bound to
104 		 * INADDR_ANY we skip all entries until we find the
105 		 * first one bound to INADDR_ANY.
106 		 * This makes sure that applications binding to a
107 		 * specific address get preference over those binding to
108 		 * INADDR_ANY.
109 		 */
110 		tcpnext = tcphash;
111 		connext = tcpnext->tcp_connp;
112 		tcphash = NULL;
113 		if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
114 		    !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
115 			while ((tcpnext = tcpp[0]) != NULL) {
116 				connext = tcpnext->tcp_connp;
117 				if (!V6_OR_V4_INADDR_ANY(
118 				    connext->conn_bound_addr_v6))
119 					tcpp = &(tcpnext->tcp_bind_hash_port);
120 				else
121 					break;
122 			}
123 			if (tcpnext != NULL) {
124 				tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
125 				tcphash = tcpnext->tcp_bind_hash;
126 				if (tcphash != NULL) {
127 					tcphash->tcp_ptpbhn =
128 					    &(tcp->tcp_bind_hash);
129 					tcpnext->tcp_bind_hash = NULL;
130 				}
131 			}
132 		} else {
133 			tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
134 			tcphash = tcpnext->tcp_bind_hash;
135 			if (tcphash != NULL) {
136 				tcphash->tcp_ptpbhn =
137 				    &(tcp->tcp_bind_hash);
138 				tcpnext->tcp_bind_hash = NULL;
139 			}
140 		}
141 	}
142 insert:
143 	tcp->tcp_bind_hash_port = tcpnext;
144 	tcp->tcp_bind_hash = tcphash;
145 	tcp->tcp_ptpbhn = tcpp;
146 	tcpp[0] = tcp;
147 	if (!caller_holds_lock)
148 		mutex_exit(&tbf->tf_lock);
149 }
150 
151 /*
152  * Hash list removal routine for tcp_t structures.
153  */
154 void
tcp_bind_hash_remove(tcp_t * tcp)155 tcp_bind_hash_remove(tcp_t *tcp)
156 {
157 	tcp_t	*tcpnext;
158 	kmutex_t *lockp;
159 	tcp_stack_t	*tcps = tcp->tcp_tcps;
160 	conn_t		*connp = tcp->tcp_connp;
161 
162 	if (tcp->tcp_ptpbhn == NULL)
163 		return;
164 
165 	/*
166 	 * Extract the lock pointer in case there are concurrent
167 	 * hash_remove's for this instance.
168 	 */
169 	ASSERT(connp->conn_lport != 0);
170 	lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
171 	    connp->conn_lport)].tf_lock;
172 
173 	ASSERT(lockp != NULL);
174 	mutex_enter(lockp);
175 	if (tcp->tcp_ptpbhn) {
176 		tcpnext = tcp->tcp_bind_hash_port;
177 		if (tcpnext != NULL) {
178 			tcp->tcp_bind_hash_port = NULL;
179 			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
180 			tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
181 			if (tcpnext->tcp_bind_hash != NULL) {
182 				tcpnext->tcp_bind_hash->tcp_ptpbhn =
183 				    &(tcpnext->tcp_bind_hash);
184 				tcp->tcp_bind_hash = NULL;
185 			}
186 		} else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
187 			tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
188 			tcp->tcp_bind_hash = NULL;
189 		}
190 		*tcp->tcp_ptpbhn = tcpnext;
191 		tcp->tcp_ptpbhn = NULL;
192 	}
193 	mutex_exit(lockp);
194 }
195 
196 /*
197  * Don't let port fall into the privileged range.
198  * Since the extra privileged ports can be arbitrary we also
199  * ensure that we exclude those from consideration.
200  * tcp_g_epriv_ports is not sorted thus we loop over it until
201  * there are no changes.
202  *
203  * Note: No locks are held when inspecting tcp_g_*epriv_ports
204  * but instead the code relies on:
205  * - the fact that the address of the array and its size never changes
206  * - the atomic assignment of the elements of the array
207  *
208  * Returns 0 if there are no more ports available.
209  *
210  * TS note: skip multilevel ports.
211  */
212 in_port_t
tcp_update_next_port(in_port_t port,const tcp_t * tcp,boolean_t random)213 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
214 {
215 	int i, bump;
216 	boolean_t restart = B_FALSE;
217 	tcp_stack_t *tcps = tcp->tcp_tcps;
218 
219 	if (random && tcp_random_anon_port != 0) {
220 		(void) random_get_pseudo_bytes((uint8_t *)&port,
221 		    sizeof (in_port_t));
222 		/*
223 		 * Unless changed by a sys admin, the smallest anon port
224 		 * is 32768 and the largest anon port is 65535.  It is
225 		 * very likely (50%) for the random port to be smaller
226 		 * than the smallest anon port.  When that happens,
227 		 * add port % (anon port range) to the smallest anon
228 		 * port to get the random port.  It should fall into the
229 		 * valid anon port range.
230 		 */
231 		if ((port < tcps->tcps_smallest_anon_port) ||
232 		    (port > tcps->tcps_largest_anon_port)) {
233 			if (tcps->tcps_smallest_anon_port ==
234 			    tcps->tcps_largest_anon_port) {
235 				bump = 0;
236 			} else {
237 				bump = port % (tcps->tcps_largest_anon_port -
238 				    tcps->tcps_smallest_anon_port);
239 			}
240 			port = tcps->tcps_smallest_anon_port + bump;
241 		}
242 	}
243 
244 retry:
245 	if (port < tcps->tcps_smallest_anon_port)
246 		port = (in_port_t)tcps->tcps_smallest_anon_port;
247 
248 	if (port > tcps->tcps_largest_anon_port) {
249 		if (restart)
250 			return (0);
251 		restart = B_TRUE;
252 		port = (in_port_t)tcps->tcps_smallest_anon_port;
253 	}
254 
255 	if (port < tcps->tcps_smallest_nonpriv_port)
256 		port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
257 
258 	for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
259 		if (port == tcps->tcps_g_epriv_ports[i]) {
260 			port++;
261 			/*
262 			 * Make sure whether the port is in the
263 			 * valid range.
264 			 */
265 			goto retry;
266 		}
267 	}
268 	if (is_system_labeled() &&
269 	    (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
270 	    IPPROTO_TCP, B_TRUE)) != 0) {
271 		port = i;
272 		goto retry;
273 	}
274 	return (port);
275 }
276 
277 /*
278  * Return the next anonymous port in the privileged port range for
279  * bind checking.  It starts at IPPORT_RESERVED - 1 and goes
280  * downwards.  This is the same behavior as documented in the userland
281  * library call rresvport(3N).
282  *
283  * TS note: skip multilevel ports.
284  */
285 static in_port_t
tcp_get_next_priv_port(const tcp_t * tcp)286 tcp_get_next_priv_port(const tcp_t *tcp)
287 {
288 	static in_port_t next_priv_port = IPPORT_RESERVED - 1;
289 	in_port_t nextport;
290 	boolean_t restart = B_FALSE;
291 	tcp_stack_t *tcps = tcp->tcp_tcps;
292 retry:
293 	if (next_priv_port < tcps->tcps_min_anonpriv_port ||
294 	    next_priv_port >= IPPORT_RESERVED) {
295 		next_priv_port = IPPORT_RESERVED - 1;
296 		if (restart)
297 			return (0);
298 		restart = B_TRUE;
299 	}
300 	if (is_system_labeled() &&
301 	    (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
302 	    next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
303 		next_priv_port = nextport;
304 		goto retry;
305 	}
306 	return (next_priv_port--);
307 }
308 
309 static int
tcp_bind_select_lport(tcp_t * tcp,in_port_t * requested_port_ptr,boolean_t bind_to_req_port_only,cred_t * cr)310 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
311     boolean_t bind_to_req_port_only, cred_t *cr)
312 {
313 	in_port_t	mlp_port;
314 	mlp_type_t 	addrtype, mlptype;
315 	boolean_t	user_specified;
316 	in_port_t	allocated_port;
317 	in_port_t	requested_port = *requested_port_ptr;
318 	conn_t		*connp = tcp->tcp_connp;
319 	zone_t		*zone;
320 	tcp_stack_t	*tcps = tcp->tcp_tcps;
321 	in6_addr_t	v6addr = connp->conn_laddr_v6;
322 
323 	/*
324 	 * XXX It's up to the caller to specify bind_to_req_port_only or not.
325 	 */
326 	ASSERT(cr != NULL);
327 
328 	/*
329 	 * Get a valid port (within the anonymous range and should not
330 	 * be a privileged one) to use if the user has not given a port.
331 	 * If multiple threads are here, they may all start with
332 	 * with the same initial port. But, it should be fine as long as
333 	 * tcp_bindi will ensure that no two threads will be assigned
334 	 * the same port.
335 	 *
336 	 * NOTE: XXX If a privileged process asks for an anonymous port, we
337 	 * still check for ports only in the range > tcp_smallest_non_priv_port,
338 	 * unless TCP_ANONPRIVBIND option is set.
339 	 */
340 	mlptype = mlptSingle;
341 	mlp_port = requested_port;
342 	if (requested_port == 0) {
343 		requested_port = connp->conn_anon_priv_bind ?
344 		    tcp_get_next_priv_port(tcp) :
345 		    tcp_update_next_port(tcps->tcps_next_port_to_try,
346 		    tcp, B_TRUE);
347 		if (requested_port == 0) {
348 			return (-TNOADDR);
349 		}
350 		user_specified = B_FALSE;
351 
352 		/*
353 		 * If the user went through one of the RPC interfaces to create
354 		 * this socket and RPC is MLP in this zone, then give him an
355 		 * anonymous MLP.
356 		 */
357 		if (connp->conn_anon_mlp && is_system_labeled()) {
358 			zone = crgetzone(cr);
359 			addrtype = tsol_mlp_addr_type(
360 			    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
361 			    IPV6_VERSION, &v6addr,
362 			    tcps->tcps_netstack->netstack_ip);
363 			if (addrtype == mlptSingle) {
364 				return (-TNOADDR);
365 			}
366 			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
367 			    PMAPPORT, addrtype);
368 			mlp_port = PMAPPORT;
369 		}
370 	} else {
371 		int i;
372 		boolean_t priv = B_FALSE;
373 
374 		/*
375 		 * If the requested_port is in the well-known privileged range,
376 		 * verify that the stream was opened by a privileged user.
377 		 * Note: No locks are held when inspecting tcp_g_*epriv_ports
378 		 * but instead the code relies on:
379 		 * - the fact that the address of the array and its size never
380 		 *   changes
381 		 * - the atomic assignment of the elements of the array
382 		 */
383 		if (requested_port < tcps->tcps_smallest_nonpriv_port) {
384 			priv = B_TRUE;
385 		} else {
386 			for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
387 				if (requested_port ==
388 				    tcps->tcps_g_epriv_ports[i]) {
389 					priv = B_TRUE;
390 					break;
391 				}
392 			}
393 		}
394 		if (priv) {
395 			if (secpolicy_net_privaddr(cr, requested_port,
396 			    IPPROTO_TCP) != 0) {
397 				if (connp->conn_debug) {
398 					(void) strlog(TCP_MOD_ID, 0, 1,
399 					    SL_ERROR|SL_TRACE,
400 					    "tcp_bind: no priv for port %d",
401 					    requested_port);
402 				}
403 				return (-TACCES);
404 			}
405 		}
406 		user_specified = B_TRUE;
407 
408 		connp = tcp->tcp_connp;
409 		if (is_system_labeled()) {
410 			zone = crgetzone(cr);
411 			addrtype = tsol_mlp_addr_type(
412 			    connp->conn_allzones ? ALL_ZONES : zone->zone_id,
413 			    IPV6_VERSION, &v6addr,
414 			    tcps->tcps_netstack->netstack_ip);
415 			if (addrtype == mlptSingle) {
416 				return (-TNOADDR);
417 			}
418 			mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
419 			    requested_port, addrtype);
420 		}
421 	}
422 
423 	if (mlptype != mlptSingle) {
424 		if (secpolicy_net_bindmlp(cr) != 0) {
425 			if (connp->conn_debug) {
426 				(void) strlog(TCP_MOD_ID, 0, 1,
427 				    SL_ERROR|SL_TRACE,
428 				    "tcp_bind: no priv for multilevel port %d",
429 				    requested_port);
430 			}
431 			return (-TACCES);
432 		}
433 
434 		/*
435 		 * If we're specifically binding a shared IP address and the
436 		 * port is MLP on shared addresses, then check to see if this
437 		 * zone actually owns the MLP.  Reject if not.
438 		 */
439 		if (mlptype == mlptShared && addrtype == mlptShared) {
440 			/*
441 			 * No need to handle exclusive-stack zones since
442 			 * ALL_ZONES only applies to the shared stack.
443 			 */
444 			zoneid_t mlpzone;
445 
446 			mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
447 			    htons(mlp_port));
448 			if (connp->conn_zoneid != mlpzone) {
449 				if (connp->conn_debug) {
450 					(void) strlog(TCP_MOD_ID, 0, 1,
451 					    SL_ERROR|SL_TRACE,
452 					    "tcp_bind: attempt to bind port "
453 					    "%d on shared addr in zone %d "
454 					    "(should be %d)",
455 					    mlp_port, connp->conn_zoneid,
456 					    mlpzone);
457 				}
458 				return (-TACCES);
459 			}
460 		}
461 
462 		if (!user_specified) {
463 			int err;
464 			err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
465 			    requested_port, B_TRUE);
466 			if (err != 0) {
467 				if (connp->conn_debug) {
468 					(void) strlog(TCP_MOD_ID, 0, 1,
469 					    SL_ERROR|SL_TRACE,
470 					    "tcp_bind: cannot establish anon "
471 					    "MLP for port %d",
472 					    requested_port);
473 				}
474 				return (err);
475 			}
476 			connp->conn_anon_port = B_TRUE;
477 		}
478 		connp->conn_mlp_type = mlptype;
479 	}
480 
481 	allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
482 	    connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
483 	    user_specified);
484 
485 	if (allocated_port == 0) {
486 		connp->conn_mlp_type = mlptSingle;
487 		if (connp->conn_anon_port) {
488 			connp->conn_anon_port = B_FALSE;
489 			(void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
490 			    requested_port, B_FALSE);
491 		}
492 		if (bind_to_req_port_only) {
493 			if (connp->conn_debug) {
494 				(void) strlog(TCP_MOD_ID, 0, 1,
495 				    SL_ERROR|SL_TRACE,
496 				    "tcp_bind: requested addr busy");
497 			}
498 			return (-TADDRBUSY);
499 		} else {
500 			/* If we are out of ports, fail the bind. */
501 			if (connp->conn_debug) {
502 				(void) strlog(TCP_MOD_ID, 0, 1,
503 				    SL_ERROR|SL_TRACE,
504 				    "tcp_bind: out of ports?");
505 			}
506 			return (-TNOADDR);
507 		}
508 	}
509 
510 	/* Pass the allocated port back */
511 	*requested_port_ptr = allocated_port;
512 	return (0);
513 }
514 
515 /*
516  * Check the address and check/pick a local port number.
517  */
518 int
tcp_bind_check(conn_t * connp,struct sockaddr * sa,socklen_t len,cred_t * cr,boolean_t bind_to_req_port_only)519 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
520     boolean_t bind_to_req_port_only)
521 {
522 	tcp_t	*tcp = connp->conn_tcp;
523 	sin_t	*sin;
524 	sin6_t  *sin6;
525 	in_port_t	requested_port;
526 	ipaddr_t	v4addr;
527 	in6_addr_t	v6addr;
528 	ip_laddr_t	laddr_type = IPVL_UNICAST_UP;	/* INADDR_ANY */
529 	zoneid_t	zoneid = IPCL_ZONEID(connp);
530 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
531 	uint_t		scopeid = 0;
532 	int		error = 0;
533 	ip_xmit_attr_t	*ixa = connp->conn_ixa;
534 
535 	ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
536 
537 	if (tcp->tcp_state == TCPS_BOUND) {
538 		return (0);
539 	} else if (tcp->tcp_state > TCPS_BOUND) {
540 		if (connp->conn_debug) {
541 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
542 			    "tcp_bind: bad state, %d", tcp->tcp_state);
543 		}
544 		return (-TOUTSTATE);
545 	}
546 
547 	ASSERT(sa != NULL && len != 0);
548 
549 	if (!OK_32PTR((char *)sa)) {
550 		if (connp->conn_debug) {
551 			(void) strlog(TCP_MOD_ID, 0, 1,
552 			    SL_ERROR|SL_TRACE,
553 			    "tcp_bind: bad address parameter, "
554 			    "address %p, len %d",
555 			    (void *)sa, len);
556 		}
557 		return (-TPROTO);
558 	}
559 
560 	error = proto_verify_ip_addr(connp->conn_family, sa, len);
561 	if (error != 0) {
562 		return (error);
563 	}
564 
565 	switch (len) {
566 	case sizeof (sin_t):	/* Complete IPv4 address */
567 		sin = (sin_t *)sa;
568 		requested_port = ntohs(sin->sin_port);
569 		v4addr = sin->sin_addr.s_addr;
570 		IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
571 		if (v4addr != INADDR_ANY) {
572 			laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
573 			    B_FALSE);
574 		}
575 		break;
576 
577 	case sizeof (sin6_t): /* Complete IPv6 address */
578 		sin6 = (sin6_t *)sa;
579 		v6addr = sin6->sin6_addr;
580 		requested_port = ntohs(sin6->sin6_port);
581 		if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
582 			if (connp->conn_ipv6_v6only)
583 				return (EADDRNOTAVAIL);
584 
585 			IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
586 			if (v4addr != INADDR_ANY) {
587 				laddr_type = ip_laddr_verify_v4(v4addr,
588 				    zoneid, ipst, B_FALSE);
589 			}
590 		} else {
591 			if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
592 				if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
593 					scopeid = sin6->sin6_scope_id;
594 				laddr_type = ip_laddr_verify_v6(&v6addr,
595 				    zoneid, ipst, B_FALSE, scopeid);
596 			}
597 		}
598 		break;
599 
600 	default:
601 		if (connp->conn_debug) {
602 			(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
603 			    "tcp_bind: bad address length, %d", len);
604 		}
605 		return (EAFNOSUPPORT);
606 		/* return (-TBADADDR); */
607 	}
608 
609 	/* Is the local address a valid unicast address? */
610 	if (laddr_type == IPVL_BAD)
611 		return (EADDRNOTAVAIL);
612 
613 	connp->conn_bound_addr_v6 = v6addr;
614 	if (scopeid != 0) {
615 		ixa->ixa_flags |= IXAF_SCOPEID_SET;
616 		ixa->ixa_scopeid = scopeid;
617 		connp->conn_incoming_ifindex = scopeid;
618 	} else {
619 		ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
620 		connp->conn_incoming_ifindex = connp->conn_bound_if;
621 	}
622 
623 	connp->conn_laddr_v6 = v6addr;
624 	connp->conn_saddr_v6 = v6addr;
625 
626 	bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
627 
628 	error = tcp_bind_select_lport(tcp, &requested_port,
629 	    bind_to_req_port_only, cr);
630 	if (error != 0) {
631 		connp->conn_laddr_v6 = ipv6_all_zeros;
632 		connp->conn_saddr_v6 = ipv6_all_zeros;
633 		connp->conn_bound_addr_v6 = ipv6_all_zeros;
634 	}
635 	return (error);
636 }
637 
638 /*
639  * If the "bind_to_req_port_only" parameter is set, if the requested port
640  * number is available, return it, If not return 0
641  *
642  * If "bind_to_req_port_only" parameter is not set and
643  * If the requested port number is available, return it.  If not, return
644  * the first anonymous port we happen across.  If no anonymous ports are
645  * available, return 0. addr is the requested local address, if any.
646  *
647  * In either case, when succeeding update the tcp_t to record the port number
648  * and insert it in the bind hash table.
649  *
650  * Note that TCP over IPv4 and IPv6 sockets can use the same port number
651  * without setting SO_REUSEADDR. This is needed so that they
652  * can be viewed as two independent transport protocols.
653  */
654 in_port_t
tcp_bindi(tcp_t * tcp,in_port_t port,const in6_addr_t * laddr,int reuseaddr,boolean_t quick_connect,boolean_t bind_to_req_port_only,boolean_t user_specified)655 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
656     int reuseaddr, boolean_t quick_connect,
657     boolean_t bind_to_req_port_only, boolean_t user_specified)
658 {
659 	/* number of times we have run around the loop */
660 	int count = 0;
661 	/* maximum number of times to run around the loop */
662 	int loopmax;
663 	conn_t *connp = tcp->tcp_connp;
664 	tcp_stack_t	*tcps = tcp->tcp_tcps;
665 
666 	/*
667 	 * Lookup for free addresses is done in a loop and "loopmax"
668 	 * influences how long we spin in the loop
669 	 */
670 	if (bind_to_req_port_only) {
671 		/*
672 		 * If the requested port is busy, don't bother to look
673 		 * for a new one. Setting loop maximum count to 1 has
674 		 * that effect.
675 		 */
676 		loopmax = 1;
677 	} else {
678 		/*
679 		 * If the requested port is busy, look for a free one
680 		 * in the anonymous port range.
681 		 * Set loopmax appropriately so that one does not look
682 		 * forever in the case all of the anonymous ports are in use.
683 		 */
684 		if (connp->conn_anon_priv_bind) {
685 			/*
686 			 * loopmax =
687 			 * 	(IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
688 			 */
689 			loopmax = IPPORT_RESERVED -
690 			    tcps->tcps_min_anonpriv_port;
691 		} else {
692 			loopmax = (tcps->tcps_largest_anon_port -
693 			    tcps->tcps_smallest_anon_port + 1);
694 		}
695 	}
696 	do {
697 		uint16_t	lport;
698 		tf_t		*tbf;
699 		tcp_t		*ltcp;
700 		conn_t		*lconnp;
701 
702 		lport = htons(port);
703 
704 		/*
705 		 * Ensure that the tcp_t is not currently in the bind hash.
706 		 * Hold the lock on the hash bucket to ensure that
707 		 * the duplicate check plus the insertion is an atomic
708 		 * operation.
709 		 *
710 		 * This function does an inline lookup on the bind hash list
711 		 * Make sure that we access only members of tcp_t
712 		 * and that we don't look at tcp_tcp, since we are not
713 		 * doing a CONN_INC_REF.
714 		 */
715 		tcp_bind_hash_remove(tcp);
716 		tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
717 		mutex_enter(&tbf->tf_lock);
718 		for (ltcp = tbf->tf_tcp; ltcp != NULL;
719 		    ltcp = ltcp->tcp_bind_hash) {
720 			if (lport == ltcp->tcp_connp->conn_lport)
721 				break;
722 		}
723 
724 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
725 			boolean_t not_socket;
726 			boolean_t exclbind;
727 
728 			lconnp = ltcp->tcp_connp;
729 
730 			/*
731 			 * On a labeled system, we must treat bindings to ports
732 			 * on shared IP addresses by sockets with MAC exemption
733 			 * privilege as being in all zones, as there's
734 			 * otherwise no way to identify the right receiver.
735 			 */
736 			if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
737 				continue;
738 
739 			/*
740 			 * If TCP_EXCLBIND is set for either the bound or
741 			 * binding endpoint, the semantics of bind
742 			 * is changed according to the following.
743 			 *
744 			 * spec = specified address (v4 or v6)
745 			 * unspec = unspecified address (v4 or v6)
746 			 * A = specified addresses are different for endpoints
747 			 *
748 			 * bound	bind to		allowed
749 			 * -------------------------------------
750 			 * unspec	unspec		no
751 			 * unspec	spec		no
752 			 * spec		unspec		no
753 			 * spec		spec		yes if A
754 			 *
755 			 * For labeled systems, SO_MAC_EXEMPT behaves the same
756 			 * as TCP_EXCLBIND, except that zoneid is ignored.
757 			 *
758 			 * Note:
759 			 *
760 			 * 1. Because of TLI semantics, an endpoint can go
761 			 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
762 			 * TCPS_BOUND, depending on whether it is originally
763 			 * a listener or not.  That is why we need to check
764 			 * for states greater than or equal to TCPS_BOUND
765 			 * here.
766 			 *
767 			 * 2. Ideally, we should only check for state equals
768 			 * to TCPS_LISTEN. And the following check should be
769 			 * added.
770 			 *
771 			 * if (ltcp->tcp_state == TCPS_LISTEN ||
772 			 *	!reuseaddr || !lconnp->conn_reuseaddr) {
773 			 *		...
774 			 * }
775 			 *
776 			 * The semantics will be changed to this.  If the
777 			 * endpoint on the list is in state not equal to
778 			 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
779 			 * set, let the bind succeed.
780 			 *
781 			 * Because of (1), we cannot do that for TLI
782 			 * endpoints.  But we can do that for socket endpoints.
783 			 * If in future, we can change this going back
784 			 * semantics, we can use the above check for TLI also.
785 			 */
786 			not_socket = !(TCP_IS_SOCKET(ltcp) &&
787 			    TCP_IS_SOCKET(tcp));
788 			exclbind = lconnp->conn_exclbind ||
789 			    connp->conn_exclbind;
790 
791 			if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
792 			    (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
793 			    (exclbind && (not_socket ||
794 			    ltcp->tcp_state <= TCPS_ESTABLISHED))) {
795 				if (V6_OR_V4_INADDR_ANY(
796 				    lconnp->conn_bound_addr_v6) ||
797 				    V6_OR_V4_INADDR_ANY(*laddr) ||
798 				    IN6_ARE_ADDR_EQUAL(laddr,
799 				    &lconnp->conn_bound_addr_v6)) {
800 					break;
801 				}
802 				continue;
803 			}
804 
805 			/*
806 			 * Check ipversion to allow IPv4 and IPv6 sockets to
807 			 * have disjoint port number spaces, if *_EXCLBIND
808 			 * is not set and only if the application binds to a
809 			 * specific port. We use the same autoassigned port
810 			 * number space for IPv4 and IPv6 sockets.
811 			 */
812 			if (connp->conn_ipversion != lconnp->conn_ipversion &&
813 			    bind_to_req_port_only)
814 				continue;
815 
816 			/*
817 			 * Ideally, we should make sure that the source
818 			 * address, remote address, and remote port in the
819 			 * four tuple for this tcp-connection is unique.
820 			 * However, trying to find out the local source
821 			 * address would require too much code duplication
822 			 * with IP, since IP needs needs to have that code
823 			 * to support userland TCP implementations.
824 			 */
825 			if (quick_connect &&
826 			    (ltcp->tcp_state > TCPS_LISTEN) &&
827 			    ((connp->conn_fport != lconnp->conn_fport) ||
828 			    !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
829 			    &lconnp->conn_faddr_v6)))
830 				continue;
831 
832 			if (!reuseaddr) {
833 				/*
834 				 * No socket option SO_REUSEADDR.
835 				 * If existing port is bound to
836 				 * a non-wildcard IP address
837 				 * and the requesting stream is
838 				 * bound to a distinct
839 				 * different IP addresses
840 				 * (non-wildcard, also), keep
841 				 * going.
842 				 */
843 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
844 				    !V6_OR_V4_INADDR_ANY(
845 				    lconnp->conn_bound_addr_v6) &&
846 				    !IN6_ARE_ADDR_EQUAL(laddr,
847 				    &lconnp->conn_bound_addr_v6))
848 					continue;
849 				if (ltcp->tcp_state >= TCPS_BOUND) {
850 					/*
851 					 * This port is being used and
852 					 * its state is >= TCPS_BOUND,
853 					 * so we can't bind to it.
854 					 */
855 					break;
856 				}
857 			} else {
858 				/*
859 				 * socket option SO_REUSEADDR is set on the
860 				 * binding tcp_t.
861 				 *
862 				 * If two streams are bound to
863 				 * same IP address or both addr
864 				 * and bound source are wildcards
865 				 * (INADDR_ANY), we want to stop
866 				 * searching.
867 				 * We have found a match of IP source
868 				 * address and source port, which is
869 				 * refused regardless of the
870 				 * SO_REUSEADDR setting, so we break.
871 				 */
872 				if (IN6_ARE_ADDR_EQUAL(laddr,
873 				    &lconnp->conn_bound_addr_v6) &&
874 				    (ltcp->tcp_state == TCPS_LISTEN ||
875 				    ltcp->tcp_state == TCPS_BOUND))
876 					break;
877 			}
878 		}
879 		if (ltcp != NULL) {
880 			/* The port number is busy */
881 			mutex_exit(&tbf->tf_lock);
882 		} else {
883 			/*
884 			 * This port is ours. Insert in fanout and mark as
885 			 * bound to prevent others from getting the port
886 			 * number.
887 			 */
888 			tcp->tcp_state = TCPS_BOUND;
889 			DTRACE_TCP6(state__change, void, NULL,
890 			    ip_xmit_attr_t *, connp->conn_ixa,
891 			    void, NULL, tcp_t *, tcp, void, NULL,
892 			    int32_t, TCPS_IDLE);
893 
894 			connp->conn_lport = htons(port);
895 
896 			ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
897 			    connp->conn_lport)] == tbf);
898 			tcp_bind_hash_insert(tbf, tcp, 1);
899 
900 			mutex_exit(&tbf->tf_lock);
901 
902 			/*
903 			 * We don't want tcp_next_port_to_try to "inherit"
904 			 * a port number supplied by the user in a bind.
905 			 */
906 			if (user_specified)
907 				return (port);
908 
909 			/*
910 			 * This is the only place where tcp_next_port_to_try
911 			 * is updated. After the update, it may or may not
912 			 * be in the valid range.
913 			 */
914 			if (!connp->conn_anon_priv_bind)
915 				tcps->tcps_next_port_to_try = port + 1;
916 			return (port);
917 		}
918 
919 		if (connp->conn_anon_priv_bind) {
920 			port = tcp_get_next_priv_port(tcp);
921 		} else {
922 			if (count == 0 && user_specified) {
923 				/*
924 				 * We may have to return an anonymous port. So
925 				 * get one to start with.
926 				 */
927 				port =
928 				    tcp_update_next_port(
929 				    tcps->tcps_next_port_to_try,
930 				    tcp, B_TRUE);
931 				user_specified = B_FALSE;
932 			} else {
933 				port = tcp_update_next_port(port + 1, tcp,
934 				    B_FALSE);
935 			}
936 		}
937 		if (port == 0)
938 			break;
939 
940 		/*
941 		 * Don't let this loop run forever in the case where
942 		 * all of the anonymous ports are in use.
943 		 */
944 	} while (++count < loopmax);
945 	return (0);
946 }
947