1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright (c) 2016 by Delphix. All rights reserved.
26 * Copyright 2024 Bill Sommerfeld <sommerfeld@hamachi.org>
27 */
28
29 #include <sys/types.h>
30 #include <sys/stream.h>
31 #include <sys/strsun.h>
32 #include <sys/strsubr.h>
33 #include <sys/stropts.h>
34 #include <sys/strlog.h>
35 #define _SUN_TPI_VERSION 2
36 #include <sys/tihdr.h>
37 #include <sys/suntpi.h>
38 #include <sys/xti_inet.h>
39 #include <sys/policy.h>
40 #include <sys/squeue_impl.h>
41 #include <sys/squeue.h>
42 #include <sys/tsol/tnet.h>
43
44 #include <rpc/pmap_prot.h>
45
46 #include <inet/common.h>
47 #include <inet/ip.h>
48 #include <inet/tcp.h>
49 #include <inet/tcp_impl.h>
50 #include <inet/proto_set.h>
51 #include <inet/ipsec_impl.h>
52
53 /* Setable in /etc/system */
54 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
55 static uint32_t tcp_random_anon_port = 1;
56
57 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
58 cred_t *cr);
59 static in_port_t tcp_get_next_priv_port(const tcp_t *);
60
61 /*
62 * Hash list insertion routine for tcp_t structures. Each hash bucket
63 * contains a list of tcp_t entries, and each entry is bound to a unique
64 * port. If there are multiple tcp_t's that are bound to the same port, then
65 * one of them will be linked into the hash bucket list, and the rest will
66 * hang off of that one entry. For each port, entries bound to a specific IP
67 * address will be inserted before those those bound to INADDR_ANY.
68 */
69 void
tcp_bind_hash_insert(tf_t * tbf,tcp_t * tcp,int caller_holds_lock)70 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
71 {
72 tcp_t **tcpp;
73 tcp_t *tcpnext;
74 tcp_t *tcphash;
75 conn_t *connp = tcp->tcp_connp;
76 conn_t *connext;
77
78 if (tcp->tcp_ptpbhn != NULL) {
79 ASSERT(!caller_holds_lock);
80 tcp_bind_hash_remove(tcp);
81 }
82 tcpp = &tbf->tf_tcp;
83 if (!caller_holds_lock) {
84 mutex_enter(&tbf->tf_lock);
85 } else {
86 ASSERT(MUTEX_HELD(&tbf->tf_lock));
87 }
88 tcphash = tcpp[0];
89 tcpnext = NULL;
90 if (tcphash != NULL) {
91 /* Look for an entry using the same port */
92 while ((tcphash = tcpp[0]) != NULL &&
93 connp->conn_lport != tcphash->tcp_connp->conn_lport)
94 tcpp = &(tcphash->tcp_bind_hash);
95
96 /* The port was not found, just add to the end */
97 if (tcphash == NULL)
98 goto insert;
99
100 /*
101 * OK, there already exists an entry bound to the
102 * same port.
103 *
104 * If the new tcp bound to the INADDR_ANY address
105 * and the first one in the list is not bound to
106 * INADDR_ANY we skip all entries until we find the
107 * first one bound to INADDR_ANY.
108 * This makes sure that applications binding to a
109 * specific address get preference over those binding to
110 * INADDR_ANY.
111 */
112 tcpnext = tcphash;
113 connext = tcpnext->tcp_connp;
114 tcphash = NULL;
115 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
116 !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
117 while ((tcpnext = tcpp[0]) != NULL) {
118 connext = tcpnext->tcp_connp;
119 if (!V6_OR_V4_INADDR_ANY(
120 connext->conn_bound_addr_v6))
121 tcpp = &(tcpnext->tcp_bind_hash_port);
122 else
123 break;
124 }
125 if (tcpnext != NULL) {
126 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
127 tcphash = tcpnext->tcp_bind_hash;
128 if (tcphash != NULL) {
129 tcphash->tcp_ptpbhn =
130 &(tcp->tcp_bind_hash);
131 tcpnext->tcp_bind_hash = NULL;
132 }
133 }
134 } else {
135 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
136 tcphash = tcpnext->tcp_bind_hash;
137 if (tcphash != NULL) {
138 tcphash->tcp_ptpbhn =
139 &(tcp->tcp_bind_hash);
140 tcpnext->tcp_bind_hash = NULL;
141 }
142 }
143 }
144 insert:
145 tcp->tcp_bind_hash_port = tcpnext;
146 tcp->tcp_bind_hash = tcphash;
147 tcp->tcp_ptpbhn = tcpp;
148 tcpp[0] = tcp;
149 if (!caller_holds_lock)
150 mutex_exit(&tbf->tf_lock);
151 }
152
153 /*
154 * Hash list removal routine for tcp_t structures.
155 */
156 void
tcp_bind_hash_remove(tcp_t * tcp)157 tcp_bind_hash_remove(tcp_t *tcp)
158 {
159 tcp_t *tcpnext;
160 kmutex_t *lockp;
161 tcp_stack_t *tcps = tcp->tcp_tcps;
162 conn_t *connp = tcp->tcp_connp;
163
164 if (tcp->tcp_ptpbhn == NULL)
165 return;
166
167 /*
168 * Extract the lock pointer in case there are concurrent
169 * hash_remove's for this instance.
170 */
171 ASSERT(connp->conn_lport != 0);
172 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
173 connp->conn_lport)].tf_lock;
174
175 ASSERT(lockp != NULL);
176 mutex_enter(lockp);
177 if (tcp->tcp_ptpbhn) {
178 tcpnext = tcp->tcp_bind_hash_port;
179 if (tcpnext != NULL) {
180 tcp->tcp_bind_hash_port = NULL;
181 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
182 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
183 if (tcpnext->tcp_bind_hash != NULL) {
184 tcpnext->tcp_bind_hash->tcp_ptpbhn =
185 &(tcpnext->tcp_bind_hash);
186 tcp->tcp_bind_hash = NULL;
187 }
188 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
189 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
190 tcp->tcp_bind_hash = NULL;
191 }
192 *tcp->tcp_ptpbhn = tcpnext;
193 tcp->tcp_ptpbhn = NULL;
194 }
195 mutex_exit(lockp);
196 }
197
198 /*
199 * Don't let port fall into the privileged range.
200 * Since the extra privileged ports can be arbitrary we also
201 * ensure that we exclude those from consideration.
202 * tcp_g_epriv_ports is not sorted thus we loop over it until
203 * there are no changes.
204 *
205 * Note: No locks are held when inspecting tcp_g_*epriv_ports
206 * but instead the code relies on:
207 * - the fact that the address of the array and its size never changes
208 * - the atomic assignment of the elements of the array
209 *
210 * Returns 0 if there are no more ports available.
211 *
212 * TS note: skip multilevel ports.
213 */
214 in_port_t
tcp_update_next_port(in_port_t port,const tcp_t * tcp,boolean_t random)215 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
216 {
217 int i, bump;
218 boolean_t restart = B_FALSE;
219 tcp_stack_t *tcps = tcp->tcp_tcps;
220
221 if (random && tcp_random_anon_port != 0) {
222 (void) random_get_pseudo_bytes((uint8_t *)&port,
223 sizeof (in_port_t));
224 /*
225 * Unless changed by a sys admin, the smallest anon port
226 * is 32768 and the largest anon port is 65535. It is
227 * very likely (50%) for the random port to be smaller
228 * than the smallest anon port. When that happens,
229 * add port % (anon port range) to the smallest anon
230 * port to get the random port. It should fall into the
231 * valid anon port range.
232 */
233 if ((port < tcps->tcps_smallest_anon_port) ||
234 (port > tcps->tcps_largest_anon_port)) {
235 if (tcps->tcps_smallest_anon_port ==
236 tcps->tcps_largest_anon_port) {
237 bump = 0;
238 } else {
239 bump = port % (tcps->tcps_largest_anon_port -
240 tcps->tcps_smallest_anon_port);
241 }
242 port = tcps->tcps_smallest_anon_port + bump;
243 }
244 }
245
246 retry:
247 if (port < tcps->tcps_smallest_anon_port)
248 port = (in_port_t)tcps->tcps_smallest_anon_port;
249
250 if (port > tcps->tcps_largest_anon_port) {
251 if (restart)
252 return (0);
253 restart = B_TRUE;
254 port = (in_port_t)tcps->tcps_smallest_anon_port;
255 }
256
257 if (port < tcps->tcps_smallest_nonpriv_port)
258 port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
259
260 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
261 if (port == tcps->tcps_g_epriv_ports[i]) {
262 port++;
263 /*
264 * Make sure whether the port is in the
265 * valid range.
266 */
267 goto retry;
268 }
269 }
270 if (is_system_labeled() &&
271 (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
272 IPPROTO_TCP, B_TRUE)) != 0) {
273 port = i;
274 goto retry;
275 }
276 return (port);
277 }
278
279 /*
280 * Return the next anonymous port in the privileged port range for
281 * bind checking. It starts at IPPORT_RESERVED - 1 and goes
282 * downwards. This is the same behavior as documented in the userland
283 * library call rresvport(3SOCKET).
284 *
285 * TS note: skip multilevel ports.
286 */
287 static in_port_t
tcp_get_next_priv_port(const tcp_t * tcp)288 tcp_get_next_priv_port(const tcp_t *tcp)
289 {
290 static in_port_t next_priv_port = IPPORT_RESERVED - 1;
291 in_port_t nextport;
292 boolean_t restart = B_FALSE;
293 tcp_stack_t *tcps = tcp->tcp_tcps;
294 retry:
295 if (next_priv_port < tcps->tcps_min_anonpriv_port ||
296 next_priv_port >= IPPORT_RESERVED) {
297 next_priv_port = IPPORT_RESERVED - 1;
298 if (restart)
299 return (0);
300 restart = B_TRUE;
301 }
302 if (is_system_labeled() &&
303 (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
304 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
305 next_priv_port = nextport;
306 goto retry;
307 }
308 return (next_priv_port--);
309 }
310
311 static int
tcp_bind_select_lport(tcp_t * tcp,in_port_t * requested_port_ptr,boolean_t bind_to_req_port_only,cred_t * cr)312 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
313 boolean_t bind_to_req_port_only, cred_t *cr)
314 {
315 in_port_t mlp_port;
316 mlp_type_t addrtype, mlptype;
317 boolean_t user_specified;
318 in_port_t allocated_port;
319 in_port_t requested_port = *requested_port_ptr;
320 conn_t *connp = tcp->tcp_connp;
321 zone_t *zone;
322 tcp_stack_t *tcps = tcp->tcp_tcps;
323 in6_addr_t v6addr = connp->conn_laddr_v6;
324
325 zone = NULL;
326 /*
327 * XXX It's up to the caller to specify bind_to_req_port_only or not.
328 */
329 ASSERT(cr != NULL);
330
331 /*
332 * Get a valid port (within the anonymous range and should not
333 * be a privileged one) to use if the user has not given a port.
334 * If multiple threads are here, they may all start with
335 * with the same initial port. But, it should be fine as long as
336 * tcp_bindi will ensure that no two threads will be assigned
337 * the same port.
338 *
339 * NOTE: XXX If a privileged process asks for an anonymous port, we
340 * still check for ports only in the range > tcp_smallest_non_priv_port,
341 * unless TCP_ANONPRIVBIND option is set.
342 */
343 mlptype = mlptSingle;
344 mlp_port = requested_port;
345 if (requested_port == 0) {
346 requested_port = connp->conn_anon_priv_bind ?
347 tcp_get_next_priv_port(tcp) :
348 tcp_update_next_port(tcps->tcps_next_port_to_try,
349 tcp, B_TRUE);
350 if (requested_port == 0) {
351 return (-TNOADDR);
352 }
353 user_specified = B_FALSE;
354
355 /*
356 * If the user went through one of the RPC interfaces to create
357 * this socket and RPC is MLP in this zone, then give them an
358 * anonymous MLP.
359 */
360 if (connp->conn_anon_mlp && is_system_labeled()) {
361 zone = crgetzone(cr);
362 addrtype = tsol_mlp_addr_type(
363 connp->conn_allzones ? ALL_ZONES : zone->zone_id,
364 IPV6_VERSION, &v6addr,
365 tcps->tcps_netstack->netstack_ip);
366 if (addrtype == mlptSingle) {
367 return (-TNOADDR);
368 }
369 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
370 PMAPPORT, addrtype);
371 mlp_port = PMAPPORT;
372 }
373 } else {
374 int i;
375 boolean_t priv = B_FALSE;
376
377 /*
378 * If the requested_port is in the well-known privileged range,
379 * verify that the stream was opened by a privileged user.
380 * Note: No locks are held when inspecting tcp_g_*epriv_ports
381 * but instead the code relies on:
382 * - the fact that the address of the array and its size never
383 * changes
384 * - the atomic assignment of the elements of the array
385 */
386 if (requested_port < tcps->tcps_smallest_nonpriv_port) {
387 priv = B_TRUE;
388 } else {
389 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
390 if (requested_port ==
391 tcps->tcps_g_epriv_ports[i]) {
392 priv = B_TRUE;
393 break;
394 }
395 }
396 }
397 if (priv) {
398 if (secpolicy_net_privaddr(cr, requested_port,
399 IPPROTO_TCP) != 0) {
400 if (connp->conn_debug) {
401 (void) strlog(TCP_MOD_ID, 0, 1,
402 SL_ERROR|SL_TRACE,
403 "tcp_bind: no priv for port %d",
404 requested_port);
405 }
406 return (-TACCES);
407 }
408 }
409 user_specified = B_TRUE;
410
411 connp = tcp->tcp_connp;
412 if (is_system_labeled()) {
413 zone = crgetzone(cr);
414 addrtype = tsol_mlp_addr_type(
415 connp->conn_allzones ? ALL_ZONES : zone->zone_id,
416 IPV6_VERSION, &v6addr,
417 tcps->tcps_netstack->netstack_ip);
418 if (addrtype == mlptSingle) {
419 return (-TNOADDR);
420 }
421 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
422 requested_port, addrtype);
423 }
424 }
425
426 if (mlptype != mlptSingle) {
427 if (secpolicy_net_bindmlp(cr) != 0) {
428 if (connp->conn_debug) {
429 (void) strlog(TCP_MOD_ID, 0, 1,
430 SL_ERROR|SL_TRACE,
431 "tcp_bind: no priv for multilevel port %d",
432 requested_port);
433 }
434 return (-TACCES);
435 }
436
437 /*
438 * If we're specifically binding a shared IP address and the
439 * port is MLP on shared addresses, then check to see if this
440 * zone actually owns the MLP. Reject if not.
441 */
442 if (mlptype == mlptShared && addrtype == mlptShared) {
443 /*
444 * No need to handle exclusive-stack zones since
445 * ALL_ZONES only applies to the shared stack.
446 */
447 zoneid_t mlpzone;
448
449 mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
450 htons(mlp_port));
451 if (connp->conn_zoneid != mlpzone) {
452 if (connp->conn_debug) {
453 (void) strlog(TCP_MOD_ID, 0, 1,
454 SL_ERROR|SL_TRACE,
455 "tcp_bind: attempt to bind port "
456 "%d on shared addr in zone %d "
457 "(should be %d)",
458 mlp_port, connp->conn_zoneid,
459 mlpzone);
460 }
461 return (-TACCES);
462 }
463 }
464
465 if (!user_specified) {
466 int err;
467 err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
468 requested_port, B_TRUE);
469 if (err != 0) {
470 if (connp->conn_debug) {
471 (void) strlog(TCP_MOD_ID, 0, 1,
472 SL_ERROR|SL_TRACE,
473 "tcp_bind: cannot establish anon "
474 "MLP for port %d",
475 requested_port);
476 }
477 return (err);
478 }
479 connp->conn_anon_port = B_TRUE;
480 }
481 connp->conn_mlp_type = mlptype;
482 }
483
484 allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
485 connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
486 user_specified);
487
488 if (allocated_port == 0) {
489 connp->conn_mlp_type = mlptSingle;
490 if (connp->conn_anon_port) {
491 connp->conn_anon_port = B_FALSE;
492 (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
493 requested_port, B_FALSE);
494 }
495 if (bind_to_req_port_only) {
496 if (connp->conn_debug) {
497 (void) strlog(TCP_MOD_ID, 0, 1,
498 SL_ERROR|SL_TRACE,
499 "tcp_bind: requested addr busy");
500 }
501 return (-TADDRBUSY);
502 } else {
503 /* If we are out of ports, fail the bind. */
504 if (connp->conn_debug) {
505 (void) strlog(TCP_MOD_ID, 0, 1,
506 SL_ERROR|SL_TRACE,
507 "tcp_bind: out of ports?");
508 }
509 return (-TNOADDR);
510 }
511 }
512
513 /* Pass the allocated port back */
514 *requested_port_ptr = allocated_port;
515 return (0);
516 }
517
518 /*
519 * Check the address and check/pick a local port number.
520 */
521 int
tcp_bind_check(conn_t * connp,struct sockaddr * sa,socklen_t len,cred_t * cr,boolean_t bind_to_req_port_only)522 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
523 boolean_t bind_to_req_port_only)
524 {
525 tcp_t *tcp = connp->conn_tcp;
526 sin_t *sin;
527 sin6_t *sin6;
528 in_port_t requested_port;
529 ipaddr_t v4addr;
530 in6_addr_t v6addr;
531 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */
532 zoneid_t zoneid = IPCL_ZONEID(connp);
533 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
534 uint_t scopeid = 0;
535 int error = 0;
536 ip_xmit_attr_t *ixa = connp->conn_ixa;
537
538 ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
539
540 if (tcp->tcp_state == TCPS_BOUND) {
541 return (0);
542 } else if (tcp->tcp_state > TCPS_BOUND) {
543 if (connp->conn_debug) {
544 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
545 "tcp_bind: bad state, %d", tcp->tcp_state);
546 }
547 return (-TOUTSTATE);
548 }
549
550 ASSERT(sa != NULL && len != 0);
551
552 if (!OK_32PTR((char *)sa)) {
553 if (connp->conn_debug) {
554 (void) strlog(TCP_MOD_ID, 0, 1,
555 SL_ERROR|SL_TRACE,
556 "tcp_bind: bad address parameter, "
557 "address %p, len %d",
558 (void *)sa, len);
559 }
560 return (-TPROTO);
561 }
562
563 error = proto_verify_ip_addr(connp->conn_family, sa, len);
564 if (error != 0) {
565 return (error);
566 }
567
568 switch (len) {
569 case sizeof (sin_t): /* Complete IPv4 address */
570 sin = (sin_t *)sa;
571 requested_port = ntohs(sin->sin_port);
572 v4addr = sin->sin_addr.s_addr;
573 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
574 if (v4addr != INADDR_ANY) {
575 laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
576 B_FALSE);
577 }
578 break;
579
580 case sizeof (sin6_t): /* Complete IPv6 address */
581 sin6 = (sin6_t *)sa;
582 v6addr = sin6->sin6_addr;
583 requested_port = ntohs(sin6->sin6_port);
584 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
585 if (connp->conn_ipv6_v6only)
586 return (EADDRNOTAVAIL);
587
588 IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
589 if (v4addr != INADDR_ANY) {
590 laddr_type = ip_laddr_verify_v4(v4addr,
591 zoneid, ipst, B_FALSE);
592 }
593 } else {
594 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
595 if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
596 scopeid = sin6->sin6_scope_id;
597 laddr_type = ip_laddr_verify_v6(&v6addr,
598 zoneid, ipst, B_FALSE, scopeid);
599 }
600 }
601 break;
602
603 default:
604 if (connp->conn_debug) {
605 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
606 "tcp_bind: bad address length, %d", len);
607 }
608 return (EAFNOSUPPORT);
609 /* return (-TBADADDR); */
610 }
611
612 /* Is the local address a valid unicast address? */
613 if (laddr_type == IPVL_BAD)
614 return (EADDRNOTAVAIL);
615
616 connp->conn_bound_addr_v6 = v6addr;
617 if (scopeid != 0) {
618 ixa->ixa_flags |= IXAF_SCOPEID_SET;
619 ixa->ixa_scopeid = scopeid;
620 connp->conn_incoming_ifindex = scopeid;
621 } else {
622 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
623 connp->conn_incoming_ifindex = connp->conn_bound_if;
624 }
625
626 connp->conn_laddr_v6 = v6addr;
627 connp->conn_saddr_v6 = v6addr;
628
629 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
630
631 error = tcp_bind_select_lport(tcp, &requested_port,
632 bind_to_req_port_only, cr);
633 if (error != 0) {
634 connp->conn_laddr_v6 = ipv6_all_zeros;
635 connp->conn_saddr_v6 = ipv6_all_zeros;
636 connp->conn_bound_addr_v6 = ipv6_all_zeros;
637 }
638 return (error);
639 }
640
641 /*
642 * If the "bind_to_req_port_only" parameter is set, if the requested port
643 * number is available, return it, If not return 0
644 *
645 * If "bind_to_req_port_only" parameter is not set and
646 * If the requested port number is available, return it. If not, return
647 * the first anonymous port we happen across. If no anonymous ports are
648 * available, return 0. addr is the requested local address, if any.
649 *
650 * In either case, when succeeding update the tcp_t to record the port number
651 * and insert it in the bind hash table.
652 *
653 * Note that TCP over IPv4 and IPv6 sockets can use the same port number
654 * without setting SO_REUSEADDR. This is needed so that they
655 * can be viewed as two independent transport protocols.
656 */
657 in_port_t
tcp_bindi(tcp_t * tcp,in_port_t port,const in6_addr_t * laddr,int reuseaddr,boolean_t quick_connect,boolean_t bind_to_req_port_only,boolean_t user_specified)658 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
659 int reuseaddr, boolean_t quick_connect,
660 boolean_t bind_to_req_port_only, boolean_t user_specified)
661 {
662 /* number of times we have run around the loop */
663 int count = 0;
664 /* maximum number of times to run around the loop */
665 int loopmax;
666 conn_t *connp = tcp->tcp_connp;
667 tcp_stack_t *tcps = tcp->tcp_tcps;
668
669 /*
670 * Lookup for free addresses is done in a loop and "loopmax"
671 * influences how long we spin in the loop
672 */
673 if (bind_to_req_port_only) {
674 /*
675 * If the requested port is busy, don't bother to look
676 * for a new one. Setting loop maximum count to 1 has
677 * that effect.
678 */
679 loopmax = 1;
680 } else {
681 /*
682 * If the requested port is busy, look for a free one
683 * in the anonymous port range.
684 * Set loopmax appropriately so that one does not look
685 * forever in the case all of the anonymous ports are in use.
686 */
687 if (connp->conn_anon_priv_bind) {
688 /*
689 * loopmax =
690 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
691 */
692 loopmax = IPPORT_RESERVED -
693 tcps->tcps_min_anonpriv_port;
694 } else {
695 loopmax = (tcps->tcps_largest_anon_port -
696 tcps->tcps_smallest_anon_port + 1);
697 }
698 }
699 do {
700 uint16_t lport;
701 tf_t *tbf;
702 tcp_t *ltcp;
703 conn_t *lconnp;
704
705 lport = htons(port);
706
707 /*
708 * Ensure that the tcp_t is not currently in the bind hash.
709 * Hold the lock on the hash bucket to ensure that
710 * the duplicate check plus the insertion is an atomic
711 * operation.
712 *
713 * This function does an inline lookup on the bind hash list
714 * Make sure that we access only members of tcp_t
715 * and that we don't look at tcp_tcp, since we are not
716 * doing a CONN_INC_REF.
717 */
718 tcp_bind_hash_remove(tcp);
719 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
720 mutex_enter(&tbf->tf_lock);
721 for (ltcp = tbf->tf_tcp; ltcp != NULL;
722 ltcp = ltcp->tcp_bind_hash) {
723 if (lport == ltcp->tcp_connp->conn_lport)
724 break;
725 }
726
727 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
728 boolean_t not_socket;
729 boolean_t exclbind;
730
731 lconnp = ltcp->tcp_connp;
732
733 /*
734 * On a labeled system, we must treat bindings to ports
735 * on shared IP addresses by sockets with MAC exemption
736 * privilege as being in all zones, as there's
737 * otherwise no way to identify the right receiver.
738 */
739 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
740 continue;
741
742 /*
743 * allow multiple interface-specific binds to coexist.
744 */
745 if (connp->conn_incoming_ifindex !=
746 lconnp->conn_incoming_ifindex) {
747 if ((connp->conn_incoming_ifindex != 0) &&
748 (lconnp->conn_incoming_ifindex != 0))
749 continue;
750 }
751
752 /*
753 * If TCP_EXCLBIND is set for either the bound or
754 * binding endpoint, the semantics of bind
755 * is changed according to the following.
756 *
757 * spec = specified address (v4 or v6)
758 * unspec = unspecified address (v4 or v6)
759 * A = specified addresses are different for endpoints
760 *
761 * bound bind to allowed
762 * -------------------------------------
763 * unspec unspec no
764 * unspec spec no
765 * spec unspec no
766 * spec spec yes if A
767 *
768 * For labeled systems, SO_MAC_EXEMPT behaves the same
769 * as TCP_EXCLBIND, except that zoneid is ignored.
770 *
771 * Note:
772 *
773 * 1. Because of TLI semantics, an endpoint can go
774 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
775 * TCPS_BOUND, depending on whether it is originally
776 * a listener or not. That is why we need to check
777 * for states greater than or equal to TCPS_BOUND
778 * here.
779 *
780 * 2. Ideally, we should only check for state equals
781 * to TCPS_LISTEN. And the following check should be
782 * added.
783 *
784 * if (ltcp->tcp_state == TCPS_LISTEN ||
785 * !reuseaddr || !lconnp->conn_reuseaddr) {
786 * ...
787 * }
788 *
789 * The semantics will be changed to this. If the
790 * endpoint on the list is in state not equal to
791 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
792 * set, let the bind succeed.
793 *
794 * Because of (1), we cannot do that for TLI
795 * endpoints. But we can do that for socket endpoints.
796 * If in future, we can change this going back
797 * semantics, we can use the above check for TLI also.
798 */
799 not_socket = !(TCP_IS_SOCKET(ltcp) &&
800 TCP_IS_SOCKET(tcp));
801 exclbind = lconnp->conn_exclbind ||
802 connp->conn_exclbind;
803
804 if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
805 (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
806 (exclbind && (not_socket ||
807 ltcp->tcp_state <= TCPS_ESTABLISHED))) {
808 if (V6_OR_V4_INADDR_ANY(
809 lconnp->conn_bound_addr_v6) ||
810 V6_OR_V4_INADDR_ANY(*laddr) ||
811 IN6_ARE_ADDR_EQUAL(laddr,
812 &lconnp->conn_bound_addr_v6)) {
813 break;
814 }
815 continue;
816 }
817
818 /*
819 * Check ipversion to allow IPv4 and IPv6 sockets to
820 * have disjoint port number spaces, if *_EXCLBIND
821 * is not set and only if the application binds to a
822 * specific port. We use the same autoassigned port
823 * number space for IPv4 and IPv6 sockets.
824 */
825 if (connp->conn_ipversion != lconnp->conn_ipversion &&
826 bind_to_req_port_only)
827 continue;
828
829 /*
830 * Ideally, we should make sure that the source
831 * address, remote address, and remote port in the
832 * four tuple for this tcp-connection is unique.
833 * However, trying to find out the local source
834 * address would require too much code duplication
835 * with IP, since IP needs needs to have that code
836 * to support userland TCP implementations.
837 */
838 if (quick_connect &&
839 (ltcp->tcp_state > TCPS_LISTEN) &&
840 ((connp->conn_fport != lconnp->conn_fport) ||
841 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
842 &lconnp->conn_faddr_v6)))
843 continue;
844
845 if (!reuseaddr) {
846 /*
847 * No socket option SO_REUSEADDR.
848 * If existing port is bound to
849 * a non-wildcard IP address
850 * and the requesting stream is
851 * bound to a distinct
852 * different IP addresses
853 * (non-wildcard, also), keep
854 * going.
855 */
856 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
857 !V6_OR_V4_INADDR_ANY(
858 lconnp->conn_bound_addr_v6) &&
859 !IN6_ARE_ADDR_EQUAL(laddr,
860 &lconnp->conn_bound_addr_v6))
861 continue;
862 if (ltcp->tcp_state >= TCPS_BOUND) {
863 /*
864 * This port is being used and
865 * its state is >= TCPS_BOUND,
866 * so we can't bind to it.
867 */
868 break;
869 }
870 } else {
871 /*
872 * socket option SO_REUSEADDR is set on the
873 * binding tcp_t.
874 *
875 * If two streams are bound to
876 * same IP address or both addr
877 * and bound source are wildcards
878 * (INADDR_ANY), we want to stop
879 * searching.
880 * We have found a match of IP source
881 * address and source port, which is
882 * refused regardless of the
883 * SO_REUSEADDR setting, so we break.
884 */
885 if (IN6_ARE_ADDR_EQUAL(laddr,
886 &lconnp->conn_bound_addr_v6) &&
887 (ltcp->tcp_state == TCPS_LISTEN ||
888 ltcp->tcp_state == TCPS_BOUND))
889 break;
890 }
891 }
892 if (ltcp != NULL) {
893 /* The port number is busy */
894 mutex_exit(&tbf->tf_lock);
895 } else {
896 /*
897 * This port is ours. Insert in fanout and mark as
898 * bound to prevent others from getting the port
899 * number.
900 */
901 tcp->tcp_state = TCPS_BOUND;
902 DTRACE_TCP6(state__change, void, NULL,
903 ip_xmit_attr_t *, connp->conn_ixa,
904 void, NULL, tcp_t *, tcp, void, NULL,
905 int32_t, TCPS_IDLE);
906
907 connp->conn_lport = htons(port);
908
909 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
910 connp->conn_lport)] == tbf);
911 tcp_bind_hash_insert(tbf, tcp, 1);
912
913 mutex_exit(&tbf->tf_lock);
914
915 /*
916 * We don't want tcp_next_port_to_try to "inherit"
917 * a port number supplied by the user in a bind.
918 */
919 if (user_specified)
920 return (port);
921
922 /*
923 * This is the only place where tcp_next_port_to_try
924 * is updated. After the update, it may or may not
925 * be in the valid range.
926 */
927 if (!connp->conn_anon_priv_bind)
928 tcps->tcps_next_port_to_try = port + 1;
929 return (port);
930 }
931
932 if (connp->conn_anon_priv_bind) {
933 port = tcp_get_next_priv_port(tcp);
934 } else {
935 if (count == 0 && user_specified) {
936 /*
937 * We may have to return an anonymous port. So
938 * get one to start with.
939 */
940 port =
941 tcp_update_next_port(
942 tcps->tcps_next_port_to_try,
943 tcp, B_TRUE);
944 user_specified = B_FALSE;
945 } else {
946 port = tcp_update_next_port(port + 1, tcp,
947 B_FALSE);
948 }
949 }
950 if (port == 0)
951 break;
952
953 /*
954 * Don't let this loop run forever in the case where
955 * all of the anonymous ports are in use.
956 */
957 } while (++count < loopmax);
958 return (0);
959 }
960