1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 */
26
27 #include <sys/types.h>
28 #include <sys/stream.h>
29 #include <sys/strsun.h>
30 #include <sys/strsubr.h>
31 #include <sys/stropts.h>
32 #include <sys/strlog.h>
33 #define _SUN_TPI_VERSION 2
34 #include <sys/tihdr.h>
35 #include <sys/suntpi.h>
36 #include <sys/xti_inet.h>
37 #include <sys/policy.h>
38 #include <sys/squeue_impl.h>
39 #include <sys/squeue.h>
40 #include <sys/tsol/tnet.h>
41
42 #include <rpc/pmap_prot.h>
43
44 #include <inet/common.h>
45 #include <inet/ip.h>
46 #include <inet/tcp.h>
47 #include <inet/tcp_impl.h>
48 #include <inet/proto_set.h>
49 #include <inet/ipsec_impl.h>
50
51 /* Setable in /etc/system */
52 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
53 static uint32_t tcp_random_anon_port = 1;
54
55 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
56 cred_t *cr);
57 static in_port_t tcp_get_next_priv_port(const tcp_t *);
58
59 /*
60 * Hash list insertion routine for tcp_t structures. Each hash bucket
61 * contains a list of tcp_t entries, and each entry is bound to a unique
62 * port. If there are multiple tcp_t's that are bound to the same port, then
63 * one of them will be linked into the hash bucket list, and the rest will
64 * hang off of that one entry. For each port, entries bound to a specific IP
65 * address will be inserted before those those bound to INADDR_ANY.
66 */
67 void
tcp_bind_hash_insert(tf_t * tbf,tcp_t * tcp,int caller_holds_lock)68 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock)
69 {
70 tcp_t **tcpp;
71 tcp_t *tcpnext;
72 tcp_t *tcphash;
73 conn_t *connp = tcp->tcp_connp;
74 conn_t *connext;
75
76 if (tcp->tcp_ptpbhn != NULL) {
77 ASSERT(!caller_holds_lock);
78 tcp_bind_hash_remove(tcp);
79 }
80 tcpp = &tbf->tf_tcp;
81 if (!caller_holds_lock) {
82 mutex_enter(&tbf->tf_lock);
83 } else {
84 ASSERT(MUTEX_HELD(&tbf->tf_lock));
85 }
86 tcphash = tcpp[0];
87 tcpnext = NULL;
88 if (tcphash != NULL) {
89 /* Look for an entry using the same port */
90 while ((tcphash = tcpp[0]) != NULL &&
91 connp->conn_lport != tcphash->tcp_connp->conn_lport)
92 tcpp = &(tcphash->tcp_bind_hash);
93
94 /* The port was not found, just add to the end */
95 if (tcphash == NULL)
96 goto insert;
97
98 /*
99 * OK, there already exists an entry bound to the
100 * same port.
101 *
102 * If the new tcp bound to the INADDR_ANY address
103 * and the first one in the list is not bound to
104 * INADDR_ANY we skip all entries until we find the
105 * first one bound to INADDR_ANY.
106 * This makes sure that applications binding to a
107 * specific address get preference over those binding to
108 * INADDR_ANY.
109 */
110 tcpnext = tcphash;
111 connext = tcpnext->tcp_connp;
112 tcphash = NULL;
113 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) &&
114 !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) {
115 while ((tcpnext = tcpp[0]) != NULL) {
116 connext = tcpnext->tcp_connp;
117 if (!V6_OR_V4_INADDR_ANY(
118 connext->conn_bound_addr_v6))
119 tcpp = &(tcpnext->tcp_bind_hash_port);
120 else
121 break;
122 }
123 if (tcpnext != NULL) {
124 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
125 tcphash = tcpnext->tcp_bind_hash;
126 if (tcphash != NULL) {
127 tcphash->tcp_ptpbhn =
128 &(tcp->tcp_bind_hash);
129 tcpnext->tcp_bind_hash = NULL;
130 }
131 }
132 } else {
133 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port;
134 tcphash = tcpnext->tcp_bind_hash;
135 if (tcphash != NULL) {
136 tcphash->tcp_ptpbhn =
137 &(tcp->tcp_bind_hash);
138 tcpnext->tcp_bind_hash = NULL;
139 }
140 }
141 }
142 insert:
143 tcp->tcp_bind_hash_port = tcpnext;
144 tcp->tcp_bind_hash = tcphash;
145 tcp->tcp_ptpbhn = tcpp;
146 tcpp[0] = tcp;
147 if (!caller_holds_lock)
148 mutex_exit(&tbf->tf_lock);
149 }
150
151 /*
152 * Hash list removal routine for tcp_t structures.
153 */
154 void
tcp_bind_hash_remove(tcp_t * tcp)155 tcp_bind_hash_remove(tcp_t *tcp)
156 {
157 tcp_t *tcpnext;
158 kmutex_t *lockp;
159 tcp_stack_t *tcps = tcp->tcp_tcps;
160 conn_t *connp = tcp->tcp_connp;
161
162 if (tcp->tcp_ptpbhn == NULL)
163 return;
164
165 /*
166 * Extract the lock pointer in case there are concurrent
167 * hash_remove's for this instance.
168 */
169 ASSERT(connp->conn_lport != 0);
170 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(
171 connp->conn_lport)].tf_lock;
172
173 ASSERT(lockp != NULL);
174 mutex_enter(lockp);
175 if (tcp->tcp_ptpbhn) {
176 tcpnext = tcp->tcp_bind_hash_port;
177 if (tcpnext != NULL) {
178 tcp->tcp_bind_hash_port = NULL;
179 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
180 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash;
181 if (tcpnext->tcp_bind_hash != NULL) {
182 tcpnext->tcp_bind_hash->tcp_ptpbhn =
183 &(tcpnext->tcp_bind_hash);
184 tcp->tcp_bind_hash = NULL;
185 }
186 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) {
187 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn;
188 tcp->tcp_bind_hash = NULL;
189 }
190 *tcp->tcp_ptpbhn = tcpnext;
191 tcp->tcp_ptpbhn = NULL;
192 }
193 mutex_exit(lockp);
194 }
195
196 /*
197 * Don't let port fall into the privileged range.
198 * Since the extra privileged ports can be arbitrary we also
199 * ensure that we exclude those from consideration.
200 * tcp_g_epriv_ports is not sorted thus we loop over it until
201 * there are no changes.
202 *
203 * Note: No locks are held when inspecting tcp_g_*epriv_ports
204 * but instead the code relies on:
205 * - the fact that the address of the array and its size never changes
206 * - the atomic assignment of the elements of the array
207 *
208 * Returns 0 if there are no more ports available.
209 *
210 * TS note: skip multilevel ports.
211 */
212 in_port_t
tcp_update_next_port(in_port_t port,const tcp_t * tcp,boolean_t random)213 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random)
214 {
215 int i, bump;
216 boolean_t restart = B_FALSE;
217 tcp_stack_t *tcps = tcp->tcp_tcps;
218
219 if (random && tcp_random_anon_port != 0) {
220 (void) random_get_pseudo_bytes((uint8_t *)&port,
221 sizeof (in_port_t));
222 /*
223 * Unless changed by a sys admin, the smallest anon port
224 * is 32768 and the largest anon port is 65535. It is
225 * very likely (50%) for the random port to be smaller
226 * than the smallest anon port. When that happens,
227 * add port % (anon port range) to the smallest anon
228 * port to get the random port. It should fall into the
229 * valid anon port range.
230 */
231 if ((port < tcps->tcps_smallest_anon_port) ||
232 (port > tcps->tcps_largest_anon_port)) {
233 if (tcps->tcps_smallest_anon_port ==
234 tcps->tcps_largest_anon_port) {
235 bump = 0;
236 } else {
237 bump = port % (tcps->tcps_largest_anon_port -
238 tcps->tcps_smallest_anon_port);
239 }
240 port = tcps->tcps_smallest_anon_port + bump;
241 }
242 }
243
244 retry:
245 if (port < tcps->tcps_smallest_anon_port)
246 port = (in_port_t)tcps->tcps_smallest_anon_port;
247
248 if (port > tcps->tcps_largest_anon_port) {
249 if (restart)
250 return (0);
251 restart = B_TRUE;
252 port = (in_port_t)tcps->tcps_smallest_anon_port;
253 }
254
255 if (port < tcps->tcps_smallest_nonpriv_port)
256 port = (in_port_t)tcps->tcps_smallest_nonpriv_port;
257
258 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
259 if (port == tcps->tcps_g_epriv_ports[i]) {
260 port++;
261 /*
262 * Make sure whether the port is in the
263 * valid range.
264 */
265 goto retry;
266 }
267 }
268 if (is_system_labeled() &&
269 (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port,
270 IPPROTO_TCP, B_TRUE)) != 0) {
271 port = i;
272 goto retry;
273 }
274 return (port);
275 }
276
277 /*
278 * Return the next anonymous port in the privileged port range for
279 * bind checking. It starts at IPPORT_RESERVED - 1 and goes
280 * downwards. This is the same behavior as documented in the userland
281 * library call rresvport(3N).
282 *
283 * TS note: skip multilevel ports.
284 */
285 static in_port_t
tcp_get_next_priv_port(const tcp_t * tcp)286 tcp_get_next_priv_port(const tcp_t *tcp)
287 {
288 static in_port_t next_priv_port = IPPORT_RESERVED - 1;
289 in_port_t nextport;
290 boolean_t restart = B_FALSE;
291 tcp_stack_t *tcps = tcp->tcp_tcps;
292 retry:
293 if (next_priv_port < tcps->tcps_min_anonpriv_port ||
294 next_priv_port >= IPPORT_RESERVED) {
295 next_priv_port = IPPORT_RESERVED - 1;
296 if (restart)
297 return (0);
298 restart = B_TRUE;
299 }
300 if (is_system_labeled() &&
301 (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred),
302 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) {
303 next_priv_port = nextport;
304 goto retry;
305 }
306 return (next_priv_port--);
307 }
308
309 static int
tcp_bind_select_lport(tcp_t * tcp,in_port_t * requested_port_ptr,boolean_t bind_to_req_port_only,cred_t * cr)310 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr,
311 boolean_t bind_to_req_port_only, cred_t *cr)
312 {
313 in_port_t mlp_port;
314 mlp_type_t addrtype, mlptype;
315 boolean_t user_specified;
316 in_port_t allocated_port;
317 in_port_t requested_port = *requested_port_ptr;
318 conn_t *connp = tcp->tcp_connp;
319 zone_t *zone;
320 tcp_stack_t *tcps = tcp->tcp_tcps;
321 in6_addr_t v6addr = connp->conn_laddr_v6;
322
323 /*
324 * XXX It's up to the caller to specify bind_to_req_port_only or not.
325 */
326 ASSERT(cr != NULL);
327
328 /*
329 * Get a valid port (within the anonymous range and should not
330 * be a privileged one) to use if the user has not given a port.
331 * If multiple threads are here, they may all start with
332 * with the same initial port. But, it should be fine as long as
333 * tcp_bindi will ensure that no two threads will be assigned
334 * the same port.
335 *
336 * NOTE: XXX If a privileged process asks for an anonymous port, we
337 * still check for ports only in the range > tcp_smallest_non_priv_port,
338 * unless TCP_ANONPRIVBIND option is set.
339 */
340 mlptype = mlptSingle;
341 mlp_port = requested_port;
342 if (requested_port == 0) {
343 requested_port = connp->conn_anon_priv_bind ?
344 tcp_get_next_priv_port(tcp) :
345 tcp_update_next_port(tcps->tcps_next_port_to_try,
346 tcp, B_TRUE);
347 if (requested_port == 0) {
348 return (-TNOADDR);
349 }
350 user_specified = B_FALSE;
351
352 /*
353 * If the user went through one of the RPC interfaces to create
354 * this socket and RPC is MLP in this zone, then give him an
355 * anonymous MLP.
356 */
357 if (connp->conn_anon_mlp && is_system_labeled()) {
358 zone = crgetzone(cr);
359 addrtype = tsol_mlp_addr_type(
360 connp->conn_allzones ? ALL_ZONES : zone->zone_id,
361 IPV6_VERSION, &v6addr,
362 tcps->tcps_netstack->netstack_ip);
363 if (addrtype == mlptSingle) {
364 return (-TNOADDR);
365 }
366 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
367 PMAPPORT, addrtype);
368 mlp_port = PMAPPORT;
369 }
370 } else {
371 int i;
372 boolean_t priv = B_FALSE;
373
374 /*
375 * If the requested_port is in the well-known privileged range,
376 * verify that the stream was opened by a privileged user.
377 * Note: No locks are held when inspecting tcp_g_*epriv_ports
378 * but instead the code relies on:
379 * - the fact that the address of the array and its size never
380 * changes
381 * - the atomic assignment of the elements of the array
382 */
383 if (requested_port < tcps->tcps_smallest_nonpriv_port) {
384 priv = B_TRUE;
385 } else {
386 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
387 if (requested_port ==
388 tcps->tcps_g_epriv_ports[i]) {
389 priv = B_TRUE;
390 break;
391 }
392 }
393 }
394 if (priv) {
395 if (secpolicy_net_privaddr(cr, requested_port,
396 IPPROTO_TCP) != 0) {
397 if (connp->conn_debug) {
398 (void) strlog(TCP_MOD_ID, 0, 1,
399 SL_ERROR|SL_TRACE,
400 "tcp_bind: no priv for port %d",
401 requested_port);
402 }
403 return (-TACCES);
404 }
405 }
406 user_specified = B_TRUE;
407
408 connp = tcp->tcp_connp;
409 if (is_system_labeled()) {
410 zone = crgetzone(cr);
411 addrtype = tsol_mlp_addr_type(
412 connp->conn_allzones ? ALL_ZONES : zone->zone_id,
413 IPV6_VERSION, &v6addr,
414 tcps->tcps_netstack->netstack_ip);
415 if (addrtype == mlptSingle) {
416 return (-TNOADDR);
417 }
418 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
419 requested_port, addrtype);
420 }
421 }
422
423 if (mlptype != mlptSingle) {
424 if (secpolicy_net_bindmlp(cr) != 0) {
425 if (connp->conn_debug) {
426 (void) strlog(TCP_MOD_ID, 0, 1,
427 SL_ERROR|SL_TRACE,
428 "tcp_bind: no priv for multilevel port %d",
429 requested_port);
430 }
431 return (-TACCES);
432 }
433
434 /*
435 * If we're specifically binding a shared IP address and the
436 * port is MLP on shared addresses, then check to see if this
437 * zone actually owns the MLP. Reject if not.
438 */
439 if (mlptype == mlptShared && addrtype == mlptShared) {
440 /*
441 * No need to handle exclusive-stack zones since
442 * ALL_ZONES only applies to the shared stack.
443 */
444 zoneid_t mlpzone;
445
446 mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
447 htons(mlp_port));
448 if (connp->conn_zoneid != mlpzone) {
449 if (connp->conn_debug) {
450 (void) strlog(TCP_MOD_ID, 0, 1,
451 SL_ERROR|SL_TRACE,
452 "tcp_bind: attempt to bind port "
453 "%d on shared addr in zone %d "
454 "(should be %d)",
455 mlp_port, connp->conn_zoneid,
456 mlpzone);
457 }
458 return (-TACCES);
459 }
460 }
461
462 if (!user_specified) {
463 int err;
464 err = tsol_mlp_anon(zone, mlptype, connp->conn_proto,
465 requested_port, B_TRUE);
466 if (err != 0) {
467 if (connp->conn_debug) {
468 (void) strlog(TCP_MOD_ID, 0, 1,
469 SL_ERROR|SL_TRACE,
470 "tcp_bind: cannot establish anon "
471 "MLP for port %d",
472 requested_port);
473 }
474 return (err);
475 }
476 connp->conn_anon_port = B_TRUE;
477 }
478 connp->conn_mlp_type = mlptype;
479 }
480
481 allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
482 connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only,
483 user_specified);
484
485 if (allocated_port == 0) {
486 connp->conn_mlp_type = mlptSingle;
487 if (connp->conn_anon_port) {
488 connp->conn_anon_port = B_FALSE;
489 (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto,
490 requested_port, B_FALSE);
491 }
492 if (bind_to_req_port_only) {
493 if (connp->conn_debug) {
494 (void) strlog(TCP_MOD_ID, 0, 1,
495 SL_ERROR|SL_TRACE,
496 "tcp_bind: requested addr busy");
497 }
498 return (-TADDRBUSY);
499 } else {
500 /* If we are out of ports, fail the bind. */
501 if (connp->conn_debug) {
502 (void) strlog(TCP_MOD_ID, 0, 1,
503 SL_ERROR|SL_TRACE,
504 "tcp_bind: out of ports?");
505 }
506 return (-TNOADDR);
507 }
508 }
509
510 /* Pass the allocated port back */
511 *requested_port_ptr = allocated_port;
512 return (0);
513 }
514
515 /*
516 * Check the address and check/pick a local port number.
517 */
518 int
tcp_bind_check(conn_t * connp,struct sockaddr * sa,socklen_t len,cred_t * cr,boolean_t bind_to_req_port_only)519 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
520 boolean_t bind_to_req_port_only)
521 {
522 tcp_t *tcp = connp->conn_tcp;
523 sin_t *sin;
524 sin6_t *sin6;
525 in_port_t requested_port;
526 ipaddr_t v4addr;
527 in6_addr_t v6addr;
528 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */
529 zoneid_t zoneid = IPCL_ZONEID(connp);
530 ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
531 uint_t scopeid = 0;
532 int error = 0;
533 ip_xmit_attr_t *ixa = connp->conn_ixa;
534
535 ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX);
536
537 if (tcp->tcp_state == TCPS_BOUND) {
538 return (0);
539 } else if (tcp->tcp_state > TCPS_BOUND) {
540 if (connp->conn_debug) {
541 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
542 "tcp_bind: bad state, %d", tcp->tcp_state);
543 }
544 return (-TOUTSTATE);
545 }
546
547 ASSERT(sa != NULL && len != 0);
548
549 if (!OK_32PTR((char *)sa)) {
550 if (connp->conn_debug) {
551 (void) strlog(TCP_MOD_ID, 0, 1,
552 SL_ERROR|SL_TRACE,
553 "tcp_bind: bad address parameter, "
554 "address %p, len %d",
555 (void *)sa, len);
556 }
557 return (-TPROTO);
558 }
559
560 error = proto_verify_ip_addr(connp->conn_family, sa, len);
561 if (error != 0) {
562 return (error);
563 }
564
565 switch (len) {
566 case sizeof (sin_t): /* Complete IPv4 address */
567 sin = (sin_t *)sa;
568 requested_port = ntohs(sin->sin_port);
569 v4addr = sin->sin_addr.s_addr;
570 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
571 if (v4addr != INADDR_ANY) {
572 laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst,
573 B_FALSE);
574 }
575 break;
576
577 case sizeof (sin6_t): /* Complete IPv6 address */
578 sin6 = (sin6_t *)sa;
579 v6addr = sin6->sin6_addr;
580 requested_port = ntohs(sin6->sin6_port);
581 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) {
582 if (connp->conn_ipv6_v6only)
583 return (EADDRNOTAVAIL);
584
585 IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr);
586 if (v4addr != INADDR_ANY) {
587 laddr_type = ip_laddr_verify_v4(v4addr,
588 zoneid, ipst, B_FALSE);
589 }
590 } else {
591 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) {
592 if (IN6_IS_ADDR_LINKSCOPE(&v6addr))
593 scopeid = sin6->sin6_scope_id;
594 laddr_type = ip_laddr_verify_v6(&v6addr,
595 zoneid, ipst, B_FALSE, scopeid);
596 }
597 }
598 break;
599
600 default:
601 if (connp->conn_debug) {
602 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
603 "tcp_bind: bad address length, %d", len);
604 }
605 return (EAFNOSUPPORT);
606 /* return (-TBADADDR); */
607 }
608
609 /* Is the local address a valid unicast address? */
610 if (laddr_type == IPVL_BAD)
611 return (EADDRNOTAVAIL);
612
613 connp->conn_bound_addr_v6 = v6addr;
614 if (scopeid != 0) {
615 ixa->ixa_flags |= IXAF_SCOPEID_SET;
616 ixa->ixa_scopeid = scopeid;
617 connp->conn_incoming_ifindex = scopeid;
618 } else {
619 ixa->ixa_flags &= ~IXAF_SCOPEID_SET;
620 connp->conn_incoming_ifindex = connp->conn_bound_if;
621 }
622
623 connp->conn_laddr_v6 = v6addr;
624 connp->conn_saddr_v6 = v6addr;
625
626 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only;
627
628 error = tcp_bind_select_lport(tcp, &requested_port,
629 bind_to_req_port_only, cr);
630 if (error != 0) {
631 connp->conn_laddr_v6 = ipv6_all_zeros;
632 connp->conn_saddr_v6 = ipv6_all_zeros;
633 connp->conn_bound_addr_v6 = ipv6_all_zeros;
634 }
635 return (error);
636 }
637
638 /*
639 * If the "bind_to_req_port_only" parameter is set, if the requested port
640 * number is available, return it, If not return 0
641 *
642 * If "bind_to_req_port_only" parameter is not set and
643 * If the requested port number is available, return it. If not, return
644 * the first anonymous port we happen across. If no anonymous ports are
645 * available, return 0. addr is the requested local address, if any.
646 *
647 * In either case, when succeeding update the tcp_t to record the port number
648 * and insert it in the bind hash table.
649 *
650 * Note that TCP over IPv4 and IPv6 sockets can use the same port number
651 * without setting SO_REUSEADDR. This is needed so that they
652 * can be viewed as two independent transport protocols.
653 */
654 in_port_t
tcp_bindi(tcp_t * tcp,in_port_t port,const in6_addr_t * laddr,int reuseaddr,boolean_t quick_connect,boolean_t bind_to_req_port_only,boolean_t user_specified)655 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
656 int reuseaddr, boolean_t quick_connect,
657 boolean_t bind_to_req_port_only, boolean_t user_specified)
658 {
659 /* number of times we have run around the loop */
660 int count = 0;
661 /* maximum number of times to run around the loop */
662 int loopmax;
663 conn_t *connp = tcp->tcp_connp;
664 tcp_stack_t *tcps = tcp->tcp_tcps;
665
666 /*
667 * Lookup for free addresses is done in a loop and "loopmax"
668 * influences how long we spin in the loop
669 */
670 if (bind_to_req_port_only) {
671 /*
672 * If the requested port is busy, don't bother to look
673 * for a new one. Setting loop maximum count to 1 has
674 * that effect.
675 */
676 loopmax = 1;
677 } else {
678 /*
679 * If the requested port is busy, look for a free one
680 * in the anonymous port range.
681 * Set loopmax appropriately so that one does not look
682 * forever in the case all of the anonymous ports are in use.
683 */
684 if (connp->conn_anon_priv_bind) {
685 /*
686 * loopmax =
687 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
688 */
689 loopmax = IPPORT_RESERVED -
690 tcps->tcps_min_anonpriv_port;
691 } else {
692 loopmax = (tcps->tcps_largest_anon_port -
693 tcps->tcps_smallest_anon_port + 1);
694 }
695 }
696 do {
697 uint16_t lport;
698 tf_t *tbf;
699 tcp_t *ltcp;
700 conn_t *lconnp;
701
702 lport = htons(port);
703
704 /*
705 * Ensure that the tcp_t is not currently in the bind hash.
706 * Hold the lock on the hash bucket to ensure that
707 * the duplicate check plus the insertion is an atomic
708 * operation.
709 *
710 * This function does an inline lookup on the bind hash list
711 * Make sure that we access only members of tcp_t
712 * and that we don't look at tcp_tcp, since we are not
713 * doing a CONN_INC_REF.
714 */
715 tcp_bind_hash_remove(tcp);
716 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
717 mutex_enter(&tbf->tf_lock);
718 for (ltcp = tbf->tf_tcp; ltcp != NULL;
719 ltcp = ltcp->tcp_bind_hash) {
720 if (lport == ltcp->tcp_connp->conn_lport)
721 break;
722 }
723
724 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
725 boolean_t not_socket;
726 boolean_t exclbind;
727
728 lconnp = ltcp->tcp_connp;
729
730 /*
731 * On a labeled system, we must treat bindings to ports
732 * on shared IP addresses by sockets with MAC exemption
733 * privilege as being in all zones, as there's
734 * otherwise no way to identify the right receiver.
735 */
736 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp))
737 continue;
738
739 /*
740 * If TCP_EXCLBIND is set for either the bound or
741 * binding endpoint, the semantics of bind
742 * is changed according to the following.
743 *
744 * spec = specified address (v4 or v6)
745 * unspec = unspecified address (v4 or v6)
746 * A = specified addresses are different for endpoints
747 *
748 * bound bind to allowed
749 * -------------------------------------
750 * unspec unspec no
751 * unspec spec no
752 * spec unspec no
753 * spec spec yes if A
754 *
755 * For labeled systems, SO_MAC_EXEMPT behaves the same
756 * as TCP_EXCLBIND, except that zoneid is ignored.
757 *
758 * Note:
759 *
760 * 1. Because of TLI semantics, an endpoint can go
761 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
762 * TCPS_BOUND, depending on whether it is originally
763 * a listener or not. That is why we need to check
764 * for states greater than or equal to TCPS_BOUND
765 * here.
766 *
767 * 2. Ideally, we should only check for state equals
768 * to TCPS_LISTEN. And the following check should be
769 * added.
770 *
771 * if (ltcp->tcp_state == TCPS_LISTEN ||
772 * !reuseaddr || !lconnp->conn_reuseaddr) {
773 * ...
774 * }
775 *
776 * The semantics will be changed to this. If the
777 * endpoint on the list is in state not equal to
778 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
779 * set, let the bind succeed.
780 *
781 * Because of (1), we cannot do that for TLI
782 * endpoints. But we can do that for socket endpoints.
783 * If in future, we can change this going back
784 * semantics, we can use the above check for TLI also.
785 */
786 not_socket = !(TCP_IS_SOCKET(ltcp) &&
787 TCP_IS_SOCKET(tcp));
788 exclbind = lconnp->conn_exclbind ||
789 connp->conn_exclbind;
790
791 if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) ||
792 (connp->conn_mac_mode != CONN_MAC_DEFAULT) ||
793 (exclbind && (not_socket ||
794 ltcp->tcp_state <= TCPS_ESTABLISHED))) {
795 if (V6_OR_V4_INADDR_ANY(
796 lconnp->conn_bound_addr_v6) ||
797 V6_OR_V4_INADDR_ANY(*laddr) ||
798 IN6_ARE_ADDR_EQUAL(laddr,
799 &lconnp->conn_bound_addr_v6)) {
800 break;
801 }
802 continue;
803 }
804
805 /*
806 * Check ipversion to allow IPv4 and IPv6 sockets to
807 * have disjoint port number spaces, if *_EXCLBIND
808 * is not set and only if the application binds to a
809 * specific port. We use the same autoassigned port
810 * number space for IPv4 and IPv6 sockets.
811 */
812 if (connp->conn_ipversion != lconnp->conn_ipversion &&
813 bind_to_req_port_only)
814 continue;
815
816 /*
817 * Ideally, we should make sure that the source
818 * address, remote address, and remote port in the
819 * four tuple for this tcp-connection is unique.
820 * However, trying to find out the local source
821 * address would require too much code duplication
822 * with IP, since IP needs needs to have that code
823 * to support userland TCP implementations.
824 */
825 if (quick_connect &&
826 (ltcp->tcp_state > TCPS_LISTEN) &&
827 ((connp->conn_fport != lconnp->conn_fport) ||
828 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6,
829 &lconnp->conn_faddr_v6)))
830 continue;
831
832 if (!reuseaddr) {
833 /*
834 * No socket option SO_REUSEADDR.
835 * If existing port is bound to
836 * a non-wildcard IP address
837 * and the requesting stream is
838 * bound to a distinct
839 * different IP addresses
840 * (non-wildcard, also), keep
841 * going.
842 */
843 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
844 !V6_OR_V4_INADDR_ANY(
845 lconnp->conn_bound_addr_v6) &&
846 !IN6_ARE_ADDR_EQUAL(laddr,
847 &lconnp->conn_bound_addr_v6))
848 continue;
849 if (ltcp->tcp_state >= TCPS_BOUND) {
850 /*
851 * This port is being used and
852 * its state is >= TCPS_BOUND,
853 * so we can't bind to it.
854 */
855 break;
856 }
857 } else {
858 /*
859 * socket option SO_REUSEADDR is set on the
860 * binding tcp_t.
861 *
862 * If two streams are bound to
863 * same IP address or both addr
864 * and bound source are wildcards
865 * (INADDR_ANY), we want to stop
866 * searching.
867 * We have found a match of IP source
868 * address and source port, which is
869 * refused regardless of the
870 * SO_REUSEADDR setting, so we break.
871 */
872 if (IN6_ARE_ADDR_EQUAL(laddr,
873 &lconnp->conn_bound_addr_v6) &&
874 (ltcp->tcp_state == TCPS_LISTEN ||
875 ltcp->tcp_state == TCPS_BOUND))
876 break;
877 }
878 }
879 if (ltcp != NULL) {
880 /* The port number is busy */
881 mutex_exit(&tbf->tf_lock);
882 } else {
883 /*
884 * This port is ours. Insert in fanout and mark as
885 * bound to prevent others from getting the port
886 * number.
887 */
888 tcp->tcp_state = TCPS_BOUND;
889 DTRACE_TCP6(state__change, void, NULL,
890 ip_xmit_attr_t *, connp->conn_ixa,
891 void, NULL, tcp_t *, tcp, void, NULL,
892 int32_t, TCPS_IDLE);
893
894 connp->conn_lport = htons(port);
895
896 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
897 connp->conn_lport)] == tbf);
898 tcp_bind_hash_insert(tbf, tcp, 1);
899
900 mutex_exit(&tbf->tf_lock);
901
902 /*
903 * We don't want tcp_next_port_to_try to "inherit"
904 * a port number supplied by the user in a bind.
905 */
906 if (user_specified)
907 return (port);
908
909 /*
910 * This is the only place where tcp_next_port_to_try
911 * is updated. After the update, it may or may not
912 * be in the valid range.
913 */
914 if (!connp->conn_anon_priv_bind)
915 tcps->tcps_next_port_to_try = port + 1;
916 return (port);
917 }
918
919 if (connp->conn_anon_priv_bind) {
920 port = tcp_get_next_priv_port(tcp);
921 } else {
922 if (count == 0 && user_specified) {
923 /*
924 * We may have to return an anonymous port. So
925 * get one to start with.
926 */
927 port =
928 tcp_update_next_port(
929 tcps->tcps_next_port_to_try,
930 tcp, B_TRUE);
931 user_specified = B_FALSE;
932 } else {
933 port = tcp_update_next_port(port + 1, tcp,
934 B_FALSE);
935 }
936 }
937 if (port == 0)
938 break;
939
940 /*
941 * Don't let this loop run forever in the case where
942 * all of the anonymous ports are in use.
943 */
944 } while (++count < loopmax);
945 return (0);
946 }
947