1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 * Copyright 2024 Bill Sommerfeld <sommerfeld@hamachi.org> 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/strsun.h> 32 #include <sys/strsubr.h> 33 #include <sys/stropts.h> 34 #include <sys/strlog.h> 35 #define _SUN_TPI_VERSION 2 36 #include <sys/tihdr.h> 37 #include <sys/suntpi.h> 38 #include <sys/xti_inet.h> 39 #include <sys/policy.h> 40 #include <sys/squeue_impl.h> 41 #include <sys/squeue.h> 42 #include <sys/tsol/tnet.h> 43 44 #include <rpc/pmap_prot.h> 45 46 #include <inet/common.h> 47 #include <inet/ip.h> 48 #include <inet/tcp.h> 49 #include <inet/tcp_impl.h> 50 #include <inet/proto_set.h> 51 #include <inet/ipsec_impl.h> 52 53 /* Setable in /etc/system */ 54 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ 55 static uint32_t tcp_random_anon_port = 1; 56 57 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, 58 cred_t *cr); 59 static in_port_t tcp_get_next_priv_port(const tcp_t *); 60 61 /* 62 * Hash list insertion routine for tcp_t structures. Each hash bucket 63 * contains a list of tcp_t entries, and each entry is bound to a unique 64 * port. If there are multiple tcp_t's that are bound to the same port, then 65 * one of them will be linked into the hash bucket list, and the rest will 66 * hang off of that one entry. For each port, entries bound to a specific IP 67 * address will be inserted before those those bound to INADDR_ANY. 68 */ 69 void 70 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) 71 { 72 tcp_t **tcpp; 73 tcp_t *tcpnext; 74 tcp_t *tcphash; 75 conn_t *connp = tcp->tcp_connp; 76 conn_t *connext; 77 78 if (tcp->tcp_ptpbhn != NULL) { 79 ASSERT(!caller_holds_lock); 80 tcp_bind_hash_remove(tcp); 81 } 82 tcpp = &tbf->tf_tcp; 83 if (!caller_holds_lock) { 84 mutex_enter(&tbf->tf_lock); 85 } else { 86 ASSERT(MUTEX_HELD(&tbf->tf_lock)); 87 } 88 tcphash = tcpp[0]; 89 tcpnext = NULL; 90 if (tcphash != NULL) { 91 /* Look for an entry using the same port */ 92 while ((tcphash = tcpp[0]) != NULL && 93 connp->conn_lport != tcphash->tcp_connp->conn_lport) 94 tcpp = &(tcphash->tcp_bind_hash); 95 96 /* The port was not found, just add to the end */ 97 if (tcphash == NULL) 98 goto insert; 99 100 /* 101 * OK, there already exists an entry bound to the 102 * same port. 103 * 104 * If the new tcp bound to the INADDR_ANY address 105 * and the first one in the list is not bound to 106 * INADDR_ANY we skip all entries until we find the 107 * first one bound to INADDR_ANY. 108 * This makes sure that applications binding to a 109 * specific address get preference over those binding to 110 * INADDR_ANY. 111 */ 112 tcpnext = tcphash; 113 connext = tcpnext->tcp_connp; 114 tcphash = NULL; 115 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) && 116 !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { 117 while ((tcpnext = tcpp[0]) != NULL) { 118 connext = tcpnext->tcp_connp; 119 if (!V6_OR_V4_INADDR_ANY( 120 connext->conn_bound_addr_v6)) 121 tcpp = &(tcpnext->tcp_bind_hash_port); 122 else 123 break; 124 } 125 if (tcpnext != NULL) { 126 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; 127 tcphash = tcpnext->tcp_bind_hash; 128 if (tcphash != NULL) { 129 tcphash->tcp_ptpbhn = 130 &(tcp->tcp_bind_hash); 131 tcpnext->tcp_bind_hash = NULL; 132 } 133 } 134 } else { 135 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; 136 tcphash = tcpnext->tcp_bind_hash; 137 if (tcphash != NULL) { 138 tcphash->tcp_ptpbhn = 139 &(tcp->tcp_bind_hash); 140 tcpnext->tcp_bind_hash = NULL; 141 } 142 } 143 } 144 insert: 145 tcp->tcp_bind_hash_port = tcpnext; 146 tcp->tcp_bind_hash = tcphash; 147 tcp->tcp_ptpbhn = tcpp; 148 tcpp[0] = tcp; 149 if (!caller_holds_lock) 150 mutex_exit(&tbf->tf_lock); 151 } 152 153 /* 154 * Hash list removal routine for tcp_t structures. 155 */ 156 void 157 tcp_bind_hash_remove(tcp_t *tcp) 158 { 159 tcp_t *tcpnext; 160 kmutex_t *lockp; 161 tcp_stack_t *tcps = tcp->tcp_tcps; 162 conn_t *connp = tcp->tcp_connp; 163 164 if (tcp->tcp_ptpbhn == NULL) 165 return; 166 167 /* 168 * Extract the lock pointer in case there are concurrent 169 * hash_remove's for this instance. 170 */ 171 ASSERT(connp->conn_lport != 0); 172 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH( 173 connp->conn_lport)].tf_lock; 174 175 ASSERT(lockp != NULL); 176 mutex_enter(lockp); 177 if (tcp->tcp_ptpbhn) { 178 tcpnext = tcp->tcp_bind_hash_port; 179 if (tcpnext != NULL) { 180 tcp->tcp_bind_hash_port = NULL; 181 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 182 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash; 183 if (tcpnext->tcp_bind_hash != NULL) { 184 tcpnext->tcp_bind_hash->tcp_ptpbhn = 185 &(tcpnext->tcp_bind_hash); 186 tcp->tcp_bind_hash = NULL; 187 } 188 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) { 189 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 190 tcp->tcp_bind_hash = NULL; 191 } 192 *tcp->tcp_ptpbhn = tcpnext; 193 tcp->tcp_ptpbhn = NULL; 194 } 195 mutex_exit(lockp); 196 } 197 198 /* 199 * Don't let port fall into the privileged range. 200 * Since the extra privileged ports can be arbitrary we also 201 * ensure that we exclude those from consideration. 202 * tcp_g_epriv_ports is not sorted thus we loop over it until 203 * there are no changes. 204 * 205 * Note: No locks are held when inspecting tcp_g_*epriv_ports 206 * but instead the code relies on: 207 * - the fact that the address of the array and its size never changes 208 * - the atomic assignment of the elements of the array 209 * 210 * Returns 0 if there are no more ports available. 211 * 212 * TS note: skip multilevel ports. 213 */ 214 in_port_t 215 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) 216 { 217 int i, bump; 218 boolean_t restart = B_FALSE; 219 tcp_stack_t *tcps = tcp->tcp_tcps; 220 221 if (random && tcp_random_anon_port != 0) { 222 (void) random_get_pseudo_bytes((uint8_t *)&port, 223 sizeof (in_port_t)); 224 /* 225 * Unless changed by a sys admin, the smallest anon port 226 * is 32768 and the largest anon port is 65535. It is 227 * very likely (50%) for the random port to be smaller 228 * than the smallest anon port. When that happens, 229 * add port % (anon port range) to the smallest anon 230 * port to get the random port. It should fall into the 231 * valid anon port range. 232 */ 233 if ((port < tcps->tcps_smallest_anon_port) || 234 (port > tcps->tcps_largest_anon_port)) { 235 if (tcps->tcps_smallest_anon_port == 236 tcps->tcps_largest_anon_port) { 237 bump = 0; 238 } else { 239 bump = port % (tcps->tcps_largest_anon_port - 240 tcps->tcps_smallest_anon_port); 241 } 242 port = tcps->tcps_smallest_anon_port + bump; 243 } 244 } 245 246 retry: 247 if (port < tcps->tcps_smallest_anon_port) 248 port = (in_port_t)tcps->tcps_smallest_anon_port; 249 250 if (port > tcps->tcps_largest_anon_port) { 251 if (restart) 252 return (0); 253 restart = B_TRUE; 254 port = (in_port_t)tcps->tcps_smallest_anon_port; 255 } 256 257 if (port < tcps->tcps_smallest_nonpriv_port) 258 port = (in_port_t)tcps->tcps_smallest_nonpriv_port; 259 260 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 261 if (port == tcps->tcps_g_epriv_ports[i]) { 262 port++; 263 /* 264 * Make sure whether the port is in the 265 * valid range. 266 */ 267 goto retry; 268 } 269 } 270 if (is_system_labeled() && 271 (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port, 272 IPPROTO_TCP, B_TRUE)) != 0) { 273 port = i; 274 goto retry; 275 } 276 return (port); 277 } 278 279 /* 280 * Return the next anonymous port in the privileged port range for 281 * bind checking. It starts at IPPORT_RESERVED - 1 and goes 282 * downwards. This is the same behavior as documented in the userland 283 * library call rresvport(3SOCKET). 284 * 285 * TS note: skip multilevel ports. 286 */ 287 static in_port_t 288 tcp_get_next_priv_port(const tcp_t *tcp) 289 { 290 static in_port_t next_priv_port = IPPORT_RESERVED - 1; 291 in_port_t nextport; 292 boolean_t restart = B_FALSE; 293 tcp_stack_t *tcps = tcp->tcp_tcps; 294 retry: 295 if (next_priv_port < tcps->tcps_min_anonpriv_port || 296 next_priv_port >= IPPORT_RESERVED) { 297 next_priv_port = IPPORT_RESERVED - 1; 298 if (restart) 299 return (0); 300 restart = B_TRUE; 301 } 302 if (is_system_labeled() && 303 (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), 304 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { 305 next_priv_port = nextport; 306 goto retry; 307 } 308 return (next_priv_port--); 309 } 310 311 static int 312 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, 313 boolean_t bind_to_req_port_only, cred_t *cr) 314 { 315 in_port_t mlp_port; 316 mlp_type_t addrtype, mlptype; 317 boolean_t user_specified; 318 in_port_t allocated_port; 319 in_port_t requested_port = *requested_port_ptr; 320 conn_t *connp = tcp->tcp_connp; 321 zone_t *zone; 322 tcp_stack_t *tcps = tcp->tcp_tcps; 323 in6_addr_t v6addr = connp->conn_laddr_v6; 324 325 zone = NULL; 326 /* 327 * XXX It's up to the caller to specify bind_to_req_port_only or not. 328 */ 329 ASSERT(cr != NULL); 330 331 /* 332 * Get a valid port (within the anonymous range and should not 333 * be a privileged one) to use if the user has not given a port. 334 * If multiple threads are here, they may all start with 335 * with the same initial port. But, it should be fine as long as 336 * tcp_bindi will ensure that no two threads will be assigned 337 * the same port. 338 * 339 * NOTE: XXX If a privileged process asks for an anonymous port, we 340 * still check for ports only in the range > tcp_smallest_non_priv_port, 341 * unless TCP_ANONPRIVBIND option is set. 342 */ 343 mlptype = mlptSingle; 344 mlp_port = requested_port; 345 if (requested_port == 0) { 346 requested_port = connp->conn_anon_priv_bind ? 347 tcp_get_next_priv_port(tcp) : 348 tcp_update_next_port(tcps->tcps_next_port_to_try, 349 tcp, B_TRUE); 350 if (requested_port == 0) { 351 return (-TNOADDR); 352 } 353 user_specified = B_FALSE; 354 355 /* 356 * If the user went through one of the RPC interfaces to create 357 * this socket and RPC is MLP in this zone, then give them an 358 * anonymous MLP. 359 */ 360 if (connp->conn_anon_mlp && is_system_labeled()) { 361 zone = crgetzone(cr); 362 addrtype = tsol_mlp_addr_type( 363 connp->conn_allzones ? ALL_ZONES : zone->zone_id, 364 IPV6_VERSION, &v6addr, 365 tcps->tcps_netstack->netstack_ip); 366 if (addrtype == mlptSingle) { 367 return (-TNOADDR); 368 } 369 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 370 PMAPPORT, addrtype); 371 mlp_port = PMAPPORT; 372 } 373 } else { 374 int i; 375 boolean_t priv = B_FALSE; 376 377 /* 378 * If the requested_port is in the well-known privileged range, 379 * verify that the stream was opened by a privileged user. 380 * Note: No locks are held when inspecting tcp_g_*epriv_ports 381 * but instead the code relies on: 382 * - the fact that the address of the array and its size never 383 * changes 384 * - the atomic assignment of the elements of the array 385 */ 386 if (requested_port < tcps->tcps_smallest_nonpriv_port) { 387 priv = B_TRUE; 388 } else { 389 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 390 if (requested_port == 391 tcps->tcps_g_epriv_ports[i]) { 392 priv = B_TRUE; 393 break; 394 } 395 } 396 } 397 if (priv) { 398 if (secpolicy_net_privaddr(cr, requested_port, 399 IPPROTO_TCP) != 0) { 400 if (connp->conn_debug) { 401 (void) strlog(TCP_MOD_ID, 0, 1, 402 SL_ERROR|SL_TRACE, 403 "tcp_bind: no priv for port %d", 404 requested_port); 405 } 406 return (-TACCES); 407 } 408 } 409 user_specified = B_TRUE; 410 411 connp = tcp->tcp_connp; 412 if (is_system_labeled()) { 413 zone = crgetzone(cr); 414 addrtype = tsol_mlp_addr_type( 415 connp->conn_allzones ? ALL_ZONES : zone->zone_id, 416 IPV6_VERSION, &v6addr, 417 tcps->tcps_netstack->netstack_ip); 418 if (addrtype == mlptSingle) { 419 return (-TNOADDR); 420 } 421 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 422 requested_port, addrtype); 423 } 424 } 425 426 if (mlptype != mlptSingle) { 427 if (secpolicy_net_bindmlp(cr) != 0) { 428 if (connp->conn_debug) { 429 (void) strlog(TCP_MOD_ID, 0, 1, 430 SL_ERROR|SL_TRACE, 431 "tcp_bind: no priv for multilevel port %d", 432 requested_port); 433 } 434 return (-TACCES); 435 } 436 437 /* 438 * If we're specifically binding a shared IP address and the 439 * port is MLP on shared addresses, then check to see if this 440 * zone actually owns the MLP. Reject if not. 441 */ 442 if (mlptype == mlptShared && addrtype == mlptShared) { 443 /* 444 * No need to handle exclusive-stack zones since 445 * ALL_ZONES only applies to the shared stack. 446 */ 447 zoneid_t mlpzone; 448 449 mlpzone = tsol_mlp_findzone(IPPROTO_TCP, 450 htons(mlp_port)); 451 if (connp->conn_zoneid != mlpzone) { 452 if (connp->conn_debug) { 453 (void) strlog(TCP_MOD_ID, 0, 1, 454 SL_ERROR|SL_TRACE, 455 "tcp_bind: attempt to bind port " 456 "%d on shared addr in zone %d " 457 "(should be %d)", 458 mlp_port, connp->conn_zoneid, 459 mlpzone); 460 } 461 return (-TACCES); 462 } 463 } 464 465 if (!user_specified) { 466 int err; 467 err = tsol_mlp_anon(zone, mlptype, connp->conn_proto, 468 requested_port, B_TRUE); 469 if (err != 0) { 470 if (connp->conn_debug) { 471 (void) strlog(TCP_MOD_ID, 0, 1, 472 SL_ERROR|SL_TRACE, 473 "tcp_bind: cannot establish anon " 474 "MLP for port %d", 475 requested_port); 476 } 477 return (err); 478 } 479 connp->conn_anon_port = B_TRUE; 480 } 481 connp->conn_mlp_type = mlptype; 482 } 483 484 allocated_port = tcp_bindi(tcp, requested_port, &v6addr, 485 connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only, 486 user_specified); 487 488 if (allocated_port == 0) { 489 connp->conn_mlp_type = mlptSingle; 490 if (connp->conn_anon_port) { 491 connp->conn_anon_port = B_FALSE; 492 (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto, 493 requested_port, B_FALSE); 494 } 495 if (bind_to_req_port_only) { 496 if (connp->conn_debug) { 497 (void) strlog(TCP_MOD_ID, 0, 1, 498 SL_ERROR|SL_TRACE, 499 "tcp_bind: requested addr busy"); 500 } 501 return (-TADDRBUSY); 502 } else { 503 /* If we are out of ports, fail the bind. */ 504 if (connp->conn_debug) { 505 (void) strlog(TCP_MOD_ID, 0, 1, 506 SL_ERROR|SL_TRACE, 507 "tcp_bind: out of ports?"); 508 } 509 return (-TNOADDR); 510 } 511 } 512 513 /* Pass the allocated port back */ 514 *requested_port_ptr = allocated_port; 515 return (0); 516 } 517 518 /* 519 * Check the address and check/pick a local port number. 520 */ 521 int 522 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, 523 boolean_t bind_to_req_port_only) 524 { 525 tcp_t *tcp = connp->conn_tcp; 526 sin_t *sin; 527 sin6_t *sin6; 528 in_port_t requested_port; 529 ipaddr_t v4addr; 530 in6_addr_t v6addr; 531 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 532 zoneid_t zoneid = IPCL_ZONEID(connp); 533 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 534 uint_t scopeid = 0; 535 int error = 0; 536 ip_xmit_attr_t *ixa = connp->conn_ixa; 537 538 ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX); 539 540 if (tcp->tcp_state == TCPS_BOUND) { 541 return (0); 542 } else if (tcp->tcp_state > TCPS_BOUND) { 543 if (connp->conn_debug) { 544 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 545 "tcp_bind: bad state, %d", tcp->tcp_state); 546 } 547 return (-TOUTSTATE); 548 } 549 550 ASSERT(sa != NULL && len != 0); 551 552 if (!OK_32PTR((char *)sa)) { 553 if (connp->conn_debug) { 554 (void) strlog(TCP_MOD_ID, 0, 1, 555 SL_ERROR|SL_TRACE, 556 "tcp_bind: bad address parameter, " 557 "address %p, len %d", 558 (void *)sa, len); 559 } 560 return (-TPROTO); 561 } 562 563 error = proto_verify_ip_addr(connp->conn_family, sa, len); 564 if (error != 0) { 565 return (error); 566 } 567 568 switch (len) { 569 case sizeof (sin_t): /* Complete IPv4 address */ 570 sin = (sin_t *)sa; 571 requested_port = ntohs(sin->sin_port); 572 v4addr = sin->sin_addr.s_addr; 573 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); 574 if (v4addr != INADDR_ANY) { 575 laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst, 576 B_FALSE); 577 } 578 break; 579 580 case sizeof (sin6_t): /* Complete IPv6 address */ 581 sin6 = (sin6_t *)sa; 582 v6addr = sin6->sin6_addr; 583 requested_port = ntohs(sin6->sin6_port); 584 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) { 585 if (connp->conn_ipv6_v6only) 586 return (EADDRNOTAVAIL); 587 588 IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr); 589 if (v4addr != INADDR_ANY) { 590 laddr_type = ip_laddr_verify_v4(v4addr, 591 zoneid, ipst, B_FALSE); 592 } 593 } else { 594 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) { 595 if (IN6_IS_ADDR_LINKSCOPE(&v6addr)) 596 scopeid = sin6->sin6_scope_id; 597 laddr_type = ip_laddr_verify_v6(&v6addr, 598 zoneid, ipst, B_FALSE, scopeid); 599 } 600 } 601 break; 602 603 default: 604 if (connp->conn_debug) { 605 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 606 "tcp_bind: bad address length, %d", len); 607 } 608 return (EAFNOSUPPORT); 609 /* return (-TBADADDR); */ 610 } 611 612 /* Is the local address a valid unicast address? */ 613 if (laddr_type == IPVL_BAD) 614 return (EADDRNOTAVAIL); 615 616 connp->conn_bound_addr_v6 = v6addr; 617 if (scopeid != 0) { 618 ixa->ixa_flags |= IXAF_SCOPEID_SET; 619 ixa->ixa_scopeid = scopeid; 620 connp->conn_incoming_ifindex = scopeid; 621 } else { 622 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 623 connp->conn_incoming_ifindex = connp->conn_bound_if; 624 } 625 626 connp->conn_laddr_v6 = v6addr; 627 connp->conn_saddr_v6 = v6addr; 628 629 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only; 630 631 error = tcp_bind_select_lport(tcp, &requested_port, 632 bind_to_req_port_only, cr); 633 if (error != 0) { 634 connp->conn_laddr_v6 = ipv6_all_zeros; 635 connp->conn_saddr_v6 = ipv6_all_zeros; 636 connp->conn_bound_addr_v6 = ipv6_all_zeros; 637 } 638 return (error); 639 } 640 641 /* 642 * If the "bind_to_req_port_only" parameter is set, if the requested port 643 * number is available, return it, If not return 0 644 * 645 * If "bind_to_req_port_only" parameter is not set and 646 * If the requested port number is available, return it. If not, return 647 * the first anonymous port we happen across. If no anonymous ports are 648 * available, return 0. addr is the requested local address, if any. 649 * 650 * In either case, when succeeding update the tcp_t to record the port number 651 * and insert it in the bind hash table. 652 * 653 * Note that TCP over IPv4 and IPv6 sockets can use the same port number 654 * without setting SO_REUSEADDR. This is needed so that they 655 * can be viewed as two independent transport protocols. 656 */ 657 in_port_t 658 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 659 int reuseaddr, boolean_t quick_connect, 660 boolean_t bind_to_req_port_only, boolean_t user_specified) 661 { 662 /* number of times we have run around the loop */ 663 int count = 0; 664 /* maximum number of times to run around the loop */ 665 int loopmax; 666 conn_t *connp = tcp->tcp_connp; 667 tcp_stack_t *tcps = tcp->tcp_tcps; 668 669 /* 670 * Lookup for free addresses is done in a loop and "loopmax" 671 * influences how long we spin in the loop 672 */ 673 if (bind_to_req_port_only) { 674 /* 675 * If the requested port is busy, don't bother to look 676 * for a new one. Setting loop maximum count to 1 has 677 * that effect. 678 */ 679 loopmax = 1; 680 } else { 681 /* 682 * If the requested port is busy, look for a free one 683 * in the anonymous port range. 684 * Set loopmax appropriately so that one does not look 685 * forever in the case all of the anonymous ports are in use. 686 */ 687 if (connp->conn_anon_priv_bind) { 688 /* 689 * loopmax = 690 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 691 */ 692 loopmax = IPPORT_RESERVED - 693 tcps->tcps_min_anonpriv_port; 694 } else { 695 loopmax = (tcps->tcps_largest_anon_port - 696 tcps->tcps_smallest_anon_port + 1); 697 } 698 } 699 do { 700 uint16_t lport; 701 tf_t *tbf; 702 tcp_t *ltcp; 703 conn_t *lconnp; 704 705 lport = htons(port); 706 707 /* 708 * Ensure that the tcp_t is not currently in the bind hash. 709 * Hold the lock on the hash bucket to ensure that 710 * the duplicate check plus the insertion is an atomic 711 * operation. 712 * 713 * This function does an inline lookup on the bind hash list 714 * Make sure that we access only members of tcp_t 715 * and that we don't look at tcp_tcp, since we are not 716 * doing a CONN_INC_REF. 717 */ 718 tcp_bind_hash_remove(tcp); 719 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; 720 mutex_enter(&tbf->tf_lock); 721 for (ltcp = tbf->tf_tcp; ltcp != NULL; 722 ltcp = ltcp->tcp_bind_hash) { 723 if (lport == ltcp->tcp_connp->conn_lport) 724 break; 725 } 726 727 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { 728 boolean_t not_socket; 729 boolean_t exclbind; 730 731 lconnp = ltcp->tcp_connp; 732 733 /* 734 * On a labeled system, we must treat bindings to ports 735 * on shared IP addresses by sockets with MAC exemption 736 * privilege as being in all zones, as there's 737 * otherwise no way to identify the right receiver. 738 */ 739 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp)) 740 continue; 741 742 /* 743 * allow multiple interface-specific binds to coexist. 744 */ 745 if (connp->conn_incoming_ifindex != 746 lconnp->conn_incoming_ifindex) { 747 if ((connp->conn_incoming_ifindex != 0) && 748 (lconnp->conn_incoming_ifindex != 0)) 749 continue; 750 } 751 752 /* 753 * If TCP_EXCLBIND is set for either the bound or 754 * binding endpoint, the semantics of bind 755 * is changed according to the following. 756 * 757 * spec = specified address (v4 or v6) 758 * unspec = unspecified address (v4 or v6) 759 * A = specified addresses are different for endpoints 760 * 761 * bound bind to allowed 762 * ------------------------------------- 763 * unspec unspec no 764 * unspec spec no 765 * spec unspec no 766 * spec spec yes if A 767 * 768 * For labeled systems, SO_MAC_EXEMPT behaves the same 769 * as TCP_EXCLBIND, except that zoneid is ignored. 770 * 771 * Note: 772 * 773 * 1. Because of TLI semantics, an endpoint can go 774 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or 775 * TCPS_BOUND, depending on whether it is originally 776 * a listener or not. That is why we need to check 777 * for states greater than or equal to TCPS_BOUND 778 * here. 779 * 780 * 2. Ideally, we should only check for state equals 781 * to TCPS_LISTEN. And the following check should be 782 * added. 783 * 784 * if (ltcp->tcp_state == TCPS_LISTEN || 785 * !reuseaddr || !lconnp->conn_reuseaddr) { 786 * ... 787 * } 788 * 789 * The semantics will be changed to this. If the 790 * endpoint on the list is in state not equal to 791 * TCPS_LISTEN and both endpoints have SO_REUSEADDR 792 * set, let the bind succeed. 793 * 794 * Because of (1), we cannot do that for TLI 795 * endpoints. But we can do that for socket endpoints. 796 * If in future, we can change this going back 797 * semantics, we can use the above check for TLI also. 798 */ 799 not_socket = !(TCP_IS_SOCKET(ltcp) && 800 TCP_IS_SOCKET(tcp)); 801 exclbind = lconnp->conn_exclbind || 802 connp->conn_exclbind; 803 804 if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) || 805 (connp->conn_mac_mode != CONN_MAC_DEFAULT) || 806 (exclbind && (not_socket || 807 ltcp->tcp_state <= TCPS_ESTABLISHED))) { 808 if (V6_OR_V4_INADDR_ANY( 809 lconnp->conn_bound_addr_v6) || 810 V6_OR_V4_INADDR_ANY(*laddr) || 811 IN6_ARE_ADDR_EQUAL(laddr, 812 &lconnp->conn_bound_addr_v6)) { 813 break; 814 } 815 continue; 816 } 817 818 /* 819 * Check ipversion to allow IPv4 and IPv6 sockets to 820 * have disjoint port number spaces, if *_EXCLBIND 821 * is not set and only if the application binds to a 822 * specific port. We use the same autoassigned port 823 * number space for IPv4 and IPv6 sockets. 824 */ 825 if (connp->conn_ipversion != lconnp->conn_ipversion && 826 bind_to_req_port_only) 827 continue; 828 829 /* 830 * Ideally, we should make sure that the source 831 * address, remote address, and remote port in the 832 * four tuple for this tcp-connection is unique. 833 * However, trying to find out the local source 834 * address would require too much code duplication 835 * with IP, since IP needs needs to have that code 836 * to support userland TCP implementations. 837 */ 838 if (quick_connect && 839 (ltcp->tcp_state > TCPS_LISTEN) && 840 ((connp->conn_fport != lconnp->conn_fport) || 841 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, 842 &lconnp->conn_faddr_v6))) 843 continue; 844 845 if (!reuseaddr) { 846 /* 847 * No socket option SO_REUSEADDR. 848 * If existing port is bound to 849 * a non-wildcard IP address 850 * and the requesting stream is 851 * bound to a distinct 852 * different IP addresses 853 * (non-wildcard, also), keep 854 * going. 855 */ 856 if (!V6_OR_V4_INADDR_ANY(*laddr) && 857 !V6_OR_V4_INADDR_ANY( 858 lconnp->conn_bound_addr_v6) && 859 !IN6_ARE_ADDR_EQUAL(laddr, 860 &lconnp->conn_bound_addr_v6)) 861 continue; 862 if (ltcp->tcp_state >= TCPS_BOUND) { 863 /* 864 * This port is being used and 865 * its state is >= TCPS_BOUND, 866 * so we can't bind to it. 867 */ 868 break; 869 } 870 } else { 871 /* 872 * socket option SO_REUSEADDR is set on the 873 * binding tcp_t. 874 * 875 * If two streams are bound to 876 * same IP address or both addr 877 * and bound source are wildcards 878 * (INADDR_ANY), we want to stop 879 * searching. 880 * We have found a match of IP source 881 * address and source port, which is 882 * refused regardless of the 883 * SO_REUSEADDR setting, so we break. 884 */ 885 if (IN6_ARE_ADDR_EQUAL(laddr, 886 &lconnp->conn_bound_addr_v6) && 887 (ltcp->tcp_state == TCPS_LISTEN || 888 ltcp->tcp_state == TCPS_BOUND)) 889 break; 890 } 891 } 892 if (ltcp != NULL) { 893 /* The port number is busy */ 894 mutex_exit(&tbf->tf_lock); 895 } else { 896 /* 897 * This port is ours. Insert in fanout and mark as 898 * bound to prevent others from getting the port 899 * number. 900 */ 901 tcp->tcp_state = TCPS_BOUND; 902 DTRACE_TCP6(state__change, void, NULL, 903 ip_xmit_attr_t *, connp->conn_ixa, 904 void, NULL, tcp_t *, tcp, void, NULL, 905 int32_t, TCPS_IDLE); 906 907 connp->conn_lport = htons(port); 908 909 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( 910 connp->conn_lport)] == tbf); 911 tcp_bind_hash_insert(tbf, tcp, 1); 912 913 mutex_exit(&tbf->tf_lock); 914 915 /* 916 * We don't want tcp_next_port_to_try to "inherit" 917 * a port number supplied by the user in a bind. 918 */ 919 if (user_specified) 920 return (port); 921 922 /* 923 * This is the only place where tcp_next_port_to_try 924 * is updated. After the update, it may or may not 925 * be in the valid range. 926 */ 927 if (!connp->conn_anon_priv_bind) 928 tcps->tcps_next_port_to_try = port + 1; 929 return (port); 930 } 931 932 if (connp->conn_anon_priv_bind) { 933 port = tcp_get_next_priv_port(tcp); 934 } else { 935 if (count == 0 && user_specified) { 936 /* 937 * We may have to return an anonymous port. So 938 * get one to start with. 939 */ 940 port = 941 tcp_update_next_port( 942 tcps->tcps_next_port_to_try, 943 tcp, B_TRUE); 944 user_specified = B_FALSE; 945 } else { 946 port = tcp_update_next_port(port + 1, tcp, 947 B_FALSE); 948 } 949 } 950 if (port == 0) 951 break; 952 953 /* 954 * Don't let this loop run forever in the case where 955 * all of the anonymous ports are in use. 956 */ 957 } while (++count < loopmax); 958 return (0); 959 } 960