1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 */ 26 27 #include <sys/types.h> 28 #include <sys/stream.h> 29 #include <sys/strsun.h> 30 #include <sys/strsubr.h> 31 #include <sys/stropts.h> 32 #include <sys/strlog.h> 33 #define _SUN_TPI_VERSION 2 34 #include <sys/tihdr.h> 35 #include <sys/suntpi.h> 36 #include <sys/xti_inet.h> 37 #include <sys/policy.h> 38 #include <sys/squeue_impl.h> 39 #include <sys/squeue.h> 40 #include <sys/tsol/tnet.h> 41 42 #include <rpc/pmap_prot.h> 43 44 #include <inet/common.h> 45 #include <inet/ip.h> 46 #include <inet/tcp.h> 47 #include <inet/tcp_impl.h> 48 #include <inet/proto_set.h> 49 #include <inet/ipsec_impl.h> 50 51 /* Setable in /etc/system */ 52 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ 53 static uint32_t tcp_random_anon_port = 1; 54 55 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, 56 cred_t *cr); 57 static in_port_t tcp_get_next_priv_port(const tcp_t *); 58 59 /* 60 * Hash list insertion routine for tcp_t structures. Each hash bucket 61 * contains a list of tcp_t entries, and each entry is bound to a unique 62 * port. If there are multiple tcp_t's that are bound to the same port, then 63 * one of them will be linked into the hash bucket list, and the rest will 64 * hang off of that one entry. For each port, entries bound to a specific IP 65 * address will be inserted before those those bound to INADDR_ANY. 66 */ 67 void 68 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) 69 { 70 tcp_t **tcpp; 71 tcp_t *tcpnext; 72 tcp_t *tcphash; 73 conn_t *connp = tcp->tcp_connp; 74 conn_t *connext; 75 76 if (tcp->tcp_ptpbhn != NULL) { 77 ASSERT(!caller_holds_lock); 78 tcp_bind_hash_remove(tcp); 79 } 80 tcpp = &tbf->tf_tcp; 81 if (!caller_holds_lock) { 82 mutex_enter(&tbf->tf_lock); 83 } else { 84 ASSERT(MUTEX_HELD(&tbf->tf_lock)); 85 } 86 tcphash = tcpp[0]; 87 tcpnext = NULL; 88 if (tcphash != NULL) { 89 /* Look for an entry using the same port */ 90 while ((tcphash = tcpp[0]) != NULL && 91 connp->conn_lport != tcphash->tcp_connp->conn_lport) 92 tcpp = &(tcphash->tcp_bind_hash); 93 94 /* The port was not found, just add to the end */ 95 if (tcphash == NULL) 96 goto insert; 97 98 /* 99 * OK, there already exists an entry bound to the 100 * same port. 101 * 102 * If the new tcp bound to the INADDR_ANY address 103 * and the first one in the list is not bound to 104 * INADDR_ANY we skip all entries until we find the 105 * first one bound to INADDR_ANY. 106 * This makes sure that applications binding to a 107 * specific address get preference over those binding to 108 * INADDR_ANY. 109 */ 110 tcpnext = tcphash; 111 connext = tcpnext->tcp_connp; 112 tcphash = NULL; 113 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) && 114 !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { 115 while ((tcpnext = tcpp[0]) != NULL) { 116 connext = tcpnext->tcp_connp; 117 if (!V6_OR_V4_INADDR_ANY( 118 connext->conn_bound_addr_v6)) 119 tcpp = &(tcpnext->tcp_bind_hash_port); 120 else 121 break; 122 } 123 if (tcpnext != NULL) { 124 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; 125 tcphash = tcpnext->tcp_bind_hash; 126 if (tcphash != NULL) { 127 tcphash->tcp_ptpbhn = 128 &(tcp->tcp_bind_hash); 129 tcpnext->tcp_bind_hash = NULL; 130 } 131 } 132 } else { 133 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; 134 tcphash = tcpnext->tcp_bind_hash; 135 if (tcphash != NULL) { 136 tcphash->tcp_ptpbhn = 137 &(tcp->tcp_bind_hash); 138 tcpnext->tcp_bind_hash = NULL; 139 } 140 } 141 } 142 insert: 143 tcp->tcp_bind_hash_port = tcpnext; 144 tcp->tcp_bind_hash = tcphash; 145 tcp->tcp_ptpbhn = tcpp; 146 tcpp[0] = tcp; 147 if (!caller_holds_lock) 148 mutex_exit(&tbf->tf_lock); 149 } 150 151 /* 152 * Hash list removal routine for tcp_t structures. 153 */ 154 void 155 tcp_bind_hash_remove(tcp_t *tcp) 156 { 157 tcp_t *tcpnext; 158 kmutex_t *lockp; 159 tcp_stack_t *tcps = tcp->tcp_tcps; 160 conn_t *connp = tcp->tcp_connp; 161 162 if (tcp->tcp_ptpbhn == NULL) 163 return; 164 165 /* 166 * Extract the lock pointer in case there are concurrent 167 * hash_remove's for this instance. 168 */ 169 ASSERT(connp->conn_lport != 0); 170 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH( 171 connp->conn_lport)].tf_lock; 172 173 ASSERT(lockp != NULL); 174 mutex_enter(lockp); 175 if (tcp->tcp_ptpbhn) { 176 tcpnext = tcp->tcp_bind_hash_port; 177 if (tcpnext != NULL) { 178 tcp->tcp_bind_hash_port = NULL; 179 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 180 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash; 181 if (tcpnext->tcp_bind_hash != NULL) { 182 tcpnext->tcp_bind_hash->tcp_ptpbhn = 183 &(tcpnext->tcp_bind_hash); 184 tcp->tcp_bind_hash = NULL; 185 } 186 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) { 187 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 188 tcp->tcp_bind_hash = NULL; 189 } 190 *tcp->tcp_ptpbhn = tcpnext; 191 tcp->tcp_ptpbhn = NULL; 192 } 193 mutex_exit(lockp); 194 } 195 196 /* 197 * Don't let port fall into the privileged range. 198 * Since the extra privileged ports can be arbitrary we also 199 * ensure that we exclude those from consideration. 200 * tcp_g_epriv_ports is not sorted thus we loop over it until 201 * there are no changes. 202 * 203 * Note: No locks are held when inspecting tcp_g_*epriv_ports 204 * but instead the code relies on: 205 * - the fact that the address of the array and its size never changes 206 * - the atomic assignment of the elements of the array 207 * 208 * Returns 0 if there are no more ports available. 209 * 210 * TS note: skip multilevel ports. 211 */ 212 in_port_t 213 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) 214 { 215 int i, bump; 216 boolean_t restart = B_FALSE; 217 tcp_stack_t *tcps = tcp->tcp_tcps; 218 219 if (random && tcp_random_anon_port != 0) { 220 (void) random_get_pseudo_bytes((uint8_t *)&port, 221 sizeof (in_port_t)); 222 /* 223 * Unless changed by a sys admin, the smallest anon port 224 * is 32768 and the largest anon port is 65535. It is 225 * very likely (50%) for the random port to be smaller 226 * than the smallest anon port. When that happens, 227 * add port % (anon port range) to the smallest anon 228 * port to get the random port. It should fall into the 229 * valid anon port range. 230 */ 231 if ((port < tcps->tcps_smallest_anon_port) || 232 (port > tcps->tcps_largest_anon_port)) { 233 if (tcps->tcps_smallest_anon_port == 234 tcps->tcps_largest_anon_port) { 235 bump = 0; 236 } else { 237 bump = port % (tcps->tcps_largest_anon_port - 238 tcps->tcps_smallest_anon_port); 239 } 240 port = tcps->tcps_smallest_anon_port + bump; 241 } 242 } 243 244 retry: 245 if (port < tcps->tcps_smallest_anon_port) 246 port = (in_port_t)tcps->tcps_smallest_anon_port; 247 248 if (port > tcps->tcps_largest_anon_port) { 249 if (restart) 250 return (0); 251 restart = B_TRUE; 252 port = (in_port_t)tcps->tcps_smallest_anon_port; 253 } 254 255 if (port < tcps->tcps_smallest_nonpriv_port) 256 port = (in_port_t)tcps->tcps_smallest_nonpriv_port; 257 258 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 259 if (port == tcps->tcps_g_epriv_ports[i]) { 260 port++; 261 /* 262 * Make sure whether the port is in the 263 * valid range. 264 */ 265 goto retry; 266 } 267 } 268 if (is_system_labeled() && 269 (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port, 270 IPPROTO_TCP, B_TRUE)) != 0) { 271 port = i; 272 goto retry; 273 } 274 return (port); 275 } 276 277 /* 278 * Return the next anonymous port in the privileged port range for 279 * bind checking. It starts at IPPORT_RESERVED - 1 and goes 280 * downwards. This is the same behavior as documented in the userland 281 * library call rresvport(3N). 282 * 283 * TS note: skip multilevel ports. 284 */ 285 static in_port_t 286 tcp_get_next_priv_port(const tcp_t *tcp) 287 { 288 static in_port_t next_priv_port = IPPORT_RESERVED - 1; 289 in_port_t nextport; 290 boolean_t restart = B_FALSE; 291 tcp_stack_t *tcps = tcp->tcp_tcps; 292 retry: 293 if (next_priv_port < tcps->tcps_min_anonpriv_port || 294 next_priv_port >= IPPORT_RESERVED) { 295 next_priv_port = IPPORT_RESERVED - 1; 296 if (restart) 297 return (0); 298 restart = B_TRUE; 299 } 300 if (is_system_labeled() && 301 (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), 302 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { 303 next_priv_port = nextport; 304 goto retry; 305 } 306 return (next_priv_port--); 307 } 308 309 static int 310 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, 311 boolean_t bind_to_req_port_only, cred_t *cr) 312 { 313 in_port_t mlp_port; 314 mlp_type_t addrtype, mlptype; 315 boolean_t user_specified; 316 in_port_t allocated_port; 317 in_port_t requested_port = *requested_port_ptr; 318 conn_t *connp = tcp->tcp_connp; 319 zone_t *zone; 320 tcp_stack_t *tcps = tcp->tcp_tcps; 321 in6_addr_t v6addr = connp->conn_laddr_v6; 322 323 /* 324 * XXX It's up to the caller to specify bind_to_req_port_only or not. 325 */ 326 ASSERT(cr != NULL); 327 328 /* 329 * Get a valid port (within the anonymous range and should not 330 * be a privileged one) to use if the user has not given a port. 331 * If multiple threads are here, they may all start with 332 * with the same initial port. But, it should be fine as long as 333 * tcp_bindi will ensure that no two threads will be assigned 334 * the same port. 335 * 336 * NOTE: XXX If a privileged process asks for an anonymous port, we 337 * still check for ports only in the range > tcp_smallest_non_priv_port, 338 * unless TCP_ANONPRIVBIND option is set. 339 */ 340 mlptype = mlptSingle; 341 mlp_port = requested_port; 342 if (requested_port == 0) { 343 requested_port = connp->conn_anon_priv_bind ? 344 tcp_get_next_priv_port(tcp) : 345 tcp_update_next_port(tcps->tcps_next_port_to_try, 346 tcp, B_TRUE); 347 if (requested_port == 0) { 348 return (-TNOADDR); 349 } 350 user_specified = B_FALSE; 351 352 /* 353 * If the user went through one of the RPC interfaces to create 354 * this socket and RPC is MLP in this zone, then give him an 355 * anonymous MLP. 356 */ 357 if (connp->conn_anon_mlp && is_system_labeled()) { 358 zone = crgetzone(cr); 359 addrtype = tsol_mlp_addr_type( 360 connp->conn_allzones ? ALL_ZONES : zone->zone_id, 361 IPV6_VERSION, &v6addr, 362 tcps->tcps_netstack->netstack_ip); 363 if (addrtype == mlptSingle) { 364 return (-TNOADDR); 365 } 366 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 367 PMAPPORT, addrtype); 368 mlp_port = PMAPPORT; 369 } 370 } else { 371 int i; 372 boolean_t priv = B_FALSE; 373 374 /* 375 * If the requested_port is in the well-known privileged range, 376 * verify that the stream was opened by a privileged user. 377 * Note: No locks are held when inspecting tcp_g_*epriv_ports 378 * but instead the code relies on: 379 * - the fact that the address of the array and its size never 380 * changes 381 * - the atomic assignment of the elements of the array 382 */ 383 if (requested_port < tcps->tcps_smallest_nonpriv_port) { 384 priv = B_TRUE; 385 } else { 386 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 387 if (requested_port == 388 tcps->tcps_g_epriv_ports[i]) { 389 priv = B_TRUE; 390 break; 391 } 392 } 393 } 394 if (priv) { 395 if (secpolicy_net_privaddr(cr, requested_port, 396 IPPROTO_TCP) != 0) { 397 if (connp->conn_debug) { 398 (void) strlog(TCP_MOD_ID, 0, 1, 399 SL_ERROR|SL_TRACE, 400 "tcp_bind: no priv for port %d", 401 requested_port); 402 } 403 return (-TACCES); 404 } 405 } 406 user_specified = B_TRUE; 407 408 connp = tcp->tcp_connp; 409 if (is_system_labeled()) { 410 zone = crgetzone(cr); 411 addrtype = tsol_mlp_addr_type( 412 connp->conn_allzones ? ALL_ZONES : zone->zone_id, 413 IPV6_VERSION, &v6addr, 414 tcps->tcps_netstack->netstack_ip); 415 if (addrtype == mlptSingle) { 416 return (-TNOADDR); 417 } 418 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 419 requested_port, addrtype); 420 } 421 } 422 423 if (mlptype != mlptSingle) { 424 if (secpolicy_net_bindmlp(cr) != 0) { 425 if (connp->conn_debug) { 426 (void) strlog(TCP_MOD_ID, 0, 1, 427 SL_ERROR|SL_TRACE, 428 "tcp_bind: no priv for multilevel port %d", 429 requested_port); 430 } 431 return (-TACCES); 432 } 433 434 /* 435 * If we're specifically binding a shared IP address and the 436 * port is MLP on shared addresses, then check to see if this 437 * zone actually owns the MLP. Reject if not. 438 */ 439 if (mlptype == mlptShared && addrtype == mlptShared) { 440 /* 441 * No need to handle exclusive-stack zones since 442 * ALL_ZONES only applies to the shared stack. 443 */ 444 zoneid_t mlpzone; 445 446 mlpzone = tsol_mlp_findzone(IPPROTO_TCP, 447 htons(mlp_port)); 448 if (connp->conn_zoneid != mlpzone) { 449 if (connp->conn_debug) { 450 (void) strlog(TCP_MOD_ID, 0, 1, 451 SL_ERROR|SL_TRACE, 452 "tcp_bind: attempt to bind port " 453 "%d on shared addr in zone %d " 454 "(should be %d)", 455 mlp_port, connp->conn_zoneid, 456 mlpzone); 457 } 458 return (-TACCES); 459 } 460 } 461 462 if (!user_specified) { 463 int err; 464 err = tsol_mlp_anon(zone, mlptype, connp->conn_proto, 465 requested_port, B_TRUE); 466 if (err != 0) { 467 if (connp->conn_debug) { 468 (void) strlog(TCP_MOD_ID, 0, 1, 469 SL_ERROR|SL_TRACE, 470 "tcp_bind: cannot establish anon " 471 "MLP for port %d", 472 requested_port); 473 } 474 return (err); 475 } 476 connp->conn_anon_port = B_TRUE; 477 } 478 connp->conn_mlp_type = mlptype; 479 } 480 481 allocated_port = tcp_bindi(tcp, requested_port, &v6addr, 482 connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only, 483 user_specified); 484 485 if (allocated_port == 0) { 486 connp->conn_mlp_type = mlptSingle; 487 if (connp->conn_anon_port) { 488 connp->conn_anon_port = B_FALSE; 489 (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto, 490 requested_port, B_FALSE); 491 } 492 if (bind_to_req_port_only) { 493 if (connp->conn_debug) { 494 (void) strlog(TCP_MOD_ID, 0, 1, 495 SL_ERROR|SL_TRACE, 496 "tcp_bind: requested addr busy"); 497 } 498 return (-TADDRBUSY); 499 } else { 500 /* If we are out of ports, fail the bind. */ 501 if (connp->conn_debug) { 502 (void) strlog(TCP_MOD_ID, 0, 1, 503 SL_ERROR|SL_TRACE, 504 "tcp_bind: out of ports?"); 505 } 506 return (-TNOADDR); 507 } 508 } 509 510 /* Pass the allocated port back */ 511 *requested_port_ptr = allocated_port; 512 return (0); 513 } 514 515 /* 516 * Check the address and check/pick a local port number. 517 */ 518 int 519 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, 520 boolean_t bind_to_req_port_only) 521 { 522 tcp_t *tcp = connp->conn_tcp; 523 sin_t *sin; 524 sin6_t *sin6; 525 in_port_t requested_port; 526 ipaddr_t v4addr; 527 in6_addr_t v6addr; 528 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 529 zoneid_t zoneid = IPCL_ZONEID(connp); 530 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 531 uint_t scopeid = 0; 532 int error = 0; 533 ip_xmit_attr_t *ixa = connp->conn_ixa; 534 535 ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX); 536 537 if (tcp->tcp_state == TCPS_BOUND) { 538 return (0); 539 } else if (tcp->tcp_state > TCPS_BOUND) { 540 if (connp->conn_debug) { 541 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 542 "tcp_bind: bad state, %d", tcp->tcp_state); 543 } 544 return (-TOUTSTATE); 545 } 546 547 ASSERT(sa != NULL && len != 0); 548 549 if (!OK_32PTR((char *)sa)) { 550 if (connp->conn_debug) { 551 (void) strlog(TCP_MOD_ID, 0, 1, 552 SL_ERROR|SL_TRACE, 553 "tcp_bind: bad address parameter, " 554 "address %p, len %d", 555 (void *)sa, len); 556 } 557 return (-TPROTO); 558 } 559 560 error = proto_verify_ip_addr(connp->conn_family, sa, len); 561 if (error != 0) { 562 return (error); 563 } 564 565 switch (len) { 566 case sizeof (sin_t): /* Complete IPv4 address */ 567 sin = (sin_t *)sa; 568 requested_port = ntohs(sin->sin_port); 569 v4addr = sin->sin_addr.s_addr; 570 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); 571 if (v4addr != INADDR_ANY) { 572 laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst, 573 B_FALSE); 574 } 575 break; 576 577 case sizeof (sin6_t): /* Complete IPv6 address */ 578 sin6 = (sin6_t *)sa; 579 v6addr = sin6->sin6_addr; 580 requested_port = ntohs(sin6->sin6_port); 581 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) { 582 if (connp->conn_ipv6_v6only) 583 return (EADDRNOTAVAIL); 584 585 IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr); 586 if (v4addr != INADDR_ANY) { 587 laddr_type = ip_laddr_verify_v4(v4addr, 588 zoneid, ipst, B_FALSE); 589 } 590 } else { 591 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) { 592 if (IN6_IS_ADDR_LINKSCOPE(&v6addr)) 593 scopeid = sin6->sin6_scope_id; 594 laddr_type = ip_laddr_verify_v6(&v6addr, 595 zoneid, ipst, B_FALSE, scopeid); 596 } 597 } 598 break; 599 600 default: 601 if (connp->conn_debug) { 602 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 603 "tcp_bind: bad address length, %d", len); 604 } 605 return (EAFNOSUPPORT); 606 /* return (-TBADADDR); */ 607 } 608 609 /* Is the local address a valid unicast address? */ 610 if (laddr_type == IPVL_BAD) 611 return (EADDRNOTAVAIL); 612 613 connp->conn_bound_addr_v6 = v6addr; 614 if (scopeid != 0) { 615 ixa->ixa_flags |= IXAF_SCOPEID_SET; 616 ixa->ixa_scopeid = scopeid; 617 connp->conn_incoming_ifindex = scopeid; 618 } else { 619 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 620 connp->conn_incoming_ifindex = connp->conn_bound_if; 621 } 622 623 connp->conn_laddr_v6 = v6addr; 624 connp->conn_saddr_v6 = v6addr; 625 626 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only; 627 628 error = tcp_bind_select_lport(tcp, &requested_port, 629 bind_to_req_port_only, cr); 630 if (error != 0) { 631 connp->conn_laddr_v6 = ipv6_all_zeros; 632 connp->conn_saddr_v6 = ipv6_all_zeros; 633 connp->conn_bound_addr_v6 = ipv6_all_zeros; 634 } 635 return (error); 636 } 637 638 /* 639 * If the "bind_to_req_port_only" parameter is set, if the requested port 640 * number is available, return it, If not return 0 641 * 642 * If "bind_to_req_port_only" parameter is not set and 643 * If the requested port number is available, return it. If not, return 644 * the first anonymous port we happen across. If no anonymous ports are 645 * available, return 0. addr is the requested local address, if any. 646 * 647 * In either case, when succeeding update the tcp_t to record the port number 648 * and insert it in the bind hash table. 649 * 650 * Note that TCP over IPv4 and IPv6 sockets can use the same port number 651 * without setting SO_REUSEADDR. This is needed so that they 652 * can be viewed as two independent transport protocols. 653 */ 654 in_port_t 655 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 656 int reuseaddr, boolean_t quick_connect, 657 boolean_t bind_to_req_port_only, boolean_t user_specified) 658 { 659 /* number of times we have run around the loop */ 660 int count = 0; 661 /* maximum number of times to run around the loop */ 662 int loopmax; 663 conn_t *connp = tcp->tcp_connp; 664 tcp_stack_t *tcps = tcp->tcp_tcps; 665 666 /* 667 * Lookup for free addresses is done in a loop and "loopmax" 668 * influences how long we spin in the loop 669 */ 670 if (bind_to_req_port_only) { 671 /* 672 * If the requested port is busy, don't bother to look 673 * for a new one. Setting loop maximum count to 1 has 674 * that effect. 675 */ 676 loopmax = 1; 677 } else { 678 /* 679 * If the requested port is busy, look for a free one 680 * in the anonymous port range. 681 * Set loopmax appropriately so that one does not look 682 * forever in the case all of the anonymous ports are in use. 683 */ 684 if (connp->conn_anon_priv_bind) { 685 /* 686 * loopmax = 687 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 688 */ 689 loopmax = IPPORT_RESERVED - 690 tcps->tcps_min_anonpriv_port; 691 } else { 692 loopmax = (tcps->tcps_largest_anon_port - 693 tcps->tcps_smallest_anon_port + 1); 694 } 695 } 696 do { 697 uint16_t lport; 698 tf_t *tbf; 699 tcp_t *ltcp; 700 conn_t *lconnp; 701 702 lport = htons(port); 703 704 /* 705 * Ensure that the tcp_t is not currently in the bind hash. 706 * Hold the lock on the hash bucket to ensure that 707 * the duplicate check plus the insertion is an atomic 708 * operation. 709 * 710 * This function does an inline lookup on the bind hash list 711 * Make sure that we access only members of tcp_t 712 * and that we don't look at tcp_tcp, since we are not 713 * doing a CONN_INC_REF. 714 */ 715 tcp_bind_hash_remove(tcp); 716 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; 717 mutex_enter(&tbf->tf_lock); 718 for (ltcp = tbf->tf_tcp; ltcp != NULL; 719 ltcp = ltcp->tcp_bind_hash) { 720 if (lport == ltcp->tcp_connp->conn_lport) 721 break; 722 } 723 724 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { 725 boolean_t not_socket; 726 boolean_t exclbind; 727 728 lconnp = ltcp->tcp_connp; 729 730 /* 731 * On a labeled system, we must treat bindings to ports 732 * on shared IP addresses by sockets with MAC exemption 733 * privilege as being in all zones, as there's 734 * otherwise no way to identify the right receiver. 735 */ 736 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp)) 737 continue; 738 739 /* 740 * If TCP_EXCLBIND is set for either the bound or 741 * binding endpoint, the semantics of bind 742 * is changed according to the following. 743 * 744 * spec = specified address (v4 or v6) 745 * unspec = unspecified address (v4 or v6) 746 * A = specified addresses are different for endpoints 747 * 748 * bound bind to allowed 749 * ------------------------------------- 750 * unspec unspec no 751 * unspec spec no 752 * spec unspec no 753 * spec spec yes if A 754 * 755 * For labeled systems, SO_MAC_EXEMPT behaves the same 756 * as TCP_EXCLBIND, except that zoneid is ignored. 757 * 758 * Note: 759 * 760 * 1. Because of TLI semantics, an endpoint can go 761 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or 762 * TCPS_BOUND, depending on whether it is originally 763 * a listener or not. That is why we need to check 764 * for states greater than or equal to TCPS_BOUND 765 * here. 766 * 767 * 2. Ideally, we should only check for state equals 768 * to TCPS_LISTEN. And the following check should be 769 * added. 770 * 771 * if (ltcp->tcp_state == TCPS_LISTEN || 772 * !reuseaddr || !lconnp->conn_reuseaddr) { 773 * ... 774 * } 775 * 776 * The semantics will be changed to this. If the 777 * endpoint on the list is in state not equal to 778 * TCPS_LISTEN and both endpoints have SO_REUSEADDR 779 * set, let the bind succeed. 780 * 781 * Because of (1), we cannot do that for TLI 782 * endpoints. But we can do that for socket endpoints. 783 * If in future, we can change this going back 784 * semantics, we can use the above check for TLI also. 785 */ 786 not_socket = !(TCP_IS_SOCKET(ltcp) && 787 TCP_IS_SOCKET(tcp)); 788 exclbind = lconnp->conn_exclbind || 789 connp->conn_exclbind; 790 791 if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) || 792 (connp->conn_mac_mode != CONN_MAC_DEFAULT) || 793 (exclbind && (not_socket || 794 ltcp->tcp_state <= TCPS_ESTABLISHED))) { 795 if (V6_OR_V4_INADDR_ANY( 796 lconnp->conn_bound_addr_v6) || 797 V6_OR_V4_INADDR_ANY(*laddr) || 798 IN6_ARE_ADDR_EQUAL(laddr, 799 &lconnp->conn_bound_addr_v6)) { 800 break; 801 } 802 continue; 803 } 804 805 /* 806 * Check ipversion to allow IPv4 and IPv6 sockets to 807 * have disjoint port number spaces, if *_EXCLBIND 808 * is not set and only if the application binds to a 809 * specific port. We use the same autoassigned port 810 * number space for IPv4 and IPv6 sockets. 811 */ 812 if (connp->conn_ipversion != lconnp->conn_ipversion && 813 bind_to_req_port_only) 814 continue; 815 816 /* 817 * Ideally, we should make sure that the source 818 * address, remote address, and remote port in the 819 * four tuple for this tcp-connection is unique. 820 * However, trying to find out the local source 821 * address would require too much code duplication 822 * with IP, since IP needs needs to have that code 823 * to support userland TCP implementations. 824 */ 825 if (quick_connect && 826 (ltcp->tcp_state > TCPS_LISTEN) && 827 ((connp->conn_fport != lconnp->conn_fport) || 828 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, 829 &lconnp->conn_faddr_v6))) 830 continue; 831 832 if (!reuseaddr) { 833 /* 834 * No socket option SO_REUSEADDR. 835 * If existing port is bound to 836 * a non-wildcard IP address 837 * and the requesting stream is 838 * bound to a distinct 839 * different IP addresses 840 * (non-wildcard, also), keep 841 * going. 842 */ 843 if (!V6_OR_V4_INADDR_ANY(*laddr) && 844 !V6_OR_V4_INADDR_ANY( 845 lconnp->conn_bound_addr_v6) && 846 !IN6_ARE_ADDR_EQUAL(laddr, 847 &lconnp->conn_bound_addr_v6)) 848 continue; 849 if (ltcp->tcp_state >= TCPS_BOUND) { 850 /* 851 * This port is being used and 852 * its state is >= TCPS_BOUND, 853 * so we can't bind to it. 854 */ 855 break; 856 } 857 } else { 858 /* 859 * socket option SO_REUSEADDR is set on the 860 * binding tcp_t. 861 * 862 * If two streams are bound to 863 * same IP address or both addr 864 * and bound source are wildcards 865 * (INADDR_ANY), we want to stop 866 * searching. 867 * We have found a match of IP source 868 * address and source port, which is 869 * refused regardless of the 870 * SO_REUSEADDR setting, so we break. 871 */ 872 if (IN6_ARE_ADDR_EQUAL(laddr, 873 &lconnp->conn_bound_addr_v6) && 874 (ltcp->tcp_state == TCPS_LISTEN || 875 ltcp->tcp_state == TCPS_BOUND)) 876 break; 877 } 878 } 879 if (ltcp != NULL) { 880 /* The port number is busy */ 881 mutex_exit(&tbf->tf_lock); 882 } else { 883 /* 884 * This port is ours. Insert in fanout and mark as 885 * bound to prevent others from getting the port 886 * number. 887 */ 888 tcp->tcp_state = TCPS_BOUND; 889 DTRACE_TCP6(state__change, void, NULL, 890 ip_xmit_attr_t *, connp->conn_ixa, 891 void, NULL, tcp_t *, tcp, void, NULL, 892 int32_t, TCPS_IDLE); 893 894 connp->conn_lport = htons(port); 895 896 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( 897 connp->conn_lport)] == tbf); 898 tcp_bind_hash_insert(tbf, tcp, 1); 899 900 mutex_exit(&tbf->tf_lock); 901 902 /* 903 * We don't want tcp_next_port_to_try to "inherit" 904 * a port number supplied by the user in a bind. 905 */ 906 if (user_specified) 907 return (port); 908 909 /* 910 * This is the only place where tcp_next_port_to_try 911 * is updated. After the update, it may or may not 912 * be in the valid range. 913 */ 914 if (!connp->conn_anon_priv_bind) 915 tcps->tcps_next_port_to_try = port + 1; 916 return (port); 917 } 918 919 if (connp->conn_anon_priv_bind) { 920 port = tcp_get_next_priv_port(tcp); 921 } else { 922 if (count == 0 && user_specified) { 923 /* 924 * We may have to return an anonymous port. So 925 * get one to start with. 926 */ 927 port = 928 tcp_update_next_port( 929 tcps->tcps_next_port_to_try, 930 tcp, B_TRUE); 931 user_specified = B_FALSE; 932 } else { 933 port = tcp_update_next_port(port + 1, tcp, 934 B_FALSE); 935 } 936 } 937 if (port == 0) 938 break; 939 940 /* 941 * Don't let this loop run forever in the case where 942 * all of the anonymous ports are in use. 943 */ 944 } while (++count < loopmax); 945 return (0); 946 } 947