1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/types.h> 27 #include <sys/stream.h> 28 #include <sys/strsun.h> 29 #include <sys/strsubr.h> 30 #include <sys/stropts.h> 31 #include <sys/strlog.h> 32 #define _SUN_TPI_VERSION 2 33 #include <sys/tihdr.h> 34 #include <sys/suntpi.h> 35 #include <sys/xti_inet.h> 36 #include <sys/policy.h> 37 #include <sys/squeue_impl.h> 38 #include <sys/squeue.h> 39 #include <sys/tsol/tnet.h> 40 41 #include <rpc/pmap_prot.h> 42 43 #include <inet/common.h> 44 #include <inet/ip.h> 45 #include <inet/tcp.h> 46 #include <inet/tcp_impl.h> 47 #include <inet/proto_set.h> 48 #include <inet/ipsec_impl.h> 49 50 /* Setable in /etc/system */ 51 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ 52 static uint32_t tcp_random_anon_port = 1; 53 54 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, 55 cred_t *cr); 56 static in_port_t tcp_get_next_priv_port(const tcp_t *); 57 58 /* 59 * Hash list insertion routine for tcp_t structures. Each hash bucket 60 * contains a list of tcp_t entries, and each entry is bound to a unique 61 * port. If there are multiple tcp_t's that are bound to the same port, then 62 * one of them will be linked into the hash bucket list, and the rest will 63 * hang off of that one entry. For each port, entries bound to a specific IP 64 * address will be inserted before those those bound to INADDR_ANY. 65 */ 66 void 67 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) 68 { 69 tcp_t **tcpp; 70 tcp_t *tcpnext; 71 tcp_t *tcphash; 72 conn_t *connp = tcp->tcp_connp; 73 conn_t *connext; 74 75 if (tcp->tcp_ptpbhn != NULL) { 76 ASSERT(!caller_holds_lock); 77 tcp_bind_hash_remove(tcp); 78 } 79 tcpp = &tbf->tf_tcp; 80 if (!caller_holds_lock) { 81 mutex_enter(&tbf->tf_lock); 82 } else { 83 ASSERT(MUTEX_HELD(&tbf->tf_lock)); 84 } 85 tcphash = tcpp[0]; 86 tcpnext = NULL; 87 if (tcphash != NULL) { 88 /* Look for an entry using the same port */ 89 while ((tcphash = tcpp[0]) != NULL && 90 connp->conn_lport != tcphash->tcp_connp->conn_lport) 91 tcpp = &(tcphash->tcp_bind_hash); 92 93 /* The port was not found, just add to the end */ 94 if (tcphash == NULL) 95 goto insert; 96 97 /* 98 * OK, there already exists an entry bound to the 99 * same port. 100 * 101 * If the new tcp bound to the INADDR_ANY address 102 * and the first one in the list is not bound to 103 * INADDR_ANY we skip all entries until we find the 104 * first one bound to INADDR_ANY. 105 * This makes sure that applications binding to a 106 * specific address get preference over those binding to 107 * INADDR_ANY. 108 */ 109 tcpnext = tcphash; 110 connext = tcpnext->tcp_connp; 111 tcphash = NULL; 112 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) && 113 !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { 114 while ((tcpnext = tcpp[0]) != NULL) { 115 connext = tcpnext->tcp_connp; 116 if (!V6_OR_V4_INADDR_ANY( 117 connext->conn_bound_addr_v6)) 118 tcpp = &(tcpnext->tcp_bind_hash_port); 119 else 120 break; 121 } 122 if (tcpnext != NULL) { 123 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; 124 tcphash = tcpnext->tcp_bind_hash; 125 if (tcphash != NULL) { 126 tcphash->tcp_ptpbhn = 127 &(tcp->tcp_bind_hash); 128 tcpnext->tcp_bind_hash = NULL; 129 } 130 } 131 } else { 132 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; 133 tcphash = tcpnext->tcp_bind_hash; 134 if (tcphash != NULL) { 135 tcphash->tcp_ptpbhn = 136 &(tcp->tcp_bind_hash); 137 tcpnext->tcp_bind_hash = NULL; 138 } 139 } 140 } 141 insert: 142 tcp->tcp_bind_hash_port = tcpnext; 143 tcp->tcp_bind_hash = tcphash; 144 tcp->tcp_ptpbhn = tcpp; 145 tcpp[0] = tcp; 146 if (!caller_holds_lock) 147 mutex_exit(&tbf->tf_lock); 148 } 149 150 /* 151 * Hash list removal routine for tcp_t structures. 152 */ 153 void 154 tcp_bind_hash_remove(tcp_t *tcp) 155 { 156 tcp_t *tcpnext; 157 kmutex_t *lockp; 158 tcp_stack_t *tcps = tcp->tcp_tcps; 159 conn_t *connp = tcp->tcp_connp; 160 161 if (tcp->tcp_ptpbhn == NULL) 162 return; 163 164 /* 165 * Extract the lock pointer in case there are concurrent 166 * hash_remove's for this instance. 167 */ 168 ASSERT(connp->conn_lport != 0); 169 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH( 170 connp->conn_lport)].tf_lock; 171 172 ASSERT(lockp != NULL); 173 mutex_enter(lockp); 174 if (tcp->tcp_ptpbhn) { 175 tcpnext = tcp->tcp_bind_hash_port; 176 if (tcpnext != NULL) { 177 tcp->tcp_bind_hash_port = NULL; 178 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 179 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash; 180 if (tcpnext->tcp_bind_hash != NULL) { 181 tcpnext->tcp_bind_hash->tcp_ptpbhn = 182 &(tcpnext->tcp_bind_hash); 183 tcp->tcp_bind_hash = NULL; 184 } 185 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) { 186 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 187 tcp->tcp_bind_hash = NULL; 188 } 189 *tcp->tcp_ptpbhn = tcpnext; 190 tcp->tcp_ptpbhn = NULL; 191 } 192 mutex_exit(lockp); 193 } 194 195 /* 196 * Don't let port fall into the privileged range. 197 * Since the extra privileged ports can be arbitrary we also 198 * ensure that we exclude those from consideration. 199 * tcp_g_epriv_ports is not sorted thus we loop over it until 200 * there are no changes. 201 * 202 * Note: No locks are held when inspecting tcp_g_*epriv_ports 203 * but instead the code relies on: 204 * - the fact that the address of the array and its size never changes 205 * - the atomic assignment of the elements of the array 206 * 207 * Returns 0 if there are no more ports available. 208 * 209 * TS note: skip multilevel ports. 210 */ 211 in_port_t 212 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) 213 { 214 int i; 215 boolean_t restart = B_FALSE; 216 tcp_stack_t *tcps = tcp->tcp_tcps; 217 218 if (random && tcp_random_anon_port != 0) { 219 (void) random_get_pseudo_bytes((uint8_t *)&port, 220 sizeof (in_port_t)); 221 /* 222 * Unless changed by a sys admin, the smallest anon port 223 * is 32768 and the largest anon port is 65535. It is 224 * very likely (50%) for the random port to be smaller 225 * than the smallest anon port. When that happens, 226 * add port % (anon port range) to the smallest anon 227 * port to get the random port. It should fall into the 228 * valid anon port range. 229 */ 230 if (port < tcps->tcps_smallest_anon_port) { 231 port = tcps->tcps_smallest_anon_port + 232 port % (tcps->tcps_largest_anon_port - 233 tcps->tcps_smallest_anon_port); 234 } 235 } 236 237 retry: 238 if (port < tcps->tcps_smallest_anon_port) 239 port = (in_port_t)tcps->tcps_smallest_anon_port; 240 241 if (port > tcps->tcps_largest_anon_port) { 242 if (restart) 243 return (0); 244 restart = B_TRUE; 245 port = (in_port_t)tcps->tcps_smallest_anon_port; 246 } 247 248 if (port < tcps->tcps_smallest_nonpriv_port) 249 port = (in_port_t)tcps->tcps_smallest_nonpriv_port; 250 251 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 252 if (port == tcps->tcps_g_epriv_ports[i]) { 253 port++; 254 /* 255 * Make sure whether the port is in the 256 * valid range. 257 */ 258 goto retry; 259 } 260 } 261 if (is_system_labeled() && 262 (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port, 263 IPPROTO_TCP, B_TRUE)) != 0) { 264 port = i; 265 goto retry; 266 } 267 return (port); 268 } 269 270 /* 271 * Return the next anonymous port in the privileged port range for 272 * bind checking. It starts at IPPORT_RESERVED - 1 and goes 273 * downwards. This is the same behavior as documented in the userland 274 * library call rresvport(3N). 275 * 276 * TS note: skip multilevel ports. 277 */ 278 static in_port_t 279 tcp_get_next_priv_port(const tcp_t *tcp) 280 { 281 static in_port_t next_priv_port = IPPORT_RESERVED - 1; 282 in_port_t nextport; 283 boolean_t restart = B_FALSE; 284 tcp_stack_t *tcps = tcp->tcp_tcps; 285 retry: 286 if (next_priv_port < tcps->tcps_min_anonpriv_port || 287 next_priv_port >= IPPORT_RESERVED) { 288 next_priv_port = IPPORT_RESERVED - 1; 289 if (restart) 290 return (0); 291 restart = B_TRUE; 292 } 293 if (is_system_labeled() && 294 (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), 295 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { 296 next_priv_port = nextport; 297 goto retry; 298 } 299 return (next_priv_port--); 300 } 301 302 static int 303 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, 304 boolean_t bind_to_req_port_only, cred_t *cr) 305 { 306 in_port_t mlp_port; 307 mlp_type_t addrtype, mlptype; 308 boolean_t user_specified; 309 in_port_t allocated_port; 310 in_port_t requested_port = *requested_port_ptr; 311 conn_t *connp = tcp->tcp_connp; 312 zone_t *zone; 313 tcp_stack_t *tcps = tcp->tcp_tcps; 314 in6_addr_t v6addr = connp->conn_laddr_v6; 315 316 /* 317 * XXX It's up to the caller to specify bind_to_req_port_only or not. 318 */ 319 ASSERT(cr != NULL); 320 321 /* 322 * Get a valid port (within the anonymous range and should not 323 * be a privileged one) to use if the user has not given a port. 324 * If multiple threads are here, they may all start with 325 * with the same initial port. But, it should be fine as long as 326 * tcp_bindi will ensure that no two threads will be assigned 327 * the same port. 328 * 329 * NOTE: XXX If a privileged process asks for an anonymous port, we 330 * still check for ports only in the range > tcp_smallest_non_priv_port, 331 * unless TCP_ANONPRIVBIND option is set. 332 */ 333 mlptype = mlptSingle; 334 mlp_port = requested_port; 335 if (requested_port == 0) { 336 requested_port = connp->conn_anon_priv_bind ? 337 tcp_get_next_priv_port(tcp) : 338 tcp_update_next_port(tcps->tcps_next_port_to_try, 339 tcp, B_TRUE); 340 if (requested_port == 0) { 341 return (-TNOADDR); 342 } 343 user_specified = B_FALSE; 344 345 /* 346 * If the user went through one of the RPC interfaces to create 347 * this socket and RPC is MLP in this zone, then give him an 348 * anonymous MLP. 349 */ 350 if (connp->conn_anon_mlp && is_system_labeled()) { 351 zone = crgetzone(cr); 352 addrtype = tsol_mlp_addr_type( 353 connp->conn_allzones ? ALL_ZONES : zone->zone_id, 354 IPV6_VERSION, &v6addr, 355 tcps->tcps_netstack->netstack_ip); 356 if (addrtype == mlptSingle) { 357 return (-TNOADDR); 358 } 359 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 360 PMAPPORT, addrtype); 361 mlp_port = PMAPPORT; 362 } 363 } else { 364 int i; 365 boolean_t priv = B_FALSE; 366 367 /* 368 * If the requested_port is in the well-known privileged range, 369 * verify that the stream was opened by a privileged user. 370 * Note: No locks are held when inspecting tcp_g_*epriv_ports 371 * but instead the code relies on: 372 * - the fact that the address of the array and its size never 373 * changes 374 * - the atomic assignment of the elements of the array 375 */ 376 if (requested_port < tcps->tcps_smallest_nonpriv_port) { 377 priv = B_TRUE; 378 } else { 379 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 380 if (requested_port == 381 tcps->tcps_g_epriv_ports[i]) { 382 priv = B_TRUE; 383 break; 384 } 385 } 386 } 387 if (priv) { 388 if (secpolicy_net_privaddr(cr, requested_port, 389 IPPROTO_TCP) != 0) { 390 if (connp->conn_debug) { 391 (void) strlog(TCP_MOD_ID, 0, 1, 392 SL_ERROR|SL_TRACE, 393 "tcp_bind: no priv for port %d", 394 requested_port); 395 } 396 return (-TACCES); 397 } 398 } 399 user_specified = B_TRUE; 400 401 connp = tcp->tcp_connp; 402 if (is_system_labeled()) { 403 zone = crgetzone(cr); 404 addrtype = tsol_mlp_addr_type( 405 connp->conn_allzones ? ALL_ZONES : zone->zone_id, 406 IPV6_VERSION, &v6addr, 407 tcps->tcps_netstack->netstack_ip); 408 if (addrtype == mlptSingle) { 409 return (-TNOADDR); 410 } 411 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 412 requested_port, addrtype); 413 } 414 } 415 416 if (mlptype != mlptSingle) { 417 if (secpolicy_net_bindmlp(cr) != 0) { 418 if (connp->conn_debug) { 419 (void) strlog(TCP_MOD_ID, 0, 1, 420 SL_ERROR|SL_TRACE, 421 "tcp_bind: no priv for multilevel port %d", 422 requested_port); 423 } 424 return (-TACCES); 425 } 426 427 /* 428 * If we're specifically binding a shared IP address and the 429 * port is MLP on shared addresses, then check to see if this 430 * zone actually owns the MLP. Reject if not. 431 */ 432 if (mlptype == mlptShared && addrtype == mlptShared) { 433 /* 434 * No need to handle exclusive-stack zones since 435 * ALL_ZONES only applies to the shared stack. 436 */ 437 zoneid_t mlpzone; 438 439 mlpzone = tsol_mlp_findzone(IPPROTO_TCP, 440 htons(mlp_port)); 441 if (connp->conn_zoneid != mlpzone) { 442 if (connp->conn_debug) { 443 (void) strlog(TCP_MOD_ID, 0, 1, 444 SL_ERROR|SL_TRACE, 445 "tcp_bind: attempt to bind port " 446 "%d on shared addr in zone %d " 447 "(should be %d)", 448 mlp_port, connp->conn_zoneid, 449 mlpzone); 450 } 451 return (-TACCES); 452 } 453 } 454 455 if (!user_specified) { 456 int err; 457 err = tsol_mlp_anon(zone, mlptype, connp->conn_proto, 458 requested_port, B_TRUE); 459 if (err != 0) { 460 if (connp->conn_debug) { 461 (void) strlog(TCP_MOD_ID, 0, 1, 462 SL_ERROR|SL_TRACE, 463 "tcp_bind: cannot establish anon " 464 "MLP for port %d", 465 requested_port); 466 } 467 return (err); 468 } 469 connp->conn_anon_port = B_TRUE; 470 } 471 connp->conn_mlp_type = mlptype; 472 } 473 474 allocated_port = tcp_bindi(tcp, requested_port, &v6addr, 475 connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only, 476 user_specified); 477 478 if (allocated_port == 0) { 479 connp->conn_mlp_type = mlptSingle; 480 if (connp->conn_anon_port) { 481 connp->conn_anon_port = B_FALSE; 482 (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto, 483 requested_port, B_FALSE); 484 } 485 if (bind_to_req_port_only) { 486 if (connp->conn_debug) { 487 (void) strlog(TCP_MOD_ID, 0, 1, 488 SL_ERROR|SL_TRACE, 489 "tcp_bind: requested addr busy"); 490 } 491 return (-TADDRBUSY); 492 } else { 493 /* If we are out of ports, fail the bind. */ 494 if (connp->conn_debug) { 495 (void) strlog(TCP_MOD_ID, 0, 1, 496 SL_ERROR|SL_TRACE, 497 "tcp_bind: out of ports?"); 498 } 499 return (-TNOADDR); 500 } 501 } 502 503 /* Pass the allocated port back */ 504 *requested_port_ptr = allocated_port; 505 return (0); 506 } 507 508 /* 509 * Check the address and check/pick a local port number. 510 */ 511 int 512 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, 513 boolean_t bind_to_req_port_only) 514 { 515 tcp_t *tcp = connp->conn_tcp; 516 sin_t *sin; 517 sin6_t *sin6; 518 in_port_t requested_port; 519 ipaddr_t v4addr; 520 in6_addr_t v6addr; 521 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 522 zoneid_t zoneid = IPCL_ZONEID(connp); 523 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 524 uint_t scopeid = 0; 525 int error = 0; 526 ip_xmit_attr_t *ixa = connp->conn_ixa; 527 528 ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX); 529 530 if (tcp->tcp_state == TCPS_BOUND) { 531 return (0); 532 } else if (tcp->tcp_state > TCPS_BOUND) { 533 if (connp->conn_debug) { 534 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 535 "tcp_bind: bad state, %d", tcp->tcp_state); 536 } 537 return (-TOUTSTATE); 538 } 539 540 ASSERT(sa != NULL && len != 0); 541 542 if (!OK_32PTR((char *)sa)) { 543 if (connp->conn_debug) { 544 (void) strlog(TCP_MOD_ID, 0, 1, 545 SL_ERROR|SL_TRACE, 546 "tcp_bind: bad address parameter, " 547 "address %p, len %d", 548 (void *)sa, len); 549 } 550 return (-TPROTO); 551 } 552 553 error = proto_verify_ip_addr(connp->conn_family, sa, len); 554 if (error != 0) { 555 return (error); 556 } 557 558 switch (len) { 559 case sizeof (sin_t): /* Complete IPv4 address */ 560 sin = (sin_t *)sa; 561 requested_port = ntohs(sin->sin_port); 562 v4addr = sin->sin_addr.s_addr; 563 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); 564 if (v4addr != INADDR_ANY) { 565 laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst, 566 B_FALSE); 567 } 568 break; 569 570 case sizeof (sin6_t): /* Complete IPv6 address */ 571 sin6 = (sin6_t *)sa; 572 v6addr = sin6->sin6_addr; 573 requested_port = ntohs(sin6->sin6_port); 574 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) { 575 if (connp->conn_ipv6_v6only) 576 return (EADDRNOTAVAIL); 577 578 IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr); 579 if (v4addr != INADDR_ANY) { 580 laddr_type = ip_laddr_verify_v4(v4addr, 581 zoneid, ipst, B_FALSE); 582 } 583 } else { 584 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) { 585 if (IN6_IS_ADDR_LINKSCOPE(&v6addr)) 586 scopeid = sin6->sin6_scope_id; 587 laddr_type = ip_laddr_verify_v6(&v6addr, 588 zoneid, ipst, B_FALSE, scopeid); 589 } 590 } 591 break; 592 593 default: 594 if (connp->conn_debug) { 595 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 596 "tcp_bind: bad address length, %d", len); 597 } 598 return (EAFNOSUPPORT); 599 /* return (-TBADADDR); */ 600 } 601 602 /* Is the local address a valid unicast address? */ 603 if (laddr_type == IPVL_BAD) 604 return (EADDRNOTAVAIL); 605 606 connp->conn_bound_addr_v6 = v6addr; 607 if (scopeid != 0) { 608 ixa->ixa_flags |= IXAF_SCOPEID_SET; 609 ixa->ixa_scopeid = scopeid; 610 connp->conn_incoming_ifindex = scopeid; 611 } else { 612 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 613 connp->conn_incoming_ifindex = connp->conn_bound_if; 614 } 615 616 connp->conn_laddr_v6 = v6addr; 617 connp->conn_saddr_v6 = v6addr; 618 619 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only; 620 621 error = tcp_bind_select_lport(tcp, &requested_port, 622 bind_to_req_port_only, cr); 623 if (error != 0) { 624 connp->conn_laddr_v6 = ipv6_all_zeros; 625 connp->conn_saddr_v6 = ipv6_all_zeros; 626 connp->conn_bound_addr_v6 = ipv6_all_zeros; 627 } 628 return (error); 629 } 630 631 /* 632 * If the "bind_to_req_port_only" parameter is set, if the requested port 633 * number is available, return it, If not return 0 634 * 635 * If "bind_to_req_port_only" parameter is not set and 636 * If the requested port number is available, return it. If not, return 637 * the first anonymous port we happen across. If no anonymous ports are 638 * available, return 0. addr is the requested local address, if any. 639 * 640 * In either case, when succeeding update the tcp_t to record the port number 641 * and insert it in the bind hash table. 642 * 643 * Note that TCP over IPv4 and IPv6 sockets can use the same port number 644 * without setting SO_REUSEADDR. This is needed so that they 645 * can be viewed as two independent transport protocols. 646 */ 647 in_port_t 648 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 649 int reuseaddr, boolean_t quick_connect, 650 boolean_t bind_to_req_port_only, boolean_t user_specified) 651 { 652 /* number of times we have run around the loop */ 653 int count = 0; 654 /* maximum number of times to run around the loop */ 655 int loopmax; 656 conn_t *connp = tcp->tcp_connp; 657 tcp_stack_t *tcps = tcp->tcp_tcps; 658 659 /* 660 * Lookup for free addresses is done in a loop and "loopmax" 661 * influences how long we spin in the loop 662 */ 663 if (bind_to_req_port_only) { 664 /* 665 * If the requested port is busy, don't bother to look 666 * for a new one. Setting loop maximum count to 1 has 667 * that effect. 668 */ 669 loopmax = 1; 670 } else { 671 /* 672 * If the requested port is busy, look for a free one 673 * in the anonymous port range. 674 * Set loopmax appropriately so that one does not look 675 * forever in the case all of the anonymous ports are in use. 676 */ 677 if (connp->conn_anon_priv_bind) { 678 /* 679 * loopmax = 680 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 681 */ 682 loopmax = IPPORT_RESERVED - 683 tcps->tcps_min_anonpriv_port; 684 } else { 685 loopmax = (tcps->tcps_largest_anon_port - 686 tcps->tcps_smallest_anon_port + 1); 687 } 688 } 689 do { 690 uint16_t lport; 691 tf_t *tbf; 692 tcp_t *ltcp; 693 conn_t *lconnp; 694 695 lport = htons(port); 696 697 /* 698 * Ensure that the tcp_t is not currently in the bind hash. 699 * Hold the lock on the hash bucket to ensure that 700 * the duplicate check plus the insertion is an atomic 701 * operation. 702 * 703 * This function does an inline lookup on the bind hash list 704 * Make sure that we access only members of tcp_t 705 * and that we don't look at tcp_tcp, since we are not 706 * doing a CONN_INC_REF. 707 */ 708 tcp_bind_hash_remove(tcp); 709 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; 710 mutex_enter(&tbf->tf_lock); 711 for (ltcp = tbf->tf_tcp; ltcp != NULL; 712 ltcp = ltcp->tcp_bind_hash) { 713 if (lport == ltcp->tcp_connp->conn_lport) 714 break; 715 } 716 717 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { 718 boolean_t not_socket; 719 boolean_t exclbind; 720 721 lconnp = ltcp->tcp_connp; 722 723 /* 724 * On a labeled system, we must treat bindings to ports 725 * on shared IP addresses by sockets with MAC exemption 726 * privilege as being in all zones, as there's 727 * otherwise no way to identify the right receiver. 728 */ 729 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp)) 730 continue; 731 732 /* 733 * If TCP_EXCLBIND is set for either the bound or 734 * binding endpoint, the semantics of bind 735 * is changed according to the following. 736 * 737 * spec = specified address (v4 or v6) 738 * unspec = unspecified address (v4 or v6) 739 * A = specified addresses are different for endpoints 740 * 741 * bound bind to allowed 742 * ------------------------------------- 743 * unspec unspec no 744 * unspec spec no 745 * spec unspec no 746 * spec spec yes if A 747 * 748 * For labeled systems, SO_MAC_EXEMPT behaves the same 749 * as TCP_EXCLBIND, except that zoneid is ignored. 750 * 751 * Note: 752 * 753 * 1. Because of TLI semantics, an endpoint can go 754 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or 755 * TCPS_BOUND, depending on whether it is originally 756 * a listener or not. That is why we need to check 757 * for states greater than or equal to TCPS_BOUND 758 * here. 759 * 760 * 2. Ideally, we should only check for state equals 761 * to TCPS_LISTEN. And the following check should be 762 * added. 763 * 764 * if (ltcp->tcp_state == TCPS_LISTEN || 765 * !reuseaddr || !lconnp->conn_reuseaddr) { 766 * ... 767 * } 768 * 769 * The semantics will be changed to this. If the 770 * endpoint on the list is in state not equal to 771 * TCPS_LISTEN and both endpoints have SO_REUSEADDR 772 * set, let the bind succeed. 773 * 774 * Because of (1), we cannot do that for TLI 775 * endpoints. But we can do that for socket endpoints. 776 * If in future, we can change this going back 777 * semantics, we can use the above check for TLI also. 778 */ 779 not_socket = !(TCP_IS_SOCKET(ltcp) && 780 TCP_IS_SOCKET(tcp)); 781 exclbind = lconnp->conn_exclbind || 782 connp->conn_exclbind; 783 784 if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) || 785 (connp->conn_mac_mode != CONN_MAC_DEFAULT) || 786 (exclbind && (not_socket || 787 ltcp->tcp_state <= TCPS_ESTABLISHED))) { 788 if (V6_OR_V4_INADDR_ANY( 789 lconnp->conn_bound_addr_v6) || 790 V6_OR_V4_INADDR_ANY(*laddr) || 791 IN6_ARE_ADDR_EQUAL(laddr, 792 &lconnp->conn_bound_addr_v6)) { 793 break; 794 } 795 continue; 796 } 797 798 /* 799 * Check ipversion to allow IPv4 and IPv6 sockets to 800 * have disjoint port number spaces, if *_EXCLBIND 801 * is not set and only if the application binds to a 802 * specific port. We use the same autoassigned port 803 * number space for IPv4 and IPv6 sockets. 804 */ 805 if (connp->conn_ipversion != lconnp->conn_ipversion && 806 bind_to_req_port_only) 807 continue; 808 809 /* 810 * Ideally, we should make sure that the source 811 * address, remote address, and remote port in the 812 * four tuple for this tcp-connection is unique. 813 * However, trying to find out the local source 814 * address would require too much code duplication 815 * with IP, since IP needs needs to have that code 816 * to support userland TCP implementations. 817 */ 818 if (quick_connect && 819 (ltcp->tcp_state > TCPS_LISTEN) && 820 ((connp->conn_fport != lconnp->conn_fport) || 821 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, 822 &lconnp->conn_faddr_v6))) 823 continue; 824 825 if (!reuseaddr) { 826 /* 827 * No socket option SO_REUSEADDR. 828 * If existing port is bound to 829 * a non-wildcard IP address 830 * and the requesting stream is 831 * bound to a distinct 832 * different IP addresses 833 * (non-wildcard, also), keep 834 * going. 835 */ 836 if (!V6_OR_V4_INADDR_ANY(*laddr) && 837 !V6_OR_V4_INADDR_ANY( 838 lconnp->conn_bound_addr_v6) && 839 !IN6_ARE_ADDR_EQUAL(laddr, 840 &lconnp->conn_bound_addr_v6)) 841 continue; 842 if (ltcp->tcp_state >= TCPS_BOUND) { 843 /* 844 * This port is being used and 845 * its state is >= TCPS_BOUND, 846 * so we can't bind to it. 847 */ 848 break; 849 } 850 } else { 851 /* 852 * socket option SO_REUSEADDR is set on the 853 * binding tcp_t. 854 * 855 * If two streams are bound to 856 * same IP address or both addr 857 * and bound source are wildcards 858 * (INADDR_ANY), we want to stop 859 * searching. 860 * We have found a match of IP source 861 * address and source port, which is 862 * refused regardless of the 863 * SO_REUSEADDR setting, so we break. 864 */ 865 if (IN6_ARE_ADDR_EQUAL(laddr, 866 &lconnp->conn_bound_addr_v6) && 867 (ltcp->tcp_state == TCPS_LISTEN || 868 ltcp->tcp_state == TCPS_BOUND)) 869 break; 870 } 871 } 872 if (ltcp != NULL) { 873 /* The port number is busy */ 874 mutex_exit(&tbf->tf_lock); 875 } else { 876 /* 877 * This port is ours. Insert in fanout and mark as 878 * bound to prevent others from getting the port 879 * number. 880 */ 881 tcp->tcp_state = TCPS_BOUND; 882 DTRACE_TCP6(state__change, void, NULL, 883 ip_xmit_attr_t *, connp->conn_ixa, 884 void, NULL, tcp_t *, tcp, void, NULL, 885 int32_t, TCPS_IDLE); 886 887 connp->conn_lport = htons(port); 888 889 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( 890 connp->conn_lport)] == tbf); 891 tcp_bind_hash_insert(tbf, tcp, 1); 892 893 mutex_exit(&tbf->tf_lock); 894 895 /* 896 * We don't want tcp_next_port_to_try to "inherit" 897 * a port number supplied by the user in a bind. 898 */ 899 if (user_specified) 900 return (port); 901 902 /* 903 * This is the only place where tcp_next_port_to_try 904 * is updated. After the update, it may or may not 905 * be in the valid range. 906 */ 907 if (!connp->conn_anon_priv_bind) 908 tcps->tcps_next_port_to_try = port + 1; 909 return (port); 910 } 911 912 if (connp->conn_anon_priv_bind) { 913 port = tcp_get_next_priv_port(tcp); 914 } else { 915 if (count == 0 && user_specified) { 916 /* 917 * We may have to return an anonymous port. So 918 * get one to start with. 919 */ 920 port = 921 tcp_update_next_port( 922 tcps->tcps_next_port_to_try, 923 tcp, B_TRUE); 924 user_specified = B_FALSE; 925 } else { 926 port = tcp_update_next_port(port + 1, tcp, 927 B_FALSE); 928 } 929 } 930 if (port == 0) 931 break; 932 933 /* 934 * Don't let this loop run forever in the case where 935 * all of the anonymous ports are in use. 936 */ 937 } while (++count < loopmax); 938 return (0); 939 } 940