1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsun.h> 31 #include <sys/strsubr.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/suntpi.h> 37 #include <sys/xti_inet.h> 38 #include <sys/policy.h> 39 #include <sys/squeue_impl.h> 40 #include <sys/squeue.h> 41 #include <sys/tsol/tnet.h> 42 43 #include <rpc/pmap_prot.h> 44 45 #include <inet/common.h> 46 #include <inet/ip.h> 47 #include <inet/tcp.h> 48 #include <inet/tcp_impl.h> 49 #include <inet/proto_set.h> 50 #include <inet/ipsec_impl.h> 51 52 /* Setable in /etc/system */ 53 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ 54 static uint32_t tcp_random_anon_port = 1; 55 56 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, 57 cred_t *cr); 58 static in_port_t tcp_get_next_priv_port(const tcp_t *); 59 60 /* 61 * Hash list insertion routine for tcp_t structures. Each hash bucket 62 * contains a list of tcp_t entries, and each entry is bound to a unique 63 * port. If there are multiple tcp_t's that are bound to the same port, then 64 * one of them will be linked into the hash bucket list, and the rest will 65 * hang off of that one entry. For each port, entries bound to a specific IP 66 * address will be inserted before those those bound to INADDR_ANY. 67 */ 68 void 69 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) 70 { 71 tcp_t **tcpp; 72 tcp_t *tcpnext; 73 tcp_t *tcphash; 74 conn_t *connp = tcp->tcp_connp; 75 conn_t *connext; 76 77 if (tcp->tcp_ptpbhn != NULL) { 78 ASSERT(!caller_holds_lock); 79 tcp_bind_hash_remove(tcp); 80 } 81 tcpp = &tbf->tf_tcp; 82 if (!caller_holds_lock) { 83 mutex_enter(&tbf->tf_lock); 84 } else { 85 ASSERT(MUTEX_HELD(&tbf->tf_lock)); 86 } 87 tcphash = tcpp[0]; 88 tcpnext = NULL; 89 if (tcphash != NULL) { 90 /* Look for an entry using the same port */ 91 while ((tcphash = tcpp[0]) != NULL && 92 connp->conn_lport != tcphash->tcp_connp->conn_lport) 93 tcpp = &(tcphash->tcp_bind_hash); 94 95 /* The port was not found, just add to the end */ 96 if (tcphash == NULL) 97 goto insert; 98 99 /* 100 * OK, there already exists an entry bound to the 101 * same port. 102 * 103 * If the new tcp bound to the INADDR_ANY address 104 * and the first one in the list is not bound to 105 * INADDR_ANY we skip all entries until we find the 106 * first one bound to INADDR_ANY. 107 * This makes sure that applications binding to a 108 * specific address get preference over those binding to 109 * INADDR_ANY. 110 */ 111 tcpnext = tcphash; 112 connext = tcpnext->tcp_connp; 113 tcphash = NULL; 114 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) && 115 !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { 116 while ((tcpnext = tcpp[0]) != NULL) { 117 connext = tcpnext->tcp_connp; 118 if (!V6_OR_V4_INADDR_ANY( 119 connext->conn_bound_addr_v6)) 120 tcpp = &(tcpnext->tcp_bind_hash_port); 121 else 122 break; 123 } 124 if (tcpnext != NULL) { 125 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; 126 tcphash = tcpnext->tcp_bind_hash; 127 if (tcphash != NULL) { 128 tcphash->tcp_ptpbhn = 129 &(tcp->tcp_bind_hash); 130 tcpnext->tcp_bind_hash = NULL; 131 } 132 } 133 } else { 134 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; 135 tcphash = tcpnext->tcp_bind_hash; 136 if (tcphash != NULL) { 137 tcphash->tcp_ptpbhn = 138 &(tcp->tcp_bind_hash); 139 tcpnext->tcp_bind_hash = NULL; 140 } 141 } 142 } 143 insert: 144 tcp->tcp_bind_hash_port = tcpnext; 145 tcp->tcp_bind_hash = tcphash; 146 tcp->tcp_ptpbhn = tcpp; 147 tcpp[0] = tcp; 148 if (!caller_holds_lock) 149 mutex_exit(&tbf->tf_lock); 150 } 151 152 /* 153 * Hash list removal routine for tcp_t structures. 154 */ 155 void 156 tcp_bind_hash_remove(tcp_t *tcp) 157 { 158 tcp_t *tcpnext; 159 kmutex_t *lockp; 160 tcp_stack_t *tcps = tcp->tcp_tcps; 161 conn_t *connp = tcp->tcp_connp; 162 163 if (tcp->tcp_ptpbhn == NULL) 164 return; 165 166 /* 167 * Extract the lock pointer in case there are concurrent 168 * hash_remove's for this instance. 169 */ 170 ASSERT(connp->conn_lport != 0); 171 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH( 172 connp->conn_lport)].tf_lock; 173 174 ASSERT(lockp != NULL); 175 mutex_enter(lockp); 176 if (tcp->tcp_ptpbhn) { 177 tcpnext = tcp->tcp_bind_hash_port; 178 if (tcpnext != NULL) { 179 tcp->tcp_bind_hash_port = NULL; 180 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 181 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash; 182 if (tcpnext->tcp_bind_hash != NULL) { 183 tcpnext->tcp_bind_hash->tcp_ptpbhn = 184 &(tcpnext->tcp_bind_hash); 185 tcp->tcp_bind_hash = NULL; 186 } 187 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) { 188 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 189 tcp->tcp_bind_hash = NULL; 190 } 191 *tcp->tcp_ptpbhn = tcpnext; 192 tcp->tcp_ptpbhn = NULL; 193 } 194 mutex_exit(lockp); 195 } 196 197 /* 198 * Don't let port fall into the privileged range. 199 * Since the extra privileged ports can be arbitrary we also 200 * ensure that we exclude those from consideration. 201 * tcp_g_epriv_ports is not sorted thus we loop over it until 202 * there are no changes. 203 * 204 * Note: No locks are held when inspecting tcp_g_*epriv_ports 205 * but instead the code relies on: 206 * - the fact that the address of the array and its size never changes 207 * - the atomic assignment of the elements of the array 208 * 209 * Returns 0 if there are no more ports available. 210 * 211 * TS note: skip multilevel ports. 212 */ 213 in_port_t 214 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) 215 { 216 int i, bump; 217 boolean_t restart = B_FALSE; 218 tcp_stack_t *tcps = tcp->tcp_tcps; 219 220 if (random && tcp_random_anon_port != 0) { 221 (void) random_get_pseudo_bytes((uint8_t *)&port, 222 sizeof (in_port_t)); 223 /* 224 * Unless changed by a sys admin, the smallest anon port 225 * is 32768 and the largest anon port is 65535. It is 226 * very likely (50%) for the random port to be smaller 227 * than the smallest anon port. When that happens, 228 * add port % (anon port range) to the smallest anon 229 * port to get the random port. It should fall into the 230 * valid anon port range. 231 */ 232 if ((port < tcps->tcps_smallest_anon_port) || 233 (port > tcps->tcps_largest_anon_port)) { 234 if (tcps->tcps_smallest_anon_port == 235 tcps->tcps_largest_anon_port) { 236 bump = 0; 237 } else { 238 bump = port % (tcps->tcps_largest_anon_port - 239 tcps->tcps_smallest_anon_port); 240 } 241 port = tcps->tcps_smallest_anon_port + bump; 242 } 243 } 244 245 retry: 246 if (port < tcps->tcps_smallest_anon_port) 247 port = (in_port_t)tcps->tcps_smallest_anon_port; 248 249 if (port > tcps->tcps_largest_anon_port) { 250 if (restart) 251 return (0); 252 restart = B_TRUE; 253 port = (in_port_t)tcps->tcps_smallest_anon_port; 254 } 255 256 if (port < tcps->tcps_smallest_nonpriv_port) 257 port = (in_port_t)tcps->tcps_smallest_nonpriv_port; 258 259 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 260 if (port == tcps->tcps_g_epriv_ports[i]) { 261 port++; 262 /* 263 * Make sure whether the port is in the 264 * valid range. 265 */ 266 goto retry; 267 } 268 } 269 if (is_system_labeled() && 270 (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port, 271 IPPROTO_TCP, B_TRUE)) != 0) { 272 port = i; 273 goto retry; 274 } 275 return (port); 276 } 277 278 /* 279 * Return the next anonymous port in the privileged port range for 280 * bind checking. It starts at IPPORT_RESERVED - 1 and goes 281 * downwards. This is the same behavior as documented in the userland 282 * library call rresvport(3N). 283 * 284 * TS note: skip multilevel ports. 285 */ 286 static in_port_t 287 tcp_get_next_priv_port(const tcp_t *tcp) 288 { 289 static in_port_t next_priv_port = IPPORT_RESERVED - 1; 290 in_port_t nextport; 291 boolean_t restart = B_FALSE; 292 tcp_stack_t *tcps = tcp->tcp_tcps; 293 retry: 294 if (next_priv_port < tcps->tcps_min_anonpriv_port || 295 next_priv_port >= IPPORT_RESERVED) { 296 next_priv_port = IPPORT_RESERVED - 1; 297 if (restart) 298 return (0); 299 restart = B_TRUE; 300 } 301 if (is_system_labeled() && 302 (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), 303 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { 304 next_priv_port = nextport; 305 goto retry; 306 } 307 return (next_priv_port--); 308 } 309 310 static int 311 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, 312 boolean_t bind_to_req_port_only, cred_t *cr) 313 { 314 in_port_t mlp_port; 315 mlp_type_t addrtype, mlptype; 316 boolean_t user_specified; 317 in_port_t allocated_port; 318 in_port_t requested_port = *requested_port_ptr; 319 conn_t *connp = tcp->tcp_connp; 320 zone_t *zone; 321 tcp_stack_t *tcps = tcp->tcp_tcps; 322 in6_addr_t v6addr = connp->conn_laddr_v6; 323 324 /* 325 * XXX It's up to the caller to specify bind_to_req_port_only or not. 326 */ 327 ASSERT(cr != NULL); 328 329 /* 330 * Get a valid port (within the anonymous range and should not 331 * be a privileged one) to use if the user has not given a port. 332 * If multiple threads are here, they may all start with 333 * with the same initial port. But, it should be fine as long as 334 * tcp_bindi will ensure that no two threads will be assigned 335 * the same port. 336 * 337 * NOTE: XXX If a privileged process asks for an anonymous port, we 338 * still check for ports only in the range > tcp_smallest_non_priv_port, 339 * unless TCP_ANONPRIVBIND option is set. 340 */ 341 mlptype = mlptSingle; 342 mlp_port = requested_port; 343 if (requested_port == 0) { 344 requested_port = connp->conn_anon_priv_bind ? 345 tcp_get_next_priv_port(tcp) : 346 tcp_update_next_port(tcps->tcps_next_port_to_try, 347 tcp, B_TRUE); 348 if (requested_port == 0) { 349 return (-TNOADDR); 350 } 351 user_specified = B_FALSE; 352 353 /* 354 * If the user went through one of the RPC interfaces to create 355 * this socket and RPC is MLP in this zone, then give them an 356 * anonymous MLP. 357 */ 358 if (connp->conn_anon_mlp && is_system_labeled()) { 359 zone = crgetzone(cr); 360 addrtype = tsol_mlp_addr_type( 361 connp->conn_allzones ? ALL_ZONES : zone->zone_id, 362 IPV6_VERSION, &v6addr, 363 tcps->tcps_netstack->netstack_ip); 364 if (addrtype == mlptSingle) { 365 return (-TNOADDR); 366 } 367 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 368 PMAPPORT, addrtype); 369 mlp_port = PMAPPORT; 370 } 371 } else { 372 int i; 373 boolean_t priv = B_FALSE; 374 375 /* 376 * If the requested_port is in the well-known privileged range, 377 * verify that the stream was opened by a privileged user. 378 * Note: No locks are held when inspecting tcp_g_*epriv_ports 379 * but instead the code relies on: 380 * - the fact that the address of the array and its size never 381 * changes 382 * - the atomic assignment of the elements of the array 383 */ 384 if (requested_port < tcps->tcps_smallest_nonpriv_port) { 385 priv = B_TRUE; 386 } else { 387 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 388 if (requested_port == 389 tcps->tcps_g_epriv_ports[i]) { 390 priv = B_TRUE; 391 break; 392 } 393 } 394 } 395 if (priv) { 396 if (secpolicy_net_privaddr(cr, requested_port, 397 IPPROTO_TCP) != 0) { 398 if (connp->conn_debug) { 399 (void) strlog(TCP_MOD_ID, 0, 1, 400 SL_ERROR|SL_TRACE, 401 "tcp_bind: no priv for port %d", 402 requested_port); 403 } 404 return (-TACCES); 405 } 406 } 407 user_specified = B_TRUE; 408 409 connp = tcp->tcp_connp; 410 if (is_system_labeled()) { 411 zone = crgetzone(cr); 412 addrtype = tsol_mlp_addr_type( 413 connp->conn_allzones ? ALL_ZONES : zone->zone_id, 414 IPV6_VERSION, &v6addr, 415 tcps->tcps_netstack->netstack_ip); 416 if (addrtype == mlptSingle) { 417 return (-TNOADDR); 418 } 419 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 420 requested_port, addrtype); 421 } 422 } 423 424 if (mlptype != mlptSingle) { 425 if (secpolicy_net_bindmlp(cr) != 0) { 426 if (connp->conn_debug) { 427 (void) strlog(TCP_MOD_ID, 0, 1, 428 SL_ERROR|SL_TRACE, 429 "tcp_bind: no priv for multilevel port %d", 430 requested_port); 431 } 432 return (-TACCES); 433 } 434 435 /* 436 * If we're specifically binding a shared IP address and the 437 * port is MLP on shared addresses, then check to see if this 438 * zone actually owns the MLP. Reject if not. 439 */ 440 if (mlptype == mlptShared && addrtype == mlptShared) { 441 /* 442 * No need to handle exclusive-stack zones since 443 * ALL_ZONES only applies to the shared stack. 444 */ 445 zoneid_t mlpzone; 446 447 mlpzone = tsol_mlp_findzone(IPPROTO_TCP, 448 htons(mlp_port)); 449 if (connp->conn_zoneid != mlpzone) { 450 if (connp->conn_debug) { 451 (void) strlog(TCP_MOD_ID, 0, 1, 452 SL_ERROR|SL_TRACE, 453 "tcp_bind: attempt to bind port " 454 "%d on shared addr in zone %d " 455 "(should be %d)", 456 mlp_port, connp->conn_zoneid, 457 mlpzone); 458 } 459 return (-TACCES); 460 } 461 } 462 463 if (!user_specified) { 464 int err; 465 err = tsol_mlp_anon(zone, mlptype, connp->conn_proto, 466 requested_port, B_TRUE); 467 if (err != 0) { 468 if (connp->conn_debug) { 469 (void) strlog(TCP_MOD_ID, 0, 1, 470 SL_ERROR|SL_TRACE, 471 "tcp_bind: cannot establish anon " 472 "MLP for port %d", 473 requested_port); 474 } 475 return (err); 476 } 477 connp->conn_anon_port = B_TRUE; 478 } 479 connp->conn_mlp_type = mlptype; 480 } 481 482 allocated_port = tcp_bindi(tcp, requested_port, &v6addr, 483 connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only, 484 user_specified); 485 486 if (allocated_port == 0) { 487 connp->conn_mlp_type = mlptSingle; 488 if (connp->conn_anon_port) { 489 connp->conn_anon_port = B_FALSE; 490 (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto, 491 requested_port, B_FALSE); 492 } 493 if (bind_to_req_port_only) { 494 if (connp->conn_debug) { 495 (void) strlog(TCP_MOD_ID, 0, 1, 496 SL_ERROR|SL_TRACE, 497 "tcp_bind: requested addr busy"); 498 } 499 return (-TADDRBUSY); 500 } else { 501 /* If we are out of ports, fail the bind. */ 502 if (connp->conn_debug) { 503 (void) strlog(TCP_MOD_ID, 0, 1, 504 SL_ERROR|SL_TRACE, 505 "tcp_bind: out of ports?"); 506 } 507 return (-TNOADDR); 508 } 509 } 510 511 /* Pass the allocated port back */ 512 *requested_port_ptr = allocated_port; 513 return (0); 514 } 515 516 /* 517 * Check the address and check/pick a local port number. 518 */ 519 int 520 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, 521 boolean_t bind_to_req_port_only) 522 { 523 tcp_t *tcp = connp->conn_tcp; 524 sin_t *sin; 525 sin6_t *sin6; 526 in_port_t requested_port; 527 ipaddr_t v4addr; 528 in6_addr_t v6addr; 529 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 530 zoneid_t zoneid = IPCL_ZONEID(connp); 531 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 532 uint_t scopeid = 0; 533 int error = 0; 534 ip_xmit_attr_t *ixa = connp->conn_ixa; 535 536 ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX); 537 538 if (tcp->tcp_state == TCPS_BOUND) { 539 return (0); 540 } else if (tcp->tcp_state > TCPS_BOUND) { 541 if (connp->conn_debug) { 542 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 543 "tcp_bind: bad state, %d", tcp->tcp_state); 544 } 545 return (-TOUTSTATE); 546 } 547 548 ASSERT(sa != NULL && len != 0); 549 550 if (!OK_32PTR((char *)sa)) { 551 if (connp->conn_debug) { 552 (void) strlog(TCP_MOD_ID, 0, 1, 553 SL_ERROR|SL_TRACE, 554 "tcp_bind: bad address parameter, " 555 "address %p, len %d", 556 (void *)sa, len); 557 } 558 return (-TPROTO); 559 } 560 561 error = proto_verify_ip_addr(connp->conn_family, sa, len); 562 if (error != 0) { 563 return (error); 564 } 565 566 switch (len) { 567 case sizeof (sin_t): /* Complete IPv4 address */ 568 sin = (sin_t *)sa; 569 requested_port = ntohs(sin->sin_port); 570 v4addr = sin->sin_addr.s_addr; 571 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); 572 if (v4addr != INADDR_ANY) { 573 laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst, 574 B_FALSE); 575 } 576 break; 577 578 case sizeof (sin6_t): /* Complete IPv6 address */ 579 sin6 = (sin6_t *)sa; 580 v6addr = sin6->sin6_addr; 581 requested_port = ntohs(sin6->sin6_port); 582 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) { 583 if (connp->conn_ipv6_v6only) 584 return (EADDRNOTAVAIL); 585 586 IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr); 587 if (v4addr != INADDR_ANY) { 588 laddr_type = ip_laddr_verify_v4(v4addr, 589 zoneid, ipst, B_FALSE); 590 } 591 } else { 592 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) { 593 if (IN6_IS_ADDR_LINKSCOPE(&v6addr)) 594 scopeid = sin6->sin6_scope_id; 595 laddr_type = ip_laddr_verify_v6(&v6addr, 596 zoneid, ipst, B_FALSE, scopeid); 597 } 598 } 599 break; 600 601 default: 602 if (connp->conn_debug) { 603 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 604 "tcp_bind: bad address length, %d", len); 605 } 606 return (EAFNOSUPPORT); 607 /* return (-TBADADDR); */ 608 } 609 610 /* Is the local address a valid unicast address? */ 611 if (laddr_type == IPVL_BAD) 612 return (EADDRNOTAVAIL); 613 614 connp->conn_bound_addr_v6 = v6addr; 615 if (scopeid != 0) { 616 ixa->ixa_flags |= IXAF_SCOPEID_SET; 617 ixa->ixa_scopeid = scopeid; 618 connp->conn_incoming_ifindex = scopeid; 619 } else { 620 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 621 connp->conn_incoming_ifindex = connp->conn_bound_if; 622 } 623 624 connp->conn_laddr_v6 = v6addr; 625 connp->conn_saddr_v6 = v6addr; 626 627 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only; 628 629 error = tcp_bind_select_lport(tcp, &requested_port, 630 bind_to_req_port_only, cr); 631 if (error != 0) { 632 connp->conn_laddr_v6 = ipv6_all_zeros; 633 connp->conn_saddr_v6 = ipv6_all_zeros; 634 connp->conn_bound_addr_v6 = ipv6_all_zeros; 635 } 636 return (error); 637 } 638 639 /* 640 * If the "bind_to_req_port_only" parameter is set, if the requested port 641 * number is available, return it, If not return 0 642 * 643 * If "bind_to_req_port_only" parameter is not set and 644 * If the requested port number is available, return it. If not, return 645 * the first anonymous port we happen across. If no anonymous ports are 646 * available, return 0. addr is the requested local address, if any. 647 * 648 * In either case, when succeeding update the tcp_t to record the port number 649 * and insert it in the bind hash table. 650 * 651 * Note that TCP over IPv4 and IPv6 sockets can use the same port number 652 * without setting SO_REUSEADDR. This is needed so that they 653 * can be viewed as two independent transport protocols. 654 */ 655 in_port_t 656 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 657 int reuseaddr, boolean_t quick_connect, 658 boolean_t bind_to_req_port_only, boolean_t user_specified) 659 { 660 /* number of times we have run around the loop */ 661 int count = 0; 662 /* maximum number of times to run around the loop */ 663 int loopmax; 664 conn_t *connp = tcp->tcp_connp; 665 tcp_stack_t *tcps = tcp->tcp_tcps; 666 667 /* 668 * Lookup for free addresses is done in a loop and "loopmax" 669 * influences how long we spin in the loop 670 */ 671 if (bind_to_req_port_only) { 672 /* 673 * If the requested port is busy, don't bother to look 674 * for a new one. Setting loop maximum count to 1 has 675 * that effect. 676 */ 677 loopmax = 1; 678 } else { 679 /* 680 * If the requested port is busy, look for a free one 681 * in the anonymous port range. 682 * Set loopmax appropriately so that one does not look 683 * forever in the case all of the anonymous ports are in use. 684 */ 685 if (connp->conn_anon_priv_bind) { 686 /* 687 * loopmax = 688 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 689 */ 690 loopmax = IPPORT_RESERVED - 691 tcps->tcps_min_anonpriv_port; 692 } else { 693 loopmax = (tcps->tcps_largest_anon_port - 694 tcps->tcps_smallest_anon_port + 1); 695 } 696 } 697 do { 698 uint16_t lport; 699 tf_t *tbf; 700 tcp_t *ltcp; 701 conn_t *lconnp; 702 703 lport = htons(port); 704 705 /* 706 * Ensure that the tcp_t is not currently in the bind hash. 707 * Hold the lock on the hash bucket to ensure that 708 * the duplicate check plus the insertion is an atomic 709 * operation. 710 * 711 * This function does an inline lookup on the bind hash list 712 * Make sure that we access only members of tcp_t 713 * and that we don't look at tcp_tcp, since we are not 714 * doing a CONN_INC_REF. 715 */ 716 tcp_bind_hash_remove(tcp); 717 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; 718 mutex_enter(&tbf->tf_lock); 719 for (ltcp = tbf->tf_tcp; ltcp != NULL; 720 ltcp = ltcp->tcp_bind_hash) { 721 if (lport == ltcp->tcp_connp->conn_lport) 722 break; 723 } 724 725 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { 726 boolean_t not_socket; 727 boolean_t exclbind; 728 729 lconnp = ltcp->tcp_connp; 730 731 /* 732 * On a labeled system, we must treat bindings to ports 733 * on shared IP addresses by sockets with MAC exemption 734 * privilege as being in all zones, as there's 735 * otherwise no way to identify the right receiver. 736 */ 737 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp)) 738 continue; 739 740 /* 741 * If TCP_EXCLBIND is set for either the bound or 742 * binding endpoint, the semantics of bind 743 * is changed according to the following. 744 * 745 * spec = specified address (v4 or v6) 746 * unspec = unspecified address (v4 or v6) 747 * A = specified addresses are different for endpoints 748 * 749 * bound bind to allowed 750 * ------------------------------------- 751 * unspec unspec no 752 * unspec spec no 753 * spec unspec no 754 * spec spec yes if A 755 * 756 * For labeled systems, SO_MAC_EXEMPT behaves the same 757 * as TCP_EXCLBIND, except that zoneid is ignored. 758 * 759 * Note: 760 * 761 * 1. Because of TLI semantics, an endpoint can go 762 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or 763 * TCPS_BOUND, depending on whether it is originally 764 * a listener or not. That is why we need to check 765 * for states greater than or equal to TCPS_BOUND 766 * here. 767 * 768 * 2. Ideally, we should only check for state equals 769 * to TCPS_LISTEN. And the following check should be 770 * added. 771 * 772 * if (ltcp->tcp_state == TCPS_LISTEN || 773 * !reuseaddr || !lconnp->conn_reuseaddr) { 774 * ... 775 * } 776 * 777 * The semantics will be changed to this. If the 778 * endpoint on the list is in state not equal to 779 * TCPS_LISTEN and both endpoints have SO_REUSEADDR 780 * set, let the bind succeed. 781 * 782 * Because of (1), we cannot do that for TLI 783 * endpoints. But we can do that for socket endpoints. 784 * If in future, we can change this going back 785 * semantics, we can use the above check for TLI also. 786 */ 787 not_socket = !(TCP_IS_SOCKET(ltcp) && 788 TCP_IS_SOCKET(tcp)); 789 exclbind = lconnp->conn_exclbind || 790 connp->conn_exclbind; 791 792 if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) || 793 (connp->conn_mac_mode != CONN_MAC_DEFAULT) || 794 (exclbind && (not_socket || 795 ltcp->tcp_state <= TCPS_ESTABLISHED))) { 796 if (V6_OR_V4_INADDR_ANY( 797 lconnp->conn_bound_addr_v6) || 798 V6_OR_V4_INADDR_ANY(*laddr) || 799 IN6_ARE_ADDR_EQUAL(laddr, 800 &lconnp->conn_bound_addr_v6)) { 801 break; 802 } 803 continue; 804 } 805 806 /* 807 * Check ipversion to allow IPv4 and IPv6 sockets to 808 * have disjoint port number spaces, if *_EXCLBIND 809 * is not set and only if the application binds to a 810 * specific port. We use the same autoassigned port 811 * number space for IPv4 and IPv6 sockets. 812 */ 813 if (connp->conn_ipversion != lconnp->conn_ipversion && 814 bind_to_req_port_only) 815 continue; 816 817 /* 818 * Ideally, we should make sure that the source 819 * address, remote address, and remote port in the 820 * four tuple for this tcp-connection is unique. 821 * However, trying to find out the local source 822 * address would require too much code duplication 823 * with IP, since IP needs needs to have that code 824 * to support userland TCP implementations. 825 */ 826 if (quick_connect && 827 (ltcp->tcp_state > TCPS_LISTEN) && 828 ((connp->conn_fport != lconnp->conn_fport) || 829 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, 830 &lconnp->conn_faddr_v6))) 831 continue; 832 833 if (!reuseaddr) { 834 /* 835 * No socket option SO_REUSEADDR. 836 * If existing port is bound to 837 * a non-wildcard IP address 838 * and the requesting stream is 839 * bound to a distinct 840 * different IP addresses 841 * (non-wildcard, also), keep 842 * going. 843 */ 844 if (!V6_OR_V4_INADDR_ANY(*laddr) && 845 !V6_OR_V4_INADDR_ANY( 846 lconnp->conn_bound_addr_v6) && 847 !IN6_ARE_ADDR_EQUAL(laddr, 848 &lconnp->conn_bound_addr_v6)) 849 continue; 850 if (ltcp->tcp_state >= TCPS_BOUND) { 851 /* 852 * This port is being used and 853 * its state is >= TCPS_BOUND, 854 * so we can't bind to it. 855 */ 856 break; 857 } 858 } else { 859 /* 860 * socket option SO_REUSEADDR is set on the 861 * binding tcp_t. 862 * 863 * If two streams are bound to 864 * same IP address or both addr 865 * and bound source are wildcards 866 * (INADDR_ANY), we want to stop 867 * searching. 868 * We have found a match of IP source 869 * address and source port, which is 870 * refused regardless of the 871 * SO_REUSEADDR setting, so we break. 872 */ 873 if (IN6_ARE_ADDR_EQUAL(laddr, 874 &lconnp->conn_bound_addr_v6) && 875 (ltcp->tcp_state == TCPS_LISTEN || 876 ltcp->tcp_state == TCPS_BOUND)) 877 break; 878 } 879 } 880 if (ltcp != NULL) { 881 /* The port number is busy */ 882 mutex_exit(&tbf->tf_lock); 883 } else { 884 /* 885 * This port is ours. Insert in fanout and mark as 886 * bound to prevent others from getting the port 887 * number. 888 */ 889 tcp->tcp_state = TCPS_BOUND; 890 DTRACE_TCP6(state__change, void, NULL, 891 ip_xmit_attr_t *, connp->conn_ixa, 892 void, NULL, tcp_t *, tcp, void, NULL, 893 int32_t, TCPS_IDLE); 894 895 connp->conn_lport = htons(port); 896 897 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( 898 connp->conn_lport)] == tbf); 899 tcp_bind_hash_insert(tbf, tcp, 1); 900 901 mutex_exit(&tbf->tf_lock); 902 903 /* 904 * We don't want tcp_next_port_to_try to "inherit" 905 * a port number supplied by the user in a bind. 906 */ 907 if (user_specified) 908 return (port); 909 910 /* 911 * This is the only place where tcp_next_port_to_try 912 * is updated. After the update, it may or may not 913 * be in the valid range. 914 */ 915 if (!connp->conn_anon_priv_bind) 916 tcps->tcps_next_port_to_try = port + 1; 917 return (port); 918 } 919 920 if (connp->conn_anon_priv_bind) { 921 port = tcp_get_next_priv_port(tcp); 922 } else { 923 if (count == 0 && user_specified) { 924 /* 925 * We may have to return an anonymous port. So 926 * get one to start with. 927 */ 928 port = 929 tcp_update_next_port( 930 tcps->tcps_next_port_to_try, 931 tcp, B_TRUE); 932 user_specified = B_FALSE; 933 } else { 934 port = tcp_update_next_port(port + 1, tcp, 935 B_FALSE); 936 } 937 } 938 if (port == 0) 939 break; 940 941 /* 942 * Don't let this loop run forever in the case where 943 * all of the anonymous ports are in use. 944 */ 945 } while (++count < loopmax); 946 return (0); 947 } 948