1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2016 by Delphix. All rights reserved. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/stream.h> 30 #include <sys/strsun.h> 31 #include <sys/strsubr.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #define _SUN_TPI_VERSION 2 35 #include <sys/tihdr.h> 36 #include <sys/suntpi.h> 37 #include <sys/xti_inet.h> 38 #include <sys/policy.h> 39 #include <sys/squeue_impl.h> 40 #include <sys/squeue.h> 41 #include <sys/tsol/tnet.h> 42 43 #include <rpc/pmap_prot.h> 44 45 #include <inet/common.h> 46 #include <inet/ip.h> 47 #include <inet/tcp.h> 48 #include <inet/tcp_impl.h> 49 #include <inet/proto_set.h> 50 #include <inet/ipsec_impl.h> 51 52 /* Setable in /etc/system */ 53 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ 54 static uint32_t tcp_random_anon_port = 1; 55 56 static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, 57 cred_t *cr); 58 static in_port_t tcp_get_next_priv_port(const tcp_t *); 59 60 /* 61 * Hash list insertion routine for tcp_t structures. Each hash bucket 62 * contains a list of tcp_t entries, and each entry is bound to a unique 63 * port. If there are multiple tcp_t's that are bound to the same port, then 64 * one of them will be linked into the hash bucket list, and the rest will 65 * hang off of that one entry. For each port, entries bound to a specific IP 66 * address will be inserted before those those bound to INADDR_ANY. 67 */ 68 void 69 tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) 70 { 71 tcp_t **tcpp; 72 tcp_t *tcpnext; 73 tcp_t *tcphash; 74 conn_t *connp = tcp->tcp_connp; 75 conn_t *connext; 76 77 if (tcp->tcp_ptpbhn != NULL) { 78 ASSERT(!caller_holds_lock); 79 tcp_bind_hash_remove(tcp); 80 } 81 tcpp = &tbf->tf_tcp; 82 if (!caller_holds_lock) { 83 mutex_enter(&tbf->tf_lock); 84 } else { 85 ASSERT(MUTEX_HELD(&tbf->tf_lock)); 86 } 87 tcphash = tcpp[0]; 88 tcpnext = NULL; 89 if (tcphash != NULL) { 90 /* Look for an entry using the same port */ 91 while ((tcphash = tcpp[0]) != NULL && 92 connp->conn_lport != tcphash->tcp_connp->conn_lport) 93 tcpp = &(tcphash->tcp_bind_hash); 94 95 /* The port was not found, just add to the end */ 96 if (tcphash == NULL) 97 goto insert; 98 99 /* 100 * OK, there already exists an entry bound to the 101 * same port. 102 * 103 * If the new tcp bound to the INADDR_ANY address 104 * and the first one in the list is not bound to 105 * INADDR_ANY we skip all entries until we find the 106 * first one bound to INADDR_ANY. 107 * This makes sure that applications binding to a 108 * specific address get preference over those binding to 109 * INADDR_ANY. 110 */ 111 tcpnext = tcphash; 112 connext = tcpnext->tcp_connp; 113 tcphash = NULL; 114 if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) && 115 !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { 116 while ((tcpnext = tcpp[0]) != NULL) { 117 connext = tcpnext->tcp_connp; 118 if (!V6_OR_V4_INADDR_ANY( 119 connext->conn_bound_addr_v6)) 120 tcpp = &(tcpnext->tcp_bind_hash_port); 121 else 122 break; 123 } 124 if (tcpnext != NULL) { 125 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; 126 tcphash = tcpnext->tcp_bind_hash; 127 if (tcphash != NULL) { 128 tcphash->tcp_ptpbhn = 129 &(tcp->tcp_bind_hash); 130 tcpnext->tcp_bind_hash = NULL; 131 } 132 } 133 } else { 134 tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; 135 tcphash = tcpnext->tcp_bind_hash; 136 if (tcphash != NULL) { 137 tcphash->tcp_ptpbhn = 138 &(tcp->tcp_bind_hash); 139 tcpnext->tcp_bind_hash = NULL; 140 } 141 } 142 } 143 insert: 144 tcp->tcp_bind_hash_port = tcpnext; 145 tcp->tcp_bind_hash = tcphash; 146 tcp->tcp_ptpbhn = tcpp; 147 tcpp[0] = tcp; 148 if (!caller_holds_lock) 149 mutex_exit(&tbf->tf_lock); 150 } 151 152 /* 153 * Hash list removal routine for tcp_t structures. 154 */ 155 void 156 tcp_bind_hash_remove(tcp_t *tcp) 157 { 158 tcp_t *tcpnext; 159 kmutex_t *lockp; 160 tcp_stack_t *tcps = tcp->tcp_tcps; 161 conn_t *connp = tcp->tcp_connp; 162 163 if (tcp->tcp_ptpbhn == NULL) 164 return; 165 166 /* 167 * Extract the lock pointer in case there are concurrent 168 * hash_remove's for this instance. 169 */ 170 ASSERT(connp->conn_lport != 0); 171 lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH( 172 connp->conn_lport)].tf_lock; 173 174 ASSERT(lockp != NULL); 175 mutex_enter(lockp); 176 if (tcp->tcp_ptpbhn) { 177 tcpnext = tcp->tcp_bind_hash_port; 178 if (tcpnext != NULL) { 179 tcp->tcp_bind_hash_port = NULL; 180 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 181 tcpnext->tcp_bind_hash = tcp->tcp_bind_hash; 182 if (tcpnext->tcp_bind_hash != NULL) { 183 tcpnext->tcp_bind_hash->tcp_ptpbhn = 184 &(tcpnext->tcp_bind_hash); 185 tcp->tcp_bind_hash = NULL; 186 } 187 } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) { 188 tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; 189 tcp->tcp_bind_hash = NULL; 190 } 191 *tcp->tcp_ptpbhn = tcpnext; 192 tcp->tcp_ptpbhn = NULL; 193 } 194 mutex_exit(lockp); 195 } 196 197 /* 198 * Don't let port fall into the privileged range. 199 * Since the extra privileged ports can be arbitrary we also 200 * ensure that we exclude those from consideration. 201 * tcp_g_epriv_ports is not sorted thus we loop over it until 202 * there are no changes. 203 * 204 * Note: No locks are held when inspecting tcp_g_*epriv_ports 205 * but instead the code relies on: 206 * - the fact that the address of the array and its size never changes 207 * - the atomic assignment of the elements of the array 208 * 209 * Returns 0 if there are no more ports available. 210 * 211 * TS note: skip multilevel ports. 212 */ 213 in_port_t 214 tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) 215 { 216 int i, bump; 217 boolean_t restart = B_FALSE; 218 tcp_stack_t *tcps = tcp->tcp_tcps; 219 220 if (random && tcp_random_anon_port != 0) { 221 (void) random_get_pseudo_bytes((uint8_t *)&port, 222 sizeof (in_port_t)); 223 /* 224 * Unless changed by a sys admin, the smallest anon port 225 * is 32768 and the largest anon port is 65535. It is 226 * very likely (50%) for the random port to be smaller 227 * than the smallest anon port. When that happens, 228 * add port % (anon port range) to the smallest anon 229 * port to get the random port. It should fall into the 230 * valid anon port range. 231 */ 232 if ((port < tcps->tcps_smallest_anon_port) || 233 (port > tcps->tcps_largest_anon_port)) { 234 if (tcps->tcps_smallest_anon_port == 235 tcps->tcps_largest_anon_port) { 236 bump = 0; 237 } else { 238 bump = port % (tcps->tcps_largest_anon_port - 239 tcps->tcps_smallest_anon_port); 240 } 241 port = tcps->tcps_smallest_anon_port + bump; 242 } 243 } 244 245 retry: 246 if (port < tcps->tcps_smallest_anon_port) 247 port = (in_port_t)tcps->tcps_smallest_anon_port; 248 249 if (port > tcps->tcps_largest_anon_port) { 250 if (restart) 251 return (0); 252 restart = B_TRUE; 253 port = (in_port_t)tcps->tcps_smallest_anon_port; 254 } 255 256 if (port < tcps->tcps_smallest_nonpriv_port) 257 port = (in_port_t)tcps->tcps_smallest_nonpriv_port; 258 259 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 260 if (port == tcps->tcps_g_epriv_ports[i]) { 261 port++; 262 /* 263 * Make sure whether the port is in the 264 * valid range. 265 */ 266 goto retry; 267 } 268 } 269 if (is_system_labeled() && 270 (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port, 271 IPPROTO_TCP, B_TRUE)) != 0) { 272 port = i; 273 goto retry; 274 } 275 return (port); 276 } 277 278 /* 279 * Return the next anonymous port in the privileged port range for 280 * bind checking. It starts at IPPORT_RESERVED - 1 and goes 281 * downwards. This is the same behavior as documented in the userland 282 * library call rresvport(3SOCKET). 283 * 284 * TS note: skip multilevel ports. 285 */ 286 static in_port_t 287 tcp_get_next_priv_port(const tcp_t *tcp) 288 { 289 static in_port_t next_priv_port = IPPORT_RESERVED - 1; 290 in_port_t nextport; 291 boolean_t restart = B_FALSE; 292 tcp_stack_t *tcps = tcp->tcp_tcps; 293 retry: 294 if (next_priv_port < tcps->tcps_min_anonpriv_port || 295 next_priv_port >= IPPORT_RESERVED) { 296 next_priv_port = IPPORT_RESERVED - 1; 297 if (restart) 298 return (0); 299 restart = B_TRUE; 300 } 301 if (is_system_labeled() && 302 (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), 303 next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { 304 next_priv_port = nextport; 305 goto retry; 306 } 307 return (next_priv_port--); 308 } 309 310 static int 311 tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, 312 boolean_t bind_to_req_port_only, cred_t *cr) 313 { 314 in_port_t mlp_port; 315 mlp_type_t addrtype, mlptype; 316 boolean_t user_specified; 317 in_port_t allocated_port; 318 in_port_t requested_port = *requested_port_ptr; 319 conn_t *connp = tcp->tcp_connp; 320 zone_t *zone; 321 tcp_stack_t *tcps = tcp->tcp_tcps; 322 in6_addr_t v6addr = connp->conn_laddr_v6; 323 324 zone = NULL; 325 /* 326 * XXX It's up to the caller to specify bind_to_req_port_only or not. 327 */ 328 ASSERT(cr != NULL); 329 330 /* 331 * Get a valid port (within the anonymous range and should not 332 * be a privileged one) to use if the user has not given a port. 333 * If multiple threads are here, they may all start with 334 * with the same initial port. But, it should be fine as long as 335 * tcp_bindi will ensure that no two threads will be assigned 336 * the same port. 337 * 338 * NOTE: XXX If a privileged process asks for an anonymous port, we 339 * still check for ports only in the range > tcp_smallest_non_priv_port, 340 * unless TCP_ANONPRIVBIND option is set. 341 */ 342 mlptype = mlptSingle; 343 mlp_port = requested_port; 344 if (requested_port == 0) { 345 requested_port = connp->conn_anon_priv_bind ? 346 tcp_get_next_priv_port(tcp) : 347 tcp_update_next_port(tcps->tcps_next_port_to_try, 348 tcp, B_TRUE); 349 if (requested_port == 0) { 350 return (-TNOADDR); 351 } 352 user_specified = B_FALSE; 353 354 /* 355 * If the user went through one of the RPC interfaces to create 356 * this socket and RPC is MLP in this zone, then give them an 357 * anonymous MLP. 358 */ 359 if (connp->conn_anon_mlp && is_system_labeled()) { 360 zone = crgetzone(cr); 361 addrtype = tsol_mlp_addr_type( 362 connp->conn_allzones ? ALL_ZONES : zone->zone_id, 363 IPV6_VERSION, &v6addr, 364 tcps->tcps_netstack->netstack_ip); 365 if (addrtype == mlptSingle) { 366 return (-TNOADDR); 367 } 368 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 369 PMAPPORT, addrtype); 370 mlp_port = PMAPPORT; 371 } 372 } else { 373 int i; 374 boolean_t priv = B_FALSE; 375 376 /* 377 * If the requested_port is in the well-known privileged range, 378 * verify that the stream was opened by a privileged user. 379 * Note: No locks are held when inspecting tcp_g_*epriv_ports 380 * but instead the code relies on: 381 * - the fact that the address of the array and its size never 382 * changes 383 * - the atomic assignment of the elements of the array 384 */ 385 if (requested_port < tcps->tcps_smallest_nonpriv_port) { 386 priv = B_TRUE; 387 } else { 388 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { 389 if (requested_port == 390 tcps->tcps_g_epriv_ports[i]) { 391 priv = B_TRUE; 392 break; 393 } 394 } 395 } 396 if (priv) { 397 if (secpolicy_net_privaddr(cr, requested_port, 398 IPPROTO_TCP) != 0) { 399 if (connp->conn_debug) { 400 (void) strlog(TCP_MOD_ID, 0, 1, 401 SL_ERROR|SL_TRACE, 402 "tcp_bind: no priv for port %d", 403 requested_port); 404 } 405 return (-TACCES); 406 } 407 } 408 user_specified = B_TRUE; 409 410 connp = tcp->tcp_connp; 411 if (is_system_labeled()) { 412 zone = crgetzone(cr); 413 addrtype = tsol_mlp_addr_type( 414 connp->conn_allzones ? ALL_ZONES : zone->zone_id, 415 IPV6_VERSION, &v6addr, 416 tcps->tcps_netstack->netstack_ip); 417 if (addrtype == mlptSingle) { 418 return (-TNOADDR); 419 } 420 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, 421 requested_port, addrtype); 422 } 423 } 424 425 if (mlptype != mlptSingle) { 426 if (secpolicy_net_bindmlp(cr) != 0) { 427 if (connp->conn_debug) { 428 (void) strlog(TCP_MOD_ID, 0, 1, 429 SL_ERROR|SL_TRACE, 430 "tcp_bind: no priv for multilevel port %d", 431 requested_port); 432 } 433 return (-TACCES); 434 } 435 436 /* 437 * If we're specifically binding a shared IP address and the 438 * port is MLP on shared addresses, then check to see if this 439 * zone actually owns the MLP. Reject if not. 440 */ 441 if (mlptype == mlptShared && addrtype == mlptShared) { 442 /* 443 * No need to handle exclusive-stack zones since 444 * ALL_ZONES only applies to the shared stack. 445 */ 446 zoneid_t mlpzone; 447 448 mlpzone = tsol_mlp_findzone(IPPROTO_TCP, 449 htons(mlp_port)); 450 if (connp->conn_zoneid != mlpzone) { 451 if (connp->conn_debug) { 452 (void) strlog(TCP_MOD_ID, 0, 1, 453 SL_ERROR|SL_TRACE, 454 "tcp_bind: attempt to bind port " 455 "%d on shared addr in zone %d " 456 "(should be %d)", 457 mlp_port, connp->conn_zoneid, 458 mlpzone); 459 } 460 return (-TACCES); 461 } 462 } 463 464 if (!user_specified) { 465 int err; 466 err = tsol_mlp_anon(zone, mlptype, connp->conn_proto, 467 requested_port, B_TRUE); 468 if (err != 0) { 469 if (connp->conn_debug) { 470 (void) strlog(TCP_MOD_ID, 0, 1, 471 SL_ERROR|SL_TRACE, 472 "tcp_bind: cannot establish anon " 473 "MLP for port %d", 474 requested_port); 475 } 476 return (err); 477 } 478 connp->conn_anon_port = B_TRUE; 479 } 480 connp->conn_mlp_type = mlptype; 481 } 482 483 allocated_port = tcp_bindi(tcp, requested_port, &v6addr, 484 connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only, 485 user_specified); 486 487 if (allocated_port == 0) { 488 connp->conn_mlp_type = mlptSingle; 489 if (connp->conn_anon_port) { 490 connp->conn_anon_port = B_FALSE; 491 (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto, 492 requested_port, B_FALSE); 493 } 494 if (bind_to_req_port_only) { 495 if (connp->conn_debug) { 496 (void) strlog(TCP_MOD_ID, 0, 1, 497 SL_ERROR|SL_TRACE, 498 "tcp_bind: requested addr busy"); 499 } 500 return (-TADDRBUSY); 501 } else { 502 /* If we are out of ports, fail the bind. */ 503 if (connp->conn_debug) { 504 (void) strlog(TCP_MOD_ID, 0, 1, 505 SL_ERROR|SL_TRACE, 506 "tcp_bind: out of ports?"); 507 } 508 return (-TNOADDR); 509 } 510 } 511 512 /* Pass the allocated port back */ 513 *requested_port_ptr = allocated_port; 514 return (0); 515 } 516 517 /* 518 * Check the address and check/pick a local port number. 519 */ 520 int 521 tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, 522 boolean_t bind_to_req_port_only) 523 { 524 tcp_t *tcp = connp->conn_tcp; 525 sin_t *sin; 526 sin6_t *sin6; 527 in_port_t requested_port; 528 ipaddr_t v4addr; 529 in6_addr_t v6addr; 530 ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ 531 zoneid_t zoneid = IPCL_ZONEID(connp); 532 ip_stack_t *ipst = connp->conn_netstack->netstack_ip; 533 uint_t scopeid = 0; 534 int error = 0; 535 ip_xmit_attr_t *ixa = connp->conn_ixa; 536 537 ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX); 538 539 if (tcp->tcp_state == TCPS_BOUND) { 540 return (0); 541 } else if (tcp->tcp_state > TCPS_BOUND) { 542 if (connp->conn_debug) { 543 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 544 "tcp_bind: bad state, %d", tcp->tcp_state); 545 } 546 return (-TOUTSTATE); 547 } 548 549 ASSERT(sa != NULL && len != 0); 550 551 if (!OK_32PTR((char *)sa)) { 552 if (connp->conn_debug) { 553 (void) strlog(TCP_MOD_ID, 0, 1, 554 SL_ERROR|SL_TRACE, 555 "tcp_bind: bad address parameter, " 556 "address %p, len %d", 557 (void *)sa, len); 558 } 559 return (-TPROTO); 560 } 561 562 error = proto_verify_ip_addr(connp->conn_family, sa, len); 563 if (error != 0) { 564 return (error); 565 } 566 567 switch (len) { 568 case sizeof (sin_t): /* Complete IPv4 address */ 569 sin = (sin_t *)sa; 570 requested_port = ntohs(sin->sin_port); 571 v4addr = sin->sin_addr.s_addr; 572 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); 573 if (v4addr != INADDR_ANY) { 574 laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst, 575 B_FALSE); 576 } 577 break; 578 579 case sizeof (sin6_t): /* Complete IPv6 address */ 580 sin6 = (sin6_t *)sa; 581 v6addr = sin6->sin6_addr; 582 requested_port = ntohs(sin6->sin6_port); 583 if (IN6_IS_ADDR_V4MAPPED(&v6addr)) { 584 if (connp->conn_ipv6_v6only) 585 return (EADDRNOTAVAIL); 586 587 IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr); 588 if (v4addr != INADDR_ANY) { 589 laddr_type = ip_laddr_verify_v4(v4addr, 590 zoneid, ipst, B_FALSE); 591 } 592 } else { 593 if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) { 594 if (IN6_IS_ADDR_LINKSCOPE(&v6addr)) 595 scopeid = sin6->sin6_scope_id; 596 laddr_type = ip_laddr_verify_v6(&v6addr, 597 zoneid, ipst, B_FALSE, scopeid); 598 } 599 } 600 break; 601 602 default: 603 if (connp->conn_debug) { 604 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, 605 "tcp_bind: bad address length, %d", len); 606 } 607 return (EAFNOSUPPORT); 608 /* return (-TBADADDR); */ 609 } 610 611 /* Is the local address a valid unicast address? */ 612 if (laddr_type == IPVL_BAD) 613 return (EADDRNOTAVAIL); 614 615 connp->conn_bound_addr_v6 = v6addr; 616 if (scopeid != 0) { 617 ixa->ixa_flags |= IXAF_SCOPEID_SET; 618 ixa->ixa_scopeid = scopeid; 619 connp->conn_incoming_ifindex = scopeid; 620 } else { 621 ixa->ixa_flags &= ~IXAF_SCOPEID_SET; 622 connp->conn_incoming_ifindex = connp->conn_bound_if; 623 } 624 625 connp->conn_laddr_v6 = v6addr; 626 connp->conn_saddr_v6 = v6addr; 627 628 bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only; 629 630 error = tcp_bind_select_lport(tcp, &requested_port, 631 bind_to_req_port_only, cr); 632 if (error != 0) { 633 connp->conn_laddr_v6 = ipv6_all_zeros; 634 connp->conn_saddr_v6 = ipv6_all_zeros; 635 connp->conn_bound_addr_v6 = ipv6_all_zeros; 636 } 637 return (error); 638 } 639 640 /* 641 * If the "bind_to_req_port_only" parameter is set, if the requested port 642 * number is available, return it, If not return 0 643 * 644 * If "bind_to_req_port_only" parameter is not set and 645 * If the requested port number is available, return it. If not, return 646 * the first anonymous port we happen across. If no anonymous ports are 647 * available, return 0. addr is the requested local address, if any. 648 * 649 * In either case, when succeeding update the tcp_t to record the port number 650 * and insert it in the bind hash table. 651 * 652 * Note that TCP over IPv4 and IPv6 sockets can use the same port number 653 * without setting SO_REUSEADDR. This is needed so that they 654 * can be viewed as two independent transport protocols. 655 */ 656 in_port_t 657 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, 658 int reuseaddr, boolean_t quick_connect, 659 boolean_t bind_to_req_port_only, boolean_t user_specified) 660 { 661 /* number of times we have run around the loop */ 662 int count = 0; 663 /* maximum number of times to run around the loop */ 664 int loopmax; 665 conn_t *connp = tcp->tcp_connp; 666 tcp_stack_t *tcps = tcp->tcp_tcps; 667 668 /* 669 * Lookup for free addresses is done in a loop and "loopmax" 670 * influences how long we spin in the loop 671 */ 672 if (bind_to_req_port_only) { 673 /* 674 * If the requested port is busy, don't bother to look 675 * for a new one. Setting loop maximum count to 1 has 676 * that effect. 677 */ 678 loopmax = 1; 679 } else { 680 /* 681 * If the requested port is busy, look for a free one 682 * in the anonymous port range. 683 * Set loopmax appropriately so that one does not look 684 * forever in the case all of the anonymous ports are in use. 685 */ 686 if (connp->conn_anon_priv_bind) { 687 /* 688 * loopmax = 689 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 690 */ 691 loopmax = IPPORT_RESERVED - 692 tcps->tcps_min_anonpriv_port; 693 } else { 694 loopmax = (tcps->tcps_largest_anon_port - 695 tcps->tcps_smallest_anon_port + 1); 696 } 697 } 698 do { 699 uint16_t lport; 700 tf_t *tbf; 701 tcp_t *ltcp; 702 conn_t *lconnp; 703 704 lport = htons(port); 705 706 /* 707 * Ensure that the tcp_t is not currently in the bind hash. 708 * Hold the lock on the hash bucket to ensure that 709 * the duplicate check plus the insertion is an atomic 710 * operation. 711 * 712 * This function does an inline lookup on the bind hash list 713 * Make sure that we access only members of tcp_t 714 * and that we don't look at tcp_tcp, since we are not 715 * doing a CONN_INC_REF. 716 */ 717 tcp_bind_hash_remove(tcp); 718 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; 719 mutex_enter(&tbf->tf_lock); 720 for (ltcp = tbf->tf_tcp; ltcp != NULL; 721 ltcp = ltcp->tcp_bind_hash) { 722 if (lport == ltcp->tcp_connp->conn_lport) 723 break; 724 } 725 726 for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { 727 boolean_t not_socket; 728 boolean_t exclbind; 729 730 lconnp = ltcp->tcp_connp; 731 732 /* 733 * On a labeled system, we must treat bindings to ports 734 * on shared IP addresses by sockets with MAC exemption 735 * privilege as being in all zones, as there's 736 * otherwise no way to identify the right receiver. 737 */ 738 if (!IPCL_BIND_ZONE_MATCH(lconnp, connp)) 739 continue; 740 741 /* 742 * If TCP_EXCLBIND is set for either the bound or 743 * binding endpoint, the semantics of bind 744 * is changed according to the following. 745 * 746 * spec = specified address (v4 or v6) 747 * unspec = unspecified address (v4 or v6) 748 * A = specified addresses are different for endpoints 749 * 750 * bound bind to allowed 751 * ------------------------------------- 752 * unspec unspec no 753 * unspec spec no 754 * spec unspec no 755 * spec spec yes if A 756 * 757 * For labeled systems, SO_MAC_EXEMPT behaves the same 758 * as TCP_EXCLBIND, except that zoneid is ignored. 759 * 760 * Note: 761 * 762 * 1. Because of TLI semantics, an endpoint can go 763 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or 764 * TCPS_BOUND, depending on whether it is originally 765 * a listener or not. That is why we need to check 766 * for states greater than or equal to TCPS_BOUND 767 * here. 768 * 769 * 2. Ideally, we should only check for state equals 770 * to TCPS_LISTEN. And the following check should be 771 * added. 772 * 773 * if (ltcp->tcp_state == TCPS_LISTEN || 774 * !reuseaddr || !lconnp->conn_reuseaddr) { 775 * ... 776 * } 777 * 778 * The semantics will be changed to this. If the 779 * endpoint on the list is in state not equal to 780 * TCPS_LISTEN and both endpoints have SO_REUSEADDR 781 * set, let the bind succeed. 782 * 783 * Because of (1), we cannot do that for TLI 784 * endpoints. But we can do that for socket endpoints. 785 * If in future, we can change this going back 786 * semantics, we can use the above check for TLI also. 787 */ 788 not_socket = !(TCP_IS_SOCKET(ltcp) && 789 TCP_IS_SOCKET(tcp)); 790 exclbind = lconnp->conn_exclbind || 791 connp->conn_exclbind; 792 793 if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) || 794 (connp->conn_mac_mode != CONN_MAC_DEFAULT) || 795 (exclbind && (not_socket || 796 ltcp->tcp_state <= TCPS_ESTABLISHED))) { 797 if (V6_OR_V4_INADDR_ANY( 798 lconnp->conn_bound_addr_v6) || 799 V6_OR_V4_INADDR_ANY(*laddr) || 800 IN6_ARE_ADDR_EQUAL(laddr, 801 &lconnp->conn_bound_addr_v6)) { 802 break; 803 } 804 continue; 805 } 806 807 /* 808 * Check ipversion to allow IPv4 and IPv6 sockets to 809 * have disjoint port number spaces, if *_EXCLBIND 810 * is not set and only if the application binds to a 811 * specific port. We use the same autoassigned port 812 * number space for IPv4 and IPv6 sockets. 813 */ 814 if (connp->conn_ipversion != lconnp->conn_ipversion && 815 bind_to_req_port_only) 816 continue; 817 818 /* 819 * Ideally, we should make sure that the source 820 * address, remote address, and remote port in the 821 * four tuple for this tcp-connection is unique. 822 * However, trying to find out the local source 823 * address would require too much code duplication 824 * with IP, since IP needs needs to have that code 825 * to support userland TCP implementations. 826 */ 827 if (quick_connect && 828 (ltcp->tcp_state > TCPS_LISTEN) && 829 ((connp->conn_fport != lconnp->conn_fport) || 830 !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, 831 &lconnp->conn_faddr_v6))) 832 continue; 833 834 if (!reuseaddr) { 835 /* 836 * No socket option SO_REUSEADDR. 837 * If existing port is bound to 838 * a non-wildcard IP address 839 * and the requesting stream is 840 * bound to a distinct 841 * different IP addresses 842 * (non-wildcard, also), keep 843 * going. 844 */ 845 if (!V6_OR_V4_INADDR_ANY(*laddr) && 846 !V6_OR_V4_INADDR_ANY( 847 lconnp->conn_bound_addr_v6) && 848 !IN6_ARE_ADDR_EQUAL(laddr, 849 &lconnp->conn_bound_addr_v6)) 850 continue; 851 if (ltcp->tcp_state >= TCPS_BOUND) { 852 /* 853 * This port is being used and 854 * its state is >= TCPS_BOUND, 855 * so we can't bind to it. 856 */ 857 break; 858 } 859 } else { 860 /* 861 * socket option SO_REUSEADDR is set on the 862 * binding tcp_t. 863 * 864 * If two streams are bound to 865 * same IP address or both addr 866 * and bound source are wildcards 867 * (INADDR_ANY), we want to stop 868 * searching. 869 * We have found a match of IP source 870 * address and source port, which is 871 * refused regardless of the 872 * SO_REUSEADDR setting, so we break. 873 */ 874 if (IN6_ARE_ADDR_EQUAL(laddr, 875 &lconnp->conn_bound_addr_v6) && 876 (ltcp->tcp_state == TCPS_LISTEN || 877 ltcp->tcp_state == TCPS_BOUND)) 878 break; 879 } 880 } 881 if (ltcp != NULL) { 882 /* The port number is busy */ 883 mutex_exit(&tbf->tf_lock); 884 } else { 885 /* 886 * This port is ours. Insert in fanout and mark as 887 * bound to prevent others from getting the port 888 * number. 889 */ 890 tcp->tcp_state = TCPS_BOUND; 891 DTRACE_TCP6(state__change, void, NULL, 892 ip_xmit_attr_t *, connp->conn_ixa, 893 void, NULL, tcp_t *, tcp, void, NULL, 894 int32_t, TCPS_IDLE); 895 896 connp->conn_lport = htons(port); 897 898 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( 899 connp->conn_lport)] == tbf); 900 tcp_bind_hash_insert(tbf, tcp, 1); 901 902 mutex_exit(&tbf->tf_lock); 903 904 /* 905 * We don't want tcp_next_port_to_try to "inherit" 906 * a port number supplied by the user in a bind. 907 */ 908 if (user_specified) 909 return (port); 910 911 /* 912 * This is the only place where tcp_next_port_to_try 913 * is updated. After the update, it may or may not 914 * be in the valid range. 915 */ 916 if (!connp->conn_anon_priv_bind) 917 tcps->tcps_next_port_to_try = port + 1; 918 return (port); 919 } 920 921 if (connp->conn_anon_priv_bind) { 922 port = tcp_get_next_priv_port(tcp); 923 } else { 924 if (count == 0 && user_specified) { 925 /* 926 * We may have to return an anonymous port. So 927 * get one to start with. 928 */ 929 port = 930 tcp_update_next_port( 931 tcps->tcps_next_port_to_try, 932 tcp, B_TRUE); 933 user_specified = B_FALSE; 934 } else { 935 port = tcp_update_next_port(port + 1, tcp, 936 B_FALSE); 937 } 938 } 939 if (port == 0) 940 break; 941 942 /* 943 * Don't let this loop run forever in the case where 944 * all of the anonymous ports are in use. 945 */ 946 } while (++count < loopmax); 947 return (0); 948 } 949