1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org> 9 * All rights reserved. 10 * 11 * Portions of this software were developed by Robert N. M. Watson under 12 * contract to Juniper Networks, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include "opt_ddb.h" 40 #include "opt_ipsec.h" 41 #include "opt_inet.h" 42 #include "opt_inet6.h" 43 #include "opt_ratelimit.h" 44 #include "opt_rss.h" 45 46 #include <sys/param.h> 47 #include <sys/hash.h> 48 #include <sys/systm.h> 49 #include <sys/libkern.h> 50 #include <sys/lock.h> 51 #include <sys/malloc.h> 52 #include <sys/mbuf.h> 53 #include <sys/eventhandler.h> 54 #include <sys/domain.h> 55 #include <sys/proc.h> 56 #include <sys/protosw.h> 57 #include <sys/smp.h> 58 #include <sys/smr.h> 59 #include <sys/socket.h> 60 #include <sys/socketvar.h> 61 #include <sys/sockio.h> 62 #include <sys/priv.h> 63 #include <sys/proc.h> 64 #include <sys/refcount.h> 65 #include <sys/jail.h> 66 #include <sys/kernel.h> 67 #include <sys/sysctl.h> 68 69 #ifdef DDB 70 #include <ddb/ddb.h> 71 #endif 72 73 #include <vm/uma.h> 74 #include <vm/vm.h> 75 76 #include <net/if.h> 77 #include <net/if_var.h> 78 #include <net/if_private.h> 79 #include <net/if_types.h> 80 #include <net/if_llatbl.h> 81 #include <net/route.h> 82 #include <net/rss_config.h> 83 #include <net/vnet.h> 84 85 #if defined(INET) || defined(INET6) 86 #include <netinet/in.h> 87 #include <netinet/in_pcb.h> 88 #include <netinet/in_pcb_var.h> 89 #include <netinet/tcp.h> 90 #ifdef INET 91 #include <netinet/in_var.h> 92 #include <netinet/in_fib.h> 93 #endif 94 #include <netinet/ip_var.h> 95 #ifdef INET6 96 #include <netinet/ip6.h> 97 #include <netinet6/in6_pcb.h> 98 #include <netinet6/in6_var.h> 99 #include <netinet6/ip6_var.h> 100 #endif /* INET6 */ 101 #include <net/route/nhop.h> 102 #endif 103 104 #include <netipsec/ipsec_support.h> 105 106 #include <security/mac/mac_framework.h> 107 108 #define INPCBLBGROUP_SIZMIN 8 109 #define INPCBLBGROUP_SIZMAX 256 110 111 #define INP_FREED 0x00000200 /* Went through in_pcbfree(). */ 112 #define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */ 113 114 /* 115 * These configure the range of local port addresses assigned to 116 * "unspecified" outgoing connections/packets/whatever. 117 */ 118 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 119 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 120 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 121 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 122 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 123 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 124 125 /* 126 * Reserved ports accessible only to root. There are significant 127 * security considerations that must be accounted for when changing these, 128 * but the security benefits can be great. Please be careful. 129 */ 130 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 131 VNET_DEFINE(int, ipport_reservedlow); 132 133 /* Enable random ephemeral port allocation by default. */ 134 VNET_DEFINE(int, ipport_randomized) = 1; 135 136 #ifdef INET 137 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 138 struct in_addr faddr, u_int fport_arg, 139 struct in_addr laddr, u_int lport_arg, 140 int lookupflags, uint8_t numa_domain, int fib); 141 142 #define RANGECHK(var, min, max) \ 143 if ((var) < (min)) { (var) = (min); } \ 144 else if ((var) > (max)) { (var) = (max); } 145 146 static int 147 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 148 { 149 int error; 150 151 error = sysctl_handle_int(oidp, arg1, arg2, req); 152 if (error == 0) { 153 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 154 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 155 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 156 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 157 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 158 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 159 } 160 return (error); 161 } 162 163 #undef RANGECHK 164 165 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 166 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 167 "IP Ports"); 168 169 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 170 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 171 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 172 ""); 173 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 174 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 175 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 176 ""); 177 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 178 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 179 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 180 ""); 181 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 182 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 183 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 184 ""); 185 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 186 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 187 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 188 ""); 189 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 190 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 191 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 192 ""); 193 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 194 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 195 &VNET_NAME(ipport_reservedhigh), 0, ""); 196 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 197 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 198 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 199 CTLFLAG_VNET | CTLFLAG_RW, 200 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 201 202 #ifdef RATELIMIT 203 counter_u64_t rate_limit_new; 204 counter_u64_t rate_limit_chg; 205 counter_u64_t rate_limit_active; 206 counter_u64_t rate_limit_alloc_fail; 207 counter_u64_t rate_limit_set_ok; 208 209 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 210 "IP Rate Limiting"); 211 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 212 &rate_limit_active, "Active rate limited connections"); 213 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 214 &rate_limit_alloc_fail, "Rate limited connection failures"); 215 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 216 &rate_limit_set_ok, "Rate limited setting succeeded"); 217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 218 &rate_limit_new, "Total Rate limit new attempts"); 219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 220 &rate_limit_chg, "Total Rate limited change attempts"); 221 #endif /* RATELIMIT */ 222 223 #endif /* INET */ 224 225 VNET_DEFINE(uint32_t, in_pcbhashseed); 226 static void 227 in_pcbhashseed_init(void) 228 { 229 230 V_in_pcbhashseed = arc4random(); 231 } 232 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, 233 in_pcbhashseed_init, NULL); 234 235 #ifdef INET 236 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0; 237 #define V_connect_inaddr_wild VNET(connect_inaddr_wild) 238 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild, 239 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0, 240 "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)"); 241 #endif 242 243 /* 244 * in_pcb.c: manage the Protocol Control Blocks. 245 * 246 * NOTE: It is assumed that most of these functions will be called with 247 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 248 * functions often modify hash chains or addresses in pcbs. 249 */ 250 251 static struct inpcblbgroup * 252 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port, 253 const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib) 254 { 255 struct inpcblbgroup *grp; 256 size_t bytes; 257 258 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 259 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 260 if (grp == NULL) 261 return (NULL); 262 LIST_INIT(&grp->il_pending); 263 grp->il_cred = crhold(cred); 264 grp->il_vflag = vflag; 265 grp->il_lport = port; 266 grp->il_numa_domain = numa_domain; 267 grp->il_fibnum = fib; 268 grp->il_dependladdr = *addr; 269 grp->il_inpsiz = size; 270 return (grp); 271 } 272 273 static void 274 in_pcblbgroup_free_deferred(epoch_context_t ctx) 275 { 276 struct inpcblbgroup *grp; 277 278 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 279 crfree(grp->il_cred); 280 free(grp, M_PCB); 281 } 282 283 static void 284 in_pcblbgroup_free(struct inpcblbgroup *grp) 285 { 286 KASSERT(LIST_EMPTY(&grp->il_pending), 287 ("local group %p still has pending inps", grp)); 288 289 CK_LIST_REMOVE(grp, il_list); 290 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 291 } 292 293 static struct inpcblbgroup * 294 in_pcblbgroup_find(struct inpcb *inp) 295 { 296 struct inpcbinfo *pcbinfo; 297 struct inpcblbgroup *grp; 298 struct inpcblbgrouphead *hdr; 299 300 INP_LOCK_ASSERT(inp); 301 302 pcbinfo = inp->inp_pcbinfo; 303 INP_HASH_LOCK_ASSERT(pcbinfo); 304 305 hdr = &pcbinfo->ipi_lbgrouphashbase[ 306 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 307 CK_LIST_FOREACH(grp, hdr, il_list) { 308 struct inpcb *inp1; 309 310 for (unsigned int i = 0; i < grp->il_inpcnt; i++) { 311 if (inp == grp->il_inp[i]) 312 goto found; 313 } 314 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { 315 if (inp == inp1) 316 goto found; 317 } 318 } 319 found: 320 return (grp); 321 } 322 323 static void 324 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp) 325 { 326 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 327 ("invalid local group size %d and count %d", grp->il_inpsiz, 328 grp->il_inpcnt)); 329 INP_WLOCK_ASSERT(inp); 330 331 if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp && 332 !SOLISTENING(inp->inp_socket)) { 333 /* 334 * If this is a TCP socket, it should not be visible to lbgroup 335 * lookups until listen() has been called. 336 */ 337 LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list); 338 grp->il_pendcnt++; 339 } else { 340 grp->il_inp[grp->il_inpcnt] = inp; 341 342 /* 343 * Synchronize with in_pcblookup_lbgroup(): make sure that we 344 * don't expose a null slot to the lookup path. 345 */ 346 atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1); 347 } 348 349 inp->inp_flags |= INP_INLBGROUP; 350 } 351 352 static struct inpcblbgroup * 353 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 354 struct inpcblbgroup *old_grp, int size) 355 { 356 struct inpcblbgroup *grp; 357 int i; 358 359 grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag, 360 old_grp->il_lport, &old_grp->il_dependladdr, size, 361 old_grp->il_numa_domain, old_grp->il_fibnum); 362 if (grp == NULL) 363 return (NULL); 364 365 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 366 ("invalid new local group size %d and old local group count %d", 367 grp->il_inpsiz, old_grp->il_inpcnt)); 368 369 for (i = 0; i < old_grp->il_inpcnt; ++i) 370 grp->il_inp[i] = old_grp->il_inp[i]; 371 grp->il_inpcnt = old_grp->il_inpcnt; 372 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 373 LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb, 374 inp_lbgroup_list); 375 grp->il_pendcnt = old_grp->il_pendcnt; 376 old_grp->il_pendcnt = 0; 377 in_pcblbgroup_free(old_grp); 378 return (grp); 379 } 380 381 /* 382 * Add PCB to load balance group for SO_REUSEPORT_LB option. 383 */ 384 static int 385 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 386 { 387 const static struct timeval interval = { 60, 0 }; 388 static struct timeval lastprint; 389 struct inpcbinfo *pcbinfo; 390 struct inpcblbgrouphead *hdr; 391 struct inpcblbgroup *grp; 392 uint32_t idx; 393 int fib; 394 395 pcbinfo = inp->inp_pcbinfo; 396 397 INP_WLOCK_ASSERT(inp); 398 INP_HASH_WLOCK_ASSERT(pcbinfo); 399 400 fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ? 401 inp->inp_inc.inc_fibnum : RT_ALL_FIBS; 402 403 #ifdef INET6 404 /* 405 * Don't allow IPv4 mapped INET6 wild socket. 406 */ 407 if ((inp->inp_vflag & INP_IPV4) && 408 inp->inp_laddr.s_addr == INADDR_ANY && 409 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 410 return (0); 411 } 412 #endif 413 414 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask); 415 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 416 CK_LIST_FOREACH(grp, hdr, il_list) { 417 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && 418 grp->il_vflag == inp->inp_vflag && 419 grp->il_lport == inp->inp_lport && 420 grp->il_numa_domain == numa_domain && 421 grp->il_fibnum == fib && 422 memcmp(&grp->il_dependladdr, 423 &inp->inp_inc.inc_ie.ie_dependladdr, 424 sizeof(grp->il_dependladdr)) == 0) { 425 break; 426 } 427 } 428 if (grp == NULL) { 429 /* Create new load balance group. */ 430 grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag, 431 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 432 INPCBLBGROUP_SIZMIN, numa_domain, fib); 433 if (grp == NULL) 434 return (ENOMEM); 435 in_pcblbgroup_insert(grp, inp); 436 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 437 } else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) { 438 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 439 if (ratecheck(&lastprint, &interval)) 440 printf("lb group port %d, limit reached\n", 441 ntohs(grp->il_lport)); 442 return (0); 443 } 444 445 /* Expand this local group. */ 446 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 447 if (grp == NULL) 448 return (ENOMEM); 449 in_pcblbgroup_insert(grp, inp); 450 } else { 451 in_pcblbgroup_insert(grp, inp); 452 } 453 return (0); 454 } 455 456 /* 457 * Remove PCB from load balance group. 458 */ 459 static void 460 in_pcbremlbgrouphash(struct inpcb *inp) 461 { 462 struct inpcbinfo *pcbinfo; 463 struct inpcblbgrouphead *hdr; 464 struct inpcblbgroup *grp; 465 struct inpcb *inp1; 466 int i; 467 468 pcbinfo = inp->inp_pcbinfo; 469 470 INP_WLOCK_ASSERT(inp); 471 MPASS(inp->inp_flags & INP_INLBGROUP); 472 INP_HASH_WLOCK_ASSERT(pcbinfo); 473 474 hdr = &pcbinfo->ipi_lbgrouphashbase[ 475 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 476 CK_LIST_FOREACH(grp, hdr, il_list) { 477 for (i = 0; i < grp->il_inpcnt; ++i) { 478 if (grp->il_inp[i] != inp) 479 continue; 480 481 if (grp->il_inpcnt == 1 && 482 LIST_EMPTY(&grp->il_pending)) { 483 /* We are the last, free this local group. */ 484 in_pcblbgroup_free(grp); 485 } else { 486 grp->il_inp[i] = 487 grp->il_inp[grp->il_inpcnt - 1]; 488 489 /* 490 * Synchronize with in_pcblookup_lbgroup(). 491 */ 492 atomic_store_rel_int(&grp->il_inpcnt, 493 grp->il_inpcnt - 1); 494 } 495 inp->inp_flags &= ~INP_INLBGROUP; 496 return; 497 } 498 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { 499 if (inp == inp1) { 500 LIST_REMOVE(inp, inp_lbgroup_list); 501 grp->il_pendcnt--; 502 inp->inp_flags &= ~INP_INLBGROUP; 503 return; 504 } 505 } 506 } 507 __assert_unreachable(); 508 } 509 510 int 511 in_pcblbgroup_numa(struct inpcb *inp, int arg) 512 { 513 struct inpcbinfo *pcbinfo; 514 int error; 515 uint8_t numa_domain; 516 517 switch (arg) { 518 case TCP_REUSPORT_LB_NUMA_NODOM: 519 numa_domain = M_NODOM; 520 break; 521 case TCP_REUSPORT_LB_NUMA_CURDOM: 522 numa_domain = PCPU_GET(domain); 523 break; 524 default: 525 if (arg < 0 || arg >= vm_ndomains) 526 return (EINVAL); 527 numa_domain = arg; 528 } 529 530 pcbinfo = inp->inp_pcbinfo; 531 INP_WLOCK_ASSERT(inp); 532 INP_HASH_WLOCK(pcbinfo); 533 if (in_pcblbgroup_find(inp) != NULL) { 534 /* Remove it from the old group. */ 535 in_pcbremlbgrouphash(inp); 536 /* Add it to the new group based on numa domain. */ 537 in_pcbinslbgrouphash(inp, numa_domain); 538 error = 0; 539 } else { 540 error = ENOENT; 541 } 542 INP_HASH_WUNLOCK(pcbinfo); 543 return (error); 544 } 545 546 /* 547 * Initialize an inpcbinfo - a per-VNET instance of connections db. 548 */ 549 void 550 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, 551 u_int hash_nelements, u_int porthash_nelements) 552 { 553 struct hashalloc_args ha = { 554 .mtype = M_PCB, 555 .mflags = M_WAITOK, 556 .head = HASH_HEAD_CK_LIST, 557 }; 558 559 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, 560 NULL, MTX_DEF); 561 CK_LIST_INIT(&pcbinfo->ipi_list_unconn); 562 pcbinfo->ipi_count = 0; 563 564 ha.size = hash_nelements; 565 pcbinfo->ipi_hash_exact = hashalloc(&ha); 566 pcbinfo->ipi_hash_wild = hashalloc(&ha); 567 pcbinfo->ipi_hashmask = ha.size - 1; 568 569 ha.size = imin(porthash_nelements, IPPORT_MAX + 1); 570 pcbinfo->ipi_porthashbase = hashalloc(&ha); 571 pcbinfo->ipi_lbgrouphashbase = hashalloc(&ha); 572 pcbinfo->ipi_porthashmask = ha.size - 1; 573 574 pcbinfo->ipi_zone = pcbstor->ips_zone; 575 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 576 } 577 578 /* 579 * Destroy an inpcbinfo. 580 */ 581 void 582 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 583 { 584 struct hashalloc_args ha = { 585 .mtype = M_PCB, 586 .head = HASH_HEAD_CK_LIST, 587 }; 588 589 KASSERT(pcbinfo->ipi_count == 0, 590 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 591 592 ha.size = pcbinfo->ipi_hashmask + 1; 593 hashfree(pcbinfo->ipi_hash_exact, &ha); 594 hashfree(pcbinfo->ipi_hash_wild, &ha); 595 ha.size = pcbinfo->ipi_porthashmask + 1; 596 hashfree(pcbinfo->ipi_porthashbase, &ha); 597 hashfree(pcbinfo->ipi_lbgrouphashbase, &ha); 598 mtx_destroy(&pcbinfo->ipi_hash_lock); 599 } 600 601 /* 602 * Initialize a pcbstorage - per protocol zones to allocate inpcbs. 603 */ 604 static void inpcb_fini(void *, int); 605 void 606 in_pcbstorage_init(void *arg) 607 { 608 struct inpcbstorage *pcbstor = arg; 609 610 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, 611 pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit, 612 inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR); 613 } 614 615 /* 616 * Destroy a pcbstorage - used by unloadable protocols. 617 */ 618 void 619 in_pcbstorage_destroy(void *arg) 620 { 621 struct inpcbstorage *pcbstor = arg; 622 623 uma_zdestroy(pcbstor->ips_zone); 624 } 625 626 /* 627 * Allocate a PCB and associate it with the socket. 628 * On success return with the PCB locked. 629 */ 630 int 631 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 632 { 633 struct inpcb *inp; 634 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 635 int error; 636 #endif 637 638 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 639 if (inp == NULL) 640 return (ENOBUFS); 641 bzero(&inp->inp_start_zero, inp_zero_size); 642 #ifdef NUMA 643 inp->inp_numa_domain = M_NODOM; 644 #endif 645 inp->inp_pcbinfo = pcbinfo; 646 inp->inp_socket = so; 647 inp->inp_cred = crhold(so->so_cred); 648 inp->inp_inc.inc_fibnum = so->so_fibnum; 649 #ifdef MAC 650 error = mac_inpcb_init(inp, M_NOWAIT); 651 if (error != 0) 652 goto out; 653 mac_inpcb_create(so, inp); 654 #endif 655 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 656 error = ipsec_init_pcbpolicy(inp); 657 if (error != 0) { 658 #ifdef MAC 659 mac_inpcb_destroy(inp); 660 #endif 661 goto out; 662 } 663 #endif /*IPSEC*/ 664 #ifdef INET6 665 if (INP_SOCKAF(so) == AF_INET6) { 666 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; 667 if (V_ip6_v6only) 668 inp->inp_flags |= IN6P_IPV6_V6ONLY; 669 #ifdef INET 670 else 671 inp->inp_vflag |= INP_IPV4; 672 #endif 673 if (V_ip6_auto_flowlabel) 674 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 675 inp->in6p_hops = -1; /* use kernel default */ 676 } 677 #endif 678 #if defined(INET) && defined(INET6) 679 else 680 #endif 681 #ifdef INET 682 inp->inp_vflag |= INP_IPV4; 683 #endif 684 inp->inp_smr = SMR_SEQ_INVALID; 685 686 /* 687 * Routes in inpcb's can cache L2 as well; they are guaranteed 688 * to be cleaned up. 689 */ 690 inp->inp_route.ro_flags = RT_LLE_CACHE; 691 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 692 inp->inp_flags |= INP_UNCONNECTED; 693 INP_WLOCK(inp); 694 INP_HASH_WLOCK(pcbinfo); 695 pcbinfo->ipi_count++; 696 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 697 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list); 698 INP_HASH_WUNLOCK(pcbinfo); 699 so->so_pcb = inp; 700 701 return (0); 702 703 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 704 out: 705 crfree(inp->inp_cred); 706 #ifdef INVARIANTS 707 inp->inp_cred = NULL; 708 #endif 709 uma_zfree_smr(pcbinfo->ipi_zone, inp); 710 return (error); 711 #endif 712 } 713 714 #if defined(INET) || defined(INET6) 715 /* 716 * Assign a local port like in_pcb_lport(), but also used with connect() 717 * and a foreign address and port. If fsa is non-NULL, choose a local port 718 * that is unused with those, otherwise one that is completely unused. 719 * lsa can be NULL for IPv6. 720 */ 721 int 722 in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa, 723 u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred, 724 int lookupflags) 725 { 726 struct inpcbinfo *pcbinfo; 727 struct inpcb *tmpinp; 728 unsigned short *lastport; 729 int count, error; 730 u_short aux, first, last, lport; 731 #ifdef INET 732 struct in_addr laddr, faddr; 733 #endif 734 #ifdef INET6 735 struct in6_addr *laddr6, *faddr6; 736 #endif 737 738 pcbinfo = inp->inp_pcbinfo; 739 740 /* 741 * Because no actual state changes occur here, a global write lock on 742 * the pcbinfo isn't required. 743 */ 744 INP_LOCK_ASSERT(inp); 745 INP_HASH_LOCK_ASSERT(pcbinfo); 746 747 if (inp->inp_flags & INP_HIGHPORT) { 748 first = V_ipport_hifirstauto; /* sysctl */ 749 last = V_ipport_hilastauto; 750 lastport = &pcbinfo->ipi_lasthi; 751 } else if (inp->inp_flags & INP_LOWPORT) { 752 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 753 if (error) 754 return (error); 755 first = V_ipport_lowfirstauto; /* 1023 */ 756 last = V_ipport_lowlastauto; /* 600 */ 757 lastport = &pcbinfo->ipi_lastlow; 758 } else { 759 first = V_ipport_firstauto; /* sysctl */ 760 last = V_ipport_lastauto; 761 lastport = &pcbinfo->ipi_lastport; 762 } 763 764 /* 765 * Instead of having two loops further down counting up or down 766 * make sure that first is always <= last and go with only one 767 * code path implementing all logic. 768 */ 769 if (first > last) { 770 aux = first; 771 first = last; 772 last = aux; 773 } 774 775 #ifdef INET 776 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ 777 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 778 if (lsa != NULL) 779 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 780 if (fsa != NULL) 781 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 782 } 783 #endif 784 #ifdef INET6 785 laddr6 = NULL; 786 if ((inp->inp_vflag & INP_IPV6) != 0) { 787 if (lsa != NULL) 788 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 789 if (fsa != NULL) 790 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 791 } 792 #endif 793 794 tmpinp = NULL; 795 796 if (V_ipport_randomized) 797 *lastport = first + (arc4random() % (last - first)); 798 799 count = last - first; 800 801 do { 802 if (count-- < 0) /* completely used? */ 803 return (EADDRNOTAVAIL); 804 ++*lastport; 805 if (*lastport < first || *lastport > last) 806 *lastport = first; 807 lport = htons(*lastport); 808 809 if (fsa != NULL) { 810 #ifdef INET 811 if (lsa->sa_family == AF_INET) { 812 tmpinp = in_pcblookup_hash_locked(pcbinfo, 813 faddr, fport, laddr, lport, lookupflags, 814 M_NODOM, RT_ALL_FIBS); 815 } 816 #endif 817 #ifdef INET6 818 if (lsa->sa_family == AF_INET6) { 819 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 820 faddr6, fport, laddr6, lport, lookupflags, 821 M_NODOM, RT_ALL_FIBS); 822 } 823 #endif 824 } else { 825 #ifdef INET6 826 if ((inp->inp_vflag & INP_IPV6) != 0) { 827 tmpinp = in6_pcblookup_local(pcbinfo, 828 &inp->in6p_laddr, lport, RT_ALL_FIBS, 829 lookupflags, cred); 830 #ifdef INET 831 if (tmpinp == NULL && 832 (inp->inp_vflag & INP_IPV4)) 833 tmpinp = in_pcblookup_local(pcbinfo, 834 laddr, lport, RT_ALL_FIBS, 835 lookupflags, cred); 836 #endif 837 } 838 #endif 839 #if defined(INET) && defined(INET6) 840 else 841 #endif 842 #ifdef INET 843 tmpinp = in_pcblookup_local(pcbinfo, laddr, 844 lport, RT_ALL_FIBS, lookupflags, cred); 845 #endif 846 } 847 } while (tmpinp != NULL); 848 849 *lportp = lport; 850 851 return (0); 852 } 853 854 /* 855 * Select a local port (number) to use. 856 */ 857 int 858 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 859 struct ucred *cred, int lookupflags) 860 { 861 struct sockaddr_in laddr; 862 863 if (laddrp) { 864 bzero(&laddr, sizeof(laddr)); 865 laddr.sin_family = AF_INET; 866 laddr.sin_addr = *laddrp; 867 } 868 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 869 NULL, lportp, NULL, 0, cred, lookupflags)); 870 } 871 #endif /* INET || INET6 */ 872 873 #ifdef INET 874 /* 875 * Determine whether the inpcb can be bound to the specified address/port tuple. 876 */ 877 static int 878 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr, 879 const u_short lport, const int fib, int sooptions, int lookupflags, 880 struct ucred *cred) 881 { 882 int reuseport, reuseport_lb; 883 884 INP_LOCK_ASSERT(inp); 885 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 886 887 reuseport = (sooptions & SO_REUSEPORT); 888 reuseport_lb = (sooptions & SO_REUSEPORT_LB); 889 890 if (IN_MULTICAST(ntohl(laddr.s_addr))) { 891 /* 892 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 893 * allow complete duplication of binding if 894 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 895 * and a multicast address is bound on both 896 * new and duplicated sockets. 897 */ 898 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0) 899 reuseport = SO_REUSEADDR | SO_REUSEPORT; 900 /* 901 * XXX: How to deal with SO_REUSEPORT_LB here? 902 * Treat same as SO_REUSEPORT for now. 903 */ 904 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0) 905 reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB; 906 } else if (!in_nullhost(laddr)) { 907 struct sockaddr_in sin; 908 909 memset(&sin, 0, sizeof(sin)); 910 sin.sin_family = AF_INET; 911 sin.sin_len = sizeof(sin); 912 sin.sin_addr = laddr; 913 914 /* 915 * Is the address a local IP address? 916 * If INP_BINDANY is set, then the socket may be bound 917 * to any endpoint address, local or not. 918 */ 919 if ((inp->inp_flags & INP_BINDANY) == 0 && 920 ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0) 921 return (EADDRNOTAVAIL); 922 } 923 924 if (lport != 0) { 925 struct inpcb *t; 926 927 if (ntohs(lport) <= V_ipport_reservedhigh && 928 ntohs(lport) >= V_ipport_reservedlow && 929 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 930 return (EACCES); 931 932 if (!IN_MULTICAST(ntohl(laddr.s_addr)) && 933 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 934 /* 935 * If a socket owned by a different user is already 936 * bound to this port, fail. In particular, SO_REUSE* 937 * can only be used to share a port among sockets owned 938 * by the same user. 939 * 940 * However, we can share a port with a connected socket 941 * which has a unique 4-tuple. 942 */ 943 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, 944 RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred); 945 if (t != NULL && 946 (inp->inp_socket->so_type != SOCK_STREAM || 947 in_nullhost(t->inp_faddr)) && 948 (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) 949 return (EADDRINUSE); 950 } 951 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib, 952 lookupflags, cred); 953 if (t != NULL && ((reuseport | reuseport_lb) & 954 t->inp_socket->so_options) == 0) { 955 #ifdef INET6 956 if (!in_nullhost(laddr) || 957 !in_nullhost(t->inp_laddr) || 958 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 959 (t->inp_vflag & INP_IPV6PROTO) == 0) 960 #endif 961 return (EADDRINUSE); 962 } 963 } 964 return (0); 965 } 966 967 /* 968 * Set up a bind operation on a PCB, performing port allocation 969 * as required, but do not actually modify the PCB. Callers can 970 * either complete the bind by setting inp_laddr/inp_lport and 971 * calling in_pcbinshash(), or they can just use the resulting 972 * port and address to authorise the sending of a once-off packet. 973 * 974 * On error, the values of *laddrp and *lportp are not changed. 975 */ 976 static int 977 in_pcbbind_setup_locked(struct inpcb *inp, struct sockaddr_in *sin, 978 in_addr_t *laddrp, u_short *lportp, int flags, struct ucred *cred) 979 { 980 struct socket *so = inp->inp_socket; 981 struct in_addr laddr; 982 u_short lport = 0; 983 int error, fib, lookupflags, sooptions; 984 985 /* 986 * No state changes, so read locks are sufficient here. 987 */ 988 INP_LOCK_ASSERT(inp); 989 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 990 991 laddr.s_addr = *laddrp; 992 if (sin != NULL && laddr.s_addr != INADDR_ANY) 993 return (EINVAL); 994 995 lookupflags = 0; 996 sooptions = atomic_load_int(&so->so_options); 997 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0) 998 lookupflags = INPLOOKUP_WILDCARD; 999 if (sin == NULL) { 1000 if ((error = prison_local_ip4(cred, &laddr)) != 0) 1001 return (error); 1002 } else { 1003 KASSERT(sin->sin_family == AF_INET, 1004 ("%s: invalid family for address %p", __func__, sin)); 1005 KASSERT(sin->sin_len == sizeof(*sin), 1006 ("%s: invalid length for address %p", __func__, sin)); 1007 1008 error = prison_local_ip4(cred, &sin->sin_addr); 1009 if (error) 1010 return (error); 1011 if (sin->sin_port != *lportp) { 1012 /* Don't allow the port to change. */ 1013 if (*lportp != 0) 1014 return (EINVAL); 1015 lport = sin->sin_port; 1016 } 1017 laddr = sin->sin_addr; 1018 1019 fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum : 1020 RT_ALL_FIBS; 1021 1022 /* See if this address/port combo is available. */ 1023 error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions, 1024 lookupflags, cred); 1025 if (error != 0) 1026 return (error); 1027 } 1028 if (*lportp != 0) 1029 lport = *lportp; 1030 if (lport == 0) { 1031 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1032 if (error != 0) 1033 return (error); 1034 } 1035 *laddrp = laddr.s_addr; 1036 *lportp = lport; 1037 if ((flags & INPBIND_FIB) != 0) 1038 inp->inp_flags |= INP_BOUNDFIB; 1039 return (0); 1040 } 1041 1042 int 1043 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, 1044 u_short *lportp, int flags, struct ucred *cred) 1045 { 1046 int error; 1047 1048 INP_HASH_WLOCK(inp->inp_pcbinfo); 1049 error = in_pcbbind_setup_locked(inp, sin, laddrp, lportp, flags, cred); 1050 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1051 1052 return (error); 1053 } 1054 1055 #ifdef INET 1056 int 1057 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags, 1058 struct ucred *cred) 1059 { 1060 int error; 1061 bool anonport; 1062 1063 KASSERT(sin == NULL || sin->sin_family == AF_INET, 1064 ("%s: invalid address family for %p", __func__, sin)); 1065 KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in), 1066 ("%s: invalid address length for %p", __func__, sin)); 1067 INP_WLOCK_ASSERT(inp); 1068 1069 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 1070 return (EINVAL); 1071 anonport = sin == NULL || sin->sin_port == 0; 1072 1073 INP_HASH_WLOCK(inp->inp_pcbinfo); 1074 error = in_pcbbind_setup_locked(inp, sin, &inp->inp_laddr.s_addr, 1075 &inp->inp_lport, flags, cred); 1076 if (error) { 1077 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1078 return (error); 1079 } 1080 if (__predict_false((error = in_pcbinshash(inp)) != 0)) { 1081 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1082 MPASS(inp->inp_socket->so_options & SO_REUSEPORT_LB); 1083 inp->inp_laddr.s_addr = INADDR_ANY; 1084 inp->inp_lport = 0; 1085 inp->inp_flags &= ~INP_BOUNDFIB; 1086 return (error); 1087 } 1088 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1089 if (anonport) 1090 inp->inp_flags |= INP_ANONPORT; 1091 return (0); 1092 } 1093 #endif 1094 1095 /* 1096 * Connect from a socket to a specified address. 1097 * Both address and port must be specified in argument sin. 1098 * If don't have a local address for this socket yet, 1099 * then pick one. 1100 */ 1101 int 1102 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) 1103 { 1104 struct in_addr laddr, faddr; 1105 u_short lport; 1106 int error; 1107 bool anonport; 1108 1109 NET_EPOCH_ASSERT(); 1110 INP_WLOCK_ASSERT(inp); 1111 KASSERT(in_nullhost(inp->inp_faddr), 1112 ("%s: inp is already connected", __func__)); 1113 KASSERT(sin->sin_family == AF_INET, 1114 ("%s: invalid address family for %p", __func__, sin)); 1115 KASSERT(sin->sin_len == sizeof(*sin), 1116 ("%s: invalid address length for %p", __func__, sin)); 1117 1118 if (sin->sin_port == 0) 1119 return (EADDRNOTAVAIL); 1120 1121 anonport = (inp->inp_lport == 0); 1122 1123 if (__predict_false(in_broadcast(sin->sin_addr))) { 1124 if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead)) 1125 return (ENETUNREACH); 1126 /* 1127 * If the destination address is INADDR_ANY, use the primary 1128 * local address. If the supplied address is INADDR_BROADCAST, 1129 * and the primary interface supports broadcast, choose the 1130 * broadcast address for that interface. 1131 */ 1132 if (in_nullhost(sin->sin_addr)) { 1133 faddr = 1134 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1135 if ((error = prison_get_ip4(cred, &faddr)) != 0) 1136 return (error); 1137 } else if (sin->sin_addr.s_addr == INADDR_BROADCAST && 1138 CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags 1139 & IFF_BROADCAST) { 1140 faddr = satosin(&CK_STAILQ_FIRST( 1141 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1142 } else 1143 faddr = sin->sin_addr; 1144 } else 1145 faddr = sin->sin_addr; 1146 1147 INP_HASH_WLOCK(inp->inp_pcbinfo); 1148 if (in_nullhost(inp->inp_laddr)) { 1149 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1150 if (__predict_false(error)) { 1151 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1152 return (error); 1153 } 1154 } else 1155 laddr = inp->inp_laddr; 1156 1157 if (anonport) { 1158 struct sockaddr_in lsin = { 1159 .sin_family = AF_INET, 1160 .sin_addr = laddr, 1161 }; 1162 struct sockaddr_in fsin = { 1163 .sin_family = AF_INET, 1164 .sin_addr = faddr, 1165 }; 1166 1167 error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin, 1168 &lport, (struct sockaddr *)&fsin, sin->sin_port, cred, 1169 INPLOOKUP_WILDCARD); 1170 if (__predict_false(error)) { 1171 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1172 return (error); 1173 } 1174 } else if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1175 sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) != 1176 NULL) { 1177 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1178 return (EADDRINUSE); 1179 } else 1180 lport = inp->inp_lport; 1181 1182 MPASS(!in_nullhost(inp->inp_laddr) || inp->inp_lport != 0 || 1183 (inp->inp_flags & INP_UNCONNECTED)); 1184 1185 inp->inp_faddr = faddr; 1186 inp->inp_fport = sin->sin_port; 1187 inp->inp_laddr = laddr; 1188 inp->inp_lport = lport; 1189 1190 if (inp->inp_flags & INP_UNCONNECTED) { 1191 error = in_pcbinshash(inp); 1192 MPASS(error == 0); 1193 } else 1194 in_pcbrehash(inp); 1195 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1196 1197 if (V_fib_hash_outbound) { 1198 uint32_t hash_val, hash_type; 1199 1200 hash_val = fib4_calc_software_hash(inp->inp_laddr, 1201 inp->inp_faddr, 0, sin->sin_port, 1202 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1203 1204 inp->inp_flowid = hash_val; 1205 inp->inp_flowtype = hash_type; 1206 } 1207 if (anonport) 1208 inp->inp_flags |= INP_ANONPORT; 1209 return (0); 1210 } 1211 1212 /* 1213 * Do proper source address selection on an unbound socket in case 1214 * of connect. Take jails into account as well. 1215 */ 1216 int 1217 in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr, 1218 struct in_addr *laddr, struct ucred *cred) 1219 { 1220 struct ifaddr *ifa; 1221 struct sockaddr *sa; 1222 struct sockaddr_in *sin, dst; 1223 struct nhop_object *nh; 1224 int error; 1225 1226 NET_EPOCH_ASSERT(); 1227 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1228 1229 /* 1230 * Bypass source address selection and use the primary jail IP 1231 * if requested. 1232 */ 1233 if (!prison_saddrsel_ip4(cred, laddr)) 1234 return (0); 1235 1236 /* 1237 * If the destination address is multicast and an outgoing 1238 * interface has been set as a multicast option, prefer the 1239 * address of that interface as our source address. 1240 */ 1241 if (IN_MULTICAST(ntohl(faddr->s_addr)) && inp->inp_moptions != NULL && 1242 inp->inp_moptions->imo_multicast_ifp != NULL) { 1243 struct ifnet *ifp = inp->inp_moptions->imo_multicast_ifp; 1244 struct in_ifaddr *ia; 1245 1246 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1247 if (ia->ia_ifp == ifp && 1248 prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0) 1249 break; 1250 } 1251 if (ia == NULL) 1252 return (EADDRNOTAVAIL); 1253 *laddr = ia->ia_addr.sin_addr; 1254 return (0); 1255 } 1256 1257 error = 0; 1258 1259 nh = NULL; 1260 bzero(&dst, sizeof(dst)); 1261 sin = &dst; 1262 sin->sin_family = AF_INET; 1263 sin->sin_len = sizeof(struct sockaddr_in); 1264 sin->sin_addr.s_addr = faddr->s_addr; 1265 1266 /* 1267 * If route is known our src addr is taken from the i/f, 1268 * else punt. 1269 * 1270 * Find out route to destination. 1271 */ 1272 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1273 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1274 0, NHR_NONE, 0); 1275 1276 /* 1277 * If we found a route, use the address corresponding to 1278 * the outgoing interface. 1279 * 1280 * Otherwise assume faddr is reachable on a directly connected 1281 * network and try to find a corresponding interface to take 1282 * the source address from. 1283 */ 1284 if (nh == NULL || nh->nh_ifp == NULL) { 1285 struct in_ifaddr *ia; 1286 struct ifnet *ifp; 1287 1288 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1289 inp->inp_socket->so_fibnum)); 1290 if (ia == NULL) { 1291 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1292 inp->inp_socket->so_fibnum)); 1293 } 1294 if (ia == NULL) { 1295 error = ENETUNREACH; 1296 goto done; 1297 } 1298 1299 if (!prison_flag(cred, PR_IP4)) { 1300 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1301 goto done; 1302 } 1303 1304 ifp = ia->ia_ifp; 1305 ia = NULL; 1306 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1307 sa = ifa->ifa_addr; 1308 if (sa->sa_family != AF_INET) 1309 continue; 1310 sin = (struct sockaddr_in *)sa; 1311 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1312 ia = (struct in_ifaddr *)ifa; 1313 break; 1314 } 1315 } 1316 if (ia != NULL) { 1317 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1318 goto done; 1319 } 1320 1321 /* 3. As a last resort return the 'default' jail address. */ 1322 error = prison_get_ip4(cred, laddr); 1323 goto done; 1324 } 1325 1326 /* 1327 * If the outgoing interface on the route found is not 1328 * a loopback interface, use the address from that interface. 1329 * In case of jails do those three steps: 1330 * 1. check if the interface address belongs to the jail. If so use it. 1331 * 2. check if we have any address on the outgoing interface 1332 * belonging to this jail. If so use it. 1333 * 3. as a last resort return the 'default' jail address. 1334 */ 1335 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1336 struct in_ifaddr *ia; 1337 struct ifnet *ifp; 1338 1339 /* If not jailed, use the default returned. */ 1340 if (!prison_flag(cred, PR_IP4)) { 1341 ia = (struct in_ifaddr *)nh->nh_ifa; 1342 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1343 goto done; 1344 } 1345 1346 /* Jailed. */ 1347 /* 1. Check if the iface address belongs to the jail. */ 1348 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1349 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1350 ia = (struct in_ifaddr *)nh->nh_ifa; 1351 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1352 goto done; 1353 } 1354 1355 /* 1356 * 2. Check if we have any address on the outgoing interface 1357 * belonging to this jail. 1358 */ 1359 ia = NULL; 1360 ifp = nh->nh_ifp; 1361 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1362 sa = ifa->ifa_addr; 1363 if (sa->sa_family != AF_INET) 1364 continue; 1365 sin = (struct sockaddr_in *)sa; 1366 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1367 ia = (struct in_ifaddr *)ifa; 1368 break; 1369 } 1370 } 1371 if (ia != NULL) { 1372 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1373 goto done; 1374 } 1375 1376 /* 3. As a last resort return the 'default' jail address. */ 1377 error = prison_get_ip4(cred, laddr); 1378 goto done; 1379 } 1380 1381 /* 1382 * The outgoing interface is marked with 'loopback net', so a route 1383 * to ourselves is here. 1384 * Try to find the interface of the destination address and then 1385 * take the address from there. That interface is not necessarily 1386 * a loopback interface. 1387 * In case of jails, check that it is an address of the jail 1388 * and if we cannot find, fall back to the 'default' jail address. 1389 */ 1390 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1391 struct in_ifaddr *ia; 1392 1393 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1394 inp->inp_socket->so_fibnum)); 1395 if (ia == NULL) 1396 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1397 inp->inp_socket->so_fibnum)); 1398 if (ia == NULL) 1399 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1400 1401 if (!prison_flag(cred, PR_IP4)) { 1402 if (ia == NULL) { 1403 error = ENETUNREACH; 1404 goto done; 1405 } 1406 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1407 goto done; 1408 } 1409 1410 /* Jailed. */ 1411 if (ia != NULL) { 1412 struct ifnet *ifp; 1413 1414 ifp = ia->ia_ifp; 1415 ia = NULL; 1416 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1417 sa = ifa->ifa_addr; 1418 if (sa->sa_family != AF_INET) 1419 continue; 1420 sin = (struct sockaddr_in *)sa; 1421 if (prison_check_ip4(cred, 1422 &sin->sin_addr) == 0) { 1423 ia = (struct in_ifaddr *)ifa; 1424 break; 1425 } 1426 } 1427 if (ia != NULL) { 1428 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1429 goto done; 1430 } 1431 } 1432 1433 /* 3. As a last resort return the 'default' jail address. */ 1434 error = prison_get_ip4(cred, laddr); 1435 goto done; 1436 } 1437 1438 done: 1439 if (error == 0 && laddr->s_addr == INADDR_ANY) 1440 return (EHOSTUNREACH); 1441 return (error); 1442 } 1443 1444 void 1445 in_pcbdisconnect(struct inpcb *inp) 1446 { 1447 1448 INP_WLOCK_ASSERT(inp); 1449 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 1450 ("%s: inp %p was already disconnected", __func__, inp)); 1451 1452 if (inp->inp_flags & INP_UNCONNECTED) 1453 return; 1454 1455 INP_HASH_WLOCK(inp->inp_pcbinfo); 1456 in_pcbremhash(inp); 1457 CK_LIST_INSERT_HEAD(&inp->inp_pcbinfo->ipi_list_unconn, inp, 1458 inp_unconn_list); 1459 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1460 inp->inp_flags |= INP_UNCONNECTED; 1461 1462 if ((inp->inp_socket->so_proto->pr_flags & PR_CONNREQUIRED) == 0) { 1463 /* See the comment in in_pcbinshash(). */ 1464 inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr); 1465 inp->inp_faddr.s_addr = INADDR_ANY; 1466 inp->inp_fport = 0; 1467 } 1468 } 1469 #endif /* INET */ 1470 1471 void 1472 in_pcblisten(struct inpcb *inp) 1473 { 1474 struct inpcblbgroup *grp; 1475 1476 INP_WLOCK_ASSERT(inp); 1477 1478 if ((inp->inp_flags & INP_INLBGROUP) != 0) { 1479 struct inpcbinfo *pcbinfo; 1480 1481 pcbinfo = inp->inp_pcbinfo; 1482 INP_HASH_WLOCK(pcbinfo); 1483 grp = in_pcblbgroup_find(inp); 1484 LIST_REMOVE(inp, inp_lbgroup_list); 1485 grp->il_pendcnt--; 1486 in_pcblbgroup_insert(grp, inp); 1487 INP_HASH_WUNLOCK(pcbinfo); 1488 } 1489 } 1490 1491 /* 1492 * inpcb hash lookups are protected by SMR section. 1493 * 1494 * Once desired pcb has been found, switching from SMR section to a pcb 1495 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1496 * here because SMR is a critical section. 1497 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1498 */ 1499 void 1500 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1501 { 1502 1503 lock == INPLOOKUP_RLOCKPCB ? 1504 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1505 } 1506 1507 void 1508 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1509 { 1510 1511 lock == INPLOOKUP_RLOCKPCB ? 1512 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1513 } 1514 1515 int 1516 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1517 { 1518 1519 return (lock == INPLOOKUP_RLOCKPCB ? 1520 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1521 } 1522 1523 static inline bool 1524 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags) 1525 { 1526 1527 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1528 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1529 1530 if (__predict_true(inp_trylock(inp, lock))) { 1531 if (__predict_false(inp->inp_flags & ignflags)) { 1532 smr_exit(inp->inp_pcbinfo->ipi_smr); 1533 inp_unlock(inp, lock); 1534 return (false); 1535 } 1536 smr_exit(inp->inp_pcbinfo->ipi_smr); 1537 return (true); 1538 } 1539 1540 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1541 smr_exit(inp->inp_pcbinfo->ipi_smr); 1542 inp_lock(inp, lock); 1543 if (__predict_false(in_pcbrele(inp, lock))) 1544 return (false); 1545 /* 1546 * inp acquired through refcount & lock for sure didn't went 1547 * through uma_zfree(). However, it may have already went 1548 * through in_pcbfree() and has another reference, that 1549 * prevented its release by our in_pcbrele(). 1550 */ 1551 if (__predict_false(inp->inp_flags & ignflags)) { 1552 inp_unlock(inp, lock); 1553 return (false); 1554 } 1555 return (true); 1556 } else { 1557 smr_exit(inp->inp_pcbinfo->ipi_smr); 1558 return (false); 1559 } 1560 } 1561 1562 bool 1563 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1564 { 1565 1566 /* 1567 * in_pcblookup() family of functions shall ignore not onlu pcbs that 1568 * had been freed that may be found due to lockless access to the hash, 1569 * but also pcbs that were removed from the hash, but are still around. 1570 */ 1571 return (_inp_smr_lock(inp, lock, INP_FREED | INP_UNCONNECTED)); 1572 } 1573 1574 /* 1575 * inp_next() - inpcb hash/list traversal iterator 1576 * 1577 * Requires initialized struct inpcb_iterator for context. 1578 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1579 * 1580 * - Iterator can have either write-lock or read-lock semantics, that can not 1581 * be changed later. 1582 * - Iterator has three modes of operation, defined by value of .hash member 1583 * on the first call: 1584 * - .hash = INP_ALL_LIST: the iterator will go through the unconnected 1585 * list, then all wildcard hash slots and then all exact hash slots. 1586 * - .hash = INP_UNCONN_LIST: the iterator will go through the list of 1587 * unconnected pcbs only. 1588 * - .hash initialized with an arbitrary positive value: iterator will go 1589 * through this exact hash slot only. 1590 * Note: only rip_input() and sysctl_setsockopt() use the latter. 1591 * The interface may be extended for iteration over single wildcard hash 1592 * slot, but there is no use case for that today. 1593 * - Iterator may have optional bool matching function. The matching function 1594 * will be executed for each inpcb in the SMR context, so it can not acquire 1595 * locks and can safely access only immutable fields of inpcb. 1596 * 1597 * A fresh initialized iterator has NULL inpcb in its context and that 1598 * means that inp_next() call would return the very first inpcb on the list 1599 * locked with desired semantic. In all following calls the context pointer 1600 * shall hold the current inpcb pointer. The KPI user is not supposed to 1601 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1602 * and write NULL to its context. After end of traversal an iterator can be 1603 * reused. 1604 * 1605 * List traversals have the following features/constraints: 1606 * - New entries won't be seen, as they are always added to the head of a list. 1607 * - Removed entries won't stop traversal as long as they are not added to 1608 * a different list. This is violated by in_pcbrehash(). 1609 */ 1610 static inline struct inpcb * 1611 ii_list_first(const struct inpcb_iterator *ii) 1612 { 1613 const struct inpcbinfo *ipi = ii->ipi; 1614 const int hash = ii->hash; 1615 1616 if (hash < 0) 1617 return (CK_LIST_FIRST(&ipi->ipi_list_unconn)); 1618 else if (hash <= ipi->ipi_hashmask) 1619 return (CK_LIST_FIRST(&ipi->ipi_hash_wild[hash])); 1620 else 1621 return (CK_LIST_FIRST( 1622 &ipi->ipi_hash_exact[hash - ipi->ipi_hashmask - 1])); 1623 } 1624 1625 static inline struct inpcb * 1626 ii_list_next(const struct inpcb_iterator *ii, struct inpcb *inp) 1627 { 1628 if (ii->hash < 0) 1629 return (CK_LIST_NEXT(inp, inp_unconn_list)); 1630 else if (ii->hash <= ii->ipi->ipi_hashmask) 1631 return (CK_LIST_NEXT(inp, inp_hash_wild)); 1632 else 1633 return (CK_LIST_NEXT(inp, inp_hash_exact)); 1634 } 1635 1636 struct inpcb * 1637 inp_next(struct inpcb_iterator *ii) 1638 { 1639 const struct inpcbinfo *ipi = ii->ipi; 1640 const int hashmax = (ipi->ipi_hashmask + 1) * 2; 1641 inp_match_t *match = ii->match; 1642 void *ctx = ii->ctx; 1643 inp_lookup_t lock = ii->lock; 1644 struct inpcb *inp; 1645 1646 if (ii->inp == NULL) { /* First call. */ 1647 if ((ii->hash = ii->mode) >= 0) { 1648 /* Targeted iterators support only the exact hash. */ 1649 MPASS(ii->hash <= ipi->ipi_hashmask); 1650 ii->hash += ipi->ipi_hashmask + 1; 1651 } 1652 smr_enter(ipi->ipi_smr); 1653 next_first: 1654 /* This is unrolled CK_LIST_FOREACH() over different headers. */ 1655 for (inp = ii_list_first(ii); 1656 inp != NULL; 1657 inp = ii_list_next(ii, inp)) { 1658 if (match != NULL && (match)(inp, ctx) == false) 1659 continue; 1660 if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) 1661 break; 1662 else { 1663 smr_enter(ipi->ipi_smr); 1664 MPASS(inp != ii_list_first(ii)); 1665 inp = ii_list_first(ii); 1666 if (inp == NULL) 1667 break; 1668 } 1669 } 1670 1671 if (inp == NULL) { 1672 if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax) 1673 goto next_first; 1674 smr_exit(ipi->ipi_smr); 1675 } else 1676 ii->inp = inp; 1677 1678 return (inp); 1679 } 1680 1681 /* Not a first call. */ 1682 smr_enter(ipi->ipi_smr); 1683 restart: 1684 inp = ii->inp; 1685 rw_assert(&inp->inp_lock, 1686 lock == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED); 1687 next: 1688 inp = ii_list_next(ii, inp); 1689 if (inp == NULL) { 1690 if (ii->mode == INP_ALL_LIST && ++ii->hash < hashmax) { 1691 inp_unlock(ii->inp, lock); 1692 ii->inp = NULL; 1693 goto next_first; 1694 } 1695 smr_exit(ipi->ipi_smr); 1696 goto found; 1697 } 1698 1699 if (match != NULL && (match)(inp, ctx) == false) 1700 goto next; 1701 1702 if (__predict_true(inp_trylock(inp, lock))) { 1703 if (__predict_false(inp->inp_flags & INP_FREED)) { 1704 /* 1705 * Entries are never inserted in middle of a list, thus 1706 * as long as we are in SMR, we can continue traversal. 1707 * Jump to 'next' should yield in the same result, but 1708 * could produce unnecessary looping. Could this 1709 * looping be unbound? 1710 */ 1711 inp_unlock(inp, lock); 1712 goto next; 1713 } else { 1714 smr_exit(ipi->ipi_smr); 1715 goto found; 1716 } 1717 } 1718 1719 /* 1720 * Can't obtain lock immediately, thus going hard. Once we exit the 1721 * SMR section we can no longer jump to 'next', and our only stable 1722 * anchoring point is ii->inp, which we keep locked for this case, so 1723 * we jump to 'restart'. 1724 */ 1725 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1726 smr_exit(ipi->ipi_smr); 1727 inp_lock(inp, lock); 1728 if (__predict_false(in_pcbrele(inp, lock))) { 1729 smr_enter(ipi->ipi_smr); 1730 goto restart; 1731 } 1732 /* 1733 * See comment in inp_smr_lock(). 1734 */ 1735 if (__predict_false(inp->inp_flags & INP_FREED)) { 1736 inp_unlock(inp, lock); 1737 smr_enter(ipi->ipi_smr); 1738 goto restart; 1739 } 1740 } else 1741 goto next; 1742 1743 found: 1744 inp_unlock(ii->inp, lock); 1745 ii->inp = inp; 1746 1747 return (ii->inp); 1748 } 1749 1750 /* 1751 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1752 * stability of an inpcb pointer despite the inpcb lock being released or 1753 * SMR section exited. 1754 * 1755 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1756 */ 1757 void 1758 in_pcbref(struct inpcb *inp) 1759 { 1760 u_int old __diagused; 1761 1762 old = refcount_acquire(&inp->inp_refcount); 1763 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1764 } 1765 1766 /* 1767 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1768 * freeing the pcb, if the reference was very last. 1769 */ 1770 bool 1771 in_pcbrele_rlocked(struct inpcb *inp) 1772 { 1773 1774 INP_RLOCK_ASSERT(inp); 1775 1776 if (!refcount_release(&inp->inp_refcount)) 1777 return (false); 1778 1779 MPASS(inp->inp_flags & INP_FREED); 1780 MPASS(inp->inp_socket == NULL); 1781 crfree(inp->inp_cred); 1782 #ifdef INVARIANTS 1783 inp->inp_cred = NULL; 1784 #endif 1785 INP_RUNLOCK(inp); 1786 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1787 return (true); 1788 } 1789 1790 bool 1791 in_pcbrele_wlocked(struct inpcb *inp) 1792 { 1793 1794 INP_WLOCK_ASSERT(inp); 1795 1796 if (!refcount_release(&inp->inp_refcount)) 1797 return (false); 1798 1799 MPASS(inp->inp_flags & INP_FREED); 1800 MPASS(inp->inp_socket == NULL); 1801 crfree(inp->inp_cred); 1802 #ifdef INVARIANTS 1803 inp->inp_cred = NULL; 1804 #endif 1805 INP_WUNLOCK(inp); 1806 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1807 return (true); 1808 } 1809 1810 bool 1811 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1812 { 1813 1814 return (lock == INPLOOKUP_RLOCKPCB ? 1815 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1816 } 1817 1818 /* 1819 * Dereference and rlock inp, for which the caller must own the 1820 * reference. Returns true if inp no longer usable, false otherwise. 1821 */ 1822 bool 1823 in_pcbrele_rlock(struct inpcb *inp) 1824 { 1825 INP_RLOCK(inp); 1826 if (in_pcbrele_rlocked(inp)) 1827 return (true); 1828 if ((inp->inp_flags & INP_FREED) != 0) { 1829 INP_RUNLOCK(inp); 1830 return (true); 1831 } 1832 return (false); 1833 } 1834 1835 /* 1836 * Unconditionally schedule an inpcb to be freed by decrementing its 1837 * reference count, which should occur only after the inpcb has been detached 1838 * from its socket. If another thread holds a temporary reference (acquired 1839 * using in_pcbref()) then the free is deferred until that reference is 1840 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1841 * Almost all work, including removal from global lists, is done in this 1842 * context, where the pcbinfo lock is held. 1843 */ 1844 void 1845 in_pcbfree(struct inpcb *inp) 1846 { 1847 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1848 #ifdef INET 1849 struct ip_moptions *imo; 1850 #endif 1851 #ifdef INET6 1852 struct ip6_moptions *im6o; 1853 #endif 1854 1855 INP_WLOCK_ASSERT(inp); 1856 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1857 KASSERT((inp->inp_flags & INP_FREED) == 0, 1858 ("%s: called twice for pcb %p", __func__, inp)); 1859 1860 /* 1861 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb 1862 * from the hash without acquiring inpcb lock, they rely on the hash 1863 * lock, thus in_pcbremhash() should be the first action. 1864 */ 1865 INP_HASH_WLOCK(pcbinfo); 1866 if (inp->inp_flags & INP_UNCONNECTED) 1867 CK_LIST_REMOVE(inp, inp_unconn_list); 1868 else 1869 in_pcbremhash(inp); 1870 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1871 pcbinfo->ipi_count--; 1872 INP_HASH_WUNLOCK(pcbinfo); 1873 1874 #ifdef RATELIMIT 1875 if (inp->inp_snd_tag != NULL) 1876 in_pcbdetach_txrtlmt(inp); 1877 #endif 1878 inp->inp_flags |= INP_FREED; 1879 inp->inp_socket->so_pcb = NULL; 1880 inp->inp_socket = NULL; 1881 1882 RO_INVALIDATE_CACHE(&inp->inp_route); 1883 #ifdef MAC 1884 mac_inpcb_destroy(inp); 1885 #endif 1886 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1887 if (inp->inp_sp != NULL) 1888 ipsec_delete_pcbpolicy(inp); 1889 #endif 1890 #ifdef INET 1891 if (inp->inp_options) 1892 (void)m_free(inp->inp_options); 1893 DEBUG_POISON_POINTER(inp->inp_options); 1894 imo = inp->inp_moptions; 1895 DEBUG_POISON_POINTER(inp->inp_moptions); 1896 #endif 1897 #ifdef INET6 1898 if (inp->inp_vflag & INP_IPV6PROTO) { 1899 ip6_freepcbopts(inp->in6p_outputopts); 1900 DEBUG_POISON_POINTER(inp->in6p_outputopts); 1901 im6o = inp->in6p_moptions; 1902 DEBUG_POISON_POINTER(inp->in6p_moptions); 1903 } else 1904 im6o = NULL; 1905 #endif 1906 1907 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1908 INP_WUNLOCK(inp); 1909 } 1910 #ifdef INET6 1911 ip6_freemoptions(im6o); 1912 #endif 1913 #ifdef INET 1914 inp_freemoptions(imo); 1915 #endif 1916 } 1917 1918 /* 1919 * Different protocols initialize their inpcbs differently - giving 1920 * different name to the lock. But they all are disposed the same. 1921 */ 1922 static void 1923 inpcb_fini(void *mem, int size) 1924 { 1925 struct inpcb *inp = mem; 1926 1927 INP_LOCK_DESTROY(inp); 1928 } 1929 1930 #ifdef INET 1931 /* 1932 * Common routines to return the socket addresses associated with inpcbs. 1933 */ 1934 int 1935 in_getsockaddr(struct socket *so, struct sockaddr *sa) 1936 { 1937 struct inpcb *inp; 1938 1939 inp = sotoinpcb(so); 1940 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1941 1942 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1943 .sin_len = sizeof(struct sockaddr_in), 1944 .sin_family = AF_INET, 1945 .sin_port = inp->inp_lport, 1946 .sin_addr = inp->inp_laddr, 1947 }; 1948 1949 return (0); 1950 } 1951 1952 int 1953 in_getpeeraddr(struct socket *so, struct sockaddr *sa) 1954 { 1955 struct inpcb *inp; 1956 1957 inp = sotoinpcb(so); 1958 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1959 1960 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1961 .sin_len = sizeof(struct sockaddr_in), 1962 .sin_family = AF_INET, 1963 .sin_port = inp->inp_fport, 1964 .sin_addr = inp->inp_faddr, 1965 }; 1966 1967 return (0); 1968 } 1969 1970 static bool 1971 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 1972 { 1973 1974 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 1975 return (true); 1976 else 1977 return (false); 1978 } 1979 1980 void 1981 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1982 { 1983 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 1984 inp_v4_multi_match, NULL); 1985 struct inpcb *inp; 1986 struct in_multi *inm; 1987 struct in_mfilter *imf; 1988 struct ip_moptions *imo; 1989 1990 IN_MULTI_LOCK_ASSERT(); 1991 1992 while ((inp = inp_next(&inpi)) != NULL) { 1993 INP_WLOCK_ASSERT(inp); 1994 1995 imo = inp->inp_moptions; 1996 /* 1997 * Unselect the outgoing interface if it is being 1998 * detached. 1999 */ 2000 if (imo->imo_multicast_ifp == ifp) 2001 imo->imo_multicast_ifp = NULL; 2002 2003 /* 2004 * Drop multicast group membership if we joined 2005 * through the interface being detached. 2006 * 2007 * XXX This can all be deferred to an epoch_call 2008 */ 2009 restart: 2010 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 2011 if ((inm = imf->imf_inm) == NULL) 2012 continue; 2013 if (inm->inm_ifp != ifp) 2014 continue; 2015 ip_mfilter_remove(&imo->imo_head, imf); 2016 in_leavegroup_locked(inm, NULL); 2017 ip_mfilter_free(imf); 2018 goto restart; 2019 } 2020 } 2021 } 2022 2023 /* 2024 * Lookup a PCB based on the local address and port. Caller must hold the 2025 * hash lock. No inpcb locks or references are acquired. 2026 */ 2027 #define INP_LOOKUP_MAPPED_PCB_COST 3 2028 struct inpcb * 2029 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2030 u_short lport, int fib, int lookupflags, struct ucred *cred) 2031 { 2032 struct inpcb *inp; 2033 #ifdef INET6 2034 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 2035 #else 2036 int matchwild = 3; 2037 #endif 2038 int wildcard; 2039 2040 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2041 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2042 KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs), 2043 ("%s: invalid fib %d", __func__, fib)); 2044 2045 INP_HASH_LOCK_ASSERT(pcbinfo); 2046 2047 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 2048 struct inpcbhead *head; 2049 /* 2050 * Look for an unconnected (wildcard foreign addr) PCB that 2051 * matches the local address and port we're looking for. 2052 */ 2053 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2054 pcbinfo->ipi_hashmask)]; 2055 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2056 #ifdef INET6 2057 /* XXX inp locking */ 2058 if ((inp->inp_vflag & INP_IPV4) == 0) 2059 continue; 2060 #endif 2061 if (inp->inp_faddr.s_addr == INADDR_ANY && 2062 inp->inp_laddr.s_addr == laddr.s_addr && 2063 inp->inp_lport == lport && (fib == RT_ALL_FIBS || 2064 inp->inp_inc.inc_fibnum == fib)) { 2065 /* 2066 * Found? 2067 */ 2068 if (prison_equal_ip4(cred->cr_prison, 2069 inp->inp_cred->cr_prison)) 2070 return (inp); 2071 } 2072 } 2073 /* 2074 * Not found. 2075 */ 2076 return (NULL); 2077 } else { 2078 struct inpcbhead *porthash; 2079 struct inpcb *match = NULL; 2080 2081 /* 2082 * Port is in use by one or more PCBs. Look for best 2083 * fit. 2084 */ 2085 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 2086 pcbinfo->ipi_porthashmask)]; 2087 CK_LIST_FOREACH(inp, porthash, inp_portlist) { 2088 if (inp->inp_lport != lport) 2089 continue; 2090 if (!prison_equal_ip4(inp->inp_cred->cr_prison, 2091 cred->cr_prison)) 2092 continue; 2093 if (fib != RT_ALL_FIBS && 2094 inp->inp_inc.inc_fibnum != fib) 2095 continue; 2096 wildcard = 0; 2097 #ifdef INET6 2098 /* XXX inp locking */ 2099 if ((inp->inp_vflag & INP_IPV4) == 0) 2100 continue; 2101 /* 2102 * We never select the PCB that has INP_IPV6 flag and 2103 * is bound to :: if we have another PCB which is bound 2104 * to 0.0.0.0. If a PCB has the INP_IPV6 flag, then we 2105 * set its cost higher than IPv4 only PCBs. 2106 * 2107 * Note that the case only happens when a socket is 2108 * bound to ::, under the condition that the use of the 2109 * mapped address is allowed. 2110 */ 2111 if ((inp->inp_vflag & INP_IPV6) != 0) 2112 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2113 #endif 2114 if (inp->inp_faddr.s_addr != INADDR_ANY) 2115 wildcard++; 2116 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2117 if (laddr.s_addr == INADDR_ANY) 2118 wildcard++; 2119 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2120 continue; 2121 } else { 2122 if (laddr.s_addr != INADDR_ANY) 2123 wildcard++; 2124 } 2125 if (wildcard < matchwild) { 2126 match = inp; 2127 matchwild = wildcard; 2128 if (matchwild == 0) 2129 break; 2130 } 2131 } 2132 return (match); 2133 } 2134 } 2135 #undef INP_LOOKUP_MAPPED_PCB_COST 2136 2137 static bool 2138 in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib) 2139 { 2140 return ((domain == M_NODOM || domain == grp->il_numa_domain) && 2141 (fib == RT_ALL_FIBS || fib == grp->il_fibnum)); 2142 } 2143 2144 static struct inpcb * 2145 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2146 const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr, 2147 uint16_t lport, int domain, int fib) 2148 { 2149 const struct inpcblbgrouphead *hdr; 2150 struct inpcblbgroup *grp; 2151 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; 2152 struct inpcb *inp; 2153 u_int count; 2154 2155 INP_HASH_LOCK_ASSERT(pcbinfo); 2156 NET_EPOCH_ASSERT(); 2157 2158 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2159 INP_PCBPORTHASH(lport, pcbinfo->ipi_porthashmask)]; 2160 2161 /* 2162 * Search for an LB group match based on the following criteria: 2163 * - prefer jailed groups to non-jailed groups 2164 * - prefer exact source address matches to wildcard matches 2165 * - prefer groups bound to the specified NUMA domain 2166 */ 2167 jail_exact = jail_wild = local_exact = local_wild = NULL; 2168 CK_LIST_FOREACH(grp, hdr, il_list) { 2169 bool injail; 2170 2171 #ifdef INET6 2172 if (!(grp->il_vflag & INP_IPV4)) 2173 continue; 2174 #endif 2175 if (grp->il_lport != lport) 2176 continue; 2177 2178 injail = prison_flag(grp->il_cred, PR_IP4) != 0; 2179 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, 2180 laddr) != 0) 2181 continue; 2182 2183 if (grp->il_laddr.s_addr == laddr->s_addr) { 2184 if (injail) { 2185 jail_exact = grp; 2186 if (in_pcblookup_lb_match(grp, domain, fib)) 2187 /* This is a perfect match. */ 2188 goto out; 2189 } else if (local_exact == NULL || 2190 in_pcblookup_lb_match(grp, domain, fib)) { 2191 local_exact = grp; 2192 } 2193 } else if (grp->il_laddr.s_addr == INADDR_ANY) { 2194 if (injail) { 2195 if (jail_wild == NULL || 2196 in_pcblookup_lb_match(grp, domain, fib)) 2197 jail_wild = grp; 2198 } else if (local_wild == NULL || 2199 in_pcblookup_lb_match(grp, domain, fib)) { 2200 local_wild = grp; 2201 } 2202 } 2203 } 2204 2205 if (jail_exact != NULL) 2206 grp = jail_exact; 2207 else if (jail_wild != NULL) 2208 grp = jail_wild; 2209 else if (local_exact != NULL) 2210 grp = local_exact; 2211 else 2212 grp = local_wild; 2213 if (grp == NULL) 2214 return (NULL); 2215 2216 out: 2217 /* 2218 * Synchronize with in_pcblbgroup_insert(). 2219 */ 2220 count = atomic_load_acq_int(&grp->il_inpcnt); 2221 if (count == 0) 2222 return (NULL); 2223 inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count]; 2224 KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); 2225 return (inp); 2226 } 2227 2228 static bool 2229 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr, 2230 u_short fport, struct in_addr laddr, u_short lport) 2231 { 2232 #ifdef INET6 2233 /* XXX inp locking */ 2234 if ((inp->inp_vflag & INP_IPV4) == 0) 2235 return (false); 2236 #endif 2237 if (inp->inp_faddr.s_addr == faddr.s_addr && 2238 inp->inp_laddr.s_addr == laddr.s_addr && 2239 inp->inp_fport == fport && 2240 inp->inp_lport == lport) 2241 return (true); 2242 return (false); 2243 } 2244 2245 static struct inpcb * 2246 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2247 u_short fport, struct in_addr laddr, u_short lport) 2248 { 2249 struct inpcbhead *head; 2250 struct inpcb *inp; 2251 2252 INP_HASH_LOCK_ASSERT(pcbinfo); 2253 2254 head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport, 2255 pcbinfo->ipi_hashmask)]; 2256 CK_LIST_FOREACH(inp, head, inp_hash_exact) { 2257 if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport)) 2258 return (inp); 2259 } 2260 return (NULL); 2261 } 2262 2263 typedef enum { 2264 INPLOOKUP_MATCH_NONE = 0, 2265 INPLOOKUP_MATCH_WILD = 1, 2266 INPLOOKUP_MATCH_LADDR = 2, 2267 } inp_lookup_match_t; 2268 2269 static inp_lookup_match_t 2270 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr, 2271 u_short lport, int fib) 2272 { 2273 #ifdef INET6 2274 /* XXX inp locking */ 2275 if ((inp->inp_vflag & INP_IPV4) == 0) 2276 return (INPLOOKUP_MATCH_NONE); 2277 #endif 2278 if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) 2279 return (INPLOOKUP_MATCH_NONE); 2280 if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib) 2281 return (INPLOOKUP_MATCH_NONE); 2282 if (inp->inp_laddr.s_addr == INADDR_ANY) 2283 return (INPLOOKUP_MATCH_WILD); 2284 if (inp->inp_laddr.s_addr == laddr.s_addr) 2285 return (INPLOOKUP_MATCH_LADDR); 2286 return (INPLOOKUP_MATCH_NONE); 2287 } 2288 2289 #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1) 2290 2291 static struct inpcb * 2292 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2293 u_short lport, int fib, const inp_lookup_t lockflags) 2294 { 2295 struct inpcbhead *head; 2296 struct inpcb *inp; 2297 2298 KASSERT(SMR_ENTERED(pcbinfo->ipi_smr), 2299 ("%s: not in SMR read section", __func__)); 2300 2301 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2302 pcbinfo->ipi_hashmask)]; 2303 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2304 inp_lookup_match_t match; 2305 2306 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2307 if (match == INPLOOKUP_MATCH_NONE) 2308 continue; 2309 2310 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2311 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2312 if (match != INPLOOKUP_MATCH_NONE && 2313 prison_check_ip4_locked(inp->inp_cred->cr_prison, 2314 &laddr) == 0) 2315 return (inp); 2316 inp_unlock(inp, lockflags); 2317 } 2318 2319 /* 2320 * The matching socket disappeared out from under us. Fall back 2321 * to a serialized lookup. 2322 */ 2323 return (INP_LOOKUP_AGAIN); 2324 } 2325 return (NULL); 2326 } 2327 2328 static struct inpcb * 2329 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2330 u_short lport, int fib) 2331 { 2332 struct inpcbhead *head; 2333 struct inpcb *inp, *local_wild, *local_exact, *jail_wild; 2334 #ifdef INET6 2335 struct inpcb *local_wild_mapped; 2336 #endif 2337 2338 INP_HASH_LOCK_ASSERT(pcbinfo); 2339 2340 /* 2341 * Order of socket selection - we always prefer jails. 2342 * 1. jailed, non-wild. 2343 * 2. jailed, wild. 2344 * 3. non-jailed, non-wild. 2345 * 4. non-jailed, wild. 2346 */ 2347 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2348 pcbinfo->ipi_hashmask)]; 2349 local_wild = local_exact = jail_wild = NULL; 2350 #ifdef INET6 2351 local_wild_mapped = NULL; 2352 #endif 2353 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2354 inp_lookup_match_t match; 2355 bool injail; 2356 2357 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2358 if (match == INPLOOKUP_MATCH_NONE) 2359 continue; 2360 2361 injail = prison_flag(inp->inp_cred, PR_IP4) != 0; 2362 if (injail) { 2363 if (prison_check_ip4_locked(inp->inp_cred->cr_prison, 2364 &laddr) != 0) 2365 continue; 2366 } else { 2367 if (local_exact != NULL) 2368 continue; 2369 } 2370 2371 if (match == INPLOOKUP_MATCH_LADDR) { 2372 if (injail) 2373 return (inp); 2374 local_exact = inp; 2375 } else { 2376 #ifdef INET6 2377 /* XXX inp locking, NULL check */ 2378 if (inp->inp_vflag & INP_IPV6PROTO) 2379 local_wild_mapped = inp; 2380 else 2381 #endif 2382 if (injail) 2383 jail_wild = inp; 2384 else 2385 local_wild = inp; 2386 } 2387 } 2388 if (jail_wild != NULL) 2389 return (jail_wild); 2390 if (local_exact != NULL) 2391 return (local_exact); 2392 if (local_wild != NULL) 2393 return (local_wild); 2394 #ifdef INET6 2395 if (local_wild_mapped != NULL) 2396 return (local_wild_mapped); 2397 #endif 2398 return (NULL); 2399 } 2400 2401 /* 2402 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2403 * that the caller has either locked the hash list, which usually happens 2404 * for bind(2) operations, or is in SMR section, which happens when sorting 2405 * out incoming packets. 2406 */ 2407 static struct inpcb * 2408 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2409 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2410 uint8_t numa_domain, int fib) 2411 { 2412 struct inpcb *inp; 2413 const u_short fport = fport_arg, lport = lport_arg; 2414 2415 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0, 2416 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2417 KASSERT(faddr.s_addr != INADDR_ANY, 2418 ("%s: invalid foreign address", __func__)); 2419 KASSERT(laddr.s_addr != INADDR_ANY, 2420 ("%s: invalid local address", __func__)); 2421 INP_HASH_WLOCK_ASSERT(pcbinfo); 2422 2423 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2424 if (inp != NULL) 2425 return (inp); 2426 2427 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2428 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2429 &laddr, lport, numa_domain, fib); 2430 if (inp == NULL) { 2431 inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr, 2432 lport, fib); 2433 } 2434 } 2435 2436 return (inp); 2437 } 2438 2439 static struct inpcb * 2440 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2441 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2442 uint8_t numa_domain, int fib) 2443 { 2444 struct inpcb *inp; 2445 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2446 2447 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2448 ("%s: LOCKPCB not set", __func__)); 2449 2450 INP_HASH_WLOCK(pcbinfo); 2451 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2452 lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib); 2453 if (inp != NULL && !inp_trylock(inp, lockflags)) { 2454 in_pcbref(inp); 2455 INP_HASH_WUNLOCK(pcbinfo); 2456 inp_lock(inp, lockflags); 2457 if (in_pcbrele(inp, lockflags)) 2458 /* XXX-MJ or retry until we get a negative match? */ 2459 inp = NULL; 2460 } else { 2461 INP_HASH_WUNLOCK(pcbinfo); 2462 } 2463 return (inp); 2464 } 2465 2466 static struct inpcb * 2467 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2468 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2469 uint8_t numa_domain, int fib) 2470 { 2471 struct inpcb *inp; 2472 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2473 const u_short fport = fport_arg, lport = lport_arg; 2474 2475 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2476 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2477 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2478 ("%s: LOCKPCB not set", __func__)); 2479 2480 smr_enter(pcbinfo->ipi_smr); 2481 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2482 if (inp != NULL) { 2483 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2484 /* 2485 * Revalidate the 4-tuple, the socket could have been 2486 * disconnected. 2487 */ 2488 if (__predict_true(in_pcblookup_exact_match(inp, 2489 faddr, fport, laddr, lport))) 2490 return (inp); 2491 inp_unlock(inp, lockflags); 2492 } 2493 2494 /* 2495 * We failed to lock the inpcb, or its connection state changed 2496 * out from under us. Fall back to a precise search. 2497 */ 2498 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2499 lookupflags, numa_domain, fib)); 2500 } 2501 2502 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2503 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2504 &laddr, lport, numa_domain, fib); 2505 if (inp != NULL) { 2506 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2507 if (__predict_true(in_pcblookup_wild_match(inp, 2508 laddr, lport, fib) != INPLOOKUP_MATCH_NONE)) 2509 return (inp); 2510 inp_unlock(inp, lockflags); 2511 } 2512 inp = INP_LOOKUP_AGAIN; 2513 } else { 2514 inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport, 2515 fib, lockflags); 2516 } 2517 if (inp == INP_LOOKUP_AGAIN) { 2518 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, 2519 lport, lookupflags, numa_domain, fib)); 2520 } 2521 } 2522 2523 if (inp == NULL) 2524 smr_exit(pcbinfo->ipi_smr); 2525 2526 return (inp); 2527 } 2528 2529 /* 2530 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2531 * from which a pre-calculated hash value may be extracted. 2532 */ 2533 struct inpcb * 2534 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2535 struct in_addr laddr, u_int lport, int lookupflags, 2536 struct ifnet *ifp) 2537 { 2538 int fib; 2539 2540 fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS; 2541 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2542 lookupflags, M_NODOM, fib)); 2543 } 2544 2545 struct inpcb * 2546 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2547 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2548 struct ifnet *ifp __unused, struct mbuf *m) 2549 { 2550 int fib; 2551 2552 M_ASSERTPKTHDR(m); 2553 fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS; 2554 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2555 lookupflags, m->m_pkthdr.numa_domain, fib)); 2556 } 2557 #endif /* INET */ 2558 2559 static bool 2560 in_pcbjailed(const struct inpcb *inp, unsigned int flag) 2561 { 2562 return (prison_flag(inp->inp_cred, flag) != 0); 2563 } 2564 2565 /* 2566 * Insert the PCB into a hash chain using ordering rules which ensure that 2567 * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first. 2568 * 2569 * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs 2570 * with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs 2571 * always appear last no matter whether they are jailed. 2572 */ 2573 static void 2574 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2575 { 2576 struct inpcb *last; 2577 bool bound, injail; 2578 2579 INP_LOCK_ASSERT(inp); 2580 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2581 2582 last = NULL; 2583 bound = inp->inp_laddr.s_addr != INADDR_ANY; 2584 if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) { 2585 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2586 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2587 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2588 return; 2589 } 2590 } 2591 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2592 return; 2593 } 2594 2595 injail = in_pcbjailed(inp, PR_IP4); 2596 if (!injail) { 2597 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2598 if (!in_pcbjailed(last, PR_IP4)) 2599 break; 2600 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2601 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2602 return; 2603 } 2604 } 2605 } else if (!CK_LIST_EMPTY(pcbhash) && 2606 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) { 2607 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2608 return; 2609 } 2610 if (!bound) { 2611 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2612 if (last->inp_laddr.s_addr == INADDR_ANY) 2613 break; 2614 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2615 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2616 return; 2617 } 2618 } 2619 } 2620 if (last == NULL) 2621 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2622 else 2623 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2624 } 2625 2626 #ifdef INET6 2627 /* 2628 * See the comment above _in_pcbinshash_wild(). 2629 */ 2630 static void 2631 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2632 { 2633 struct inpcb *last; 2634 bool bound, injail; 2635 2636 INP_LOCK_ASSERT(inp); 2637 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2638 2639 last = NULL; 2640 bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr); 2641 injail = in_pcbjailed(inp, PR_IP6); 2642 if (!injail) { 2643 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2644 if (!in_pcbjailed(last, PR_IP6)) 2645 break; 2646 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2647 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2648 return; 2649 } 2650 } 2651 } else if (!CK_LIST_EMPTY(pcbhash) && 2652 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) { 2653 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2654 return; 2655 } 2656 if (!bound) { 2657 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2658 if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr)) 2659 break; 2660 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2661 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2662 return; 2663 } 2664 } 2665 } 2666 if (last == NULL) 2667 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2668 else 2669 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2670 } 2671 #endif 2672 2673 /* 2674 * Insert PCB onto various hash lists. 2675 * 2676 * With normal sockets this function shall not fail, so it could return void. 2677 * But for SO_REUSEPORT_LB it may need to allocate memory with locks held, 2678 * that's the only condition when it can fail. 2679 */ 2680 int 2681 in_pcbinshash(struct inpcb *inp) 2682 { 2683 struct inpcbhead *pcbhash, *pcbporthash; 2684 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2685 uint32_t hash; 2686 bool connected; 2687 2688 INP_WLOCK_ASSERT(inp); 2689 INP_HASH_WLOCK_ASSERT(pcbinfo); 2690 MPASS(inp->inp_flags & INP_UNCONNECTED); 2691 2692 #ifdef INET6 2693 if (inp->inp_vflag & INP_IPV6) { 2694 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2695 inp->inp_fport, pcbinfo->ipi_hashmask); 2696 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2697 } else 2698 #endif 2699 { 2700 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2701 inp->inp_fport, pcbinfo->ipi_hashmask); 2702 connected = !in_nullhost(inp->inp_faddr); 2703 } 2704 2705 if (connected) 2706 pcbhash = &pcbinfo->ipi_hash_exact[hash]; 2707 else 2708 pcbhash = &pcbinfo->ipi_hash_wild[hash]; 2709 2710 pcbporthash = &pcbinfo->ipi_porthashbase[ 2711 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2712 2713 /* 2714 * Ignore SO_REUSEPORT_LB if the socket is connected. Really this case 2715 * should be an error, but for UDP sockets it is not, and some 2716 * applications erroneously set it on connected UDP sockets, so we can't 2717 * change this without breaking compatibility. 2718 */ 2719 if (!connected && 2720 (inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) { 2721 int error = in_pcbinslbgrouphash(inp, M_NODOM); 2722 if (error != 0) 2723 return (error); 2724 } 2725 2726 /* 2727 * The PCB may have been disconnected in the past. Before we can safely 2728 * make it visible in the hash table, we must wait for all readers which 2729 * may be traversing this PCB to finish. 2730 */ 2731 if (inp->inp_smr != SMR_SEQ_INVALID) { 2732 smr_wait(pcbinfo->ipi_smr, inp->inp_smr); 2733 inp->inp_smr = SMR_SEQ_INVALID; 2734 } 2735 2736 CK_LIST_REMOVE(inp, inp_unconn_list); 2737 2738 if (connected) 2739 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); 2740 else { 2741 #ifdef INET6 2742 if ((inp->inp_vflag & INP_IPV6) != 0) 2743 _in6_pcbinshash_wild(pcbhash, inp); 2744 else 2745 #endif 2746 _in_pcbinshash_wild(pcbhash, inp); 2747 } 2748 CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist); 2749 inp->inp_flags &= ~INP_UNCONNECTED; 2750 2751 return (0); 2752 } 2753 2754 void 2755 in_pcbremhash(struct inpcb *inp) 2756 { 2757 2758 INP_WLOCK_ASSERT(inp); 2759 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2760 MPASS(!(inp->inp_flags & INP_UNCONNECTED)); 2761 2762 if ((inp->inp_flags & INP_INLBGROUP) != 0) 2763 in_pcbremlbgrouphash(inp); 2764 #ifdef INET6 2765 if (inp->inp_vflag & INP_IPV6) { 2766 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) 2767 CK_LIST_REMOVE(inp, inp_hash_wild); 2768 else 2769 CK_LIST_REMOVE(inp, inp_hash_exact); 2770 } else 2771 #endif 2772 { 2773 if (in_nullhost(inp->inp_faddr)) 2774 CK_LIST_REMOVE(inp, inp_hash_wild); 2775 else 2776 CK_LIST_REMOVE(inp, inp_hash_exact); 2777 } 2778 CK_LIST_REMOVE(inp, inp_portlist); 2779 } 2780 2781 /* 2782 * Move PCB to the proper hash bucket when { faddr, fport } have been 2783 * changed. NOTE: This does not handle the case of the lport changing (the 2784 * hashed port list would have to be updated as well), so the lport must 2785 * not change after in_pcbinshash() has been called. 2786 */ 2787 void 2788 in_pcbrehash(struct inpcb *inp) 2789 { 2790 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2791 struct inpcbhead *head; 2792 uint32_t hash; 2793 bool connected; 2794 2795 INP_WLOCK_ASSERT(inp); 2796 INP_HASH_WLOCK_ASSERT(pcbinfo); 2797 MPASS(!(inp->inp_flags & INP_UNCONNECTED)); 2798 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 2799 ("%s: inp was disconnected", __func__)); 2800 2801 #ifdef INET6 2802 if (inp->inp_vflag & INP_IPV6) { 2803 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2804 inp->inp_fport, pcbinfo->ipi_hashmask); 2805 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2806 } else 2807 #endif 2808 { 2809 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2810 inp->inp_fport, pcbinfo->ipi_hashmask); 2811 connected = !in_nullhost(inp->inp_faddr); 2812 } 2813 2814 /* See the comment in in_pcbinshash(). */ 2815 if (connected && (inp->inp_flags & INP_INLBGROUP) != 0) 2816 in_pcbremlbgrouphash(inp); 2817 2818 /* 2819 * When rehashing, the caller must ensure that either the new or the old 2820 * foreign address was unspecified. 2821 */ 2822 if (connected) { 2823 CK_LIST_REMOVE(inp, inp_hash_wild); 2824 head = &pcbinfo->ipi_hash_exact[hash]; 2825 CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact); 2826 } else { 2827 CK_LIST_REMOVE(inp, inp_hash_exact); 2828 head = &pcbinfo->ipi_hash_wild[hash]; 2829 CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild); 2830 } 2831 } 2832 2833 void 2834 ripcb_connect(struct inpcb *inp) 2835 { 2836 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2837 uint32_t hash; 2838 2839 INP_WLOCK_ASSERT(inp); 2840 MPASS(inp->inp_flags & INP_UNCONNECTED); 2841 2842 hash = RIPCB_HASH(inp) & pcbinfo->ipi_hashmask; 2843 2844 INP_HASH_WLOCK(pcbinfo); 2845 CK_LIST_REMOVE(inp, inp_unconn_list); 2846 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_hash_exact[hash], inp, 2847 inp_hash_exact); 2848 INP_HASH_WUNLOCK(pcbinfo); 2849 inp->inp_flags &= ~INP_UNCONNECTED; 2850 } 2851 2852 void 2853 ripcb_disconnect(struct inpcb *inp) 2854 { 2855 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2856 2857 INP_WLOCK_ASSERT(inp); 2858 2859 if (inp->inp_flags & INP_UNCONNECTED) 2860 return; 2861 2862 INP_HASH_WLOCK(pcbinfo); 2863 CK_LIST_REMOVE(inp, inp_hash_exact); 2864 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_list_unconn, inp, inp_unconn_list); 2865 INP_HASH_WUNLOCK(pcbinfo); 2866 inp->inp_flags |= INP_UNCONNECTED; 2867 } 2868 2869 /* 2870 * Check for alternatives when higher level complains 2871 * about service problems. For now, invalidate cached 2872 * routing information. If the route was created dynamically 2873 * (by a redirect), time to try a default gateway again. 2874 */ 2875 void 2876 in_losing(struct inpcb *inp) 2877 { 2878 2879 RO_INVALIDATE_CACHE(&inp->inp_route); 2880 return; 2881 } 2882 2883 /* 2884 * A set label operation has occurred at the socket layer, propagate the 2885 * label change into the in_pcb for the socket. 2886 */ 2887 void 2888 in_pcbsosetlabel(struct socket *so) 2889 { 2890 #ifdef MAC 2891 struct inpcb *inp; 2892 2893 inp = sotoinpcb(so); 2894 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2895 2896 INP_WLOCK(inp); 2897 SOCK_LOCK(so); 2898 mac_inpcb_sosetlabel(so, inp); 2899 SOCK_UNLOCK(so); 2900 INP_WUNLOCK(inp); 2901 #endif 2902 } 2903 2904 void 2905 inp_wlock(struct inpcb *inp) 2906 { 2907 2908 INP_WLOCK(inp); 2909 } 2910 2911 void 2912 inp_wunlock(struct inpcb *inp) 2913 { 2914 2915 INP_WUNLOCK(inp); 2916 } 2917 2918 void 2919 inp_rlock(struct inpcb *inp) 2920 { 2921 2922 INP_RLOCK(inp); 2923 } 2924 2925 void 2926 inp_runlock(struct inpcb *inp) 2927 { 2928 2929 INP_RUNLOCK(inp); 2930 } 2931 2932 #ifdef INVARIANT_SUPPORT 2933 void 2934 inp_lock_assert(struct inpcb *inp) 2935 { 2936 2937 INP_WLOCK_ASSERT(inp); 2938 } 2939 2940 void 2941 inp_unlock_assert(struct inpcb *inp) 2942 { 2943 2944 INP_UNLOCK_ASSERT(inp); 2945 } 2946 #endif 2947 2948 void 2949 inp_apply_all(struct inpcbinfo *pcbinfo, 2950 void (*func)(struct inpcb *, void *), void *arg) 2951 { 2952 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2953 INPLOOKUP_WLOCKPCB); 2954 struct inpcb *inp; 2955 2956 while ((inp = inp_next(&inpi)) != NULL) 2957 func(inp, arg); 2958 } 2959 2960 struct socket * 2961 inp_inpcbtosocket(struct inpcb *inp) 2962 { 2963 2964 INP_WLOCK_ASSERT(inp); 2965 return (inp->inp_socket); 2966 } 2967 2968 void 2969 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2970 uint32_t *faddr, uint16_t *fp) 2971 { 2972 2973 INP_LOCK_ASSERT(inp); 2974 *laddr = inp->inp_laddr.s_addr; 2975 *faddr = inp->inp_faddr.s_addr; 2976 *lp = inp->inp_lport; 2977 *fp = inp->inp_fport; 2978 } 2979 2980 /* 2981 * Create an external-format (``xinpcb'') structure using the information in 2982 * the kernel-format in_pcb structure pointed to by inp. This is done to 2983 * reduce the spew of irrelevant information over this interface, to isolate 2984 * user code from changes in the kernel structure, and potentially to provide 2985 * information-hiding if we decide that some of this information should be 2986 * hidden from users. 2987 */ 2988 void 2989 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2990 { 2991 2992 bzero(xi, sizeof(*xi)); 2993 xi->xi_len = sizeof(struct xinpcb); 2994 if (inp->inp_socket) 2995 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2996 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2997 xi->inp_gencnt = inp->inp_gencnt; 2998 xi->inp_flow = inp->inp_flow; 2999 xi->inp_flowid = inp->inp_flowid; 3000 xi->inp_flowtype = inp->inp_flowtype; 3001 xi->inp_flags = inp->inp_flags; 3002 xi->inp_flags2 = inp->inp_flags2; 3003 xi->in6p_cksum = inp->in6p_cksum; 3004 xi->in6p_hops = inp->in6p_hops; 3005 xi->inp_ip_tos = inp->inp_ip_tos; 3006 xi->inp_vflag = inp->inp_vflag; 3007 xi->inp_ip_ttl = inp->inp_ip_ttl; 3008 xi->inp_ip_p = inp->inp_ip_p; 3009 xi->inp_ip_minttl = inp->inp_ip_minttl; 3010 } 3011 3012 int 3013 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, 3014 int (*ctloutput_set)(struct inpcb *, struct sockopt *)) 3015 { 3016 struct sockopt sopt; 3017 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 3018 INPLOOKUP_WLOCKPCB); 3019 struct inpcb *inp; 3020 struct sockopt_parameters *params; 3021 struct socket *so; 3022 int error; 3023 char buf[1024]; 3024 3025 if (req->oldptr != NULL || req->oldlen != 0) 3026 return (EINVAL); 3027 if (req->newptr == NULL) 3028 return (EPERM); 3029 if (req->newlen > sizeof(buf)) 3030 return (ENOMEM); 3031 error = SYSCTL_IN(req, buf, req->newlen); 3032 if (error != 0) 3033 return (error); 3034 if (req->newlen < sizeof(struct sockopt_parameters)) 3035 return (EINVAL); 3036 params = (struct sockopt_parameters *)buf; 3037 sopt.sopt_level = params->sop_level; 3038 sopt.sopt_name = params->sop_optname; 3039 sopt.sopt_dir = SOPT_SET; 3040 sopt.sopt_val = params->sop_optval; 3041 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); 3042 sopt.sopt_td = NULL; 3043 #ifdef INET6 3044 if (params->sop_inc.inc_flags & INC_ISIPV6) { 3045 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) 3046 params->sop_inc.inc6_laddr.s6_addr16[1] = 3047 htons(params->sop_inc.inc6_zoneid & 0xffff); 3048 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) 3049 params->sop_inc.inc6_faddr.s6_addr16[1] = 3050 htons(params->sop_inc.inc6_zoneid & 0xffff); 3051 } 3052 #endif 3053 if (params->sop_inc.inc_lport != htons(0) && 3054 params->sop_inc.inc_fport != htons(0)) { 3055 #ifdef INET6 3056 if (params->sop_inc.inc_flags & INC_ISIPV6) 3057 inpi.hash = INP6_PCBHASH( 3058 ¶ms->sop_inc.inc6_faddr, 3059 params->sop_inc.inc_lport, 3060 params->sop_inc.inc_fport, 3061 pcbinfo->ipi_hashmask); 3062 else 3063 #endif 3064 inpi.hash = INP_PCBHASH( 3065 ¶ms->sop_inc.inc_faddr, 3066 params->sop_inc.inc_lport, 3067 params->sop_inc.inc_fport, 3068 pcbinfo->ipi_hashmask); 3069 } 3070 while ((inp = inp_next(&inpi)) != NULL) 3071 if (inp->inp_gencnt == params->sop_id) { 3072 /* 3073 * XXXGL 3074 * 1) the inp_next() that ignores INP_UNCONNECTED needs 3075 * to be generally supported. 3076 * 2) Why do we ECONNRESET instead of continueing? 3077 */ 3078 if (inp->inp_flags & INP_UNCONNECTED) { 3079 INP_WUNLOCK(inp); 3080 return (ECONNRESET); 3081 } 3082 so = inp->inp_socket; 3083 KASSERT(so != NULL, ("inp_socket == NULL")); 3084 soref(so); 3085 if (params->sop_level == SOL_SOCKET) { 3086 INP_WUNLOCK(inp); 3087 error = sosetopt(so, &sopt); 3088 } else 3089 error = (*ctloutput_set)(inp, &sopt); 3090 sorele(so); 3091 break; 3092 } 3093 if (inp == NULL) 3094 error = ESRCH; 3095 return (error); 3096 } 3097 3098 #ifdef DDB 3099 static void 3100 db_print_indent(int indent) 3101 { 3102 int i; 3103 3104 for (i = 0; i < indent; i++) 3105 db_printf(" "); 3106 } 3107 3108 static void 3109 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 3110 { 3111 char faddr_str[48], laddr_str[48]; 3112 3113 db_print_indent(indent); 3114 db_printf("%s at %p\n", name, inc); 3115 3116 indent += 2; 3117 3118 #ifdef INET6 3119 if (inc->inc_flags & INC_ISIPV6) { 3120 /* IPv6. */ 3121 ip6_sprintf(laddr_str, &inc->inc6_laddr); 3122 ip6_sprintf(faddr_str, &inc->inc6_faddr); 3123 } else 3124 #endif 3125 { 3126 /* IPv4. */ 3127 inet_ntoa_r(inc->inc_laddr, laddr_str); 3128 inet_ntoa_r(inc->inc_faddr, faddr_str); 3129 } 3130 db_print_indent(indent); 3131 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 3132 ntohs(inc->inc_lport)); 3133 db_print_indent(indent); 3134 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 3135 ntohs(inc->inc_fport)); 3136 } 3137 3138 void 3139 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 3140 { 3141 3142 db_print_indent(indent); 3143 db_printf("%s at %p\n", name, inp); 3144 3145 indent += 2; 3146 3147 db_print_indent(indent); 3148 db_printf("inp_flow: 0x%x inp_label: %p\n", inp->inp_flow, 3149 inp->inp_label); 3150 3151 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 3152 3153 db_print_indent(indent); 3154 db_printf("inp_flags: 0x%b\n", inp->inp_flags, INP_FLAGS_BITS); 3155 3156 db_print_indent(indent); 3157 db_printf("inp_flags2: 0x%b\n", inp->inp_flags2, INP_FLAGS2_BITS); 3158 3159 db_print_indent(indent); 3160 db_printf("inp_sp: %p inp_vflag: 0x%b\n", inp->inp_sp, 3161 inp->inp_vflag, INP_VFLAGS_BITS); 3162 3163 db_print_indent(indent); 3164 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 3165 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 3166 3167 #ifdef INET6 3168 if (inp->inp_vflag & INP_IPV6) { 3169 db_print_indent(indent); 3170 db_printf("in6p_options: %p in6p_outputopts: %p " 3171 "in6p_moptions: %p\n", inp->in6p_options, 3172 inp->in6p_outputopts, inp->in6p_moptions); 3173 db_print_indent(indent); 3174 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 3175 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 3176 inp->in6p_hops); 3177 } else 3178 #endif 3179 { 3180 db_print_indent(indent); 3181 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3182 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3183 inp->inp_options, inp->inp_moptions); 3184 } 3185 3186 db_print_indent(indent); 3187 db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt); 3188 } 3189 3190 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3191 { 3192 struct inpcb *inp; 3193 3194 if (!have_addr) { 3195 db_printf("usage: show inpcb <addr>\n"); 3196 return; 3197 } 3198 inp = (struct inpcb *)addr; 3199 3200 db_print_inpcb(inp, "inpcb", 0); 3201 } 3202 #endif /* DDB */ 3203 3204 #ifdef RATELIMIT 3205 /* 3206 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3207 * if any. 3208 */ 3209 int 3210 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3211 { 3212 union if_snd_tag_modify_params params = { 3213 .rate_limit.max_rate = max_pacing_rate, 3214 .rate_limit.flags = M_NOWAIT, 3215 }; 3216 struct m_snd_tag *mst; 3217 int error; 3218 3219 mst = inp->inp_snd_tag; 3220 if (mst == NULL) 3221 return (EINVAL); 3222 3223 if (mst->sw->snd_tag_modify == NULL) { 3224 error = EOPNOTSUPP; 3225 } else { 3226 error = mst->sw->snd_tag_modify(mst, ¶ms); 3227 } 3228 return (error); 3229 } 3230 3231 /* 3232 * Query existing TX rate limit based on the existing 3233 * "inp->inp_snd_tag", if any. 3234 */ 3235 int 3236 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3237 { 3238 union if_snd_tag_query_params params = { }; 3239 struct m_snd_tag *mst; 3240 int error; 3241 3242 mst = inp->inp_snd_tag; 3243 if (mst == NULL) 3244 return (EINVAL); 3245 3246 if (mst->sw->snd_tag_query == NULL) { 3247 error = EOPNOTSUPP; 3248 } else { 3249 error = mst->sw->snd_tag_query(mst, ¶ms); 3250 if (error == 0 && p_max_pacing_rate != NULL) 3251 *p_max_pacing_rate = params.rate_limit.max_rate; 3252 } 3253 return (error); 3254 } 3255 3256 /* 3257 * Query existing TX queue level based on the existing 3258 * "inp->inp_snd_tag", if any. 3259 */ 3260 int 3261 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3262 { 3263 union if_snd_tag_query_params params = { }; 3264 struct m_snd_tag *mst; 3265 int error; 3266 3267 mst = inp->inp_snd_tag; 3268 if (mst == NULL) 3269 return (EINVAL); 3270 3271 if (mst->sw->snd_tag_query == NULL) 3272 return (EOPNOTSUPP); 3273 3274 error = mst->sw->snd_tag_query(mst, ¶ms); 3275 if (error == 0 && p_txqueue_level != NULL) 3276 *p_txqueue_level = params.rate_limit.queue_level; 3277 return (error); 3278 } 3279 3280 /* 3281 * Allocate a new TX rate limit send tag from the network interface 3282 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3283 */ 3284 int 3285 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3286 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3287 3288 { 3289 union if_snd_tag_alloc_params params = { 3290 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3291 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3292 .rate_limit.hdr.flowid = flowid, 3293 .rate_limit.hdr.flowtype = flowtype, 3294 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3295 .rate_limit.max_rate = max_pacing_rate, 3296 .rate_limit.flags = M_NOWAIT, 3297 }; 3298 int error; 3299 3300 INP_WLOCK_ASSERT(inp); 3301 3302 /* 3303 * If there is already a send tag, or the INP is being torn 3304 * down, allocating a new send tag is not allowed. Else send 3305 * tags may leak. 3306 */ 3307 if (*st != NULL || (inp->inp_flags & INP_UNCONNECTED)) 3308 return (EINVAL); 3309 3310 error = m_snd_tag_alloc(ifp, ¶ms, st); 3311 #ifdef INET 3312 if (error == 0) { 3313 counter_u64_add(rate_limit_set_ok, 1); 3314 counter_u64_add(rate_limit_active, 1); 3315 } else if (error != EOPNOTSUPP) 3316 counter_u64_add(rate_limit_alloc_fail, 1); 3317 #endif 3318 return (error); 3319 } 3320 3321 void 3322 in_pcbdetach_tag(struct m_snd_tag *mst) 3323 { 3324 3325 m_snd_tag_rele(mst); 3326 #ifdef INET 3327 counter_u64_add(rate_limit_active, -1); 3328 #endif 3329 } 3330 3331 /* 3332 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3333 * if any: 3334 */ 3335 void 3336 in_pcbdetach_txrtlmt(struct inpcb *inp) 3337 { 3338 struct m_snd_tag *mst; 3339 3340 INP_WLOCK_ASSERT(inp); 3341 3342 mst = inp->inp_snd_tag; 3343 inp->inp_snd_tag = NULL; 3344 3345 if (mst == NULL) 3346 return; 3347 3348 m_snd_tag_rele(mst); 3349 #ifdef INET 3350 counter_u64_add(rate_limit_active, -1); 3351 #endif 3352 } 3353 3354 int 3355 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3356 { 3357 int error; 3358 3359 /* 3360 * If the existing send tag is for the wrong interface due to 3361 * a route change, first drop the existing tag. Set the 3362 * CHANGED flag so that we will keep trying to allocate a new 3363 * tag if we fail to allocate one this time. 3364 */ 3365 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3366 in_pcbdetach_txrtlmt(inp); 3367 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3368 } 3369 3370 /* 3371 * NOTE: When attaching to a network interface a reference is 3372 * made to ensure the network interface doesn't go away until 3373 * all ratelimit connections are gone. The network interface 3374 * pointers compared below represent valid network interfaces, 3375 * except when comparing towards NULL. 3376 */ 3377 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3378 error = 0; 3379 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3380 if (inp->inp_snd_tag != NULL) 3381 in_pcbdetach_txrtlmt(inp); 3382 error = 0; 3383 } else if (inp->inp_snd_tag == NULL) { 3384 /* 3385 * In order to utilize packet pacing with RSS, we need 3386 * to wait until there is a valid RSS hash before we 3387 * can proceed: 3388 */ 3389 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3390 error = EAGAIN; 3391 } else { 3392 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3393 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3394 } 3395 } else { 3396 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3397 } 3398 if (error == 0 || error == EOPNOTSUPP) 3399 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3400 3401 return (error); 3402 } 3403 3404 /* 3405 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3406 * is set in the fast path and will attach/detach/modify the TX rate 3407 * limit send tag based on the socket's so_max_pacing_rate value. 3408 */ 3409 void 3410 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3411 { 3412 struct socket *socket; 3413 uint32_t max_pacing_rate; 3414 bool did_upgrade; 3415 3416 if (inp == NULL) 3417 return; 3418 3419 socket = inp->inp_socket; 3420 if (socket == NULL) 3421 return; 3422 3423 if (!INP_WLOCKED(inp)) { 3424 /* 3425 * NOTE: If the write locking fails, we need to bail 3426 * out and use the non-ratelimited ring for the 3427 * transmit until there is a new chance to get the 3428 * write lock. 3429 */ 3430 if (!INP_TRY_UPGRADE(inp)) 3431 return; 3432 did_upgrade = 1; 3433 } else { 3434 did_upgrade = 0; 3435 } 3436 3437 /* 3438 * NOTE: The so_max_pacing_rate value is read unlocked, 3439 * because atomic updates are not required since the variable 3440 * is checked at every mbuf we send. It is assumed that the 3441 * variable read itself will be atomic. 3442 */ 3443 max_pacing_rate = socket->so_max_pacing_rate; 3444 3445 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3446 3447 if (did_upgrade) 3448 INP_DOWNGRADE(inp); 3449 } 3450 3451 /* 3452 * Track route changes for TX rate limiting. 3453 */ 3454 void 3455 in_pcboutput_eagain(struct inpcb *inp) 3456 { 3457 bool did_upgrade; 3458 3459 if (inp == NULL) 3460 return; 3461 3462 if (inp->inp_snd_tag == NULL) 3463 return; 3464 3465 if (!INP_WLOCKED(inp)) { 3466 /* 3467 * NOTE: If the write locking fails, we need to bail 3468 * out and use the non-ratelimited ring for the 3469 * transmit until there is a new chance to get the 3470 * write lock. 3471 */ 3472 if (!INP_TRY_UPGRADE(inp)) 3473 return; 3474 did_upgrade = 1; 3475 } else { 3476 did_upgrade = 0; 3477 } 3478 3479 /* detach rate limiting */ 3480 in_pcbdetach_txrtlmt(inp); 3481 3482 /* make sure new mbuf send tag allocation is made */ 3483 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3484 3485 if (did_upgrade) 3486 INP_DOWNGRADE(inp); 3487 } 3488 3489 #ifdef INET 3490 static void 3491 rl_init(void *st) 3492 { 3493 rate_limit_new = counter_u64_alloc(M_WAITOK); 3494 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3495 rate_limit_active = counter_u64_alloc(M_WAITOK); 3496 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3497 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3498 } 3499 3500 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3501 #endif 3502 #endif /* RATELIMIT */ 3503