1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org> 9 * All rights reserved. 10 * 11 * Portions of this software were developed by Robert N. M. Watson under 12 * contract to Juniper Networks, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include <sys/cdefs.h> 40 #include "opt_ddb.h" 41 #include "opt_ipsec.h" 42 #include "opt_inet.h" 43 #include "opt_inet6.h" 44 #include "opt_ratelimit.h" 45 #include "opt_route.h" 46 #include "opt_rss.h" 47 48 #include <sys/param.h> 49 #include <sys/hash.h> 50 #include <sys/systm.h> 51 #include <sys/libkern.h> 52 #include <sys/lock.h> 53 #include <sys/malloc.h> 54 #include <sys/mbuf.h> 55 #include <sys/eventhandler.h> 56 #include <sys/domain.h> 57 #include <sys/proc.h> 58 #include <sys/protosw.h> 59 #include <sys/smp.h> 60 #include <sys/smr.h> 61 #include <sys/socket.h> 62 #include <sys/socketvar.h> 63 #include <sys/sockio.h> 64 #include <sys/priv.h> 65 #include <sys/proc.h> 66 #include <sys/refcount.h> 67 #include <sys/jail.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 71 #ifdef DDB 72 #include <ddb/ddb.h> 73 #endif 74 75 #include <vm/uma.h> 76 #include <vm/vm.h> 77 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/if_private.h> 81 #include <net/if_types.h> 82 #include <net/if_llatbl.h> 83 #include <net/route.h> 84 #include <net/rss_config.h> 85 #include <net/vnet.h> 86 87 #if defined(INET) || defined(INET6) 88 #include <netinet/in.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/in_pcb_var.h> 91 #include <netinet/tcp.h> 92 #ifdef INET 93 #include <netinet/in_var.h> 94 #include <netinet/in_fib.h> 95 #endif 96 #include <netinet/ip_var.h> 97 #ifdef INET6 98 #include <netinet/ip6.h> 99 #include <netinet6/in6_pcb.h> 100 #include <netinet6/in6_var.h> 101 #include <netinet6/ip6_var.h> 102 #endif /* INET6 */ 103 #include <net/route/nhop.h> 104 #endif 105 106 #include <netipsec/ipsec_support.h> 107 108 #include <security/mac/mac_framework.h> 109 110 #define INPCBLBGROUP_SIZMIN 8 111 #define INPCBLBGROUP_SIZMAX 256 112 113 #define INP_FREED 0x00000200 /* Went through in_pcbfree(). */ 114 #define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */ 115 116 /* 117 * These configure the range of local port addresses assigned to 118 * "unspecified" outgoing connections/packets/whatever. 119 */ 120 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 121 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 122 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 123 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 124 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 125 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 126 127 /* 128 * Reserved ports accessible only to root. There are significant 129 * security considerations that must be accounted for when changing these, 130 * but the security benefits can be great. Please be careful. 131 */ 132 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 133 VNET_DEFINE(int, ipport_reservedlow); 134 135 /* Enable random ephemeral port allocation by default. */ 136 VNET_DEFINE(int, ipport_randomized) = 1; 137 138 #ifdef INET 139 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 140 struct in_addr faddr, u_int fport_arg, 141 struct in_addr laddr, u_int lport_arg, 142 int lookupflags, uint8_t numa_domain, int fib); 143 144 #define RANGECHK(var, min, max) \ 145 if ((var) < (min)) { (var) = (min); } \ 146 else if ((var) > (max)) { (var) = (max); } 147 148 static int 149 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 150 { 151 int error; 152 153 error = sysctl_handle_int(oidp, arg1, arg2, req); 154 if (error == 0) { 155 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 156 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 157 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 158 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 159 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 160 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 161 } 162 return (error); 163 } 164 165 #undef RANGECHK 166 167 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 168 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 169 "IP Ports"); 170 171 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 172 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 173 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 174 ""); 175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 176 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 177 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 178 ""); 179 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 180 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 181 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 182 ""); 183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 184 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 185 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 186 ""); 187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 188 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 189 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 190 ""); 191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 192 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 193 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 194 ""); 195 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 196 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 197 &VNET_NAME(ipport_reservedhigh), 0, ""); 198 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 199 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 200 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 201 CTLFLAG_VNET | CTLFLAG_RW, 202 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 203 204 #ifdef RATELIMIT 205 counter_u64_t rate_limit_new; 206 counter_u64_t rate_limit_chg; 207 counter_u64_t rate_limit_active; 208 counter_u64_t rate_limit_alloc_fail; 209 counter_u64_t rate_limit_set_ok; 210 211 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 212 "IP Rate Limiting"); 213 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 214 &rate_limit_active, "Active rate limited connections"); 215 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 216 &rate_limit_alloc_fail, "Rate limited connection failures"); 217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 218 &rate_limit_set_ok, "Rate limited setting succeeded"); 219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 220 &rate_limit_new, "Total Rate limit new attempts"); 221 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 222 &rate_limit_chg, "Total Rate limited change attempts"); 223 #endif /* RATELIMIT */ 224 225 #endif /* INET */ 226 227 VNET_DEFINE(uint32_t, in_pcbhashseed); 228 static void 229 in_pcbhashseed_init(void) 230 { 231 232 V_in_pcbhashseed = arc4random(); 233 } 234 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, 235 in_pcbhashseed_init, NULL); 236 237 #ifdef INET 238 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 1; 239 #define V_connect_inaddr_wild VNET(connect_inaddr_wild) 240 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild, 241 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0, 242 "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)"); 243 #endif 244 245 static void in_pcbremhash(struct inpcb *); 246 247 /* 248 * in_pcb.c: manage the Protocol Control Blocks. 249 * 250 * NOTE: It is assumed that most of these functions will be called with 251 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 252 * functions often modify hash chains or addresses in pcbs. 253 */ 254 255 static struct inpcblbgroup * 256 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port, 257 const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib) 258 { 259 struct inpcblbgroup *grp; 260 size_t bytes; 261 262 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 263 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 264 if (grp == NULL) 265 return (NULL); 266 LIST_INIT(&grp->il_pending); 267 grp->il_cred = crhold(cred); 268 grp->il_vflag = vflag; 269 grp->il_lport = port; 270 grp->il_numa_domain = numa_domain; 271 grp->il_fibnum = fib; 272 grp->il_dependladdr = *addr; 273 grp->il_inpsiz = size; 274 return (grp); 275 } 276 277 static void 278 in_pcblbgroup_free_deferred(epoch_context_t ctx) 279 { 280 struct inpcblbgroup *grp; 281 282 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 283 crfree(grp->il_cred); 284 free(grp, M_PCB); 285 } 286 287 static void 288 in_pcblbgroup_free(struct inpcblbgroup *grp) 289 { 290 KASSERT(LIST_EMPTY(&grp->il_pending), 291 ("local group %p still has pending inps", grp)); 292 293 CK_LIST_REMOVE(grp, il_list); 294 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 295 } 296 297 static struct inpcblbgroup * 298 in_pcblbgroup_find(struct inpcb *inp) 299 { 300 struct inpcbinfo *pcbinfo; 301 struct inpcblbgroup *grp; 302 struct inpcblbgrouphead *hdr; 303 304 INP_LOCK_ASSERT(inp); 305 306 pcbinfo = inp->inp_pcbinfo; 307 INP_HASH_LOCK_ASSERT(pcbinfo); 308 309 hdr = &pcbinfo->ipi_lbgrouphashbase[ 310 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 311 CK_LIST_FOREACH(grp, hdr, il_list) { 312 struct inpcb *inp1; 313 314 for (unsigned int i = 0; i < grp->il_inpcnt; i++) { 315 if (inp == grp->il_inp[i]) 316 goto found; 317 } 318 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { 319 if (inp == inp1) 320 goto found; 321 } 322 } 323 found: 324 return (grp); 325 } 326 327 static void 328 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp) 329 { 330 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 331 ("invalid local group size %d and count %d", grp->il_inpsiz, 332 grp->il_inpcnt)); 333 INP_WLOCK_ASSERT(inp); 334 335 if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp && 336 !SOLISTENING(inp->inp_socket)) { 337 /* 338 * If this is a TCP socket, it should not be visible to lbgroup 339 * lookups until listen() has been called. 340 */ 341 LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list); 342 } else { 343 grp->il_inp[grp->il_inpcnt] = inp; 344 345 /* 346 * Synchronize with in_pcblookup_lbgroup(): make sure that we 347 * don't expose a null slot to the lookup path. 348 */ 349 atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1); 350 } 351 352 inp->inp_flags |= INP_INLBGROUP; 353 } 354 355 static struct inpcblbgroup * 356 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 357 struct inpcblbgroup *old_grp, int size) 358 { 359 struct inpcblbgroup *grp; 360 int i; 361 362 grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag, 363 old_grp->il_lport, &old_grp->il_dependladdr, size, 364 old_grp->il_numa_domain, old_grp->il_fibnum); 365 if (grp == NULL) 366 return (NULL); 367 368 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 369 ("invalid new local group size %d and old local group count %d", 370 grp->il_inpsiz, old_grp->il_inpcnt)); 371 372 for (i = 0; i < old_grp->il_inpcnt; ++i) 373 grp->il_inp[i] = old_grp->il_inp[i]; 374 grp->il_inpcnt = old_grp->il_inpcnt; 375 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 376 LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb, 377 inp_lbgroup_list); 378 in_pcblbgroup_free(old_grp); 379 return (grp); 380 } 381 382 /* 383 * Add PCB to load balance group for SO_REUSEPORT_LB option. 384 */ 385 static int 386 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 387 { 388 const static struct timeval interval = { 60, 0 }; 389 static struct timeval lastprint; 390 struct inpcbinfo *pcbinfo; 391 struct inpcblbgrouphead *hdr; 392 struct inpcblbgroup *grp; 393 uint32_t idx; 394 int fib; 395 396 pcbinfo = inp->inp_pcbinfo; 397 398 INP_WLOCK_ASSERT(inp); 399 INP_HASH_WLOCK_ASSERT(pcbinfo); 400 401 fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ? 402 inp->inp_inc.inc_fibnum : RT_ALL_FIBS; 403 404 #ifdef INET6 405 /* 406 * Don't allow IPv4 mapped INET6 wild socket. 407 */ 408 if ((inp->inp_vflag & INP_IPV4) && 409 inp->inp_laddr.s_addr == INADDR_ANY && 410 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 411 return (0); 412 } 413 #endif 414 415 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 416 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 417 CK_LIST_FOREACH(grp, hdr, il_list) { 418 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && 419 grp->il_vflag == inp->inp_vflag && 420 grp->il_lport == inp->inp_lport && 421 grp->il_numa_domain == numa_domain && 422 grp->il_fibnum == fib && 423 memcmp(&grp->il_dependladdr, 424 &inp->inp_inc.inc_ie.ie_dependladdr, 425 sizeof(grp->il_dependladdr)) == 0) { 426 break; 427 } 428 } 429 if (grp == NULL) { 430 /* Create new load balance group. */ 431 grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag, 432 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 433 INPCBLBGROUP_SIZMIN, numa_domain, fib); 434 if (grp == NULL) 435 return (ENOBUFS); 436 in_pcblbgroup_insert(grp, inp); 437 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 438 } else if (grp->il_inpcnt == grp->il_inpsiz) { 439 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 440 if (ratecheck(&lastprint, &interval)) 441 printf("lb group port %d, limit reached\n", 442 ntohs(grp->il_lport)); 443 return (0); 444 } 445 446 /* Expand this local group. */ 447 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 448 if (grp == NULL) 449 return (ENOBUFS); 450 in_pcblbgroup_insert(grp, inp); 451 } else { 452 in_pcblbgroup_insert(grp, inp); 453 } 454 return (0); 455 } 456 457 /* 458 * Remove PCB from load balance group. 459 */ 460 static void 461 in_pcbremlbgrouphash(struct inpcb *inp) 462 { 463 struct inpcbinfo *pcbinfo; 464 struct inpcblbgrouphead *hdr; 465 struct inpcblbgroup *grp; 466 struct inpcb *inp1; 467 int i; 468 469 pcbinfo = inp->inp_pcbinfo; 470 471 INP_WLOCK_ASSERT(inp); 472 MPASS(inp->inp_flags & INP_INLBGROUP); 473 INP_HASH_WLOCK_ASSERT(pcbinfo); 474 475 hdr = &pcbinfo->ipi_lbgrouphashbase[ 476 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 477 CK_LIST_FOREACH(grp, hdr, il_list) { 478 for (i = 0; i < grp->il_inpcnt; ++i) { 479 if (grp->il_inp[i] != inp) 480 continue; 481 482 if (grp->il_inpcnt == 1 && 483 LIST_EMPTY(&grp->il_pending)) { 484 /* We are the last, free this local group. */ 485 in_pcblbgroup_free(grp); 486 } else { 487 grp->il_inp[i] = 488 grp->il_inp[grp->il_inpcnt - 1]; 489 490 /* 491 * Synchronize with in_pcblookup_lbgroup(). 492 */ 493 atomic_store_rel_int(&grp->il_inpcnt, 494 grp->il_inpcnt - 1); 495 } 496 inp->inp_flags &= ~INP_INLBGROUP; 497 return; 498 } 499 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { 500 if (inp == inp1) { 501 LIST_REMOVE(inp, inp_lbgroup_list); 502 inp->inp_flags &= ~INP_INLBGROUP; 503 return; 504 } 505 } 506 } 507 __assert_unreachable(); 508 } 509 510 int 511 in_pcblbgroup_numa(struct inpcb *inp, int arg) 512 { 513 struct inpcbinfo *pcbinfo; 514 int error; 515 uint8_t numa_domain; 516 517 switch (arg) { 518 case TCP_REUSPORT_LB_NUMA_NODOM: 519 numa_domain = M_NODOM; 520 break; 521 case TCP_REUSPORT_LB_NUMA_CURDOM: 522 numa_domain = PCPU_GET(domain); 523 break; 524 default: 525 if (arg < 0 || arg >= vm_ndomains) 526 return (EINVAL); 527 numa_domain = arg; 528 } 529 530 pcbinfo = inp->inp_pcbinfo; 531 INP_WLOCK_ASSERT(inp); 532 INP_HASH_WLOCK(pcbinfo); 533 if (in_pcblbgroup_find(inp) != NULL) { 534 /* Remove it from the old group. */ 535 in_pcbremlbgrouphash(inp); 536 /* Add it to the new group based on numa domain. */ 537 in_pcbinslbgrouphash(inp, numa_domain); 538 error = 0; 539 } else { 540 error = ENOENT; 541 } 542 INP_HASH_WUNLOCK(pcbinfo); 543 return (error); 544 } 545 546 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 547 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 548 549 /* 550 * Initialize an inpcbinfo - a per-VNET instance of connections db. 551 */ 552 void 553 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, 554 u_int hash_nelements, u_int porthash_nelements) 555 { 556 557 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); 558 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, 559 NULL, MTX_DEF); 560 #ifdef VIMAGE 561 pcbinfo->ipi_vnet = curvnet; 562 #endif 563 CK_LIST_INIT(&pcbinfo->ipi_listhead); 564 pcbinfo->ipi_count = 0; 565 pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB, 566 &pcbinfo->ipi_hashmask); 567 pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB, 568 &pcbinfo->ipi_hashmask); 569 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 570 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 571 &pcbinfo->ipi_porthashmask); 572 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 573 &pcbinfo->ipi_lbgrouphashmask); 574 pcbinfo->ipi_zone = pcbstor->ips_zone; 575 pcbinfo->ipi_portzone = pcbstor->ips_portzone; 576 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 577 } 578 579 /* 580 * Destroy an inpcbinfo. 581 */ 582 void 583 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 584 { 585 586 KASSERT(pcbinfo->ipi_count == 0, 587 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 588 589 hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask); 590 hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask); 591 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 592 pcbinfo->ipi_porthashmask); 593 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 594 pcbinfo->ipi_lbgrouphashmask); 595 mtx_destroy(&pcbinfo->ipi_hash_lock); 596 mtx_destroy(&pcbinfo->ipi_lock); 597 } 598 599 /* 600 * Initialize a pcbstorage - per protocol zones to allocate inpcbs. 601 */ 602 static void inpcb_fini(void *, int); 603 void 604 in_pcbstorage_init(void *arg) 605 { 606 struct inpcbstorage *pcbstor = arg; 607 608 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, 609 pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit, 610 inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR); 611 pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, 612 sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 613 uma_zone_set_smr(pcbstor->ips_portzone, 614 uma_zone_get_smr(pcbstor->ips_zone)); 615 } 616 617 /* 618 * Destroy a pcbstorage - used by unloadable protocols. 619 */ 620 void 621 in_pcbstorage_destroy(void *arg) 622 { 623 struct inpcbstorage *pcbstor = arg; 624 625 uma_zdestroy(pcbstor->ips_zone); 626 uma_zdestroy(pcbstor->ips_portzone); 627 } 628 629 /* 630 * Allocate a PCB and associate it with the socket. 631 * On success return with the PCB locked. 632 */ 633 int 634 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 635 { 636 struct inpcb *inp; 637 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 638 int error; 639 #endif 640 641 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 642 if (inp == NULL) 643 return (ENOBUFS); 644 bzero(&inp->inp_start_zero, inp_zero_size); 645 #ifdef NUMA 646 inp->inp_numa_domain = M_NODOM; 647 #endif 648 inp->inp_pcbinfo = pcbinfo; 649 inp->inp_socket = so; 650 inp->inp_cred = crhold(so->so_cred); 651 inp->inp_inc.inc_fibnum = so->so_fibnum; 652 #ifdef MAC 653 error = mac_inpcb_init(inp, M_NOWAIT); 654 if (error != 0) 655 goto out; 656 mac_inpcb_create(so, inp); 657 #endif 658 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 659 error = ipsec_init_pcbpolicy(inp); 660 if (error != 0) { 661 #ifdef MAC 662 mac_inpcb_destroy(inp); 663 #endif 664 goto out; 665 } 666 #endif /*IPSEC*/ 667 #ifdef INET6 668 if (INP_SOCKAF(so) == AF_INET6) { 669 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; 670 if (V_ip6_v6only) 671 inp->inp_flags |= IN6P_IPV6_V6ONLY; 672 #ifdef INET 673 else 674 inp->inp_vflag |= INP_IPV4; 675 #endif 676 if (V_ip6_auto_flowlabel) 677 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 678 inp->in6p_hops = -1; /* use kernel default */ 679 } 680 #endif 681 #if defined(INET) && defined(INET6) 682 else 683 #endif 684 #ifdef INET 685 inp->inp_vflag |= INP_IPV4; 686 #endif 687 inp->inp_smr = SMR_SEQ_INVALID; 688 689 /* 690 * Routes in inpcb's can cache L2 as well; they are guaranteed 691 * to be cleaned up. 692 */ 693 inp->inp_route.ro_flags = RT_LLE_CACHE; 694 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 695 INP_WLOCK(inp); 696 INP_INFO_WLOCK(pcbinfo); 697 pcbinfo->ipi_count++; 698 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 699 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 700 INP_INFO_WUNLOCK(pcbinfo); 701 so->so_pcb = inp; 702 703 return (0); 704 705 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 706 out: 707 crfree(inp->inp_cred); 708 #ifdef INVARIANTS 709 inp->inp_cred = NULL; 710 #endif 711 uma_zfree_smr(pcbinfo->ipi_zone, inp); 712 return (error); 713 #endif 714 } 715 716 #ifdef INET 717 int 718 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags, 719 struct ucred *cred) 720 { 721 int anonport, error; 722 723 KASSERT(sin == NULL || sin->sin_family == AF_INET, 724 ("%s: invalid address family for %p", __func__, sin)); 725 KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in), 726 ("%s: invalid address length for %p", __func__, sin)); 727 INP_WLOCK_ASSERT(inp); 728 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 729 730 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 731 return (EINVAL); 732 anonport = sin == NULL || sin->sin_port == 0; 733 error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr, 734 &inp->inp_lport, flags, cred); 735 if (error) 736 return (error); 737 if (in_pcbinshash(inp) != 0) { 738 inp->inp_laddr.s_addr = INADDR_ANY; 739 inp->inp_lport = 0; 740 inp->inp_flags &= ~INP_BOUNDFIB; 741 return (EAGAIN); 742 } 743 if (anonport) 744 inp->inp_flags |= INP_ANONPORT; 745 return (0); 746 } 747 #endif 748 749 #if defined(INET) || defined(INET6) 750 /* 751 * Assign a local port like in_pcb_lport(), but also used with connect() 752 * and a foreign address and port. If fsa is non-NULL, choose a local port 753 * that is unused with those, otherwise one that is completely unused. 754 * lsa can be NULL for IPv6. 755 */ 756 int 757 in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa, 758 u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred, 759 int lookupflags) 760 { 761 struct inpcbinfo *pcbinfo; 762 struct inpcb *tmpinp; 763 unsigned short *lastport; 764 int count, error; 765 u_short aux, first, last, lport; 766 #ifdef INET 767 struct in_addr laddr, faddr; 768 #endif 769 #ifdef INET6 770 struct in6_addr *laddr6, *faddr6; 771 #endif 772 773 pcbinfo = inp->inp_pcbinfo; 774 775 /* 776 * Because no actual state changes occur here, a global write lock on 777 * the pcbinfo isn't required. 778 */ 779 INP_LOCK_ASSERT(inp); 780 INP_HASH_LOCK_ASSERT(pcbinfo); 781 782 if (inp->inp_flags & INP_HIGHPORT) { 783 first = V_ipport_hifirstauto; /* sysctl */ 784 last = V_ipport_hilastauto; 785 lastport = &pcbinfo->ipi_lasthi; 786 } else if (inp->inp_flags & INP_LOWPORT) { 787 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 788 if (error) 789 return (error); 790 first = V_ipport_lowfirstauto; /* 1023 */ 791 last = V_ipport_lowlastauto; /* 600 */ 792 lastport = &pcbinfo->ipi_lastlow; 793 } else { 794 first = V_ipport_firstauto; /* sysctl */ 795 last = V_ipport_lastauto; 796 lastport = &pcbinfo->ipi_lastport; 797 } 798 799 /* 800 * Instead of having two loops further down counting up or down 801 * make sure that first is always <= last and go with only one 802 * code path implementing all logic. 803 */ 804 if (first > last) { 805 aux = first; 806 first = last; 807 last = aux; 808 } 809 810 #ifdef INET 811 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ 812 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 813 if (lsa != NULL) 814 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 815 if (fsa != NULL) 816 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 817 } 818 #endif 819 #ifdef INET6 820 laddr6 = NULL; 821 if ((inp->inp_vflag & INP_IPV6) != 0) { 822 if (lsa != NULL) 823 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 824 if (fsa != NULL) 825 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 826 } 827 #endif 828 829 tmpinp = NULL; 830 lport = *lportp; 831 832 if (V_ipport_randomized) 833 *lastport = first + (arc4random() % (last - first)); 834 835 count = last - first; 836 837 do { 838 if (count-- < 0) /* completely used? */ 839 return (EADDRNOTAVAIL); 840 ++*lastport; 841 if (*lastport < first || *lastport > last) 842 *lastport = first; 843 lport = htons(*lastport); 844 845 if (fsa != NULL) { 846 #ifdef INET 847 if (lsa->sa_family == AF_INET) { 848 tmpinp = in_pcblookup_hash_locked(pcbinfo, 849 faddr, fport, laddr, lport, lookupflags, 850 M_NODOM, RT_ALL_FIBS); 851 } 852 #endif 853 #ifdef INET6 854 if (lsa->sa_family == AF_INET6) { 855 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 856 faddr6, fport, laddr6, lport, lookupflags, 857 M_NODOM, RT_ALL_FIBS); 858 } 859 #endif 860 } else { 861 #ifdef INET6 862 if ((inp->inp_vflag & INP_IPV6) != 0) { 863 tmpinp = in6_pcblookup_local(pcbinfo, 864 &inp->in6p_laddr, lport, RT_ALL_FIBS, 865 lookupflags, cred); 866 #ifdef INET 867 if (tmpinp == NULL && 868 (inp->inp_vflag & INP_IPV4)) 869 tmpinp = in_pcblookup_local(pcbinfo, 870 laddr, lport, RT_ALL_FIBS, 871 lookupflags, cred); 872 #endif 873 } 874 #endif 875 #if defined(INET) && defined(INET6) 876 else 877 #endif 878 #ifdef INET 879 tmpinp = in_pcblookup_local(pcbinfo, laddr, 880 lport, RT_ALL_FIBS, lookupflags, cred); 881 #endif 882 } 883 } while (tmpinp != NULL); 884 885 *lportp = lport; 886 887 return (0); 888 } 889 890 /* 891 * Select a local port (number) to use. 892 */ 893 int 894 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 895 struct ucred *cred, int lookupflags) 896 { 897 struct sockaddr_in laddr; 898 899 if (laddrp) { 900 bzero(&laddr, sizeof(laddr)); 901 laddr.sin_family = AF_INET; 902 laddr.sin_addr = *laddrp; 903 } 904 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 905 NULL, lportp, NULL, 0, cred, lookupflags)); 906 } 907 #endif /* INET || INET6 */ 908 909 #ifdef INET 910 /* 911 * Determine whether the inpcb can be bound to the specified address/port tuple. 912 */ 913 static int 914 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr, 915 const u_short lport, const int fib, int sooptions, int lookupflags, 916 struct ucred *cred) 917 { 918 int reuseport, reuseport_lb; 919 920 INP_LOCK_ASSERT(inp); 921 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 922 923 reuseport = (sooptions & SO_REUSEPORT); 924 reuseport_lb = (sooptions & SO_REUSEPORT_LB); 925 926 if (IN_MULTICAST(ntohl(laddr.s_addr))) { 927 /* 928 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 929 * allow complete duplication of binding if 930 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 931 * and a multicast address is bound on both 932 * new and duplicated sockets. 933 */ 934 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0) 935 reuseport = SO_REUSEADDR | SO_REUSEPORT; 936 /* 937 * XXX: How to deal with SO_REUSEPORT_LB here? 938 * Treat same as SO_REUSEPORT for now. 939 */ 940 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0) 941 reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB; 942 } else if (!in_nullhost(laddr)) { 943 struct sockaddr_in sin; 944 945 memset(&sin, 0, sizeof(sin)); 946 sin.sin_family = AF_INET; 947 sin.sin_len = sizeof(sin); 948 sin.sin_addr = laddr; 949 950 /* 951 * Is the address a local IP address? 952 * If INP_BINDANY is set, then the socket may be bound 953 * to any endpoint address, local or not. 954 */ 955 if ((inp->inp_flags & INP_BINDANY) == 0 && 956 ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0) 957 return (EADDRNOTAVAIL); 958 } 959 960 if (lport != 0) { 961 struct inpcb *t; 962 963 if (ntohs(lport) <= V_ipport_reservedhigh && 964 ntohs(lport) >= V_ipport_reservedlow && 965 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 966 return (EACCES); 967 968 if (!IN_MULTICAST(ntohl(laddr.s_addr)) && 969 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 970 /* 971 * If a socket owned by a different user is already 972 * bound to this port, fail. In particular, SO_REUSE* 973 * can only be used to share a port among sockets owned 974 * by the same user. 975 * 976 * However, we can share a port with a connected socket 977 * which has a unique 4-tuple. 978 */ 979 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, 980 RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred); 981 if (t != NULL && 982 (inp->inp_socket->so_type != SOCK_STREAM || 983 in_nullhost(t->inp_faddr)) && 984 (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) 985 return (EADDRINUSE); 986 } 987 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib, 988 lookupflags, cred); 989 if (t != NULL && ((reuseport | reuseport_lb) & 990 t->inp_socket->so_options) == 0) { 991 #ifdef INET6 992 if (!in_nullhost(laddr) || 993 !in_nullhost(t->inp_laddr) || 994 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 995 (t->inp_vflag & INP_IPV6PROTO) == 0) 996 #endif 997 return (EADDRINUSE); 998 } 999 } 1000 return (0); 1001 } 1002 1003 /* 1004 * Set up a bind operation on a PCB, performing port allocation 1005 * as required, but do not actually modify the PCB. Callers can 1006 * either complete the bind by setting inp_laddr/inp_lport and 1007 * calling in_pcbinshash(), or they can just use the resulting 1008 * port and address to authorise the sending of a once-off packet. 1009 * 1010 * On error, the values of *laddrp and *lportp are not changed. 1011 */ 1012 int 1013 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, 1014 u_short *lportp, int flags, struct ucred *cred) 1015 { 1016 struct socket *so = inp->inp_socket; 1017 struct in_addr laddr; 1018 u_short lport = 0; 1019 int error, fib, lookupflags, sooptions; 1020 1021 /* 1022 * No state changes, so read locks are sufficient here. 1023 */ 1024 INP_LOCK_ASSERT(inp); 1025 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1026 1027 laddr.s_addr = *laddrp; 1028 if (sin != NULL && laddr.s_addr != INADDR_ANY) 1029 return (EINVAL); 1030 1031 lookupflags = 0; 1032 sooptions = atomic_load_int(&so->so_options); 1033 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0) 1034 lookupflags = INPLOOKUP_WILDCARD; 1035 if (sin == NULL) { 1036 if ((error = prison_local_ip4(cred, &laddr)) != 0) 1037 return (error); 1038 } else { 1039 KASSERT(sin->sin_family == AF_INET, 1040 ("%s: invalid family for address %p", __func__, sin)); 1041 KASSERT(sin->sin_len == sizeof(*sin), 1042 ("%s: invalid length for address %p", __func__, sin)); 1043 1044 error = prison_local_ip4(cred, &sin->sin_addr); 1045 if (error) 1046 return (error); 1047 if (sin->sin_port != *lportp) { 1048 /* Don't allow the port to change. */ 1049 if (*lportp != 0) 1050 return (EINVAL); 1051 lport = sin->sin_port; 1052 } 1053 laddr = sin->sin_addr; 1054 1055 fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum : 1056 RT_ALL_FIBS; 1057 1058 /* See if this address/port combo is available. */ 1059 error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions, 1060 lookupflags, cred); 1061 if (error != 0) 1062 return (error); 1063 } 1064 if (*lportp != 0) 1065 lport = *lportp; 1066 if (lport == 0) { 1067 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1068 if (error != 0) 1069 return (error); 1070 } 1071 *laddrp = laddr.s_addr; 1072 *lportp = lport; 1073 if ((flags & INPBIND_FIB) != 0) 1074 inp->inp_flags |= INP_BOUNDFIB; 1075 return (0); 1076 } 1077 1078 /* 1079 * Connect from a socket to a specified address. 1080 * Both address and port must be specified in argument sin. 1081 * If don't have a local address for this socket yet, 1082 * then pick one. 1083 */ 1084 int 1085 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) 1086 { 1087 u_short lport, fport; 1088 in_addr_t laddr, faddr; 1089 int anonport, error; 1090 1091 INP_WLOCK_ASSERT(inp); 1092 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1093 KASSERT(in_nullhost(inp->inp_faddr), 1094 ("%s: inp is already connected", __func__)); 1095 1096 lport = inp->inp_lport; 1097 laddr = inp->inp_laddr.s_addr; 1098 anonport = (lport == 0); 1099 error = in_pcbconnect_setup(inp, sin, &laddr, &lport, &faddr, &fport, 1100 cred); 1101 if (error) 1102 return (error); 1103 1104 inp->inp_faddr.s_addr = faddr; 1105 inp->inp_fport = fport; 1106 1107 /* Do the initial binding of the local address if required. */ 1108 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { 1109 inp->inp_lport = lport; 1110 inp->inp_laddr.s_addr = laddr; 1111 if (in_pcbinshash(inp) != 0) { 1112 inp->inp_laddr.s_addr = inp->inp_faddr.s_addr = 1113 INADDR_ANY; 1114 inp->inp_lport = inp->inp_fport = 0; 1115 return (EAGAIN); 1116 } 1117 } else { 1118 inp->inp_lport = lport; 1119 inp->inp_laddr.s_addr = laddr; 1120 if ((inp->inp_flags & INP_INHASHLIST) != 0) 1121 in_pcbrehash(inp); 1122 else 1123 in_pcbinshash(inp); 1124 } 1125 #ifdef ROUTE_MPATH 1126 if (CALC_FLOWID_OUTBOUND) { 1127 uint32_t hash_val, hash_type; 1128 1129 hash_val = fib4_calc_software_hash(inp->inp_laddr, 1130 inp->inp_faddr, 0, fport, 1131 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1132 1133 inp->inp_flowid = hash_val; 1134 inp->inp_flowtype = hash_type; 1135 } 1136 #endif 1137 if (anonport) 1138 inp->inp_flags |= INP_ANONPORT; 1139 return (0); 1140 } 1141 1142 /* 1143 * Do proper source address selection on an unbound socket in case 1144 * of connect. Take jails into account as well. 1145 */ 1146 int 1147 in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr, 1148 struct in_addr *laddr, struct ucred *cred) 1149 { 1150 struct ifaddr *ifa; 1151 struct sockaddr *sa; 1152 struct sockaddr_in *sin, dst; 1153 struct nhop_object *nh; 1154 int error; 1155 1156 NET_EPOCH_ASSERT(); 1157 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1158 1159 /* 1160 * Bypass source address selection and use the primary jail IP 1161 * if requested. 1162 */ 1163 if (!prison_saddrsel_ip4(cred, laddr)) 1164 return (0); 1165 1166 error = 0; 1167 1168 nh = NULL; 1169 bzero(&dst, sizeof(dst)); 1170 sin = &dst; 1171 sin->sin_family = AF_INET; 1172 sin->sin_len = sizeof(struct sockaddr_in); 1173 sin->sin_addr.s_addr = faddr->s_addr; 1174 1175 /* 1176 * If route is known our src addr is taken from the i/f, 1177 * else punt. 1178 * 1179 * Find out route to destination. 1180 */ 1181 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1182 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1183 0, NHR_NONE, 0); 1184 1185 /* 1186 * If we found a route, use the address corresponding to 1187 * the outgoing interface. 1188 * 1189 * Otherwise assume faddr is reachable on a directly connected 1190 * network and try to find a corresponding interface to take 1191 * the source address from. 1192 */ 1193 if (nh == NULL || nh->nh_ifp == NULL) { 1194 struct in_ifaddr *ia; 1195 struct ifnet *ifp; 1196 1197 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1198 inp->inp_socket->so_fibnum)); 1199 if (ia == NULL) { 1200 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1201 inp->inp_socket->so_fibnum)); 1202 } 1203 if (ia == NULL) { 1204 error = ENETUNREACH; 1205 goto done; 1206 } 1207 1208 if (!prison_flag(cred, PR_IP4)) { 1209 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1210 goto done; 1211 } 1212 1213 ifp = ia->ia_ifp; 1214 ia = NULL; 1215 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1216 sa = ifa->ifa_addr; 1217 if (sa->sa_family != AF_INET) 1218 continue; 1219 sin = (struct sockaddr_in *)sa; 1220 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1221 ia = (struct in_ifaddr *)ifa; 1222 break; 1223 } 1224 } 1225 if (ia != NULL) { 1226 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1227 goto done; 1228 } 1229 1230 /* 3. As a last resort return the 'default' jail address. */ 1231 error = prison_get_ip4(cred, laddr); 1232 goto done; 1233 } 1234 1235 /* 1236 * If the outgoing interface on the route found is not 1237 * a loopback interface, use the address from that interface. 1238 * In case of jails do those three steps: 1239 * 1. check if the interface address belongs to the jail. If so use it. 1240 * 2. check if we have any address on the outgoing interface 1241 * belonging to this jail. If so use it. 1242 * 3. as a last resort return the 'default' jail address. 1243 */ 1244 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1245 struct in_ifaddr *ia; 1246 struct ifnet *ifp; 1247 1248 /* If not jailed, use the default returned. */ 1249 if (!prison_flag(cred, PR_IP4)) { 1250 ia = (struct in_ifaddr *)nh->nh_ifa; 1251 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1252 goto done; 1253 } 1254 1255 /* Jailed. */ 1256 /* 1. Check if the iface address belongs to the jail. */ 1257 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1258 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1259 ia = (struct in_ifaddr *)nh->nh_ifa; 1260 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1261 goto done; 1262 } 1263 1264 /* 1265 * 2. Check if we have any address on the outgoing interface 1266 * belonging to this jail. 1267 */ 1268 ia = NULL; 1269 ifp = nh->nh_ifp; 1270 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1271 sa = ifa->ifa_addr; 1272 if (sa->sa_family != AF_INET) 1273 continue; 1274 sin = (struct sockaddr_in *)sa; 1275 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1276 ia = (struct in_ifaddr *)ifa; 1277 break; 1278 } 1279 } 1280 if (ia != NULL) { 1281 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1282 goto done; 1283 } 1284 1285 /* 3. As a last resort return the 'default' jail address. */ 1286 error = prison_get_ip4(cred, laddr); 1287 goto done; 1288 } 1289 1290 /* 1291 * The outgoing interface is marked with 'loopback net', so a route 1292 * to ourselves is here. 1293 * Try to find the interface of the destination address and then 1294 * take the address from there. That interface is not necessarily 1295 * a loopback interface. 1296 * In case of jails, check that it is an address of the jail 1297 * and if we cannot find, fall back to the 'default' jail address. 1298 */ 1299 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1300 struct in_ifaddr *ia; 1301 1302 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1303 inp->inp_socket->so_fibnum)); 1304 if (ia == NULL) 1305 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1306 inp->inp_socket->so_fibnum)); 1307 if (ia == NULL) 1308 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1309 1310 if (!prison_flag(cred, PR_IP4)) { 1311 if (ia == NULL) { 1312 error = ENETUNREACH; 1313 goto done; 1314 } 1315 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1316 goto done; 1317 } 1318 1319 /* Jailed. */ 1320 if (ia != NULL) { 1321 struct ifnet *ifp; 1322 1323 ifp = ia->ia_ifp; 1324 ia = NULL; 1325 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1326 sa = ifa->ifa_addr; 1327 if (sa->sa_family != AF_INET) 1328 continue; 1329 sin = (struct sockaddr_in *)sa; 1330 if (prison_check_ip4(cred, 1331 &sin->sin_addr) == 0) { 1332 ia = (struct in_ifaddr *)ifa; 1333 break; 1334 } 1335 } 1336 if (ia != NULL) { 1337 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1338 goto done; 1339 } 1340 } 1341 1342 /* 3. As a last resort return the 'default' jail address. */ 1343 error = prison_get_ip4(cred, laddr); 1344 goto done; 1345 } 1346 1347 done: 1348 if (error == 0 && laddr->s_addr == INADDR_ANY) 1349 return (EHOSTUNREACH); 1350 return (error); 1351 } 1352 1353 /* 1354 * Set up for a connect from a socket to the specified address. 1355 * On entry, *laddrp and *lportp should contain the current local 1356 * address and port for the PCB; these are updated to the values 1357 * that should be placed in inp_laddr and inp_lport to complete 1358 * the connect. 1359 * 1360 * On success, *faddrp and *fportp will be set to the remote address 1361 * and port. These are not updated in the error case. 1362 */ 1363 int 1364 in_pcbconnect_setup(const struct inpcb *inp, struct sockaddr_in *sin, 1365 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, 1366 struct ucred *cred) 1367 { 1368 struct in_ifaddr *ia; 1369 struct in_addr laddr, faddr; 1370 u_short lport, fport; 1371 int error; 1372 1373 KASSERT(sin->sin_family == AF_INET, 1374 ("%s: invalid address family for %p", __func__, sin)); 1375 KASSERT(sin->sin_len == sizeof(*sin), 1376 ("%s: invalid address length for %p", __func__, sin)); 1377 1378 /* 1379 * Because a global state change doesn't actually occur here, a read 1380 * lock is sufficient. 1381 */ 1382 NET_EPOCH_ASSERT(); 1383 INP_LOCK_ASSERT(inp); 1384 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1385 1386 if (sin->sin_port == 0) 1387 return (EADDRNOTAVAIL); 1388 laddr.s_addr = *laddrp; 1389 lport = *lportp; 1390 faddr = sin->sin_addr; 1391 fport = sin->sin_port; 1392 if (V_connect_inaddr_wild && !CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { 1393 /* 1394 * If the destination address is INADDR_ANY, 1395 * use the primary local address. 1396 * If the supplied address is INADDR_BROADCAST, 1397 * and the primary interface supports broadcast, 1398 * choose the broadcast address for that interface. 1399 */ 1400 if (faddr.s_addr == INADDR_ANY) { 1401 faddr = 1402 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1403 if ((error = prison_get_ip4(cred, &faddr)) != 0) 1404 return (error); 1405 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { 1406 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & 1407 IFF_BROADCAST) 1408 faddr = satosin(&CK_STAILQ_FIRST( 1409 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1410 } 1411 } else if (faddr.s_addr == INADDR_ANY) { 1412 return (ENETUNREACH); 1413 } 1414 if (laddr.s_addr == INADDR_ANY) { 1415 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1416 /* 1417 * If the destination address is multicast and an outgoing 1418 * interface has been set as a multicast option, prefer the 1419 * address of that interface as our source address. 1420 */ 1421 if (IN_MULTICAST(ntohl(faddr.s_addr)) && 1422 inp->inp_moptions != NULL) { 1423 struct ip_moptions *imo; 1424 struct ifnet *ifp; 1425 1426 imo = inp->inp_moptions; 1427 if (imo->imo_multicast_ifp != NULL) { 1428 ifp = imo->imo_multicast_ifp; 1429 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1430 if (ia->ia_ifp == ifp && 1431 prison_check_ip4(cred, 1432 &ia->ia_addr.sin_addr) == 0) 1433 break; 1434 } 1435 if (ia == NULL) 1436 error = EADDRNOTAVAIL; 1437 else { 1438 laddr = ia->ia_addr.sin_addr; 1439 error = 0; 1440 } 1441 } 1442 } 1443 if (error) 1444 return (error); 1445 } 1446 1447 if (lport != 0) { 1448 if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1449 fport, laddr, lport, 0, M_NODOM, RT_ALL_FIBS) != NULL) 1450 return (EADDRINUSE); 1451 } else { 1452 struct sockaddr_in lsin, fsin; 1453 1454 bzero(&lsin, sizeof(lsin)); 1455 bzero(&fsin, sizeof(fsin)); 1456 lsin.sin_family = AF_INET; 1457 lsin.sin_addr = laddr; 1458 fsin.sin_family = AF_INET; 1459 fsin.sin_addr = faddr; 1460 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, 1461 &lport, (struct sockaddr *)& fsin, fport, cred, 1462 INPLOOKUP_WILDCARD); 1463 if (error) 1464 return (error); 1465 } 1466 *laddrp = laddr.s_addr; 1467 *lportp = lport; 1468 *faddrp = faddr.s_addr; 1469 *fportp = fport; 1470 return (0); 1471 } 1472 1473 void 1474 in_pcbdisconnect(struct inpcb *inp) 1475 { 1476 1477 INP_WLOCK_ASSERT(inp); 1478 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1479 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 1480 ("%s: inp %p was already disconnected", __func__, inp)); 1481 1482 in_pcbremhash_locked(inp); 1483 1484 /* See the comment in in_pcbinshash(). */ 1485 inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr); 1486 inp->inp_laddr.s_addr = INADDR_ANY; 1487 inp->inp_faddr.s_addr = INADDR_ANY; 1488 inp->inp_fport = 0; 1489 } 1490 #endif /* INET */ 1491 1492 void 1493 in_pcblisten(struct inpcb *inp) 1494 { 1495 struct inpcblbgroup *grp; 1496 1497 INP_WLOCK_ASSERT(inp); 1498 1499 if ((inp->inp_flags & INP_INLBGROUP) != 0) { 1500 struct inpcbinfo *pcbinfo; 1501 1502 pcbinfo = inp->inp_pcbinfo; 1503 INP_HASH_WLOCK(pcbinfo); 1504 grp = in_pcblbgroup_find(inp); 1505 LIST_REMOVE(inp, inp_lbgroup_list); 1506 in_pcblbgroup_insert(grp, inp); 1507 INP_HASH_WUNLOCK(pcbinfo); 1508 } 1509 } 1510 1511 /* 1512 * inpcb hash lookups are protected by SMR section. 1513 * 1514 * Once desired pcb has been found, switching from SMR section to a pcb 1515 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1516 * here because SMR is a critical section. 1517 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1518 */ 1519 void 1520 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1521 { 1522 1523 lock == INPLOOKUP_RLOCKPCB ? 1524 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1525 } 1526 1527 void 1528 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1529 { 1530 1531 lock == INPLOOKUP_RLOCKPCB ? 1532 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1533 } 1534 1535 int 1536 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1537 { 1538 1539 return (lock == INPLOOKUP_RLOCKPCB ? 1540 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1541 } 1542 1543 static inline bool 1544 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags) 1545 { 1546 1547 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1548 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1549 1550 if (__predict_true(inp_trylock(inp, lock))) { 1551 if (__predict_false(inp->inp_flags & ignflags)) { 1552 smr_exit(inp->inp_pcbinfo->ipi_smr); 1553 inp_unlock(inp, lock); 1554 return (false); 1555 } 1556 smr_exit(inp->inp_pcbinfo->ipi_smr); 1557 return (true); 1558 } 1559 1560 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1561 smr_exit(inp->inp_pcbinfo->ipi_smr); 1562 inp_lock(inp, lock); 1563 if (__predict_false(in_pcbrele(inp, lock))) 1564 return (false); 1565 /* 1566 * inp acquired through refcount & lock for sure didn't went 1567 * through uma_zfree(). However, it may have already went 1568 * through in_pcbfree() and has another reference, that 1569 * prevented its release by our in_pcbrele(). 1570 */ 1571 if (__predict_false(inp->inp_flags & ignflags)) { 1572 inp_unlock(inp, lock); 1573 return (false); 1574 } 1575 return (true); 1576 } else { 1577 smr_exit(inp->inp_pcbinfo->ipi_smr); 1578 return (false); 1579 } 1580 } 1581 1582 bool 1583 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1584 { 1585 1586 /* 1587 * in_pcblookup() family of functions ignore not only freed entries, 1588 * that may be found due to lockless access to the hash, but dropped 1589 * entries, too. 1590 */ 1591 return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED)); 1592 } 1593 1594 /* 1595 * inp_next() - inpcb hash/list traversal iterator 1596 * 1597 * Requires initialized struct inpcb_iterator for context. 1598 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1599 * 1600 * - Iterator can have either write-lock or read-lock semantics, that can not 1601 * be changed later. 1602 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1603 * a single hash slot. Note: only rip_input() does the latter. 1604 * - Iterator may have optional bool matching function. The matching function 1605 * will be executed for each inpcb in the SMR context, so it can not acquire 1606 * locks and can safely access only immutable fields of inpcb. 1607 * 1608 * A fresh initialized iterator has NULL inpcb in its context and that 1609 * means that inp_next() call would return the very first inpcb on the list 1610 * locked with desired semantic. In all following calls the context pointer 1611 * shall hold the current inpcb pointer. The KPI user is not supposed to 1612 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1613 * and write NULL to its context. After end of traversal an iterator can be 1614 * reused. 1615 * 1616 * List traversals have the following features/constraints: 1617 * - New entries won't be seen, as they are always added to the head of a list. 1618 * - Removed entries won't stop traversal as long as they are not added to 1619 * a different list. This is violated by in_pcbrehash(). 1620 */ 1621 #define II_LIST_FIRST(ipi, hash) \ 1622 (((hash) == INP_ALL_LIST) ? \ 1623 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1624 CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)])) 1625 #define II_LIST_NEXT(inp, hash) \ 1626 (((hash) == INP_ALL_LIST) ? \ 1627 CK_LIST_NEXT((inp), inp_list) : \ 1628 CK_LIST_NEXT((inp), inp_hash_exact)) 1629 #define II_LOCK_ASSERT(inp, lock) \ 1630 rw_assert(&(inp)->inp_lock, \ 1631 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1632 struct inpcb * 1633 inp_next(struct inpcb_iterator *ii) 1634 { 1635 const struct inpcbinfo *ipi = ii->ipi; 1636 inp_match_t *match = ii->match; 1637 void *ctx = ii->ctx; 1638 inp_lookup_t lock = ii->lock; 1639 int hash = ii->hash; 1640 struct inpcb *inp; 1641 1642 if (ii->inp == NULL) { /* First call. */ 1643 smr_enter(ipi->ipi_smr); 1644 /* This is unrolled CK_LIST_FOREACH(). */ 1645 for (inp = II_LIST_FIRST(ipi, hash); 1646 inp != NULL; 1647 inp = II_LIST_NEXT(inp, hash)) { 1648 if (match != NULL && (match)(inp, ctx) == false) 1649 continue; 1650 if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) 1651 break; 1652 else { 1653 smr_enter(ipi->ipi_smr); 1654 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1655 inp = II_LIST_FIRST(ipi, hash); 1656 if (inp == NULL) 1657 break; 1658 } 1659 } 1660 1661 if (inp == NULL) 1662 smr_exit(ipi->ipi_smr); 1663 else 1664 ii->inp = inp; 1665 1666 return (inp); 1667 } 1668 1669 /* Not a first call. */ 1670 smr_enter(ipi->ipi_smr); 1671 restart: 1672 inp = ii->inp; 1673 II_LOCK_ASSERT(inp, lock); 1674 next: 1675 inp = II_LIST_NEXT(inp, hash); 1676 if (inp == NULL) { 1677 smr_exit(ipi->ipi_smr); 1678 goto found; 1679 } 1680 1681 if (match != NULL && (match)(inp, ctx) == false) 1682 goto next; 1683 1684 if (__predict_true(inp_trylock(inp, lock))) { 1685 if (__predict_false(inp->inp_flags & INP_FREED)) { 1686 /* 1687 * Entries are never inserted in middle of a list, thus 1688 * as long as we are in SMR, we can continue traversal. 1689 * Jump to 'restart' should yield in the same result, 1690 * but could produce unnecessary looping. Could this 1691 * looping be unbound? 1692 */ 1693 inp_unlock(inp, lock); 1694 goto next; 1695 } else { 1696 smr_exit(ipi->ipi_smr); 1697 goto found; 1698 } 1699 } 1700 1701 /* 1702 * Can't obtain lock immediately, thus going hard. Once we exit the 1703 * SMR section we can no longer jump to 'next', and our only stable 1704 * anchoring point is ii->inp, which we keep locked for this case, so 1705 * we jump to 'restart'. 1706 */ 1707 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1708 smr_exit(ipi->ipi_smr); 1709 inp_lock(inp, lock); 1710 if (__predict_false(in_pcbrele(inp, lock))) { 1711 smr_enter(ipi->ipi_smr); 1712 goto restart; 1713 } 1714 /* 1715 * See comment in inp_smr_lock(). 1716 */ 1717 if (__predict_false(inp->inp_flags & INP_FREED)) { 1718 inp_unlock(inp, lock); 1719 smr_enter(ipi->ipi_smr); 1720 goto restart; 1721 } 1722 } else 1723 goto next; 1724 1725 found: 1726 inp_unlock(ii->inp, lock); 1727 ii->inp = inp; 1728 1729 return (ii->inp); 1730 } 1731 1732 /* 1733 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1734 * stability of an inpcb pointer despite the inpcb lock being released or 1735 * SMR section exited. 1736 * 1737 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1738 */ 1739 void 1740 in_pcbref(struct inpcb *inp) 1741 { 1742 u_int old __diagused; 1743 1744 old = refcount_acquire(&inp->inp_refcount); 1745 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1746 } 1747 1748 /* 1749 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1750 * freeing the pcb, if the reference was very last. 1751 */ 1752 bool 1753 in_pcbrele_rlocked(struct inpcb *inp) 1754 { 1755 1756 INP_RLOCK_ASSERT(inp); 1757 1758 if (!refcount_release(&inp->inp_refcount)) 1759 return (false); 1760 1761 MPASS(inp->inp_flags & INP_FREED); 1762 MPASS(inp->inp_socket == NULL); 1763 crfree(inp->inp_cred); 1764 #ifdef INVARIANTS 1765 inp->inp_cred = NULL; 1766 #endif 1767 INP_RUNLOCK(inp); 1768 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1769 return (true); 1770 } 1771 1772 bool 1773 in_pcbrele_wlocked(struct inpcb *inp) 1774 { 1775 1776 INP_WLOCK_ASSERT(inp); 1777 1778 if (!refcount_release(&inp->inp_refcount)) 1779 return (false); 1780 1781 MPASS(inp->inp_flags & INP_FREED); 1782 MPASS(inp->inp_socket == NULL); 1783 crfree(inp->inp_cred); 1784 #ifdef INVARIANTS 1785 inp->inp_cred = NULL; 1786 #endif 1787 INP_WUNLOCK(inp); 1788 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1789 return (true); 1790 } 1791 1792 bool 1793 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1794 { 1795 1796 return (lock == INPLOOKUP_RLOCKPCB ? 1797 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1798 } 1799 1800 /* 1801 * Unconditionally schedule an inpcb to be freed by decrementing its 1802 * reference count, which should occur only after the inpcb has been detached 1803 * from its socket. If another thread holds a temporary reference (acquired 1804 * using in_pcbref()) then the free is deferred until that reference is 1805 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1806 * Almost all work, including removal from global lists, is done in this 1807 * context, where the pcbinfo lock is held. 1808 */ 1809 void 1810 in_pcbfree(struct inpcb *inp) 1811 { 1812 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1813 #ifdef INET 1814 struct ip_moptions *imo; 1815 #endif 1816 #ifdef INET6 1817 struct ip6_moptions *im6o; 1818 #endif 1819 1820 INP_WLOCK_ASSERT(inp); 1821 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1822 KASSERT((inp->inp_flags & INP_FREED) == 0, 1823 ("%s: called twice for pcb %p", __func__, inp)); 1824 1825 /* 1826 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb 1827 * from the hash without acquiring inpcb lock, they rely on the hash 1828 * lock, thus in_pcbremhash() should be the first action. 1829 */ 1830 if (inp->inp_flags & INP_INHASHLIST) 1831 in_pcbremhash(inp); 1832 INP_INFO_WLOCK(pcbinfo); 1833 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1834 pcbinfo->ipi_count--; 1835 CK_LIST_REMOVE(inp, inp_list); 1836 INP_INFO_WUNLOCK(pcbinfo); 1837 1838 #ifdef RATELIMIT 1839 if (inp->inp_snd_tag != NULL) 1840 in_pcbdetach_txrtlmt(inp); 1841 #endif 1842 inp->inp_flags |= INP_FREED; 1843 inp->inp_socket->so_pcb = NULL; 1844 inp->inp_socket = NULL; 1845 1846 RO_INVALIDATE_CACHE(&inp->inp_route); 1847 #ifdef MAC 1848 mac_inpcb_destroy(inp); 1849 #endif 1850 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1851 if (inp->inp_sp != NULL) 1852 ipsec_delete_pcbpolicy(inp); 1853 #endif 1854 #ifdef INET 1855 if (inp->inp_options) 1856 (void)m_free(inp->inp_options); 1857 DEBUG_POISON_POINTER(inp->inp_options); 1858 imo = inp->inp_moptions; 1859 DEBUG_POISON_POINTER(inp->inp_moptions); 1860 #endif 1861 #ifdef INET6 1862 if (inp->inp_vflag & INP_IPV6PROTO) { 1863 ip6_freepcbopts(inp->in6p_outputopts); 1864 DEBUG_POISON_POINTER(inp->in6p_outputopts); 1865 im6o = inp->in6p_moptions; 1866 DEBUG_POISON_POINTER(inp->in6p_moptions); 1867 } else 1868 im6o = NULL; 1869 #endif 1870 1871 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1872 INP_WUNLOCK(inp); 1873 } 1874 #ifdef INET6 1875 ip6_freemoptions(im6o); 1876 #endif 1877 #ifdef INET 1878 inp_freemoptions(imo); 1879 #endif 1880 } 1881 1882 /* 1883 * Different protocols initialize their inpcbs differently - giving 1884 * different name to the lock. But they all are disposed the same. 1885 */ 1886 static void 1887 inpcb_fini(void *mem, int size) 1888 { 1889 struct inpcb *inp = mem; 1890 1891 INP_LOCK_DESTROY(inp); 1892 } 1893 1894 /* 1895 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1896 * port reservation, and preventing it from being returned by inpcb lookups. 1897 * 1898 * It is used by TCP to mark an inpcb as unused and avoid future packet 1899 * delivery or event notification when a socket remains open but TCP has 1900 * closed. This might occur as a result of a shutdown()-initiated TCP close 1901 * or a RST on the wire, and allows the port binding to be reused while still 1902 * maintaining the invariant that so_pcb always points to a valid inpcb until 1903 * in_pcbdetach(). 1904 * 1905 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1906 * in_pcbpurgeif0()? 1907 */ 1908 void 1909 in_pcbdrop(struct inpcb *inp) 1910 { 1911 1912 INP_WLOCK_ASSERT(inp); 1913 1914 inp->inp_flags |= INP_DROPPED; 1915 if (inp->inp_flags & INP_INHASHLIST) 1916 in_pcbremhash(inp); 1917 } 1918 1919 #ifdef INET 1920 /* 1921 * Common routines to return the socket addresses associated with inpcbs. 1922 */ 1923 int 1924 in_getsockaddr(struct socket *so, struct sockaddr *sa) 1925 { 1926 struct inpcb *inp; 1927 1928 inp = sotoinpcb(so); 1929 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1930 1931 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1932 .sin_len = sizeof(struct sockaddr_in), 1933 .sin_family = AF_INET, 1934 .sin_port = inp->inp_lport, 1935 .sin_addr = inp->inp_laddr, 1936 }; 1937 1938 return (0); 1939 } 1940 1941 int 1942 in_getpeeraddr(struct socket *so, struct sockaddr *sa) 1943 { 1944 struct inpcb *inp; 1945 1946 inp = sotoinpcb(so); 1947 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1948 1949 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1950 .sin_len = sizeof(struct sockaddr_in), 1951 .sin_family = AF_INET, 1952 .sin_port = inp->inp_fport, 1953 .sin_addr = inp->inp_faddr, 1954 }; 1955 1956 return (0); 1957 } 1958 1959 static bool 1960 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 1961 { 1962 1963 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 1964 return (true); 1965 else 1966 return (false); 1967 } 1968 1969 void 1970 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1971 { 1972 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 1973 inp_v4_multi_match, NULL); 1974 struct inpcb *inp; 1975 struct in_multi *inm; 1976 struct in_mfilter *imf; 1977 struct ip_moptions *imo; 1978 1979 IN_MULTI_LOCK_ASSERT(); 1980 1981 while ((inp = inp_next(&inpi)) != NULL) { 1982 INP_WLOCK_ASSERT(inp); 1983 1984 imo = inp->inp_moptions; 1985 /* 1986 * Unselect the outgoing interface if it is being 1987 * detached. 1988 */ 1989 if (imo->imo_multicast_ifp == ifp) 1990 imo->imo_multicast_ifp = NULL; 1991 1992 /* 1993 * Drop multicast group membership if we joined 1994 * through the interface being detached. 1995 * 1996 * XXX This can all be deferred to an epoch_call 1997 */ 1998 restart: 1999 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 2000 if ((inm = imf->imf_inm) == NULL) 2001 continue; 2002 if (inm->inm_ifp != ifp) 2003 continue; 2004 ip_mfilter_remove(&imo->imo_head, imf); 2005 in_leavegroup_locked(inm, NULL); 2006 ip_mfilter_free(imf); 2007 goto restart; 2008 } 2009 } 2010 } 2011 2012 /* 2013 * Lookup a PCB based on the local address and port. Caller must hold the 2014 * hash lock. No inpcb locks or references are acquired. 2015 */ 2016 #define INP_LOOKUP_MAPPED_PCB_COST 3 2017 struct inpcb * 2018 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2019 u_short lport, int fib, int lookupflags, struct ucred *cred) 2020 { 2021 struct inpcb *inp; 2022 #ifdef INET6 2023 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 2024 #else 2025 int matchwild = 3; 2026 #endif 2027 int wildcard; 2028 2029 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2030 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2031 KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs), 2032 ("%s: invalid fib %d", __func__, fib)); 2033 2034 INP_HASH_LOCK_ASSERT(pcbinfo); 2035 2036 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 2037 struct inpcbhead *head; 2038 /* 2039 * Look for an unconnected (wildcard foreign addr) PCB that 2040 * matches the local address and port we're looking for. 2041 */ 2042 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2043 pcbinfo->ipi_hashmask)]; 2044 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2045 #ifdef INET6 2046 /* XXX inp locking */ 2047 if ((inp->inp_vflag & INP_IPV4) == 0) 2048 continue; 2049 #endif 2050 if (inp->inp_faddr.s_addr == INADDR_ANY && 2051 inp->inp_laddr.s_addr == laddr.s_addr && 2052 inp->inp_lport == lport && (fib == RT_ALL_FIBS || 2053 inp->inp_inc.inc_fibnum == fib)) { 2054 /* 2055 * Found? 2056 */ 2057 if (prison_equal_ip4(cred->cr_prison, 2058 inp->inp_cred->cr_prison)) 2059 return (inp); 2060 } 2061 } 2062 /* 2063 * Not found. 2064 */ 2065 return (NULL); 2066 } else { 2067 struct inpcbporthead *porthash; 2068 struct inpcbport *phd; 2069 struct inpcb *match = NULL; 2070 /* 2071 * Best fit PCB lookup. 2072 * 2073 * First see if this local port is in use by looking on the 2074 * port hash list. 2075 */ 2076 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 2077 pcbinfo->ipi_porthashmask)]; 2078 CK_LIST_FOREACH(phd, porthash, phd_hash) { 2079 if (phd->phd_port == lport) 2080 break; 2081 } 2082 if (phd != NULL) { 2083 /* 2084 * Port is in use by one or more PCBs. Look for best 2085 * fit. 2086 */ 2087 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 2088 wildcard = 0; 2089 if (!prison_equal_ip4(inp->inp_cred->cr_prison, 2090 cred->cr_prison)) 2091 continue; 2092 if (fib != RT_ALL_FIBS && 2093 inp->inp_inc.inc_fibnum != fib) 2094 continue; 2095 #ifdef INET6 2096 /* XXX inp locking */ 2097 if ((inp->inp_vflag & INP_IPV4) == 0) 2098 continue; 2099 /* 2100 * We never select the PCB that has 2101 * INP_IPV6 flag and is bound to :: if 2102 * we have another PCB which is bound 2103 * to 0.0.0.0. If a PCB has the 2104 * INP_IPV6 flag, then we set its cost 2105 * higher than IPv4 only PCBs. 2106 * 2107 * Note that the case only happens 2108 * when a socket is bound to ::, under 2109 * the condition that the use of the 2110 * mapped address is allowed. 2111 */ 2112 if ((inp->inp_vflag & INP_IPV6) != 0) 2113 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2114 #endif 2115 if (inp->inp_faddr.s_addr != INADDR_ANY) 2116 wildcard++; 2117 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2118 if (laddr.s_addr == INADDR_ANY) 2119 wildcard++; 2120 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2121 continue; 2122 } else { 2123 if (laddr.s_addr != INADDR_ANY) 2124 wildcard++; 2125 } 2126 if (wildcard < matchwild) { 2127 match = inp; 2128 matchwild = wildcard; 2129 if (matchwild == 0) 2130 break; 2131 } 2132 } 2133 } 2134 return (match); 2135 } 2136 } 2137 #undef INP_LOOKUP_MAPPED_PCB_COST 2138 2139 static bool 2140 in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib) 2141 { 2142 return ((domain == M_NODOM || domain == grp->il_numa_domain) && 2143 (fib == RT_ALL_FIBS || fib == grp->il_fibnum)); 2144 } 2145 2146 static struct inpcb * 2147 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2148 const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr, 2149 uint16_t lport, int domain, int fib) 2150 { 2151 const struct inpcblbgrouphead *hdr; 2152 struct inpcblbgroup *grp; 2153 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; 2154 struct inpcb *inp; 2155 u_int count; 2156 2157 INP_HASH_LOCK_ASSERT(pcbinfo); 2158 NET_EPOCH_ASSERT(); 2159 2160 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2161 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2162 2163 /* 2164 * Search for an LB group match based on the following criteria: 2165 * - prefer jailed groups to non-jailed groups 2166 * - prefer exact source address matches to wildcard matches 2167 * - prefer groups bound to the specified NUMA domain 2168 */ 2169 jail_exact = jail_wild = local_exact = local_wild = NULL; 2170 CK_LIST_FOREACH(grp, hdr, il_list) { 2171 bool injail; 2172 2173 #ifdef INET6 2174 if (!(grp->il_vflag & INP_IPV4)) 2175 continue; 2176 #endif 2177 if (grp->il_lport != lport) 2178 continue; 2179 2180 injail = prison_flag(grp->il_cred, PR_IP4) != 0; 2181 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, 2182 laddr) != 0) 2183 continue; 2184 2185 if (grp->il_laddr.s_addr == laddr->s_addr) { 2186 if (injail) { 2187 jail_exact = grp; 2188 if (in_pcblookup_lb_match(grp, domain, fib)) 2189 /* This is a perfect match. */ 2190 goto out; 2191 } else if (local_exact == NULL || 2192 in_pcblookup_lb_match(grp, domain, fib)) { 2193 local_exact = grp; 2194 } 2195 } else if (grp->il_laddr.s_addr == INADDR_ANY) { 2196 if (injail) { 2197 if (jail_wild == NULL || 2198 in_pcblookup_lb_match(grp, domain, fib)) 2199 jail_wild = grp; 2200 } else if (local_wild == NULL || 2201 in_pcblookup_lb_match(grp, domain, fib)) { 2202 local_wild = grp; 2203 } 2204 } 2205 } 2206 2207 if (jail_exact != NULL) 2208 grp = jail_exact; 2209 else if (jail_wild != NULL) 2210 grp = jail_wild; 2211 else if (local_exact != NULL) 2212 grp = local_exact; 2213 else 2214 grp = local_wild; 2215 if (grp == NULL) 2216 return (NULL); 2217 2218 out: 2219 /* 2220 * Synchronize with in_pcblbgroup_insert(). 2221 */ 2222 count = atomic_load_acq_int(&grp->il_inpcnt); 2223 if (count == 0) 2224 return (NULL); 2225 inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count]; 2226 KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); 2227 return (inp); 2228 } 2229 2230 static bool 2231 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr, 2232 u_short fport, struct in_addr laddr, u_short lport) 2233 { 2234 #ifdef INET6 2235 /* XXX inp locking */ 2236 if ((inp->inp_vflag & INP_IPV4) == 0) 2237 return (false); 2238 #endif 2239 if (inp->inp_faddr.s_addr == faddr.s_addr && 2240 inp->inp_laddr.s_addr == laddr.s_addr && 2241 inp->inp_fport == fport && 2242 inp->inp_lport == lport) 2243 return (true); 2244 return (false); 2245 } 2246 2247 static struct inpcb * 2248 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2249 u_short fport, struct in_addr laddr, u_short lport) 2250 { 2251 struct inpcbhead *head; 2252 struct inpcb *inp; 2253 2254 INP_HASH_LOCK_ASSERT(pcbinfo); 2255 2256 head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport, 2257 pcbinfo->ipi_hashmask)]; 2258 CK_LIST_FOREACH(inp, head, inp_hash_exact) { 2259 if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport)) 2260 return (inp); 2261 } 2262 return (NULL); 2263 } 2264 2265 typedef enum { 2266 INPLOOKUP_MATCH_NONE = 0, 2267 INPLOOKUP_MATCH_WILD = 1, 2268 INPLOOKUP_MATCH_LADDR = 2, 2269 } inp_lookup_match_t; 2270 2271 static inp_lookup_match_t 2272 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr, 2273 u_short lport, int fib) 2274 { 2275 #ifdef INET6 2276 /* XXX inp locking */ 2277 if ((inp->inp_vflag & INP_IPV4) == 0) 2278 return (INPLOOKUP_MATCH_NONE); 2279 #endif 2280 if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) 2281 return (INPLOOKUP_MATCH_NONE); 2282 if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib) 2283 return (INPLOOKUP_MATCH_NONE); 2284 if (inp->inp_laddr.s_addr == INADDR_ANY) 2285 return (INPLOOKUP_MATCH_WILD); 2286 if (inp->inp_laddr.s_addr == laddr.s_addr) 2287 return (INPLOOKUP_MATCH_LADDR); 2288 return (INPLOOKUP_MATCH_NONE); 2289 } 2290 2291 #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1) 2292 2293 static struct inpcb * 2294 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2295 u_short lport, int fib, const inp_lookup_t lockflags) 2296 { 2297 struct inpcbhead *head; 2298 struct inpcb *inp; 2299 2300 KASSERT(SMR_ENTERED(pcbinfo->ipi_smr), 2301 ("%s: not in SMR read section", __func__)); 2302 2303 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2304 pcbinfo->ipi_hashmask)]; 2305 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2306 inp_lookup_match_t match; 2307 2308 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2309 if (match == INPLOOKUP_MATCH_NONE) 2310 continue; 2311 2312 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2313 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2314 if (match != INPLOOKUP_MATCH_NONE && 2315 prison_check_ip4_locked(inp->inp_cred->cr_prison, 2316 &laddr) == 0) 2317 return (inp); 2318 inp_unlock(inp, lockflags); 2319 } 2320 2321 /* 2322 * The matching socket disappeared out from under us. Fall back 2323 * to a serialized lookup. 2324 */ 2325 return (INP_LOOKUP_AGAIN); 2326 } 2327 return (NULL); 2328 } 2329 2330 static struct inpcb * 2331 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2332 u_short lport, int fib) 2333 { 2334 struct inpcbhead *head; 2335 struct inpcb *inp, *local_wild, *local_exact, *jail_wild; 2336 #ifdef INET6 2337 struct inpcb *local_wild_mapped; 2338 #endif 2339 2340 INP_HASH_LOCK_ASSERT(pcbinfo); 2341 2342 /* 2343 * Order of socket selection - we always prefer jails. 2344 * 1. jailed, non-wild. 2345 * 2. jailed, wild. 2346 * 3. non-jailed, non-wild. 2347 * 4. non-jailed, wild. 2348 */ 2349 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2350 pcbinfo->ipi_hashmask)]; 2351 local_wild = local_exact = jail_wild = NULL; 2352 #ifdef INET6 2353 local_wild_mapped = NULL; 2354 #endif 2355 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2356 inp_lookup_match_t match; 2357 bool injail; 2358 2359 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2360 if (match == INPLOOKUP_MATCH_NONE) 2361 continue; 2362 2363 injail = prison_flag(inp->inp_cred, PR_IP4) != 0; 2364 if (injail) { 2365 if (prison_check_ip4_locked(inp->inp_cred->cr_prison, 2366 &laddr) != 0) 2367 continue; 2368 } else { 2369 if (local_exact != NULL) 2370 continue; 2371 } 2372 2373 if (match == INPLOOKUP_MATCH_LADDR) { 2374 if (injail) 2375 return (inp); 2376 local_exact = inp; 2377 } else { 2378 #ifdef INET6 2379 /* XXX inp locking, NULL check */ 2380 if (inp->inp_vflag & INP_IPV6PROTO) 2381 local_wild_mapped = inp; 2382 else 2383 #endif 2384 if (injail) 2385 jail_wild = inp; 2386 else 2387 local_wild = inp; 2388 } 2389 } 2390 if (jail_wild != NULL) 2391 return (jail_wild); 2392 if (local_exact != NULL) 2393 return (local_exact); 2394 if (local_wild != NULL) 2395 return (local_wild); 2396 #ifdef INET6 2397 if (local_wild_mapped != NULL) 2398 return (local_wild_mapped); 2399 #endif 2400 return (NULL); 2401 } 2402 2403 /* 2404 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2405 * that the caller has either locked the hash list, which usually happens 2406 * for bind(2) operations, or is in SMR section, which happens when sorting 2407 * out incoming packets. 2408 */ 2409 static struct inpcb * 2410 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2411 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2412 uint8_t numa_domain, int fib) 2413 { 2414 struct inpcb *inp; 2415 const u_short fport = fport_arg, lport = lport_arg; 2416 2417 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0, 2418 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2419 KASSERT(faddr.s_addr != INADDR_ANY, 2420 ("%s: invalid foreign address", __func__)); 2421 KASSERT(laddr.s_addr != INADDR_ANY, 2422 ("%s: invalid local address", __func__)); 2423 INP_HASH_WLOCK_ASSERT(pcbinfo); 2424 2425 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2426 if (inp != NULL) 2427 return (inp); 2428 2429 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2430 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2431 &laddr, lport, numa_domain, fib); 2432 if (inp == NULL) { 2433 inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr, 2434 lport, fib); 2435 } 2436 } 2437 2438 return (inp); 2439 } 2440 2441 static struct inpcb * 2442 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2443 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2444 uint8_t numa_domain, int fib) 2445 { 2446 struct inpcb *inp; 2447 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2448 2449 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2450 ("%s: LOCKPCB not set", __func__)); 2451 2452 INP_HASH_WLOCK(pcbinfo); 2453 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2454 lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib); 2455 if (inp != NULL && !inp_trylock(inp, lockflags)) { 2456 in_pcbref(inp); 2457 INP_HASH_WUNLOCK(pcbinfo); 2458 inp_lock(inp, lockflags); 2459 if (in_pcbrele(inp, lockflags)) 2460 /* XXX-MJ or retry until we get a negative match? */ 2461 inp = NULL; 2462 } else { 2463 INP_HASH_WUNLOCK(pcbinfo); 2464 } 2465 return (inp); 2466 } 2467 2468 static struct inpcb * 2469 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2470 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2471 uint8_t numa_domain, int fib) 2472 { 2473 struct inpcb *inp; 2474 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2475 const u_short fport = fport_arg, lport = lport_arg; 2476 2477 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2478 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2479 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2480 ("%s: LOCKPCB not set", __func__)); 2481 2482 smr_enter(pcbinfo->ipi_smr); 2483 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2484 if (inp != NULL) { 2485 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2486 /* 2487 * Revalidate the 4-tuple, the socket could have been 2488 * disconnected. 2489 */ 2490 if (__predict_true(in_pcblookup_exact_match(inp, 2491 faddr, fport, laddr, lport))) 2492 return (inp); 2493 inp_unlock(inp, lockflags); 2494 } 2495 2496 /* 2497 * We failed to lock the inpcb, or its connection state changed 2498 * out from under us. Fall back to a precise search. 2499 */ 2500 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2501 lookupflags, numa_domain, fib)); 2502 } 2503 2504 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2505 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2506 &laddr, lport, numa_domain, fib); 2507 if (inp != NULL) { 2508 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2509 if (__predict_true(in_pcblookup_wild_match(inp, 2510 laddr, lport, fib) != INPLOOKUP_MATCH_NONE)) 2511 return (inp); 2512 inp_unlock(inp, lockflags); 2513 } 2514 inp = INP_LOOKUP_AGAIN; 2515 } else { 2516 inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport, 2517 fib, lockflags); 2518 } 2519 if (inp == INP_LOOKUP_AGAIN) { 2520 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, 2521 lport, lookupflags, numa_domain, fib)); 2522 } 2523 } 2524 2525 if (inp == NULL) 2526 smr_exit(pcbinfo->ipi_smr); 2527 2528 return (inp); 2529 } 2530 2531 /* 2532 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2533 * from which a pre-calculated hash value may be extracted. 2534 */ 2535 struct inpcb * 2536 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2537 struct in_addr laddr, u_int lport, int lookupflags, 2538 struct ifnet *ifp) 2539 { 2540 int fib; 2541 2542 fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS; 2543 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2544 lookupflags, M_NODOM, fib)); 2545 } 2546 2547 struct inpcb * 2548 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2549 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2550 struct ifnet *ifp __unused, struct mbuf *m) 2551 { 2552 int fib; 2553 2554 M_ASSERTPKTHDR(m); 2555 fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS; 2556 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2557 lookupflags, m->m_pkthdr.numa_domain, fib)); 2558 } 2559 #endif /* INET */ 2560 2561 static bool 2562 in_pcbjailed(const struct inpcb *inp, unsigned int flag) 2563 { 2564 return (prison_flag(inp->inp_cred, flag) != 0); 2565 } 2566 2567 /* 2568 * Insert the PCB into a hash chain using ordering rules which ensure that 2569 * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first. 2570 * 2571 * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs 2572 * with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs 2573 * always appear last no matter whether they are jailed. 2574 */ 2575 static void 2576 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2577 { 2578 struct inpcb *last; 2579 bool bound, injail; 2580 2581 INP_LOCK_ASSERT(inp); 2582 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2583 2584 last = NULL; 2585 bound = inp->inp_laddr.s_addr != INADDR_ANY; 2586 if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) { 2587 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2588 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2589 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2590 return; 2591 } 2592 } 2593 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2594 return; 2595 } 2596 2597 injail = in_pcbjailed(inp, PR_IP4); 2598 if (!injail) { 2599 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2600 if (!in_pcbjailed(last, PR_IP4)) 2601 break; 2602 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2603 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2604 return; 2605 } 2606 } 2607 } else if (!CK_LIST_EMPTY(pcbhash) && 2608 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) { 2609 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2610 return; 2611 } 2612 if (!bound) { 2613 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2614 if (last->inp_laddr.s_addr == INADDR_ANY) 2615 break; 2616 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2617 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2618 return; 2619 } 2620 } 2621 } 2622 if (last == NULL) 2623 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2624 else 2625 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2626 } 2627 2628 #ifdef INET6 2629 /* 2630 * See the comment above _in_pcbinshash_wild(). 2631 */ 2632 static void 2633 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2634 { 2635 struct inpcb *last; 2636 bool bound, injail; 2637 2638 INP_LOCK_ASSERT(inp); 2639 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2640 2641 last = NULL; 2642 bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr); 2643 injail = in_pcbjailed(inp, PR_IP6); 2644 if (!injail) { 2645 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2646 if (!in_pcbjailed(last, PR_IP6)) 2647 break; 2648 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2649 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2650 return; 2651 } 2652 } 2653 } else if (!CK_LIST_EMPTY(pcbhash) && 2654 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) { 2655 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2656 return; 2657 } 2658 if (!bound) { 2659 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2660 if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr)) 2661 break; 2662 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2663 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2664 return; 2665 } 2666 } 2667 } 2668 if (last == NULL) 2669 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2670 else 2671 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2672 } 2673 #endif 2674 2675 /* 2676 * Insert PCB onto various hash lists. 2677 */ 2678 int 2679 in_pcbinshash(struct inpcb *inp) 2680 { 2681 struct inpcbhead *pcbhash; 2682 struct inpcbporthead *pcbporthash; 2683 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2684 struct inpcbport *phd; 2685 uint32_t hash; 2686 bool connected; 2687 2688 INP_WLOCK_ASSERT(inp); 2689 INP_HASH_WLOCK_ASSERT(pcbinfo); 2690 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2691 ("in_pcbinshash: INP_INHASHLIST")); 2692 2693 #ifdef INET6 2694 if (inp->inp_vflag & INP_IPV6) { 2695 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2696 inp->inp_fport, pcbinfo->ipi_hashmask); 2697 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2698 } else 2699 #endif 2700 { 2701 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2702 inp->inp_fport, pcbinfo->ipi_hashmask); 2703 connected = !in_nullhost(inp->inp_faddr); 2704 } 2705 2706 if (connected) 2707 pcbhash = &pcbinfo->ipi_hash_exact[hash]; 2708 else 2709 pcbhash = &pcbinfo->ipi_hash_wild[hash]; 2710 2711 pcbporthash = &pcbinfo->ipi_porthashbase[ 2712 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2713 2714 /* 2715 * Add entry to load balance group. 2716 * Only do this if SO_REUSEPORT_LB is set. 2717 */ 2718 if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) { 2719 int error = in_pcbinslbgrouphash(inp, M_NODOM); 2720 if (error != 0) 2721 return (error); 2722 } 2723 2724 /* 2725 * Go through port list and look for a head for this lport. 2726 */ 2727 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { 2728 if (phd->phd_port == inp->inp_lport) 2729 break; 2730 } 2731 2732 /* 2733 * If none exists, malloc one and tack it on. 2734 */ 2735 if (phd == NULL) { 2736 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); 2737 if (phd == NULL) { 2738 if ((inp->inp_flags & INP_INLBGROUP) != 0) 2739 in_pcbremlbgrouphash(inp); 2740 return (ENOMEM); 2741 } 2742 phd->phd_port = inp->inp_lport; 2743 CK_LIST_INIT(&phd->phd_pcblist); 2744 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 2745 } 2746 inp->inp_phd = phd; 2747 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 2748 2749 /* 2750 * The PCB may have been disconnected in the past. Before we can safely 2751 * make it visible in the hash table, we must wait for all readers which 2752 * may be traversing this PCB to finish. 2753 */ 2754 if (inp->inp_smr != SMR_SEQ_INVALID) { 2755 smr_wait(pcbinfo->ipi_smr, inp->inp_smr); 2756 inp->inp_smr = SMR_SEQ_INVALID; 2757 } 2758 2759 if (connected) 2760 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); 2761 else { 2762 #ifdef INET6 2763 if ((inp->inp_vflag & INP_IPV6) != 0) 2764 _in6_pcbinshash_wild(pcbhash, inp); 2765 else 2766 #endif 2767 _in_pcbinshash_wild(pcbhash, inp); 2768 } 2769 inp->inp_flags |= INP_INHASHLIST; 2770 2771 return (0); 2772 } 2773 2774 void 2775 in_pcbremhash_locked(struct inpcb *inp) 2776 { 2777 struct inpcbport *phd = inp->inp_phd; 2778 2779 INP_WLOCK_ASSERT(inp); 2780 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2781 MPASS(inp->inp_flags & INP_INHASHLIST); 2782 2783 if ((inp->inp_flags & INP_INLBGROUP) != 0) 2784 in_pcbremlbgrouphash(inp); 2785 #ifdef INET6 2786 if (inp->inp_vflag & INP_IPV6) { 2787 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) 2788 CK_LIST_REMOVE(inp, inp_hash_wild); 2789 else 2790 CK_LIST_REMOVE(inp, inp_hash_exact); 2791 } else 2792 #endif 2793 { 2794 if (in_nullhost(inp->inp_faddr)) 2795 CK_LIST_REMOVE(inp, inp_hash_wild); 2796 else 2797 CK_LIST_REMOVE(inp, inp_hash_exact); 2798 } 2799 CK_LIST_REMOVE(inp, inp_portlist); 2800 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 2801 CK_LIST_REMOVE(phd, phd_hash); 2802 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); 2803 } 2804 inp->inp_flags &= ~INP_INHASHLIST; 2805 } 2806 2807 static void 2808 in_pcbremhash(struct inpcb *inp) 2809 { 2810 INP_HASH_WLOCK(inp->inp_pcbinfo); 2811 in_pcbremhash_locked(inp); 2812 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 2813 } 2814 2815 /* 2816 * Move PCB to the proper hash bucket when { faddr, fport } have been 2817 * changed. NOTE: This does not handle the case of the lport changing (the 2818 * hashed port list would have to be updated as well), so the lport must 2819 * not change after in_pcbinshash() has been called. 2820 */ 2821 void 2822 in_pcbrehash(struct inpcb *inp) 2823 { 2824 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2825 struct inpcbhead *head; 2826 uint32_t hash; 2827 bool connected; 2828 2829 INP_WLOCK_ASSERT(inp); 2830 INP_HASH_WLOCK_ASSERT(pcbinfo); 2831 KASSERT(inp->inp_flags & INP_INHASHLIST, 2832 ("%s: !INP_INHASHLIST", __func__)); 2833 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 2834 ("%s: inp was disconnected", __func__)); 2835 2836 #ifdef INET6 2837 if (inp->inp_vflag & INP_IPV6) { 2838 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2839 inp->inp_fport, pcbinfo->ipi_hashmask); 2840 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2841 } else 2842 #endif 2843 { 2844 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2845 inp->inp_fport, pcbinfo->ipi_hashmask); 2846 connected = !in_nullhost(inp->inp_faddr); 2847 } 2848 2849 /* 2850 * When rehashing, the caller must ensure that either the new or the old 2851 * foreign address was unspecified. 2852 */ 2853 if (connected) 2854 CK_LIST_REMOVE(inp, inp_hash_wild); 2855 else 2856 CK_LIST_REMOVE(inp, inp_hash_exact); 2857 2858 if (connected) { 2859 head = &pcbinfo->ipi_hash_exact[hash]; 2860 CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact); 2861 } else { 2862 head = &pcbinfo->ipi_hash_wild[hash]; 2863 CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild); 2864 } 2865 } 2866 2867 /* 2868 * Check for alternatives when higher level complains 2869 * about service problems. For now, invalidate cached 2870 * routing information. If the route was created dynamically 2871 * (by a redirect), time to try a default gateway again. 2872 */ 2873 void 2874 in_losing(struct inpcb *inp) 2875 { 2876 2877 RO_INVALIDATE_CACHE(&inp->inp_route); 2878 return; 2879 } 2880 2881 /* 2882 * A set label operation has occurred at the socket layer, propagate the 2883 * label change into the in_pcb for the socket. 2884 */ 2885 void 2886 in_pcbsosetlabel(struct socket *so) 2887 { 2888 #ifdef MAC 2889 struct inpcb *inp; 2890 2891 inp = sotoinpcb(so); 2892 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2893 2894 INP_WLOCK(inp); 2895 SOCK_LOCK(so); 2896 mac_inpcb_sosetlabel(so, inp); 2897 SOCK_UNLOCK(so); 2898 INP_WUNLOCK(inp); 2899 #endif 2900 } 2901 2902 void 2903 inp_wlock(struct inpcb *inp) 2904 { 2905 2906 INP_WLOCK(inp); 2907 } 2908 2909 void 2910 inp_wunlock(struct inpcb *inp) 2911 { 2912 2913 INP_WUNLOCK(inp); 2914 } 2915 2916 void 2917 inp_rlock(struct inpcb *inp) 2918 { 2919 2920 INP_RLOCK(inp); 2921 } 2922 2923 void 2924 inp_runlock(struct inpcb *inp) 2925 { 2926 2927 INP_RUNLOCK(inp); 2928 } 2929 2930 #ifdef INVARIANT_SUPPORT 2931 void 2932 inp_lock_assert(struct inpcb *inp) 2933 { 2934 2935 INP_WLOCK_ASSERT(inp); 2936 } 2937 2938 void 2939 inp_unlock_assert(struct inpcb *inp) 2940 { 2941 2942 INP_UNLOCK_ASSERT(inp); 2943 } 2944 #endif 2945 2946 void 2947 inp_apply_all(struct inpcbinfo *pcbinfo, 2948 void (*func)(struct inpcb *, void *), void *arg) 2949 { 2950 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2951 INPLOOKUP_WLOCKPCB); 2952 struct inpcb *inp; 2953 2954 while ((inp = inp_next(&inpi)) != NULL) 2955 func(inp, arg); 2956 } 2957 2958 struct socket * 2959 inp_inpcbtosocket(struct inpcb *inp) 2960 { 2961 2962 INP_WLOCK_ASSERT(inp); 2963 return (inp->inp_socket); 2964 } 2965 2966 void 2967 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2968 uint32_t *faddr, uint16_t *fp) 2969 { 2970 2971 INP_LOCK_ASSERT(inp); 2972 *laddr = inp->inp_laddr.s_addr; 2973 *faddr = inp->inp_faddr.s_addr; 2974 *lp = inp->inp_lport; 2975 *fp = inp->inp_fport; 2976 } 2977 2978 /* 2979 * Create an external-format (``xinpcb'') structure using the information in 2980 * the kernel-format in_pcb structure pointed to by inp. This is done to 2981 * reduce the spew of irrelevant information over this interface, to isolate 2982 * user code from changes in the kernel structure, and potentially to provide 2983 * information-hiding if we decide that some of this information should be 2984 * hidden from users. 2985 */ 2986 void 2987 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2988 { 2989 2990 bzero(xi, sizeof(*xi)); 2991 xi->xi_len = sizeof(struct xinpcb); 2992 if (inp->inp_socket) 2993 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2994 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2995 xi->inp_gencnt = inp->inp_gencnt; 2996 xi->inp_flow = inp->inp_flow; 2997 xi->inp_flowid = inp->inp_flowid; 2998 xi->inp_flowtype = inp->inp_flowtype; 2999 xi->inp_flags = inp->inp_flags; 3000 xi->inp_flags2 = inp->inp_flags2; 3001 xi->in6p_cksum = inp->in6p_cksum; 3002 xi->in6p_hops = inp->in6p_hops; 3003 xi->inp_ip_tos = inp->inp_ip_tos; 3004 xi->inp_vflag = inp->inp_vflag; 3005 xi->inp_ip_ttl = inp->inp_ip_ttl; 3006 xi->inp_ip_p = inp->inp_ip_p; 3007 xi->inp_ip_minttl = inp->inp_ip_minttl; 3008 } 3009 3010 int 3011 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, 3012 int (*ctloutput_set)(struct inpcb *, struct sockopt *)) 3013 { 3014 struct sockopt sopt; 3015 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 3016 INPLOOKUP_WLOCKPCB); 3017 struct inpcb *inp; 3018 struct sockopt_parameters *params; 3019 struct socket *so; 3020 int error; 3021 char buf[1024]; 3022 3023 if (req->oldptr != NULL || req->oldlen != 0) 3024 return (EINVAL); 3025 if (req->newptr == NULL) 3026 return (EPERM); 3027 if (req->newlen > sizeof(buf)) 3028 return (ENOMEM); 3029 error = SYSCTL_IN(req, buf, req->newlen); 3030 if (error != 0) 3031 return (error); 3032 if (req->newlen < sizeof(struct sockopt_parameters)) 3033 return (EINVAL); 3034 params = (struct sockopt_parameters *)buf; 3035 sopt.sopt_level = params->sop_level; 3036 sopt.sopt_name = params->sop_optname; 3037 sopt.sopt_dir = SOPT_SET; 3038 sopt.sopt_val = params->sop_optval; 3039 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); 3040 sopt.sopt_td = NULL; 3041 #ifdef INET6 3042 if (params->sop_inc.inc_flags & INC_ISIPV6) { 3043 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) 3044 params->sop_inc.inc6_laddr.s6_addr16[1] = 3045 htons(params->sop_inc.inc6_zoneid & 0xffff); 3046 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) 3047 params->sop_inc.inc6_faddr.s6_addr16[1] = 3048 htons(params->sop_inc.inc6_zoneid & 0xffff); 3049 } 3050 #endif 3051 if (params->sop_inc.inc_lport != htons(0) && 3052 params->sop_inc.inc_fport != htons(0)) { 3053 #ifdef INET6 3054 if (params->sop_inc.inc_flags & INC_ISIPV6) 3055 inpi.hash = INP6_PCBHASH( 3056 ¶ms->sop_inc.inc6_faddr, 3057 params->sop_inc.inc_lport, 3058 params->sop_inc.inc_fport, 3059 pcbinfo->ipi_hashmask); 3060 else 3061 #endif 3062 inpi.hash = INP_PCBHASH( 3063 ¶ms->sop_inc.inc_faddr, 3064 params->sop_inc.inc_lport, 3065 params->sop_inc.inc_fport, 3066 pcbinfo->ipi_hashmask); 3067 } 3068 while ((inp = inp_next(&inpi)) != NULL) 3069 if (inp->inp_gencnt == params->sop_id) { 3070 if (inp->inp_flags & INP_DROPPED) { 3071 INP_WUNLOCK(inp); 3072 return (ECONNRESET); 3073 } 3074 so = inp->inp_socket; 3075 KASSERT(so != NULL, ("inp_socket == NULL")); 3076 soref(so); 3077 if (params->sop_level == SOL_SOCKET) { 3078 INP_WUNLOCK(inp); 3079 error = sosetopt(so, &sopt); 3080 } else 3081 error = (*ctloutput_set)(inp, &sopt); 3082 sorele(so); 3083 break; 3084 } 3085 if (inp == NULL) 3086 error = ESRCH; 3087 return (error); 3088 } 3089 3090 #ifdef DDB 3091 static void 3092 db_print_indent(int indent) 3093 { 3094 int i; 3095 3096 for (i = 0; i < indent; i++) 3097 db_printf(" "); 3098 } 3099 3100 static void 3101 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 3102 { 3103 char faddr_str[48], laddr_str[48]; 3104 3105 db_print_indent(indent); 3106 db_printf("%s at %p\n", name, inc); 3107 3108 indent += 2; 3109 3110 #ifdef INET6 3111 if (inc->inc_flags & INC_ISIPV6) { 3112 /* IPv6. */ 3113 ip6_sprintf(laddr_str, &inc->inc6_laddr); 3114 ip6_sprintf(faddr_str, &inc->inc6_faddr); 3115 } else 3116 #endif 3117 { 3118 /* IPv4. */ 3119 inet_ntoa_r(inc->inc_laddr, laddr_str); 3120 inet_ntoa_r(inc->inc_faddr, faddr_str); 3121 } 3122 db_print_indent(indent); 3123 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 3124 ntohs(inc->inc_lport)); 3125 db_print_indent(indent); 3126 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 3127 ntohs(inc->inc_fport)); 3128 } 3129 3130 static void 3131 db_print_inpflags(int inp_flags) 3132 { 3133 int comma; 3134 3135 comma = 0; 3136 if (inp_flags & INP_RECVOPTS) { 3137 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 3138 comma = 1; 3139 } 3140 if (inp_flags & INP_RECVRETOPTS) { 3141 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 3142 comma = 1; 3143 } 3144 if (inp_flags & INP_RECVDSTADDR) { 3145 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 3146 comma = 1; 3147 } 3148 if (inp_flags & INP_ORIGDSTADDR) { 3149 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 3150 comma = 1; 3151 } 3152 if (inp_flags & INP_HDRINCL) { 3153 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 3154 comma = 1; 3155 } 3156 if (inp_flags & INP_HIGHPORT) { 3157 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 3158 comma = 1; 3159 } 3160 if (inp_flags & INP_LOWPORT) { 3161 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 3162 comma = 1; 3163 } 3164 if (inp_flags & INP_ANONPORT) { 3165 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 3166 comma = 1; 3167 } 3168 if (inp_flags & INP_RECVIF) { 3169 db_printf("%sINP_RECVIF", comma ? ", " : ""); 3170 comma = 1; 3171 } 3172 if (inp_flags & INP_MTUDISC) { 3173 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 3174 comma = 1; 3175 } 3176 if (inp_flags & INP_RECVTTL) { 3177 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 3178 comma = 1; 3179 } 3180 if (inp_flags & INP_DONTFRAG) { 3181 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 3182 comma = 1; 3183 } 3184 if (inp_flags & INP_RECVTOS) { 3185 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 3186 comma = 1; 3187 } 3188 if (inp_flags & IN6P_IPV6_V6ONLY) { 3189 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 3190 comma = 1; 3191 } 3192 if (inp_flags & IN6P_PKTINFO) { 3193 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 3194 comma = 1; 3195 } 3196 if (inp_flags & IN6P_HOPLIMIT) { 3197 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 3198 comma = 1; 3199 } 3200 if (inp_flags & IN6P_HOPOPTS) { 3201 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 3202 comma = 1; 3203 } 3204 if (inp_flags & IN6P_DSTOPTS) { 3205 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 3206 comma = 1; 3207 } 3208 if (inp_flags & IN6P_RTHDR) { 3209 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 3210 comma = 1; 3211 } 3212 if (inp_flags & IN6P_RTHDRDSTOPTS) { 3213 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 3214 comma = 1; 3215 } 3216 if (inp_flags & IN6P_TCLASS) { 3217 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 3218 comma = 1; 3219 } 3220 if (inp_flags & IN6P_AUTOFLOWLABEL) { 3221 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 3222 comma = 1; 3223 } 3224 if (inp_flags & INP_ONESBCAST) { 3225 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 3226 comma = 1; 3227 } 3228 if (inp_flags & INP_DROPPED) { 3229 db_printf("%sINP_DROPPED", comma ? ", " : ""); 3230 comma = 1; 3231 } 3232 if (inp_flags & INP_SOCKREF) { 3233 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 3234 comma = 1; 3235 } 3236 if (inp_flags & IN6P_RFC2292) { 3237 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 3238 comma = 1; 3239 } 3240 if (inp_flags & IN6P_MTU) { 3241 db_printf("IN6P_MTU%s", comma ? ", " : ""); 3242 comma = 1; 3243 } 3244 } 3245 3246 static void 3247 db_print_inpvflag(u_char inp_vflag) 3248 { 3249 int comma; 3250 3251 comma = 0; 3252 if (inp_vflag & INP_IPV4) { 3253 db_printf("%sINP_IPV4", comma ? ", " : ""); 3254 comma = 1; 3255 } 3256 if (inp_vflag & INP_IPV6) { 3257 db_printf("%sINP_IPV6", comma ? ", " : ""); 3258 comma = 1; 3259 } 3260 if (inp_vflag & INP_IPV6PROTO) { 3261 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 3262 comma = 1; 3263 } 3264 } 3265 3266 static void 3267 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 3268 { 3269 3270 db_print_indent(indent); 3271 db_printf("%s at %p\n", name, inp); 3272 3273 indent += 2; 3274 3275 db_print_indent(indent); 3276 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 3277 3278 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 3279 3280 db_print_indent(indent); 3281 db_printf("inp_label: %p inp_flags: 0x%x (", 3282 inp->inp_label, inp->inp_flags); 3283 db_print_inpflags(inp->inp_flags); 3284 db_printf(")\n"); 3285 3286 db_print_indent(indent); 3287 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 3288 inp->inp_vflag); 3289 db_print_inpvflag(inp->inp_vflag); 3290 db_printf(")\n"); 3291 3292 db_print_indent(indent); 3293 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 3294 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 3295 3296 db_print_indent(indent); 3297 #ifdef INET6 3298 if (inp->inp_vflag & INP_IPV6) { 3299 db_printf("in6p_options: %p in6p_outputopts: %p " 3300 "in6p_moptions: %p\n", inp->in6p_options, 3301 inp->in6p_outputopts, inp->in6p_moptions); 3302 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 3303 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 3304 inp->in6p_hops); 3305 } else 3306 #endif 3307 { 3308 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3309 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3310 inp->inp_options, inp->inp_moptions); 3311 } 3312 3313 db_print_indent(indent); 3314 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, 3315 (uintmax_t)inp->inp_gencnt); 3316 } 3317 3318 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3319 { 3320 struct inpcb *inp; 3321 3322 if (!have_addr) { 3323 db_printf("usage: show inpcb <addr>\n"); 3324 return; 3325 } 3326 inp = (struct inpcb *)addr; 3327 3328 db_print_inpcb(inp, "inpcb", 0); 3329 } 3330 #endif /* DDB */ 3331 3332 #ifdef RATELIMIT 3333 /* 3334 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3335 * if any. 3336 */ 3337 int 3338 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3339 { 3340 union if_snd_tag_modify_params params = { 3341 .rate_limit.max_rate = max_pacing_rate, 3342 .rate_limit.flags = M_NOWAIT, 3343 }; 3344 struct m_snd_tag *mst; 3345 int error; 3346 3347 mst = inp->inp_snd_tag; 3348 if (mst == NULL) 3349 return (EINVAL); 3350 3351 if (mst->sw->snd_tag_modify == NULL) { 3352 error = EOPNOTSUPP; 3353 } else { 3354 error = mst->sw->snd_tag_modify(mst, ¶ms); 3355 } 3356 return (error); 3357 } 3358 3359 /* 3360 * Query existing TX rate limit based on the existing 3361 * "inp->inp_snd_tag", if any. 3362 */ 3363 int 3364 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3365 { 3366 union if_snd_tag_query_params params = { }; 3367 struct m_snd_tag *mst; 3368 int error; 3369 3370 mst = inp->inp_snd_tag; 3371 if (mst == NULL) 3372 return (EINVAL); 3373 3374 if (mst->sw->snd_tag_query == NULL) { 3375 error = EOPNOTSUPP; 3376 } else { 3377 error = mst->sw->snd_tag_query(mst, ¶ms); 3378 if (error == 0 && p_max_pacing_rate != NULL) 3379 *p_max_pacing_rate = params.rate_limit.max_rate; 3380 } 3381 return (error); 3382 } 3383 3384 /* 3385 * Query existing TX queue level based on the existing 3386 * "inp->inp_snd_tag", if any. 3387 */ 3388 int 3389 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3390 { 3391 union if_snd_tag_query_params params = { }; 3392 struct m_snd_tag *mst; 3393 int error; 3394 3395 mst = inp->inp_snd_tag; 3396 if (mst == NULL) 3397 return (EINVAL); 3398 3399 if (mst->sw->snd_tag_query == NULL) 3400 return (EOPNOTSUPP); 3401 3402 error = mst->sw->snd_tag_query(mst, ¶ms); 3403 if (error == 0 && p_txqueue_level != NULL) 3404 *p_txqueue_level = params.rate_limit.queue_level; 3405 return (error); 3406 } 3407 3408 /* 3409 * Allocate a new TX rate limit send tag from the network interface 3410 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3411 */ 3412 int 3413 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3414 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3415 3416 { 3417 union if_snd_tag_alloc_params params = { 3418 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3419 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3420 .rate_limit.hdr.flowid = flowid, 3421 .rate_limit.hdr.flowtype = flowtype, 3422 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3423 .rate_limit.max_rate = max_pacing_rate, 3424 .rate_limit.flags = M_NOWAIT, 3425 }; 3426 int error; 3427 3428 INP_WLOCK_ASSERT(inp); 3429 3430 /* 3431 * If there is already a send tag, or the INP is being torn 3432 * down, allocating a new send tag is not allowed. Else send 3433 * tags may leak. 3434 */ 3435 if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) 3436 return (EINVAL); 3437 3438 error = m_snd_tag_alloc(ifp, ¶ms, st); 3439 #ifdef INET 3440 if (error == 0) { 3441 counter_u64_add(rate_limit_set_ok, 1); 3442 counter_u64_add(rate_limit_active, 1); 3443 } else if (error != EOPNOTSUPP) 3444 counter_u64_add(rate_limit_alloc_fail, 1); 3445 #endif 3446 return (error); 3447 } 3448 3449 void 3450 in_pcbdetach_tag(struct m_snd_tag *mst) 3451 { 3452 3453 m_snd_tag_rele(mst); 3454 #ifdef INET 3455 counter_u64_add(rate_limit_active, -1); 3456 #endif 3457 } 3458 3459 /* 3460 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3461 * if any: 3462 */ 3463 void 3464 in_pcbdetach_txrtlmt(struct inpcb *inp) 3465 { 3466 struct m_snd_tag *mst; 3467 3468 INP_WLOCK_ASSERT(inp); 3469 3470 mst = inp->inp_snd_tag; 3471 inp->inp_snd_tag = NULL; 3472 3473 if (mst == NULL) 3474 return; 3475 3476 m_snd_tag_rele(mst); 3477 #ifdef INET 3478 counter_u64_add(rate_limit_active, -1); 3479 #endif 3480 } 3481 3482 int 3483 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3484 { 3485 int error; 3486 3487 /* 3488 * If the existing send tag is for the wrong interface due to 3489 * a route change, first drop the existing tag. Set the 3490 * CHANGED flag so that we will keep trying to allocate a new 3491 * tag if we fail to allocate one this time. 3492 */ 3493 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3494 in_pcbdetach_txrtlmt(inp); 3495 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3496 } 3497 3498 /* 3499 * NOTE: When attaching to a network interface a reference is 3500 * made to ensure the network interface doesn't go away until 3501 * all ratelimit connections are gone. The network interface 3502 * pointers compared below represent valid network interfaces, 3503 * except when comparing towards NULL. 3504 */ 3505 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3506 error = 0; 3507 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3508 if (inp->inp_snd_tag != NULL) 3509 in_pcbdetach_txrtlmt(inp); 3510 error = 0; 3511 } else if (inp->inp_snd_tag == NULL) { 3512 /* 3513 * In order to utilize packet pacing with RSS, we need 3514 * to wait until there is a valid RSS hash before we 3515 * can proceed: 3516 */ 3517 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3518 error = EAGAIN; 3519 } else { 3520 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3521 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3522 } 3523 } else { 3524 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3525 } 3526 if (error == 0 || error == EOPNOTSUPP) 3527 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3528 3529 return (error); 3530 } 3531 3532 /* 3533 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3534 * is set in the fast path and will attach/detach/modify the TX rate 3535 * limit send tag based on the socket's so_max_pacing_rate value. 3536 */ 3537 void 3538 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3539 { 3540 struct socket *socket; 3541 uint32_t max_pacing_rate; 3542 bool did_upgrade; 3543 3544 if (inp == NULL) 3545 return; 3546 3547 socket = inp->inp_socket; 3548 if (socket == NULL) 3549 return; 3550 3551 if (!INP_WLOCKED(inp)) { 3552 /* 3553 * NOTE: If the write locking fails, we need to bail 3554 * out and use the non-ratelimited ring for the 3555 * transmit until there is a new chance to get the 3556 * write lock. 3557 */ 3558 if (!INP_TRY_UPGRADE(inp)) 3559 return; 3560 did_upgrade = 1; 3561 } else { 3562 did_upgrade = 0; 3563 } 3564 3565 /* 3566 * NOTE: The so_max_pacing_rate value is read unlocked, 3567 * because atomic updates are not required since the variable 3568 * is checked at every mbuf we send. It is assumed that the 3569 * variable read itself will be atomic. 3570 */ 3571 max_pacing_rate = socket->so_max_pacing_rate; 3572 3573 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3574 3575 if (did_upgrade) 3576 INP_DOWNGRADE(inp); 3577 } 3578 3579 /* 3580 * Track route changes for TX rate limiting. 3581 */ 3582 void 3583 in_pcboutput_eagain(struct inpcb *inp) 3584 { 3585 bool did_upgrade; 3586 3587 if (inp == NULL) 3588 return; 3589 3590 if (inp->inp_snd_tag == NULL) 3591 return; 3592 3593 if (!INP_WLOCKED(inp)) { 3594 /* 3595 * NOTE: If the write locking fails, we need to bail 3596 * out and use the non-ratelimited ring for the 3597 * transmit until there is a new chance to get the 3598 * write lock. 3599 */ 3600 if (!INP_TRY_UPGRADE(inp)) 3601 return; 3602 did_upgrade = 1; 3603 } else { 3604 did_upgrade = 0; 3605 } 3606 3607 /* detach rate limiting */ 3608 in_pcbdetach_txrtlmt(inp); 3609 3610 /* make sure new mbuf send tag allocation is made */ 3611 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3612 3613 if (did_upgrade) 3614 INP_DOWNGRADE(inp); 3615 } 3616 3617 #ifdef INET 3618 static void 3619 rl_init(void *st) 3620 { 3621 rate_limit_new = counter_u64_alloc(M_WAITOK); 3622 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3623 rate_limit_active = counter_u64_alloc(M_WAITOK); 3624 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3625 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3626 } 3627 3628 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3629 #endif 3630 #endif /* RATELIMIT */ 3631